From a93f4d8ff9011d75f29b846441da47234e7e23fe Mon Sep 17 00:00:00 2001 From: dam2452 <81230036+dam2452@users.noreply.github.com> Date: Mon, 9 Feb 2026 15:16:19 +0100 Subject: [PATCH 01/89] Add multi-series output paths and refactor processors Introduce multi-series output layout and central path helpers, plus a large refactor of processor and character modules. Added get_base_output_dir/get_output_path and per-setting get_output_dir methods so CLI commands resolve default output directories by series name; CLI commands were updated to accept --name/series and to use these per-series paths. Moved many processing modules under preprocessor/processors and reorganized character and video code into subpackages (preprocessor/characters/search, preprocessor/characters/face, preprocessor/video/helpers, preprocessor/video/subprocessors). Pipeline steps and orchestrator now use series-aware paths; README updated with multi-series examples and migration notes. Also added core path/processor registry/factory stubs to support the new layout. --- preprocessor/README.md | 51 +- preprocessor/characters/face/__init__.py | 0 .../{ => face}/face_detection_utils.py | 0 preprocessor/characters/{ => face}/utils.py | 0 preprocessor/characters/reference/__init__.py | 0 .../{ => reference}/reference_downloader.py | 13 +- .../{ => reference}/reference_processor.py | 5 +- preprocessor/characters/search/__init__.py | 0 .../{ => search}/base_image_search.py | 0 .../{ => search}/duckduckgo_search.py | 2 +- .../{ => search}/google_image_search.py | 2 +- preprocessor/cli/commands/analyze_text.py | 2 +- preprocessor/cli/commands/detect_scenes.py | 10 +- preprocessor/cli/commands/export_frames.py | 11 +- preprocessor/cli/commands/fix_unicode.py | 5 +- .../cli/commands/generate_archives.py | 19 +- .../commands/generate_elastic_documents.py | 10 +- .../cli/commands/generate_embeddings.py | 17 +- preprocessor/cli/commands/image_hashing.py | 11 +- .../cli/commands/import_transcriptions.py | 7 +- preprocessor/cli/commands/index.py | 14 +- .../commands/process_character_references.py | 11 +- preprocessor/cli/commands/run_all.py | 29 +- preprocessor/cli/commands/separate_sounds.py | 5 +- preprocessor/cli/commands/transcode.py | 10 +- preprocessor/cli/commands/transcribe.py | 7 +- .../cli/commands/transcribe_elevenlabs.py | 5 +- preprocessor/cli/pipeline/orchestrator.py | 6 +- preprocessor/cli/pipeline/steps.py | 108 +-- preprocessor/config/config.py | 78 +- preprocessor/core/base_processor.py | 96 ++ preprocessor/core/episode_file_finder.py | 5 +- preprocessor/core/episode_manager.py | 35 +- preprocessor/core/output_path_builder.py | 72 -- preprocessor/core/path_manager.py | 51 ++ preprocessor/core/processor_factory.py | 64 ++ preprocessor/core/processor_registry.py | 48 + .../embeddings/embedding_generator.py | 42 +- .../embeddings/episode_name_embedder.py | 42 +- preprocessor/processors/__init__.py | 0 .../archive_generator.py | 25 +- .../character_detector.py} | 26 +- .../elastic_document_generator.py | 95 +- .../elasticsearch_indexer.py} | 20 +- .../processors/embedding_generator.py | 821 ++++++++++++++++++ .../{video => processors}/frame_exporter.py | 20 +- .../image_hash_processor.py | 20 +- .../{video => processors}/scene_detector.py | 15 +- .../text_analyzer.py | 14 +- .../transcription_generator.py} | 53 +- .../transcription_importer.py} | 14 +- .../video_transcoder.py} | 17 +- preprocessor/scraping/base_scraper.py | 2 +- preprocessor/scraping/character_scraper.py | 3 + preprocessor/scraping/episode_scraper.py | 3 + preprocessor/text_analysis/__init__.py | 3 +- preprocessor/transcription/elevenlabs.py | 8 +- .../generators/multi_format_generator.py | 38 +- .../processors/episode_info_processor.py | 3 +- .../processors/sound_separator.py | 7 +- .../transcription/processors/unicode_fixer.py | 7 +- preprocessor/utils/detection_io.py | 23 +- preprocessor/utils/image_hash_utils.py | 25 +- preprocessor/validation/episode_stats.py | 42 +- preprocessor/validation/global_validator.py | 3 +- preprocessor/validation/validator.py | 7 +- preprocessor/video/helpers/__init__.py | 0 .../{ => helpers}/base_video_processor.py | 0 .../video/{ => helpers}/frame_processor.py | 7 +- preprocessor/video/subprocessors/__init__.py | 0 .../emotion_detection_subprocessor.py | 9 +- .../face_clustering_subprocessor.py | 11 +- .../frame_subprocessors.py | 41 +- 73 files changed, 1821 insertions(+), 454 deletions(-) create mode 100644 preprocessor/characters/face/__init__.py rename preprocessor/characters/{ => face}/face_detection_utils.py (100%) rename preprocessor/characters/{ => face}/utils.py (100%) create mode 100644 preprocessor/characters/reference/__init__.py rename preprocessor/characters/{ => reference}/reference_downloader.py (96%) rename preprocessor/characters/{ => reference}/reference_processor.py (99%) create mode 100644 preprocessor/characters/search/__init__.py rename preprocessor/characters/{ => search}/base_image_search.py (100%) rename preprocessor/characters/{ => search}/duckduckgo_search.py (83%) rename preprocessor/characters/{ => search}/google_image_search.py (92%) delete mode 100644 preprocessor/core/output_path_builder.py create mode 100644 preprocessor/core/path_manager.py create mode 100644 preprocessor/core/processor_factory.py create mode 100644 preprocessor/core/processor_registry.py create mode 100644 preprocessor/processors/__init__.py rename preprocessor/{indexing => processors}/archive_generator.py (89%) rename preprocessor/{characters/detector.py => processors/character_detector.py} (82%) rename preprocessor/{indexing => processors}/elastic_document_generator.py (90%) rename preprocessor/{indexing/elasticsearch.py => processors/elasticsearch_indexer.py} (95%) create mode 100644 preprocessor/processors/embedding_generator.py rename preprocessor/{video => processors}/frame_exporter.py (94%) rename preprocessor/{hashing => processors}/image_hash_processor.py (90%) rename preprocessor/{video => processors}/scene_detector.py (93%) rename preprocessor/{text_analysis => processors}/text_analyzer.py (90%) rename preprocessor/{transcription/generator.py => processors/transcription_generator.py} (82%) rename preprocessor/{transcription/importer.py => processors/transcription_importer.py} (93%) rename preprocessor/{video/transcoder.py => processors/video_transcoder.py} (94%) create mode 100644 preprocessor/video/helpers/__init__.py rename preprocessor/video/{ => helpers}/base_video_processor.py (100%) rename preprocessor/video/{ => helpers}/frame_processor.py (95%) create mode 100644 preprocessor/video/subprocessors/__init__.py rename preprocessor/video/{ => subprocessors}/emotion_detection_subprocessor.py (90%) rename preprocessor/video/{ => subprocessors}/face_clustering_subprocessor.py (94%) rename preprocessor/video/{ => subprocessors}/frame_subprocessors.py (91%) diff --git a/preprocessor/README.md b/preprocessor/README.md index fe3306875..82d2ab8c3 100644 --- a/preprocessor/README.md +++ b/preprocessor/README.md @@ -15,15 +15,15 @@ cp /twoje/wideo/*.mp4 input_data/videos/ docker compose build # Pełny pipeline z scrapingiem -./run-preprocessor.sh run-all /input_data/videos \ +./run-preprocessor.sh run-all /input_data/ranczo \ --scrape-urls https://example.com/wiki/Seria \ --character-urls https://example.com/wiki/Postacie \ - --series-name nazwa_serii + --series-name ranczo # Z gotowymi metadanymi -./run-preprocessor.sh run-all /input_data/videos \ - --episodes-info-json /input_data/episodes.json \ - --series-name nazwa_serii +./run-preprocessor.sh run-all /input_data/kiepscy \ + --episodes-info-json /input_data/kiepscy_episodes.json \ + --series-name kiepscy # Pomiń transkodowanie i transkrypcję (użyj istniejących) ./run-preprocessor.sh run-all /input_data/videos \ @@ -129,10 +129,49 @@ SCRAPING PROCESSING INDEXING --- -## Struktura output +## Multi-Series Support + +Pipeline wspiera przetwarzanie wielu seriali jednocześnie. Każdy serial ma dedykowany folder: +**Input struktura:** +``` +input_data/ +├── ranczo/ +│ ├── S01/ +│ ├── S02/ +│ └── S03/ +└── kiepscy/ + ├── S01/ + └── S02/ +``` + +**Output struktura:** ``` output_data/ +├── ranczo/ +│ ├── transcoded_videos/ +│ ├── transcriptions/ +│ ├── ranczo_episodes.json +│ ├── ranczo_characters.json +│ └── ... +└── kiepscy/ + ├── transcoded_videos/ + ├── kiepscy_episodes.json + └── ... +``` + +**Migracja ze starej struktury:** +```bash +mkdir -p input_data/{series_name} +mv input_data/S* input_data/{series_name}/ +``` + +--- + +## Struktura output (per serial) + +``` +output_data/{series_name}/ ├── transcoded_videos/ # MP4 h264_nvenc (720p) ├── transcriptions/ # raw/ • clean/ • sound_events/ ├── scene_timestamps/ # JSON z timestampami scen diff --git a/preprocessor/characters/face/__init__.py b/preprocessor/characters/face/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/preprocessor/characters/face_detection_utils.py b/preprocessor/characters/face/face_detection_utils.py similarity index 100% rename from preprocessor/characters/face_detection_utils.py rename to preprocessor/characters/face/face_detection_utils.py diff --git a/preprocessor/characters/utils.py b/preprocessor/characters/face/utils.py similarity index 100% rename from preprocessor/characters/utils.py rename to preprocessor/characters/face/utils.py diff --git a/preprocessor/characters/reference/__init__.py b/preprocessor/characters/reference/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/preprocessor/characters/reference_downloader.py b/preprocessor/characters/reference/reference_downloader.py similarity index 96% rename from preprocessor/characters/reference_downloader.py rename to preprocessor/characters/reference/reference_downloader.py index 5108363dd..6eabc14be 100644 --- a/preprocessor/characters/reference_downloader.py +++ b/preprocessor/characters/reference/reference_downloader.py @@ -21,10 +21,10 @@ sync_playwright, ) -from preprocessor.characters.base_image_search import BaseImageSearch -from preprocessor.characters.duckduckgo_search import DuckDuckGoImageSearch -from preprocessor.characters.google_image_search import GoogleImageSearch -from preprocessor.characters.utils import init_face_detection +from preprocessor.characters.search.base_image_search import BaseImageSearch +from preprocessor.characters.search.duckduckgo_search import DuckDuckGoImageSearch +from preprocessor.characters.search.google_image_search import GoogleImageSearch +from preprocessor.characters.face.utils import init_face_detection from preprocessor.config.config import settings from preprocessor.core.base_processor import BaseProcessor from preprocessor.utils.console import ( @@ -44,7 +44,7 @@ def __init__(self, args: Dict[str, Any]): self.characters_json: Path = self._args["characters_json"] self.series_name: str = self._args["series_name"] - self.output_dir: Path = self._args.get("output_dir", settings.character.output_dir) + self.output_dir: Path = self._args.get("output_dir", settings.character.get_output_dir(self.series_name)) self.images_per_character: int = self._args.get( "images_per_character", settings.character.reference_images_per_character, @@ -68,6 +68,9 @@ def __create_search_engine(self) -> BaseImageSearch: def _validate_args(self, args: Dict[str, Any]) -> None: if "characters_json" not in args: raise ValueError("characters_json is required") + + def get_output_subdir(self) -> str: + return "character_references" if "series_name" not in args: raise ValueError("series_name is required") diff --git a/preprocessor/characters/reference_processor.py b/preprocessor/characters/reference/reference_processor.py similarity index 99% rename from preprocessor/characters/reference_processor.py rename to preprocessor/characters/reference/reference_processor.py index c3f8ea4d7..154a0342b 100644 --- a/preprocessor/characters/reference_processor.py +++ b/preprocessor/characters/reference/reference_processor.py @@ -15,7 +15,7 @@ from insightface.app import FaceAnalysis import numpy as np -from preprocessor.characters.utils import init_face_detection +from preprocessor.characters.face.utils import init_face_detection from preprocessor.config.config import settings from preprocessor.core.base_processor import ( BaseProcessor, @@ -69,6 +69,9 @@ def _validate_args(self, args: Dict[str, Any]) -> None: if key not in args: raise ValueError(f"Missing required argument: {key}") + def get_output_subdir(self) -> str: + return "character_references" + def _load_resources(self) -> bool: self.face_app = init_face_detection() return True diff --git a/preprocessor/characters/search/__init__.py b/preprocessor/characters/search/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/preprocessor/characters/base_image_search.py b/preprocessor/characters/search/base_image_search.py similarity index 100% rename from preprocessor/characters/base_image_search.py rename to preprocessor/characters/search/base_image_search.py diff --git a/preprocessor/characters/duckduckgo_search.py b/preprocessor/characters/search/duckduckgo_search.py similarity index 83% rename from preprocessor/characters/duckduckgo_search.py rename to preprocessor/characters/search/duckduckgo_search.py index f70264593..c01819a32 100644 --- a/preprocessor/characters/duckduckgo_search.py +++ b/preprocessor/characters/search/duckduckgo_search.py @@ -5,7 +5,7 @@ from ddgs import DDGS -from preprocessor.characters.base_image_search import BaseImageSearch +from preprocessor.characters.search.base_image_search import BaseImageSearch class DuckDuckGoImageSearch(BaseImageSearch): diff --git a/preprocessor/characters/google_image_search.py b/preprocessor/characters/search/google_image_search.py similarity index 92% rename from preprocessor/characters/google_image_search.py rename to preprocessor/characters/search/google_image_search.py index 878a8f60e..f64bddd0b 100644 --- a/preprocessor/characters/google_image_search.py +++ b/preprocessor/characters/search/google_image_search.py @@ -5,7 +5,7 @@ from serpapi import GoogleSearch -from preprocessor.characters.base_image_search import BaseImageSearch +from preprocessor.characters.search.base_image_search import BaseImageSearch class GoogleImageSearch(BaseImageSearch): diff --git a/preprocessor/cli/commands/analyze_text.py b/preprocessor/cli/commands/analyze_text.py index 78e9b330c..cda8240be 100644 --- a/preprocessor/cli/commands/analyze_text.py +++ b/preprocessor/cli/commands/analyze_text.py @@ -3,7 +3,7 @@ import click -from preprocessor.text_analysis.text_analyzer import TextAnalyzer +from preprocessor.processors.text_analyzer import TextAnalyzer @click.command(context_settings={"show_default": True}) diff --git a/preprocessor/cli/commands/detect_scenes.py b/preprocessor/cli/commands/detect_scenes.py index 9a35ddbd2..2dae67025 100644 --- a/preprocessor/cli/commands/detect_scenes.py +++ b/preprocessor/cli/commands/detect_scenes.py @@ -5,7 +5,7 @@ from preprocessor.cli_utils.resource_scope import ResourceScope from preprocessor.config.config import settings -from preprocessor.video.scene_detector import SceneDetector +from preprocessor.processors.scene_detector import SceneDetector @click.command(name="detect-scenes", context_settings={"show_default": True}) @@ -13,7 +13,7 @@ @click.option( "--output-dir", type=click.Path(path_type=Path), - default=str(settings.scene_detection.output_dir), + default=None, help="Output directory for scene JSON files", ) @click.option( @@ -28,8 +28,12 @@ default=settings.scene_detection.min_scene_len, help="Minimum scene length in frames", ) -def detect_scenes(videos: Path, output_dir: Path, threshold: float, min_scene_len: int): +@click.option("--name", required=True, help="Series name") +def detect_scenes(videos: Path, output_dir: Path, threshold: float, min_scene_len: int, name: str): """Detect scene changes in videos using TransNetV2.""" + if output_dir is None: + output_dir = settings.scene_detection.get_output_dir(name) + with ResourceScope(): detector = SceneDetector( { diff --git a/preprocessor/cli/commands/export_frames.py b/preprocessor/cli/commands/export_frames.py index d5c38edbe..bf1794cd5 100644 --- a/preprocessor/cli/commands/export_frames.py +++ b/preprocessor/cli/commands/export_frames.py @@ -5,8 +5,8 @@ from preprocessor.cli.utils import create_state_manager from preprocessor.config.config import settings +from preprocessor.processors.frame_exporter import FrameExporter from preprocessor.utils.resolution import Resolution -from preprocessor.video.frame_exporter import FrameExporter @click.command(context_settings={"show_default": True}) @@ -20,13 +20,13 @@ @click.option( "--scene-timestamps-dir", type=click.Path(exists=True, path_type=Path), - default=str(settings.scene_detection.output_dir), + default=None, help="Directory with scene timestamps", ) @click.option( "--output-frames", type=click.Path(path_type=Path), - default=str(settings.frame_export.output_dir), + default=None, help="Output directory for exported frames", ) @click.option( @@ -47,6 +47,11 @@ def export_frames( no_state: bool, ): """Export keyframes at target resolution based on configured keyframe strategy.""" + if scene_timestamps_dir is None: + scene_timestamps_dir = settings.scene_detection.get_output_dir(name) + if output_frames is None: + output_frames = settings.frame_export.get_output_dir(name) + state_manager = create_state_manager(name, no_state) res = Resolution.from_str(resolution) diff --git a/preprocessor/cli/commands/fix_unicode.py b/preprocessor/cli/commands/fix_unicode.py index f7ad56227..74b2ffde9 100644 --- a/preprocessor/cli/commands/fix_unicode.py +++ b/preprocessor/cli/commands/fix_unicode.py @@ -12,7 +12,7 @@ @click.option( "--transcription-jsons", type=click.Path(exists=True, path_type=Path), - default=str(settings.transcription.output_dir), + default=None, help="Directory with transcription JSON files", ) @click.option( @@ -32,6 +32,9 @@ def fix_unicode( name: str, ): """Fix unicode escape sequences in transcription files.""" + if transcription_jsons is None: + transcription_jsons = settings.transcription.get_output_dir(name) + args = { "transcription_jsons": transcription_jsons, "episodes_info_json": episodes_info_json, diff --git a/preprocessor/cli/commands/generate_archives.py b/preprocessor/cli/commands/generate_archives.py index 9d59381db..c5b5ac008 100644 --- a/preprocessor/cli/commands/generate_archives.py +++ b/preprocessor/cli/commands/generate_archives.py @@ -8,24 +8,24 @@ name_option, ) from preprocessor.config.config import ( - BASE_OUTPUT_DIR, + get_base_output_dir, settings, ) -from preprocessor.indexing.archive_generator import ArchiveGenerator +from preprocessor.processors.archive_generator import ArchiveGenerator @click.command(name="generate-archives", context_settings={"show_default": True}) @click.option( "--elastic-documents-dir", type=click.Path(exists=True, file_okay=False, path_type=Path), - default=BASE_OUTPUT_DIR / settings.output_subdirs.elastic_documents, - help="Directory with Elasticsearch documents", + default=None, + help="Directory with Elasticsearch documents (defaults to {series_name}/elastic_documents)", ) @click.option( "--output-dir", type=click.Path(path_type=Path), - default=BASE_OUTPUT_DIR / settings.output_subdirs.archives, - help="Output directory for ZIP archives", + default=None, + help="Output directory for ZIP archives (defaults to {series_name}/archives)", ) @click.option( "--season", @@ -59,6 +59,13 @@ def generate_archives( name: str, episodes_info_json: Path, ) -> None: + base_output = get_base_output_dir(name) + + if elastic_documents_dir is None: + elastic_documents_dir = base_output / settings.output_subdirs.elastic_documents + if output_dir is None: + output_dir = base_output / settings.output_subdirs.archives + args = { "elastic_documents_dir": elastic_documents_dir, "output_dir": output_dir, diff --git a/preprocessor/cli/commands/generate_elastic_documents.py b/preprocessor/cli/commands/generate_elastic_documents.py index 3a7113e68..4d8deb648 100644 --- a/preprocessor/cli/commands/generate_elastic_documents.py +++ b/preprocessor/cli/commands/generate_elastic_documents.py @@ -7,7 +7,11 @@ episodes_info_option, name_option, ) -from preprocessor.indexing.elastic_document_generator import ElasticDocumentGenerator +from preprocessor.config.config import ( + get_output_path, + settings, +) +from preprocessor.processors.elastic_document_generator import ElasticDocumentGenerator @click.command(name="generate-elastic-documents", context_settings={"show_default": True}) @@ -40,7 +44,7 @@ @click.option( "--output-dir", type=click.Path(path_type=Path), - default="/app/output_data/elastic_documents", + default=None, help="Output directory", ) @name_option() @@ -55,6 +59,8 @@ def generate_elastic_documents( name: str, episodes_info_json: Path, ) -> None: + if output_dir is None: + output_dir = get_output_path(settings.output_subdirs.elastic_documents, name) args = { "transcription_jsons": transcription_jsons, "embeddings_dir": embeddings_dir, diff --git a/preprocessor/cli/commands/generate_embeddings.py b/preprocessor/cli/commands/generate_embeddings.py index 9833377ba..cc2d15eca 100644 --- a/preprocessor/cli/commands/generate_embeddings.py +++ b/preprocessor/cli/commands/generate_embeddings.py @@ -5,7 +5,7 @@ from preprocessor.cli_utils.resource_scope import ResourceScope from preprocessor.config.config import settings -from preprocessor.embeddings.embedding_generator import EmbeddingGenerator +from preprocessor.processors.embedding_generator import EmbeddingGenerator @click.command(name="generate-embeddings", context_settings={"show_default": True}) @@ -18,19 +18,19 @@ @click.option( "--frames-dir", type=click.Path(exists=True, file_okay=False, path_type=Path), - default=str(settings.frame_export.output_dir), + default=None, help="Directory with exported frames", ) @click.option( "--output-dir", type=click.Path(path_type=Path), - default=str(settings.embedding.default_output_dir), + default=None, help="Output directory", ) @click.option( "--image-hashes-dir", type=click.Path(path_type=Path), - default=str(settings.image_hash.output_dir), + default=None, help="Directory with image hashes", ) @click.option( @@ -93,6 +93,7 @@ default=settings.text_chunking.text_chunk_overlap, help="Number of overlapping sentences between chunks (only for --sentence-chunking)", ) +@click.option("--name", required=True, help="Series name") def generate_embeddings( # pylint: disable=too-many-arguments transcription_jsons: Path, frames_dir: Path, @@ -109,8 +110,16 @@ def generate_embeddings( # pylint: disable=too-many-arguments batch_size: int, sentences_per_chunk: int, chunk_overlap: int, + name: str, ): """Generate text and video embeddings from transcriptions and exported frames.""" + if frames_dir is None: + frames_dir = settings.frame_export.get_output_dir(name) + if output_dir is None: + output_dir = settings.embedding.get_output_dir(name) + if image_hashes_dir is None: + image_hashes_dir = settings.image_hash.get_output_dir(name) + with ResourceScope(): generator = EmbeddingGenerator( { diff --git a/preprocessor/cli/commands/image_hashing.py b/preprocessor/cli/commands/image_hashing.py index 1b80a8251..26ee7267b 100644 --- a/preprocessor/cli/commands/image_hashing.py +++ b/preprocessor/cli/commands/image_hashing.py @@ -5,14 +5,14 @@ from preprocessor.cli.utils import create_state_manager from preprocessor.config.config import settings -from preprocessor.hashing.image_hash_processor import ImageHashProcessor +from preprocessor.processors.image_hash_processor import ImageHashProcessor @click.command(context_settings={"show_default": True}) @click.option( "--frames-dir", type=click.Path(exists=True, file_okay=False, path_type=Path), - default=str(settings.frame_export.output_dir), + default=None, help="Directory with exported frames", ) @click.option( @@ -24,7 +24,7 @@ @click.option( "--output-dir", type=click.Path(path_type=Path), - default=str(settings.image_hash.output_dir), + default=None, help="Output directory for image hashes", ) @click.option( @@ -44,6 +44,11 @@ def image_hashing( no_state: bool, ): """Generate perceptual hashes for exported frames.""" + if frames_dir is None: + frames_dir = settings.frame_export.get_output_dir(name) + if output_dir is None: + output_dir = settings.image_hash.get_output_dir(name) + state_manager = create_state_manager(name, no_state) hasher = ImageHashProcessor( diff --git a/preprocessor/cli/commands/import_transcriptions.py b/preprocessor/cli/commands/import_transcriptions.py index 99af5534d..6ee89871a 100644 --- a/preprocessor/cli/commands/import_transcriptions.py +++ b/preprocessor/cli/commands/import_transcriptions.py @@ -5,7 +5,7 @@ from preprocessor.cli.utils import create_state_manager from preprocessor.config.config import settings -from preprocessor.transcription.importer import TranscriptionImporter +from preprocessor.processors.transcription_importer import TranscriptionImporter from preprocessor.utils.console import console @@ -19,7 +19,7 @@ @click.option( "--output-dir", type=click.Path(path_type=Path), - default=str(settings.transcription.output_dir), + default=None, help="Output directory for converted transcriptions", ) @click.option( @@ -44,6 +44,9 @@ def import_transcriptions( no_state: bool, ): """Import and convert transcriptions from external sources.""" + if output_dir is None: + output_dir = settings.transcription.get_output_dir(name) + state_manager = create_state_manager(name, no_state) importer = TranscriptionImporter( diff --git a/preprocessor/cli/commands/index.py b/preprocessor/cli/commands/index.py index 29aea19d1..c4aeb258b 100644 --- a/preprocessor/cli/commands/index.py +++ b/preprocessor/cli/commands/index.py @@ -3,22 +3,28 @@ import click -from preprocessor.config.config import settings -from preprocessor.indexing.elasticsearch import ElasticSearchIndexer +from preprocessor.config.config import ( + get_output_path, + settings, +) +from preprocessor.processors.elasticsearch_indexer import ElasticSearchIndexer @click.command() -@click.option("--name", required=True, help="Elasticsearch index name") +@click.option("--name", required=True, help="Elasticsearch index name (also used as series name for path resolution)") @click.option( "--elastic-documents-dir", type=click.Path(exists=True, path_type=Path), - default=str(settings.elastic_documents.output_dir) if hasattr(settings, 'elastic_documents') else "/app/output_data/elastic_documents", + default=None, help="Directory with generated elastic documents", ) @click.option("--dry-run", is_flag=True, help="Validate without sending to Elasticsearch") @click.option("--append", is_flag=True, help="Append to existing indices instead of recreating") def index(name: str, elastic_documents_dir: Path, dry_run: bool, append: bool): """Index documents into Elasticsearch (creates 3 indices: segments, text_embeddings, video_frames).""" + if elastic_documents_dir is None: + elastic_documents_dir = get_output_path(settings.output_subdirs.elastic_documents, name) + indexer = ElasticSearchIndexer({ "name": name, "elastic_documents_dir": elastic_documents_dir, diff --git a/preprocessor/cli/commands/process_character_references.py b/preprocessor/cli/commands/process_character_references.py index 4b9d1a432..627d9ea16 100644 --- a/preprocessor/cli/commands/process_character_references.py +++ b/preprocessor/cli/commands/process_character_references.py @@ -3,7 +3,7 @@ import click -from preprocessor.characters.reference_processor import CharacterReferenceProcessor +from preprocessor.characters.reference.reference_processor import CharacterReferenceProcessor from preprocessor.cli.utils import create_state_manager from preprocessor.config.config import settings @@ -12,13 +12,13 @@ @click.option( "--characters-dir", type=click.Path(exists=True, path_type=Path), - default=str(settings.character.output_dir), + default=None, help="Directory with character reference images", ) @click.option( "--output-dir", type=click.Path(path_type=Path), - default=str(settings.character.processed_references_dir), + default=None, help="Output directory for processed references", ) @click.option( @@ -43,6 +43,11 @@ def process_character_references( no_state: bool, ): """Process character reference images to identify and extract common faces.""" + if characters_dir is None: + characters_dir = settings.character.get_output_dir(name) + if output_dir is None: + output_dir = settings.character.get_processed_references_dir(name) + state_manager = create_state_manager(name, no_state) processor = CharacterReferenceProcessor( diff --git a/preprocessor/cli/commands/run_all.py b/preprocessor/cli/commands/run_all.py index 76cda6071..4a2648f82 100644 --- a/preprocessor/cli/commands/run_all.py +++ b/preprocessor/cli/commands/run_all.py @@ -24,7 +24,10 @@ run_validation_step, ) from preprocessor.cli.utils import create_state_manager -from preprocessor.config.config import settings +from preprocessor.config.config import ( + get_base_output_dir, + settings, +) from preprocessor.utils.console import console from preprocessor.utils.resolution import Resolution @@ -44,14 +47,14 @@ @click.option( "--transcription-jsons", type=click.Path(path_type=Path), - default=str(settings.transcription.output_dir), - help="Output directory for transcription JSONs", + default=None, + help="Output directory for transcription JSONs (defaults to {series_name}/transcriptions)", ) @click.option( "--scene-timestamps-dir", type=click.Path(path_type=Path), - default=str(settings.scene_detection.output_dir), - help="Output directory for scene timestamps", + default=None, + help="Output directory for scene timestamps (defaults to {series_name}/scene_timestamps)", ) @click.option("--series-name", required=True, help="Series name") @click.option( @@ -179,12 +182,16 @@ def run_all( # pylint: disable=too-many-arguments,too-many-locals,too-many-stat ): """Run complete video processing pipeline.""" if transcoded_videos is None: # pylint: disable=duplicate-code - transcoded_videos = settings.transcode.output_dir + transcoded_videos = settings.transcode.get_output_dir(series_name) if codec is None: codec = settings.transcode.codec + if transcription_jsons is None: + transcription_jsons = settings.transcription.get_output_dir(series_name) + if scene_timestamps_dir is None: + scene_timestamps_dir = settings.scene_detection.get_output_dir(series_name) if not episodes_info_json: - default_episodes_json = Path("/app/output_data") / f"{series_name}_episodes.json" + default_episodes_json = get_base_output_dir(series_name) / f"{series_name}_episodes.json" if default_episodes_json.exists(): episodes_info_json = default_episodes_json console.print(f"[cyan]Using existing episodes JSON: {episodes_info_json}[/cyan]") @@ -197,7 +204,7 @@ def run_all( # pylint: disable=too-many-arguments,too-many-locals,too-many-stat sys.exit(1) characters_json = None - default_characters_json = Path("/app/output_data") / f"{series_name}_characters.json" + default_characters_json = get_base_output_dir(series_name) / f"{series_name}_characters.json" if default_characters_json.exists(): characters_json = default_characters_json @@ -206,7 +213,7 @@ def run_all( # pylint: disable=too-many-arguments,too-many-locals,too-many-stat characters_json = default_characters_json console.print(f"[cyan]Will scrape characters to: {characters_json}[/cyan]") else: - characters_json = settings.character.characters_list_file + characters_json = settings.character.get_characters_list_file(series_name) if characters_json and Path(characters_json).exists(): console.print(f"[cyan]Using default characters JSON: {characters_json}[/cyan]") else: @@ -223,7 +230,7 @@ def run_all( # pylint: disable=too-many-arguments,too-many-locals,too-many-stat "transcoded_videos": transcoded_videos, "transcription_jsons": transcription_jsons, "scene_timestamps_dir": scene_timestamps_dir, - "output_frames": settings.frame_export.output_dir, + "output_frames": settings.frame_export.get_output_dir(series_name), "name": series_name, "resolution": resolution, "codec": codec, @@ -252,7 +259,7 @@ def run_all( # pylint: disable=too-many-arguments,too-many-locals,too-many-stat "skip_full_episode": skip_full_episode, } - metadata_output_dir = Path("/app/output_data/processing_metadata") + metadata_output_dir = get_base_output_dir(series_name) / "processing_metadata" orchestrator = PipelineOrchestrator( state_manager=state_manager, diff --git a/preprocessor/cli/commands/separate_sounds.py b/preprocessor/cli/commands/separate_sounds.py index cc7f7bc88..1358f0e96 100644 --- a/preprocessor/cli/commands/separate_sounds.py +++ b/preprocessor/cli/commands/separate_sounds.py @@ -12,7 +12,7 @@ @click.option( "--transcription-dir", type=click.Path(exists=True, path_type=Path), - default=str(settings.transcription.output_dir), + default=None, help="Directory with transcription JSON files", ) @click.option( @@ -32,6 +32,9 @@ def separate_sounds( series_name: str, ): """Separate sound events from dialogues in transcription files.""" + if transcription_dir is None: + transcription_dir = settings.transcription.get_output_dir(series_name) + args = { "transcription_dir": transcription_dir, "episodes_info_json": episodes_info_json, diff --git a/preprocessor/cli/commands/transcode.py b/preprocessor/cli/commands/transcode.py index 38dc1e760..4b50c3e07 100644 --- a/preprocessor/cli/commands/transcode.py +++ b/preprocessor/cli/commands/transcode.py @@ -9,8 +9,8 @@ TranscodeConfig, settings, ) +from preprocessor.processors.video_transcoder import VideoTranscoder from preprocessor.utils.resolution import Resolution -from preprocessor.video.transcoder import VideoTranscoder @click.command(context_settings={"show_default": True}) @@ -18,7 +18,7 @@ @click.option( "--transcoded-videos", type=click.Path(path_type=Path), - default=str(settings.transcode.output_dir), + default=None, help="Output directory for transcoded videos", ) @click.option( @@ -55,7 +55,11 @@ def transcode( ): """Transcode videos to target resolution with FFmpeg.""" if transcoded_videos is None: # pylint: disable=duplicate-code - transcoded_videos = settings.transcode.output_dir + if name: + transcoded_videos = settings.transcode.get_output_dir(name) + else: + from preprocessor.config.config import BASE_OUTPUT_DIR # pylint: disable=import-outside-toplevel + transcoded_videos = BASE_OUTPUT_DIR / "transcoded_videos" if codec is None: codec = settings.transcode.codec if gop_size is None: diff --git a/preprocessor/cli/commands/transcribe.py b/preprocessor/cli/commands/transcribe.py index dec170f73..a156333c3 100644 --- a/preprocessor/cli/commands/transcribe.py +++ b/preprocessor/cli/commands/transcribe.py @@ -9,7 +9,7 @@ TranscriptionConfig, settings, ) -from preprocessor.transcription.generator import TranscriptionGenerator +from preprocessor.processors.transcription_generator import TranscriptionGenerator # pylint: disable=duplicate-code @@ -26,7 +26,7 @@ @click.option( "--transcription-jsons", type=click.Path(path_type=Path), - default=str(settings.transcription.output_dir), + default=None, help="Output directory for transcription JSONs", ) @click.option( @@ -59,6 +59,9 @@ def transcribe( name: str, ): """Generate transcriptions using Whisper.""" + if transcription_jsons is None: + transcription_jsons = settings.transcription.get_output_dir(name) + config = TranscriptionConfig( videos=videos, episodes_info_json=episodes_info_json, diff --git a/preprocessor/cli/commands/transcribe_elevenlabs.py b/preprocessor/cli/commands/transcribe_elevenlabs.py index 4814b4273..ea2535a76 100644 --- a/preprocessor/cli/commands/transcribe_elevenlabs.py +++ b/preprocessor/cli/commands/transcribe_elevenlabs.py @@ -14,7 +14,7 @@ @click.option( "--output-dir", type=click.Path(path_type=Path), - default=str(settings.transcription.output_dir), + default=None, help="Output directory for transcriptions", ) @click.option( @@ -56,6 +56,9 @@ def transcribe_elevenlabs( no_state: bool, ): """Transcribe videos using ElevenLabs API.""" + if output_dir is None: + output_dir = settings.transcription.get_output_dir(name) + state_manager = create_state_manager(name, no_state) transcriber = ElevenLabsTranscriber( diff --git a/preprocessor/cli/pipeline/orchestrator.py b/preprocessor/cli/pipeline/orchestrator.py index 500def747..72cfa6ba1 100644 --- a/preprocessor/cli/pipeline/orchestrator.py +++ b/preprocessor/cli/pipeline/orchestrator.py @@ -117,7 +117,7 @@ def __collect_additional_statistics(self) -> Dict[str, Any]: # pylint: disable= total_size = sum(f.stat().st_size for f in video_files if f.is_file()) stats["transcoded_videos_total_size_mb"] = round(total_size / (1024 * 1024), 2) - output_frames_dir = Path(settings.frame_export.output_dir) + output_frames_dir = Path(settings.frame_export.get_output_dir(self.series_name)) if output_frames_dir.exists(): frame_metadata_files = list(output_frames_dir.rglob("*_frame_metadata.json")) stats["processed_episodes_count"] = len(frame_metadata_files) @@ -131,14 +131,14 @@ def __collect_additional_statistics(self) -> Dict[str, Any]: # pylint: disable= pass stats["total_frames_extracted"] = total_frames - embeddings_dir = Path(settings.embedding.default_output_dir) + embeddings_dir = Path(settings.embedding.get_output_dir(self.series_name)) if embeddings_dir.exists(): text_embedding_files = list(embeddings_dir.rglob("*_embeddings_text.json")) video_embedding_files = list(embeddings_dir.rglob("*_embeddings_video.json")) stats["text_embedding_files_count"] = len(text_embedding_files) stats["video_embedding_files_count"] = len(video_embedding_files) - image_hashes_dir = Path(settings.image_hash.output_dir) + image_hashes_dir = Path(settings.image_hash.get_output_dir(self.series_name)) if image_hashes_dir.exists(): hash_files = list(image_hashes_dir.rglob("*_image_hashes.json")) stats["image_hash_files_count"] = len(hash_files) diff --git a/preprocessor/cli/pipeline/steps.py b/preprocessor/cli/pipeline/steps.py index f304ba09b..4030e33b7 100644 --- a/preprocessor/cli/pipeline/steps.py +++ b/preprocessor/cli/pipeline/steps.py @@ -1,12 +1,15 @@ from pathlib import Path -from preprocessor.config.config import settings +from preprocessor.config.config import ( + get_base_output_dir, + settings, +) from preprocessor.core.constants import SUPPORTED_VIDEO_EXTENSIONS from preprocessor.utils.console import console -from preprocessor.video.emotion_detection_subprocessor import EmotionDetectionSubProcessor -from preprocessor.video.face_clustering_subprocessor import FaceClusteringSubProcessor -from preprocessor.video.frame_processor import FrameProcessor -from preprocessor.video.frame_subprocessors import ( +from preprocessor.video.helpers.frame_processor import FrameProcessor +from preprocessor.video.subprocessors.emotion_detection_subprocessor import EmotionDetectionSubProcessor +from preprocessor.video.subprocessors.face_clustering_subprocessor import FaceClusteringSubProcessor +from preprocessor.video.subprocessors.frame_subprocessors import ( CharacterDetectionSubProcessor, CharacterDetectionVisualizationSubProcessor, ImageHashSubProcessor, @@ -82,7 +85,9 @@ def run_character_scrape_step(character_urls, characters_json, name, parser_mode def run_character_reference_download_step(name, characters_json, search_mode="normal", **_kwargs): - from preprocessor.characters.reference_downloader import CharacterReferenceDownloader # pylint: disable=import-outside-toplevel + from preprocessor.characters.reference.reference_downloader import ( + CharacterReferenceDownloader, # pylint: disable=import-outside-toplevel + ) if not characters_json.exists(): console.print("[yellow]No characters.json found, skipping reference download[/yellow]") @@ -92,7 +97,7 @@ def run_character_reference_download_step(name, characters_json, search_mode="no { "characters_json": characters_json, "series_name": name, - "output_dir": settings.character.output_dir, + "output_dir": settings.character.get_output_dir(name), "images_per_character": settings.character.reference_images_per_character, "search_mode": search_mode, }, @@ -101,9 +106,9 @@ def run_character_reference_download_step(name, characters_json, search_mode="no def run_character_reference_processing_step(name, state_manager, interactive_character_processing=False, debug_visualizations=False, **_kwargs): - from preprocessor.characters.reference_processor import CharacterReferenceProcessor # pylint: disable=import-outside-toplevel + from preprocessor.characters.reference.reference_processor import CharacterReferenceProcessor # pylint: disable=import-outside-toplevel - characters_dir = settings.character.output_dir + characters_dir = settings.character.get_output_dir(name) if not characters_dir.exists() or not list(characters_dir.iterdir()): console.print("[yellow]No character references found, skipping processing[/yellow]") return 0 @@ -111,7 +116,7 @@ def run_character_reference_processing_step(name, state_manager, interactive_cha processor = CharacterReferenceProcessor( { "characters_dir": characters_dir, - "output_dir": settings.character.processed_references_dir, + "output_dir": settings.character.get_processed_references_dir(name), "similarity_threshold": settings.character.reference_matching_threshold, "interactive": interactive_character_processing, "series_name": name, @@ -127,13 +132,13 @@ def run_character_reference_processing_step(name, state_manager, interactive_cha def run_character_detection_step(**kwargs): - from preprocessor.characters.detector import CharacterDetector # pylint: disable=import-outside-toplevel + from preprocessor.processors.character_detector import CharacterDetector # pylint: disable=import-outside-toplevel - frames_dir = kwargs.get("output_frames", settings.frame_export.output_dir) - characters_dir = settings.character.output_dir - output_dir = settings.character.detections_dir - episodes_info_json = kwargs.get("episodes_info_json") name = kwargs.get("name") + frames_dir = kwargs.get("output_frames", settings.frame_export.get_output_dir(name)) + characters_dir = settings.character.get_output_dir(name) + output_dir = settings.character.get_detections_dir(name) + episodes_info_json = kwargs.get("episodes_info_json") state_manager = kwargs.get("state_manager") detector = CharacterDetector( @@ -151,8 +156,8 @@ def run_character_detection_step(**kwargs): def run_transcode_step(videos, episodes_info_json, name, resolution, codec, state_manager, **kwargs): from preprocessor.config.config import TranscodeConfig # pylint: disable=import-outside-toplevel + from preprocessor.processors.video_transcoder import VideoTranscoder # pylint: disable=import-outside-toplevel from preprocessor.utils.resolution import Resolution # pylint: disable=import-outside-toplevel - from preprocessor.video.transcoder import VideoTranscoder # pylint: disable=import-outside-toplevel transcoded_videos = kwargs.get("transcoded_videos") @@ -206,7 +211,7 @@ def run_transcribe_step(videos, episodes_info_json, name, model, language, devic return transcriber.work() from preprocessor.config.config import TranscriptionConfig # pylint: disable=import-outside-toplevel - from preprocessor.transcription.generator import TranscriptionGenerator # pylint: disable=import-outside-toplevel + from preprocessor.processors.transcription_generator import TranscriptionGenerator # pylint: disable=import-outside-toplevel console.print("[cyan]Using normal transcription mode (Whisper)[/cyan]") @@ -244,7 +249,7 @@ def run_sound_separation_step(name, episodes_info_json, transcription_jsons, sta def run_scene_step(device, **kwargs): - from preprocessor.video.scene_detector import SceneDetector # pylint: disable=import-outside-toplevel + from preprocessor.processors.scene_detector import SceneDetector # pylint: disable=import-outside-toplevel videos = kwargs.get("videos") scene_timestamps_dir = kwargs.get("scene_timestamps_dir") @@ -268,13 +273,13 @@ def run_scene_step(device, **kwargs): def run_frame_export_step(state_manager, **kwargs): - from preprocessor.video.frame_exporter import FrameExporter # pylint: disable=import-outside-toplevel + from preprocessor.processors.frame_exporter import FrameExporter # pylint: disable=import-outside-toplevel videos = kwargs.get("videos") scene_timestamps_dir = kwargs.get("scene_timestamps_dir") name = kwargs.get("name") episodes_info_json = kwargs.get("episodes_info_json") - output_frames = kwargs.get("output_frames", settings.frame_export.output_dir) + output_frames = kwargs.get("output_frames", settings.frame_export.get_output_dir(name)) exporter = FrameExporter( { @@ -291,16 +296,16 @@ def run_frame_export_step(state_manager, **kwargs): def run_image_hashing_step(device, state_manager, **kwargs): - from preprocessor.hashing.image_hash_processor import ImageHashProcessor # pylint: disable=import-outside-toplevel + from preprocessor.processors.image_hash_processor import ImageHashProcessor # pylint: disable=import-outside-toplevel name = kwargs.get("name") episodes_info_json = kwargs.get("episodes_info_json") - frames_dir = kwargs.get("output_frames", settings.frame_export.output_dir) + frames_dir = kwargs.get("output_frames", settings.frame_export.get_output_dir(name)) hasher = ImageHashProcessor( { "frames_dir": frames_dir, - "output_dir": settings.image_hash.output_dir, + "output_dir": settings.image_hash.get_output_dir(name), "batch_size": settings.embedding.batch_size, "device": device, "series_name": name, @@ -314,20 +319,20 @@ def run_image_hashing_step(device, state_manager, **kwargs): def run_embedding_step(device, state_manager, **kwargs): - from preprocessor.embeddings.embedding_generator import EmbeddingGenerator # pylint: disable=import-outside-toplevel + from preprocessor.processors.embedding_generator import EmbeddingGenerator # pylint: disable=import-outside-toplevel transcription_jsons = kwargs.get("transcription_jsons") name = kwargs.get("name") episodes_info_json = kwargs.get("episodes_info_json") - frames_dir = kwargs.get("output_frames", settings.frame_export.output_dir) + frames_dir = kwargs.get("output_frames", settings.frame_export.get_output_dir(name)) skip_full_episode = kwargs.get("skip_full_episode", False) embedding_generator = EmbeddingGenerator( { "transcription_jsons": transcription_jsons, "frames_dir": frames_dir, - "output_dir": settings.embedding.default_output_dir, - "image_hashes_dir": settings.image_hash.output_dir, + "output_dir": settings.embedding.get_output_dir(name), + "image_hashes_dir": settings.image_hash.get_output_dir(name), "model": settings.embedding_model.model_name, "segments_per_embedding": settings.text_chunking.segments_per_embedding, "generate_text": True, @@ -346,18 +351,17 @@ def run_embedding_step(device, state_manager, **kwargs): def run_elastic_documents_step(**kwargs): - from preprocessor.config.config import ( # pylint: disable=import-outside-toplevel - BASE_OUTPUT_DIR, - get_output_path, - ) - from preprocessor.indexing.elastic_document_generator import ElasticDocumentGenerator # pylint: disable=import-outside-toplevel + from preprocessor.config.config import get_output_path # pylint: disable=import-outside-toplevel + from preprocessor.processors.elastic_document_generator import ElasticDocumentGenerator # pylint: disable=import-outside-toplevel - transcription_jsons = BASE_OUTPUT_DIR / settings.output_subdirs.transcriptions - embeddings_dir = BASE_OUTPUT_DIR / settings.output_subdirs.embeddings - scene_timestamps_dir = kwargs.get("scene_timestamps_dir") or (BASE_OUTPUT_DIR / settings.output_subdirs.scenes) - character_detections_dir = BASE_OUTPUT_DIR / settings.output_subdirs.character_detections - object_detections_dir = BASE_OUTPUT_DIR / settings.output_subdirs.object_detections name = kwargs.get("name") + base_output = get_base_output_dir(name) + + transcription_jsons = base_output / settings.output_subdirs.transcriptions + embeddings_dir = base_output / settings.output_subdirs.embeddings + scene_timestamps_dir = kwargs.get("scene_timestamps_dir") or (base_output / settings.output_subdirs.scenes) + character_detections_dir = base_output / settings.output_subdirs.character_detections + object_detections_dir = base_output / settings.output_subdirs.object_detections episodes_info_json = kwargs.get("episodes_info_json") generator = ElasticDocumentGenerator( @@ -377,7 +381,7 @@ def run_elastic_documents_step(**kwargs): def run_index_step(name, dry_run, state_manager, **kwargs): from preprocessor.config.config import get_output_path # pylint: disable=import-outside-toplevel - from preprocessor.indexing.elasticsearch import ElasticSearchIndexer # pylint: disable=import-outside-toplevel + from preprocessor.processors.elasticsearch_indexer import ElasticSearchIndexer # pylint: disable=import-outside-toplevel episodes_info_json = kwargs.get("episodes_info_json") elastic_documents_dir = get_output_path(settings.output_subdirs.elastic_documents) @@ -411,7 +415,7 @@ def run_frame_processing_step( # pylint: disable=too-many-locals,too-many-argum ): name = kwargs.get("name") episodes_info_json = kwargs.get("episodes_info_json") - output_frames = kwargs.get("output_frames", settings.frame_export.output_dir) + output_frames = kwargs.get("output_frames", settings.frame_export.get_output_dir(name)) processor = FrameProcessor( { @@ -445,7 +449,7 @@ def run_frame_processing_step( # pylint: disable=too-many-locals,too-many-argum if not skip_character_detection: char_detection_sub = CharacterDetectionSubProcessor( - characters_dir=Path(settings.character.output_dir), + characters_dir=Path(settings.character.get_output_dir(name)), use_gpu=True, threshold=settings.character.frame_detection_threshold, ) @@ -494,15 +498,16 @@ def run_frame_processing_step( # pylint: disable=too-many-locals,too-many-argum def run_validation_step(name, episodes_info_json, **kwargs): # pylint: disable=too-many-locals - from preprocessor.config.config import BASE_OUTPUT_DIR # pylint: disable=import-outside-toplevel from preprocessor.validation.global_validator import GlobalValidator # pylint: disable=import-outside-toplevel from preprocessor.validation.validator import Validator # pylint: disable=import-outside-toplevel + base_output = get_base_output_dir(name) + console.print("[bold cyan]Running global validation...[/bold cyan]") - global_validator = GlobalValidator(series_name=name, base_output_dir=BASE_OUTPUT_DIR) + global_validator = GlobalValidator(series_name=name, base_output_dir=base_output) global_result = global_validator.validate() - validation_reports_dir = BASE_OUTPUT_DIR / settings.output_subdirs.validation_reports + validation_reports_dir = base_output / settings.output_subdirs.validation_reports validation_reports_dir.mkdir(parents=True, exist_ok=True) from preprocessor.utils.file_utils import atomic_write_json # pylint: disable=import-outside-toplevel @@ -556,7 +561,7 @@ def run_validation_step(name, episodes_info_json, **kwargs): # pylint: disable= season=season, series_name=name, anomaly_threshold=20.0, - base_output_dir=BASE_OUTPUT_DIR, + base_output_dir=base_output, episodes_info_json=episodes_info_json, ) @@ -572,7 +577,7 @@ def run_validation_step(name, episodes_info_json, **kwargs): # pylint: disable= def run_text_analysis_step(name, episodes_info_json, language, state_manager, **_kwargs): - from preprocessor.text_analysis.text_analyzer import TextAnalyzer # pylint: disable=import-outside-toplevel + from preprocessor.processors.text_analyzer import TextAnalyzer # pylint: disable=import-outside-toplevel analyzer = TextAnalyzer( { @@ -586,15 +591,14 @@ def run_text_analysis_step(name, episodes_info_json, language, state_manager, ** def run_archive_generation_step(**kwargs): - from preprocessor.config.config import ( # pylint: disable=import-outside-toplevel - BASE_OUTPUT_DIR, - get_output_path, - ) - from preprocessor.indexing.archive_generator import ArchiveGenerator # pylint: disable=import-outside-toplevel + from preprocessor.config.config import get_output_path # pylint: disable=import-outside-toplevel + from preprocessor.processors.archive_generator import ArchiveGenerator # pylint: disable=import-outside-toplevel - elastic_documents_dir = get_output_path(settings.output_subdirs.elastic_documents) - output_dir = BASE_OUTPUT_DIR / settings.output_subdirs.archives name = kwargs.get("name") + base_output = get_base_output_dir(name) + + elastic_documents_dir = get_output_path(settings.output_subdirs.elastic_documents, name) + output_dir = base_output / settings.output_subdirs.archives episodes_info_json = kwargs.get("episodes_info_json") generator = ArchiveGenerator( diff --git a/preprocessor/config/config.py b/preprocessor/config/config.py index 6fd98dc05..4679bd243 100644 --- a/preprocessor/config/config.py +++ b/preprocessor/config/config.py @@ -24,8 +24,15 @@ BASE_OUTPUT_DIR = Path("/app/output_data") if is_docker else Path("preprocessor/output_data") -def get_output_path(relative_path: str) -> Path: - return BASE_OUTPUT_DIR / relative_path +def get_base_output_dir(series_name: Optional[str] = None) -> Path: + base = Path("/app/output_data") if is_docker else Path("preprocessor/output_data") + if series_name: + return base / series_name.lower() + return base + + +def get_output_path(relative_path: str, series_name: Optional[str] = None) -> Path: + return get_base_output_dir(series_name) / relative_path # ============================================================================ @@ -90,13 +97,16 @@ def api_key(self) -> Optional[str]: @dataclass class TranscodeSettings: - output_dir: Path = BASE_OUTPUT_DIR / "transcoded_videos" codec: str = "h264_nvenc" target_file_size_mb: float = 50.0 target_duration_seconds: float = 100.0 audio_bitrate_kbps: int = 128 gop_size: float = 0.5 + @staticmethod + def get_output_dir(series_name: str) -> Path: + return get_base_output_dir(series_name) / "transcoded_videos" + def calculate_video_bitrate_mbps(self) -> float: total_bitrate_mbps = (self.target_file_size_mb * 8) / self.target_duration_seconds audio_bitrate_mbps = self.audio_bitrate_kbps / 1000.0 @@ -117,7 +127,10 @@ def calculate_bufsize_mbps(self, multiplier: float = 2.0) -> float: class SceneDetectionSettings: threshold: float = 0.5 min_scene_len: int = 10 - output_dir: Path = BASE_OUTPUT_DIR / "scene_timestamps" + + @staticmethod + def get_output_dir(series_name: str) -> Path: + return get_base_output_dir(series_name) / "scene_timestamps" @dataclass @@ -133,9 +146,12 @@ class KeyframeExtractionSettings: @dataclass class FrameExportSettings: - output_dir: Path = BASE_OUTPUT_DIR / "exported_frames" resolution: Resolution = Resolution.R1080P + @staticmethod + def get_output_dir(series_name: str) -> Path: + return get_base_output_dir(series_name) / "exported_frames" + # ============================================================================ # TRANSCRIPTION & TEXT PROCESSING @@ -143,11 +159,14 @@ class FrameExportSettings: @dataclass class TranscriptionSettings: - output_dir: Path = BASE_OUTPUT_DIR / "transcriptions" model: str = "large-v3-turbo" language: str = "Polish" device: str = "cuda" + @staticmethod + def get_output_dir(series_name: str) -> Path: + return get_base_output_dir(series_name) / "transcriptions" + @dataclass class WhisperSettings: @@ -203,13 +222,16 @@ class EmbeddingModelSettings: @dataclass class EmbeddingSettings: - default_output_dir: Path = BASE_OUTPUT_DIR / "embeddings" batch_size: int = 32 text_batch_size: int = 64 progress_sub_batch_size: int = 100 prefetch_chunks: int = 2 generate_full_episode_embedding: bool = True + @staticmethod + def get_output_dir(series_name: str) -> Path: + return get_base_output_dir(series_name) / "embeddings" + # ============================================================================ # COMPUTER VISION @@ -223,11 +245,14 @@ class FaceRecognitionSettings: @dataclass class FaceClusteringSettings: - output_dir: Path = BASE_OUTPUT_DIR / "face_clusters" min_cluster_size: int = 5 min_samples: int = 3 save_noise: bool = True + @staticmethod + def get_output_dir(series_name: str) -> Path: + return get_base_output_dir(series_name) / "face_clusters" + @dataclass class EmotionDetectionSettings: @@ -241,26 +266,41 @@ def _from_env(cls) -> "EmotionDetectionSettings": @dataclass class CharacterSettings: - output_dir: Path = BASE_OUTPUT_DIR / "characters" reference_images_per_character: int = 3 - characters_list_file: Path = BASE_OUTPUT_DIR / "characters.json" - detections_dir: Path = BASE_OUTPUT_DIR / "character_detections" - processed_references_dir: Path = BASE_OUTPUT_DIR / "character_references_processed" normalized_face_size: Tuple[int, int] = (112, 112) face_detection_threshold: float = 0.2 reference_matching_threshold: float = 0.50 frame_detection_threshold: float = 0.55 + @staticmethod + def get_output_dir(series_name: str) -> Path: + return get_base_output_dir(series_name) / "characters" + + @staticmethod + def get_characters_list_file(series_name: str) -> Path: + return get_base_output_dir(series_name) / "characters.json" -_OBJECT_DETECTIONS_DIR = BASE_OUTPUT_DIR / "object_detections" + @staticmethod + def get_detections_dir(series_name: str) -> Path: + return get_base_output_dir(series_name) / "character_detections" + + @staticmethod + def get_processed_references_dir(series_name: str) -> Path: + return get_base_output_dir(series_name) / "character_references_processed" @dataclass class ObjectDetectionSettings: model_name: str = "ustc-community/dfine-xlarge-obj2coco" conf_threshold: float = 0.30 - output_dir: Path = _OBJECT_DETECTIONS_DIR - visualized_output_dir: Path = _OBJECT_DETECTIONS_DIR / "visualizations" + + @staticmethod + def get_output_dir(series_name: str) -> Path: + return get_base_output_dir(series_name) / "object_detections" + + @staticmethod + def get_visualized_output_dir(series_name: str) -> Path: + return get_base_output_dir(series_name) / "object_detections" / "visualizations" # ============================================================================ @@ -269,7 +309,9 @@ class ObjectDetectionSettings: @dataclass class ImageHashSettings: - output_dir: Path = BASE_OUTPUT_DIR / "image_hashes" + @staticmethod + def get_output_dir(series_name: str) -> Path: + return get_base_output_dir(series_name) / "image_hashes" @dataclass @@ -297,7 +339,9 @@ def serpapi_key(self) -> Optional[str]: @dataclass class ScraperSettings: - output_dir: Path = BASE_OUTPUT_DIR / "scraped_pages" + @staticmethod + def get_output_dir(series_name: str) -> Path: + return get_base_output_dir(series_name) / "scraped_pages" # ============================================================================ diff --git a/preprocessor/core/base_processor.py b/preprocessor/core/base_processor.py index fa8e5aa04..a3ab82bef 100644 --- a/preprocessor/core/base_processor.py +++ b/preprocessor/core/base_processor.py @@ -18,6 +18,7 @@ FILE_SUFFIXES, SUPPORTED_VIDEO_EXTENSIONS, ) +from preprocessor.core.file_naming import FileNamingConventions from preprocessor.core.state_manager import StateManager from preprocessor.utils.console import ( console, @@ -42,6 +43,11 @@ class OutputSpec: class BaseProcessor(ABC): SUPPORTED_VIDEO_EXTENSIONS = SUPPORTED_VIDEO_EXTENSIONS + REQUIRES: List[str] = [] + PRODUCES: List[str] = [] + PRIORITY: int = 100 + DESCRIPTION: str = "" + def __init__( self, args: Dict[str, Any], @@ -61,6 +67,12 @@ def __init__( self.state_manager: Optional[StateManager] = args.get("state_manager") self.series_name: str = args.get("series_name", "unknown") + from preprocessor.core.path_manager import PathManager # pylint: disable=import-outside-toplevel + self.path_manager: PathManager = args.get( + "path_manager", + PathManager(self.series_name), + ) + from preprocessor.utils.progress_tracker import ProgressTracker # pylint: disable=import-outside-toplevel self.progress = args.get("progress_tracker", ProgressTracker()) @@ -142,6 +154,10 @@ def _process_item(self, item: ProcessingItem, missing_outputs: List[OutputSpec]) "or override _execute() directly (legacy mode)", ) + @abstractmethod + def get_output_subdir(self) -> str: + pass + def __get_step_name(self) -> str: class_name = self.__class__.__name__ name = class_name.replace("Processor", "").replace("Generator", "").replace("Detector", "") @@ -284,6 +300,21 @@ def _create_video_processing_items( ) -> List[ProcessingItem]: from preprocessor.core.episode_manager import EpisodeManager # pylint: disable=import-outside-toplevel + series_name = self.series_name + + if not source_path.is_file(): + if source_path.name != series_name: + source_path = source_path / series_name + + if not source_path.exists(): + raise FileNotFoundError( + f"Input directory does not exist: {source_path}\n" + f"Expected structure: /input_data/{series_name}/S01/, /input_data/{series_name}/S02/, etc.\n\n" + f"Migration guide:\n" + f" mkdir -p /input_data/{series_name}\n" + f" mv /input_data/S* /input_data/{series_name}/", + ) + video_files = [] if source_path.is_file(): @@ -338,3 +369,68 @@ def _create_transcription_processing_item(self, transcription_file: Path) -> Pro "base_name": base_name, }, ) + + def _build_output_path( + self, + episode_info, + filename: str, + subdir: Optional[str] = None, + ) -> Path: + target_subdir = subdir if subdir is not None else self.get_output_subdir() + return self.path_manager.build_path(episode_info, target_subdir, filename) + + def _build_output_paths( + self, + episode_info, + filenames: List[str], + subdir: Optional[str] = None, + ) -> List[Path]: + return [ + self._build_output_path(episode_info, filename, subdir) + for filename in filenames + ] + + def _build_season_path( + self, + episode_info, + filename: str, + subdir: Optional[str] = None, + ) -> Path: + target_subdir = subdir if subdir is not None else self.get_output_subdir() + return self.path_manager.build_season_path(episode_info, target_subdir, filename) + + def _build_filename( + self, + episode_info, + extension: str = "json", + suffix: Optional[str] = None, + ) -> str: + if hasattr(self, 'episode_manager') and self.episode_manager: + return self.episode_manager.file_naming.build_filename( # pylint: disable=no-member + episode_info, + extension=extension, + suffix=suffix, + ) + + file_naming = FileNamingConventions(self.series_name) + return file_naming.build_filename( + episode_info, + extension=extension, + suffix=suffix, + ) + + def _build_single_output( + self, + item: ProcessingItem, + suffix: str, + extension: str = "json", + subdir: Optional[str] = None, + required: bool = True, + ) -> List[OutputSpec]: + episode_info = item.metadata.get("episode_info") + if not episode_info: + return [] + + filename = self._build_filename(episode_info, extension=extension, suffix=suffix) + path = self._build_output_path(episode_info, filename, subdir=subdir) + return [OutputSpec(path=path, required=required)] diff --git a/preprocessor/core/episode_file_finder.py b/preprocessor/core/episode_file_finder.py index 84556d66c..59762b919 100644 --- a/preprocessor/core/episode_file_finder.py +++ b/preprocessor/core/episode_file_finder.py @@ -11,7 +11,6 @@ from preprocessor.core.constants import SUPPORTED_VIDEO_EXTENSIONS from preprocessor.core.file_naming import FileNamingConventions -from preprocessor.core.output_path_builder import OutputPathBuilder logger = logging.getLogger(__name__) @@ -29,7 +28,7 @@ def find_video_file(episode_info, search_dir: Path) -> Optional[Path]: return search_dir episode_code = episode_info.episode_code() - season_dir_name = OutputPathBuilder.get_season_dir(episode_info) + season_dir_name = episode_info.season_code() search_dirs = [search_dir / season_dir_name, search_dir] for dir_path in search_dirs: @@ -52,7 +51,7 @@ def find_transcription_file( if not search_dir.exists(): return None - season_dir_name = OutputPathBuilder.get_season_dir(episode_info) + season_dir_name = episode_info.season_code() season_dir = search_dir / season_dir_name if not season_dir.exists(): return None diff --git a/preprocessor/core/episode_manager.py b/preprocessor/core/episode_manager.py index e910e9b94..9d6ecb3fb 100644 --- a/preprocessor/core/episode_manager.py +++ b/preprocessor/core/episode_manager.py @@ -9,11 +9,9 @@ Optional, ) -from preprocessor.core.constants import DEFAULT_VIDEO_EXTENSION from preprocessor.core.episode_file_finder import EpisodeFileFinder from preprocessor.core.episode_parser import EpisodeInfoParser from preprocessor.core.file_naming import FileNamingConventions -from preprocessor.core.output_path_builder import OutputPathBuilder from preprocessor.utils.constants import ( EpisodeMetadataKeys, EpisodesDataKeys, @@ -38,9 +36,25 @@ def episode_code(self) -> str: def season_dir_name(self) -> str: return f"S{self.season:02d}" + def season_code(self) -> str: + return f"S{self.season:02d}" + + def episode_num(self) -> str: + return f"E{self.relative_episode:02d}" + def is_special(self) -> bool: return self.season == 0 + @staticmethod + def create_minimal(season: int, episode: int, series_name: str) -> "EpisodeInfo": + return EpisodeInfo( + absolute_episode=0, + season=season, + relative_episode=episode, + title="", + series_name=series_name, + ) + class EpisodeManager: def __init__(self, episodes_info_json: Optional[Path], series_name: str): @@ -100,23 +114,6 @@ def get_episode_by_season_and_relative(self, season: int, relative_episode: int) series_name=self.series_name, ) - def build_output_path(self, episode_info: EpisodeInfo, base_dir: Path, extension: str = DEFAULT_VIDEO_EXTENSION) -> Path: - filename = self.file_naming.build_filename(episode_info, extension=extension.lstrip('.')) - season_dir_name = OutputPathBuilder.get_season_dir(episode_info) - season_dir = base_dir / season_dir_name - season_dir.mkdir(parents=True, exist_ok=True) - return season_dir / filename - - @staticmethod - def get_episode_subdir(episode_info: EpisodeInfo, subdir: str) -> Path: - return OutputPathBuilder.get_episode_dir(episode_info, subdir) - - @staticmethod - def build_episode_output_path(episode_info: EpisodeInfo, subdir: str, filename: str) -> Path: - return OutputPathBuilder.build_output_path(episode_info, subdir, filename) - - def build_video_path_for_elastic(self, episode_info: EpisodeInfo) -> str: - return OutputPathBuilder.build_elastic_video_path(episode_info, self.series_name) def find_transcription_file(self, episode_info: EpisodeInfo, search_dir: Path, prefer_segmented: bool = True) -> Optional[Path]: return self.file_finder.find_transcription_file(episode_info, search_dir, prefer_segmented) diff --git a/preprocessor/core/output_path_builder.py b/preprocessor/core/output_path_builder.py deleted file mode 100644 index 6bfba6456..000000000 --- a/preprocessor/core/output_path_builder.py +++ /dev/null @@ -1,72 +0,0 @@ -from pathlib import Path - -from preprocessor.config.config import ( - BASE_OUTPUT_DIR, - settings, -) -from preprocessor.core.constants import ( - DEFAULT_VIDEO_EXTENSION, - FILE_EXTENSIONS, -) - - -class OutputPathBuilder: - @staticmethod - def get_episode_dir(episode_info, base_subdir: str) -> Path: - season_code = f"S{episode_info.season:02d}" - episode_code = f"E{episode_info.relative_episode:02d}" - return BASE_OUTPUT_DIR / base_subdir / season_code / episode_code - - @staticmethod - def get_season_dir(episode_info) -> str: - return f"S{episode_info.season:02d}" - - @staticmethod - def build_transcription_path(episode_info, filename: str, subdir: str = "raw") -> Path: - season_code = f"S{episode_info.season:02d}" - episode_code = f"E{episode_info.relative_episode:02d}" - path = BASE_OUTPUT_DIR / settings.output_subdirs.transcriptions / season_code / episode_code / subdir / filename - path.parent.mkdir(parents=True, exist_ok=True) - return path - - @staticmethod - def build_output_path(episode_info, subdir: str, filename: str) -> Path: - path = OutputPathBuilder.get_episode_dir(episode_info, subdir) / filename - path.parent.mkdir(parents=True, exist_ok=True) - return path - - @staticmethod - def build_video_path(episode_info, series_name: str, extension: str = DEFAULT_VIDEO_EXTENSION) -> Path: - filename = f"{series_name.lower()}_{episode_info.episode_code()}{extension}" - season_dir_name = OutputPathBuilder.get_season_dir(episode_info) - season_dir = BASE_OUTPUT_DIR / settings.output_subdirs.video / season_dir_name - season_dir.mkdir(parents=True, exist_ok=True) - return season_dir / filename - - @staticmethod - def build_elastic_video_path(episode_info, series_name: str) -> str: - filename = f"{series_name.lower()}_{episode_info.episode_code()}{FILE_EXTENSIONS['mp4']}" - season_dir_name = OutputPathBuilder.get_season_dir(episode_info) - path = Path("bot") / f"{series_name.upper()}-WIDEO" / season_dir_name / filename - return path.as_posix() - - @staticmethod - def build_embedding_path(episode_info, filename: str) -> Path: - return OutputPathBuilder.build_output_path( - episode_info, - settings.output_subdirs.embeddings, - filename, - ) - - @staticmethod - def build_scene_path(episode_info, filename: str) -> Path: - return OutputPathBuilder.build_output_path( - episode_info, - settings.output_subdirs.scenes, - filename, - ) - - @staticmethod - def build_elastic_document_path(episode_info, subdoc_type: str, filename: str) -> Path: - full_subdir = f"{settings.output_subdirs.elastic_documents}/{subdoc_type}" - return OutputPathBuilder.build_output_path(episode_info, full_subdir, filename) diff --git a/preprocessor/core/path_manager.py b/preprocessor/core/path_manager.py new file mode 100644 index 000000000..184cdd208 --- /dev/null +++ b/preprocessor/core/path_manager.py @@ -0,0 +1,51 @@ +from pathlib import Path + +from preprocessor.config.config import get_base_output_dir + + +class PathManager: + def __init__(self, series_name: str): + self._series_name = series_name.lower() + self._base_output_dir = get_base_output_dir(self._series_name) + + @property + def series_name(self) -> str: + return self._series_name + + @property + def base_output_dir(self) -> Path: + return self._base_output_dir + + def build_path( + self, + episode_info, + subdir: str, + filename: str, + ) -> Path: + season_code = episode_info.season_code() + episode_code = episode_info.episode_num() + + path = self._base_output_dir / subdir / season_code / episode_code / filename + path.parent.mkdir(parents=True, exist_ok=True) + + return path + + def build_season_path( + self, + episode_info, + subdir: str, + filename: str, + ) -> Path: + season_code = episode_info.season_code() + + path = self._base_output_dir / subdir / season_code / filename + path.parent.mkdir(parents=True, exist_ok=True) + + return path + + def get_episode_dir(self, episode_info, subdir: str) -> Path: + season_code = episode_info.season_code() + episode_code = episode_info.episode_num() + episode_dir = self._base_output_dir / subdir / season_code / episode_code + episode_dir.mkdir(parents=True, exist_ok=True) + return episode_dir diff --git a/preprocessor/core/processor_factory.py b/preprocessor/core/processor_factory.py new file mode 100644 index 000000000..f33f921f9 --- /dev/null +++ b/preprocessor/core/processor_factory.py @@ -0,0 +1,64 @@ +from typing import ( + Any, + Dict, + List, + Set, + Tuple, +) + +from preprocessor.core.processor_registry import ( + get_processor_class, + get_processor_info, + list_processors, +) + + +class ProcessorFactory: + @staticmethod + def create(processor_name: str, args: Dict[str, Any]): + processor_class = get_processor_class(processor_name) + return processor_class(args) + + @staticmethod + def list_available() -> List[str]: + return list_processors() + + @staticmethod + def get_info(processor_name: str) -> Dict[str, Any]: + return get_processor_info(processor_name) + + @staticmethod + def get_all_info() -> List[Dict[str, Any]]: + return [ + ProcessorFactory.get_info(name) + for name in ProcessorFactory.list_available() + ] + + @staticmethod + def build_dependency_graph() -> Dict[str, List[str]]: + graph = {} + for name in list_processors(): + info = get_processor_info(name) + graph[name] = info["requires"] + return graph + + @staticmethod + def validate_dependencies( + processor_name: str, + available_data: Set[str], + ) -> Tuple[bool, List[str]]: + info = get_processor_info(processor_name) + required = set(info["requires"]) + missing = required - available_data + return len(missing) == 0, sorted(missing) + + @staticmethod + def sort_by_priority(processors: List[str]) -> List[str]: + processor_info = { + name: get_processor_info(name) + for name in processors + } + return sorted( + processors, + key=lambda name: processor_info[name]["priority"], + ) diff --git a/preprocessor/core/processor_registry.py b/preprocessor/core/processor_registry.py new file mode 100644 index 000000000..171dc890e --- /dev/null +++ b/preprocessor/core/processor_registry.py @@ -0,0 +1,48 @@ +from typing import ( + Any, + Dict, + List, + Type, +) + +from preprocessor.core.base_processor import BaseProcessor + +PROCESSOR_REGISTRY: Dict[str, Type[BaseProcessor]] = {} + + +def register_processor(name: str): + def decorator(cls: Type[BaseProcessor]): + if name in PROCESSOR_REGISTRY: + raise ValueError(f"Processor '{name}' already registered!") + + PROCESSOR_REGISTRY[name] = cls + cls.PROCESSOR_NAME = name + + return cls + return decorator + + +def get_processor_class(name: str) -> Type[BaseProcessor]: + if name not in PROCESSOR_REGISTRY: + available = ", ".join(sorted(PROCESSOR_REGISTRY.keys())) + raise ValueError( + f"Unknown processor: '{name}'\n" + f"Available processors: {available}" + ) + return PROCESSOR_REGISTRY[name] + + +def list_processors() -> List[str]: + return sorted(PROCESSOR_REGISTRY.keys()) + + +def get_processor_info(name: str) -> Dict[str, Any]: + processor_class = get_processor_class(name) + return { + "name": name, + "class": processor_class.__name__, + "requires": getattr(processor_class, "REQUIRES", []), + "produces": getattr(processor_class, "PRODUCES", []), + "priority": getattr(processor_class, "PRIORITY", 100), + "description": getattr(processor_class, "DESCRIPTION", ""), + } diff --git a/preprocessor/embeddings/embedding_generator.py b/preprocessor/embeddings/embedding_generator.py index 81b2f00ef..0c985e281 100644 --- a/preprocessor/embeddings/embedding_generator.py +++ b/preprocessor/embeddings/embedding_generator.py @@ -21,7 +21,7 @@ ) from preprocessor.core.constants import FILE_SUFFIXES from preprocessor.core.episode_manager import EpisodeManager -from preprocessor.core.output_path_builder import OutputPathBuilder +from preprocessor.core.processor_registry import register_processor from preprocessor.embeddings.episode_name_embedder import EpisodeNameEmbedder from preprocessor.embeddings.gpu_batch_processor import GPUBatchProcessor from preprocessor.embeddings.qwen3_vl_embedding import Qwen3VLEmbedder @@ -35,8 +35,13 @@ # pylint: disable=duplicate-code - +@register_processor("generate_embeddings") class EmbeddingGenerator(BaseProcessor): # pylint: disable=too-many-instance-attributes + REQUIRES = ["transcriptions", "frames"] + PRODUCES = ["embeddings"] + PRIORITY = 50 + DESCRIPTION = "Generate multimodal embeddings" + def __init__(self, args: Dict[str, Any]): super().__init__( args=args, @@ -46,8 +51,8 @@ def __init__(self, args: Dict[str, Any]): ) self.transcription_jsons: Path = self._args["transcription_jsons"] - self.frames_dir: Path = self._args.get("frames_dir", settings.frame_export.output_dir) - self.output_dir: Path = self._args.get("output_dir", settings.embedding.default_output_dir) + self.frames_dir: Path = self._args.get("frames_dir", settings.frame_export.get_output_dir(self.series_name)) + self.output_dir: Path = self._args.get("output_dir", settings.embedding.get_output_dir(self.series_name)) self.model_name: str = self._args.get("model", settings.embedding_model.model_name) self.model_revision: str = self._args.get("model_revision", settings.embedding_model.model_revision) @@ -63,7 +68,9 @@ def __init__(self, args: Dict[str, Any]): self.generate_full_episode: bool = self._args.get("generate_full_episode", settings.embedding.generate_full_episode_embedding) self.generate_sound_events: bool = self._args.get("generate_sound_events", True) - self.image_hashes_dir: Path = Path(self._args.get("image_hashes_dir", settings.image_hash.output_dir)) + self.image_hashes_dir: Path = Path( + self._args.get("image_hashes_dir", settings.image_hash.get_output_dir(self.series_name)), + ) episodes_info_json = self._args.get("episodes_info_json") self.episode_manager = EpisodeManager(episodes_info_json, self.series_name) @@ -79,6 +86,9 @@ def _validate_args(self, args: Dict[str, Any]) -> None: if not torch.cuda.is_available(): raise RuntimeError("CUDA is not available. This application requires GPU.") + def get_output_subdir(self) -> str: + return settings.output_subdirs.embeddings + def cleanup(self) -> None: console.print("[cyan]Unloading embedding model...[/cyan]") self.model = None @@ -154,12 +164,12 @@ def _get_expected_outputs(self, item: ProcessingItem) -> List[OutputSpec]: extension="json", suffix="_embeddings_text", ) - text_output = OutputPathBuilder.build_embedding_path(episode_info, text_filename) + text_output = self._build_output_path(episode_info, text_filename) outputs.append(OutputSpec(path=text_output, required=True)) if self.generate_episode_names: episode_name_filename = f"{FILE_SUFFIXES['episode_name']}.json" - episode_name_output = OutputPathBuilder.build_embedding_path(episode_info, episode_name_filename) + episode_name_output = self._build_output_path(episode_info, episode_name_filename) outputs.append(OutputSpec(path=episode_name_output, required=True)) if self.generate_video: @@ -168,7 +178,7 @@ def _get_expected_outputs(self, item: ProcessingItem) -> List[OutputSpec]: extension="json", suffix="_embeddings_video", ) - video_output = OutputPathBuilder.build_embedding_path(episode_info, video_filename) + video_output = self._build_output_path(episode_info, video_filename) outputs.append(OutputSpec(path=video_output, required=True)) if self.generate_full_episode: @@ -177,7 +187,7 @@ def _get_expected_outputs(self, item: ProcessingItem) -> List[OutputSpec]: extension="json", suffix="_embeddings_full_episode", ) - full_episode_output = OutputPathBuilder.build_embedding_path(episode_info, full_episode_filename) + full_episode_output = self._build_output_path(episode_info, full_episode_filename) outputs.append(OutputSpec(path=full_episode_output, required=True)) if self.generate_sound_events: @@ -186,7 +196,7 @@ def _get_expected_outputs(self, item: ProcessingItem) -> List[OutputSpec]: extension="json", suffix="_embeddings_sound_events", ) - sound_events_output = OutputPathBuilder.build_embedding_path(episode_info, sound_events_filename) + sound_events_output = self._build_output_path(episode_info, sound_events_filename) outputs.append(OutputSpec(path=sound_events_output, required=True)) return outputs @@ -642,7 +652,7 @@ def __load_frame_metadata(self, episode_info_dict: Dict[str, Any]) -> Optional[D if not episode_info_obj: return None - frames_episode_dir = self.episode_manager.get_episode_subdir(episode_info_obj, settings.output_subdirs.frames) + frames_episode_dir = self.path_manager.get_episode_dir(episode_info_obj, settings.output_subdirs.frames) metadata_file = frames_episode_dir / f"{self.episode_manager.series_name}_{episode_info_obj.episode_code()}_frame_metadata.json" if not metadata_file.exists(): @@ -653,7 +663,7 @@ def __load_frame_metadata(self, episode_info_dict: Dict[str, Any]) -> Optional[D return json.load(f) def __load_image_hashes(self, episode_info_dict: Dict[str, Any]) -> Dict[int, str]: - return load_image_hashes_for_episode(episode_info_dict, self.logger) + return load_image_hashes_for_episode(episode_info_dict, self.series_name, self.logger) def __generate_video_embeddings(self, episode_info_dict: Dict[str, Any], frame_metadata: Dict[str, Any]) -> List[Dict[str, Any]]: frame_requests = frame_metadata.get("frames", []) @@ -667,8 +677,8 @@ def __generate_video_embeddings(self, episode_info_dict: Dict[str, Any], frame_m if not episode_info_obj: return [] - frames_episode_dir = self.episode_manager.get_episode_subdir(episode_info_obj, settings.output_subdirs.frames) - episode_output_dir = self.episode_manager.get_episode_subdir(episode_info_obj, settings.output_subdirs.embeddings) + frames_episode_dir = self.path_manager.get_episode_dir(episode_info_obj, settings.output_subdirs.frames) + episode_output_dir = self.path_manager.get_episode_dir(episode_info_obj, settings.output_subdirs.embeddings) checkpoint_file = episode_output_dir / "embeddings_video_checkpoint.json" image_hashes = self.__load_image_hashes(episode_info_dict) @@ -688,8 +698,8 @@ def __generate_video_embeddings(self, episode_info_dict: Dict[str, Any], frame_m def __get_episode_output_dir(self, transcription_file: Path) -> Path: episode_info_from_file = self.episode_manager.parse_filename(transcription_file) if episode_info_from_file: - return self.episode_manager.get_episode_subdir(episode_info_from_file, settings.output_subdirs.embeddings) - return self.episode_manager.get_episode_subdir(None, settings.output_subdirs.embeddings) + return self.path_manager.get_episode_dir(episode_info_from_file, settings.output_subdirs.embeddings) + return self.path_manager.base_output_dir / settings.output_subdirs.embeddings def __save_embeddings( self, diff --git a/preprocessor/embeddings/episode_name_embedder.py b/preprocessor/embeddings/episode_name_embedder.py index 46256818c..43bd1447f 100644 --- a/preprocessor/embeddings/episode_name_embedder.py +++ b/preprocessor/embeddings/episode_name_embedder.py @@ -10,7 +10,11 @@ import numpy as np from preprocessor.config.config import settings -from preprocessor.core.episode_manager import EpisodeManager +from preprocessor.core.episode_manager import ( + EpisodeInfo, + EpisodeManager, +) +from preprocessor.core.path_manager import PathManager from preprocessor.utils.console import console from preprocessor.utils.constants import EmbeddingKeys from preprocessor.utils.file_utils import atomic_write_json @@ -28,7 +32,7 @@ def __init__( self.model = model self.episode_manager = episode_manager self.series_name = series_name - self.output_dir = output_dir or settings.embedding.default_output_dir + self.output_dir = output_dir or settings.embedding.get_output_dir(series_name) self.logger = logger or logging.getLogger(__name__) def __generate_episode_name_embeddings( @@ -94,20 +98,19 @@ def __generate_title_embedding(self, title: str) -> Optional[np.ndarray]: @staticmethod def __save_episode_name_embedding( - season: int, + season: int, episode: int, embedding_data: Dict[str, Any], + series_name: str, ) -> Path: - from preprocessor.core.episode_manager import EpisodeInfo # pylint: disable=import-outside-toplevel - from preprocessor.core.output_path_builder import OutputPathBuilder # pylint: disable=import-outside-toplevel + path_manager = PathManager(series_name) + episode_info = EpisodeInfo.create_minimal(season, episode, series_name) - episode_info = EpisodeInfo( - absolute_episode=0, - season=season, - relative_episode=episode, - title="", + output_file = path_manager.build_path( + episode_info, + settings.output_subdirs.embeddings, + "episode_name_embedding.json", ) - output_file = OutputPathBuilder.build_embedding_path(episode_info, "episode_name_embedding.json") atomic_write_json(output_file, embedding_data, indent=2, ensure_ascii=False) @@ -124,7 +127,7 @@ def generate_and_save_for_transcription( season = embedding_data[EmbeddingKeys.EPISODE_METADATA]["season"] episode = embedding_data[EmbeddingKeys.EPISODE_METADATA]["episode_number"] - output_file = self.__save_episode_name_embedding(season, episode, embedding_data) + output_file = self.__save_episode_name_embedding(season, episode, embedding_data, self.series_name) console.print( f"[green]Generated episode name embedding for {embedding_data[EmbeddingKeys.EPISODE_ID]}: {embedding_data[EmbeddingKeys.TITLE]}[/green]", ) @@ -135,21 +138,26 @@ def generate_and_save_for_transcription( def load_episode_name_embedding( season: int, episode: int, + series_name: str, output_dir: Optional[Path] = None, ) -> Optional[Dict[str, Any]]: - from preprocessor.core.episode_manager import EpisodeInfo # pylint: disable=import-outside-toplevel - from preprocessor.core.output_path_builder import OutputPathBuilder # pylint: disable=import-outside-toplevel - if output_dir is None: - output_dir = settings.embedding.default_output_dir + output_dir = settings.embedding.get_output_dir(series_name) + path_manager = PathManager(series_name) episode_info = EpisodeInfo( absolute_episode=0, season=season, relative_episode=episode, title="", + series_name=series_name, + ) + + embedding_file = path_manager.build_path( + episode_info, + settings.output_subdirs.embeddings, + "episode_name_embedding.json", ) - embedding_file = OutputPathBuilder.build_embedding_path(episode_info, "episode_name_embedding.json") if not embedding_file.exists(): return None diff --git a/preprocessor/processors/__init__.py b/preprocessor/processors/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/preprocessor/indexing/archive_generator.py b/preprocessor/processors/archive_generator.py similarity index 89% rename from preprocessor/indexing/archive_generator.py rename to preprocessor/processors/archive_generator.py index 711c9c670..09e1b9797 100644 --- a/preprocessor/indexing/archive_generator.py +++ b/preprocessor/processors/archive_generator.py @@ -7,7 +7,10 @@ ) import zipfile -from preprocessor.config.config import settings +from preprocessor.config.config import ( + get_base_output_dir, + settings, +) from preprocessor.core.base_processor import ( BaseProcessor, OutputSpec, @@ -18,12 +21,19 @@ FILE_SUFFIXES, ) from preprocessor.core.episode_manager import EpisodeManager +from preprocessor.core.processor_registry import register_processor from preprocessor.utils.console import console ELASTIC_SUBDIRS = settings.output_subdirs.elastic_document_subdirs +@register_processor("generate_archives") class ArchiveGenerator(BaseProcessor): + REQUIRES = ["elastic_documents"] + PRODUCES = ["archives"] + PRIORITY = 90 + DESCRIPTION = "Generate archive files" + FOLDER_TO_FILE_SUFFIX = { ELASTIC_SUBDIRS.text_segments: "text_segments", ELASTIC_SUBDIRS.text_embeddings: "text_embeddings", @@ -44,7 +54,7 @@ def __init__(self, args: Dict[str, Any]): ) self.elastic_documents_dir: Path = self._args["elastic_documents_dir"] - self.output_dir: Path = self._args.get("output_dir", Path("/app/output_data/archives")) + self.output_dir: Path = self._args.get("output_dir", get_base_output_dir(self.series_name) / "archives") self.allow_partial: bool = self._args.get("allow_partial", False) episodes_info_json = self._args.get("episodes_info_json") @@ -54,6 +64,9 @@ def _validate_args(self, args: Dict[str, Any]) -> None: if "elastic_documents_dir" not in args: raise ValueError("elastic_documents_dir is required") + def get_output_subdir(self) -> str: + return settings.output_subdirs.archives + def _get_processing_items(self) -> List[ProcessingItem]: segments_dir = self.elastic_documents_dir / ELASTIC_SUBDIRS.text_segments if not segments_dir.exists(): @@ -90,8 +103,8 @@ def _get_expected_outputs(self, item: ProcessingItem) -> List[OutputSpec]: archive_name = f"{base_name}.zip" archive_path = ( self.output_dir - / f"S{episode_info.season:02d}" - / f"E{episode_info.relative_episode:02d}" + / episode_info.season_code() + / episode_info.episode_num() / archive_name ) @@ -132,8 +145,8 @@ def __collect_episode_files(self, episode_info, base_name: str) -> Dict[str, Pat file_path = ( self.elastic_documents_dir / folder_name - / f"S{episode_info.season:02d}" - / f"E{episode_info.relative_episode:02d}" + / episode_info.season_code() + / episode_info.episode_num() / file_name ) diff --git a/preprocessor/characters/detector.py b/preprocessor/processors/character_detector.py similarity index 82% rename from preprocessor/characters/detector.py rename to preprocessor/processors/character_detector.py index b7255b50f..1b03a3cae 100644 --- a/preprocessor/characters/detector.py +++ b/preprocessor/processors/character_detector.py @@ -11,8 +11,8 @@ from insightface.app import FaceAnalysis import numpy as np -from preprocessor.characters.face_detection_utils import load_character_references -from preprocessor.characters.utils import init_face_detection +from preprocessor.characters.face.face_detection_utils import load_character_references +from preprocessor.characters.face.utils import init_face_detection from preprocessor.config.config import settings from preprocessor.core.base_processor import ( BaseProcessor, @@ -21,7 +21,7 @@ ) from preprocessor.core.episode_manager import EpisodeManager from preprocessor.core.file_naming import FileNamingConventions -from preprocessor.core.output_path_builder import OutputPathBuilder +from preprocessor.core.processor_registry import register_processor from preprocessor.utils.console import console from preprocessor.utils.detection_io import ( process_frames_for_detection, @@ -31,8 +31,13 @@ # pylint: disable=duplicate-code - +@register_processor("detect_characters") class CharacterDetector(BaseProcessor): + REQUIRES = ["frames"] + PRODUCES = ["character_detections"] + PRIORITY = 60 + DESCRIPTION = "Detect characters in frames" + def __init__(self, args: Dict[str, Any]): super().__init__( args=args, @@ -42,7 +47,7 @@ def __init__(self, args: Dict[str, Any]): ) self.frames_dir: Path = self._args["frames_dir"] - self.characters_dir: Path = self._args.get("characters_dir", settings.character.output_dir) + self.characters_dir: Path = self._args.get("characters_dir", settings.character.get_output_dir(self.series_name)) self.threshold: float = settings.character.frame_detection_threshold episodes_info_json = self._args.get("episodes_info_json") @@ -55,6 +60,9 @@ def _validate_args(self, args: Dict[str, Any]) -> None: if "frames_dir" not in args: raise ValueError("frames_dir is required") + def get_output_subdir(self) -> str: + return settings.output_subdirs.character_detections + # pylint: disable=duplicate-code def _get_processing_items(self) -> List[ProcessingItem]: return self._get_episode_processing_items_from_metadata( @@ -71,11 +79,7 @@ def _get_expected_outputs(self, item: ProcessingItem) -> List[OutputSpec]: extension="json", suffix="character_detections", ) - detections_output = OutputPathBuilder.build_output_path( - episode_info, - settings.output_subdirs.character_detections, - detections_filename, - ) + detections_output = self._build_output_path(episode_info, detections_filename) return [OutputSpec(path=detections_output, required=True)] # pylint: enable=duplicate-code @@ -112,4 +116,4 @@ def _process_item(self, item: ProcessingItem, missing_outputs: List[OutputSpec]) self.threshold, fps=fps, ) - save_character_detections(episode_info, results, fps=fps) + save_character_detections(episode_info, results, self.path_manager, fps=fps) diff --git a/preprocessor/indexing/elastic_document_generator.py b/preprocessor/processors/elastic_document_generator.py similarity index 90% rename from preprocessor/indexing/elastic_document_generator.py rename to preprocessor/processors/elastic_document_generator.py index c238cf2e2..8c0e6f8fc 100644 --- a/preprocessor/indexing/elastic_document_generator.py +++ b/preprocessor/processors/elastic_document_generator.py @@ -14,7 +14,10 @@ ObjectDetectionInFrame, SceneTimestampsData, ) -from preprocessor.config.config import settings +from preprocessor.config.config import ( + get_base_output_dir, + settings, +) from preprocessor.core.base_processor import ( BaseProcessor, OutputSpec, @@ -25,7 +28,7 @@ FILE_SUFFIXES, ) from preprocessor.core.episode_manager import EpisodeManager -from preprocessor.core.output_path_builder import OutputPathBuilder +from preprocessor.core.processor_registry import register_processor from preprocessor.embeddings.episode_name_embedder import EpisodeNameEmbedder from preprocessor.utils.console import console from preprocessor.utils.constants import ( @@ -43,7 +46,13 @@ ELASTIC_SUBDIRS = settings.output_subdirs.elastic_document_subdirs +@register_processor("generate_elastic_docs") class ElasticDocumentGenerator(BaseProcessor): + REQUIRES = ["transcriptions", "embeddings"] + PRODUCES = ["elastic_documents"] + PRIORITY = 80 + DESCRIPTION = "Generate Elasticsearch documents" + def __init__(self, args: Dict[str, Any]): super().__init__( args=args, @@ -57,7 +66,10 @@ def __init__(self, args: Dict[str, Any]): self.scene_timestamps_dir: Optional[Path] = self._args.get("scene_timestamps_dir") self.character_detections_dir: Optional[Path] = self._args.get("character_detections_dir") self.object_detections_dir: Optional[Path] = self._args.get("object_detections_dir") - self.output_dir: Path = self._args.get("output_dir", Path("/app/output_data/elastic_documents")) + self.output_dir: Path = self._args.get( + "output_dir", + get_base_output_dir(self.series_name) / "elastic_documents" + ) episodes_info_json = self._args.get("episodes_info_json") self.episode_manager = EpisodeManager(episodes_info_json, self.series_name) @@ -66,6 +78,13 @@ def _validate_args(self, args: Dict[str, Any]) -> None: if "transcription_jsons" not in args: raise ValueError("transcription_jsons is required") + def get_output_subdir(self) -> str: + return settings.output_subdirs.elastic_documents + + def __build_elastic_path(self, episode_info, subdoc_type: str, filename: str) -> Path: + full_subdir = f"{self.get_output_subdir()}/{subdoc_type}" + return self._build_output_path(episode_info, filename, subdir=full_subdir) + def _get_processing_items(self) -> List[ProcessingItem]: all_transcription_files = list(self.transcription_jsons.glob("**/raw/*_segmented.json")) items = [] @@ -83,14 +102,17 @@ def _get_expected_outputs(self, item: ProcessingItem) -> List[OutputSpec]: # py if episode_info: segments_filename = f"{base_name}{FILE_SUFFIXES['text_segments']}{FILE_EXTENSIONS['jsonl']}" - segments_file = OutputPathBuilder.build_elastic_document_path( + segments_file = self.__build_elastic_path( episode_info, ELASTIC_SUBDIRS.text_segments, segments_filename, + self.series_name, ) outputs.append(OutputSpec(path=segments_file, required=True)) - trans_dir = OutputPathBuilder.get_episode_dir(episode_info, settings.output_subdirs.transcriptions) + season_code = episode_info.season_code() + episode_code = episode_info.episode_num() + trans_dir = self.path_manager.base_output_dir / settings.output_subdirs.transcriptions / season_code / episode_code sound_events_dir = trans_dir / settings.output_subdirs.transcription_subdirs.sound_events sound_events_filename = self.episode_manager.file_naming.build_filename( episode_info, @@ -100,10 +122,11 @@ def _get_expected_outputs(self, item: ProcessingItem) -> List[OutputSpec]: # py sound_events_json = sound_events_dir / sound_events_filename if sound_events_json.exists(): sound_events_elastic = f"{base_name}_sound_events.jsonl" - sound_events_file = OutputPathBuilder.build_elastic_document_path( + sound_events_file = self.__build_elastic_path( episode_info, ELASTIC_SUBDIRS.sound_events, sound_events_elastic, + self.series_name, ) outputs.append(OutputSpec(path=sound_events_file, required=False)) else: @@ -118,7 +141,9 @@ def _get_expected_outputs(self, item: ProcessingItem) -> List[OutputSpec]: # py ) if self.embeddings_dir and episode_info: - episode_emb_dir = OutputPathBuilder.get_episode_dir(episode_info, settings.output_subdirs.embeddings) + season_code = episode_info.season_code() + episode_code = episode_info.episode_num() + episode_emb_dir = self.path_manager.base_output_dir / settings.output_subdirs.embeddings / season_code / episode_code text_emb_files = list(episode_emb_dir.glob("*_embeddings_text.json")) text_emb_file = text_emb_files[0] if text_emb_files else None video_emb_files = list(episode_emb_dir.glob("*_embeddings_video.json")) @@ -126,66 +151,75 @@ def _get_expected_outputs(self, item: ProcessingItem) -> List[OutputSpec]: # py if text_emb_file and text_emb_file.exists(): text_embeddings_filename = f"{base_name}_text_embeddings.jsonl" - text_embeddings_file = OutputPathBuilder.build_elastic_document_path( + text_embeddings_file = self.__build_elastic_path( episode_info, ELASTIC_SUBDIRS.text_embeddings, text_embeddings_filename, + self.series_name, ) outputs.append(OutputSpec(path=text_embeddings_file, required=True)) if video_emb_file and video_emb_file.exists(): video_frames_filename = f"{base_name}_video_frames.jsonl" - video_frames_file = OutputPathBuilder.build_elastic_document_path( + video_frames_file = self.__build_elastic_path( episode_info, ELASTIC_SUBDIRS.video_frames, video_frames_filename, + self.series_name, ) outputs.append(OutputSpec(path=video_frames_file, required=True)) episode_name_emb = EpisodeNameEmbedder.load_episode_name_embedding( episode_info.season, episode_info.relative_episode, + self.series_name, output_dir=self.embeddings_dir, ) if episode_name_emb: episode_name_filename = f"{base_name}_episode_name.jsonl" - episode_name_file = OutputPathBuilder.build_elastic_document_path( + episode_name_file = self.__build_elastic_path( episode_info, ELASTIC_SUBDIRS.episode_names, episode_name_filename, + self.series_name, ) outputs.append(OutputSpec(path=episode_name_file, required=True)) - trans_dir = OutputPathBuilder.get_episode_dir(episode_info, settings.output_subdirs.transcriptions) + season_code = episode_info.season_code() + episode_code = episode_info.episode_num() + trans_dir = self.path_manager.base_output_dir / settings.output_subdirs.transcriptions / season_code / episode_code clean_dir = trans_dir / settings.output_subdirs.transcription_subdirs.clean text_stats_filename = f"{base_name}_text_stats.json" text_stats_file = clean_dir / text_stats_filename if text_stats_file.exists(): text_stats_elastic_filename = f"{base_name}_text_statistics.jsonl" - text_stats_elastic_file = OutputPathBuilder.build_elastic_document_path( + text_stats_elastic_file = self.__build_elastic_path( episode_info, ELASTIC_SUBDIRS.text_statistics, text_stats_elastic_filename, + self.series_name, ) outputs.append(OutputSpec(path=text_stats_elastic_file, required=True)) full_episode_emb_file = episode_emb_dir / f"{base_name}_embeddings_full_episode.json" if full_episode_emb_file.exists(): full_episode_elastic_filename = f"{base_name}_full_episode_embedding.jsonl" - full_episode_elastic_file = OutputPathBuilder.build_elastic_document_path( + full_episode_elastic_file = self.__build_elastic_path( episode_info, ELASTIC_SUBDIRS.full_episode_embeddings, full_episode_elastic_filename, + self.series_name, ) outputs.append(OutputSpec(path=full_episode_elastic_file, required=True)) sound_event_emb_file = episode_emb_dir / f"{base_name}_embeddings_sound_events.json" if sound_event_emb_file.exists(): sound_event_elastic_filename = f"{base_name}_sound_event_embeddings.jsonl" - sound_event_elastic_file = OutputPathBuilder.build_elastic_document_path( + sound_event_elastic_file = self.__build_elastic_path( episode_info, ELASTIC_SUBDIRS.sound_event_embeddings, sound_event_elastic_filename, + self.series_name, ) outputs.append(OutputSpec(path=sound_event_elastic_file, required=False)) @@ -238,7 +272,9 @@ def _process_item(self, item: ProcessingItem, missing_outputs: List[OutputSpec]) episode_metadata = self.__build_episode_metadata(episode_info) episode_id = episode_info.episode_code() - video_path = self.episode_manager.build_video_path_for_elastic(episode_info) + from preprocessor.core.constants import FILE_EXTENSIONS + filename = f"{self.series_name.lower()}_{episode_info.episode_code()}{FILE_EXTENSIONS['mp4']}" + video_path = str(Path("bot") / f"{self.series_name.upper()}-WIDEO" / episode_info.season_code() / filename) scene_timestamps = self.__load_scene_timestamps(episode_info) character_detections = self.__load_character_detections(episode_info) @@ -255,7 +291,7 @@ def _process_item(self, item: ProcessingItem, missing_outputs: List[OutputSpec]) base_name, ) - trans_dir = self.episode_manager.get_episode_subdir(episode_info, settings.output_subdirs.transcriptions) + trans_dir = self.path_manager.get_episode_dir(episode_info, settings.output_subdirs.transcriptions) sound_events_dir = trans_dir / settings.output_subdirs.transcription_subdirs.sound_events sound_events_json = sound_events_dir / f"{base_name}_sound_events.json" if sound_events_json.exists() and any("_sound_events.jsonl" in str(o.path) for o in missing_outputs): @@ -273,7 +309,7 @@ def _process_item(self, item: ProcessingItem, missing_outputs: List[OutputSpec]) ) if self.embeddings_dir: - episode_emb_dir = self.episode_manager.get_episode_subdir(episode_info, settings.output_subdirs.embeddings) + episode_emb_dir = self.path_manager.get_episode_dir(episode_info, settings.output_subdirs.embeddings) text_emb_files = list(episode_emb_dir.glob("*_embeddings_text.json")) text_emb_file = text_emb_files[0] if text_emb_files else None @@ -306,6 +342,7 @@ def _process_item(self, item: ProcessingItem, missing_outputs: List[OutputSpec]) episode_name_emb = EpisodeNameEmbedder.load_episode_name_embedding( season, episode_number, + self.series_name, output_dir=self.embeddings_dir, ) if episode_name_emb and any("_episode_name.jsonl" in str(o.path) for o in missing_outputs): @@ -318,7 +355,7 @@ def _process_item(self, item: ProcessingItem, missing_outputs: List[OutputSpec]) base_name, ) - trans_dir = self.episode_manager.get_episode_subdir(episode_info, settings.output_subdirs.transcriptions) + trans_dir = self.path_manager.get_episode_dir(episode_info, settings.output_subdirs.transcriptions) clean_dir = trans_dir / settings.output_subdirs.transcription_subdirs.clean text_stats_file = clean_dir / f"{base_name}_text_stats.json" if text_stats_file.exists() and any("_text_statistics.jsonl" in str(o.path) for o in missing_outputs): @@ -332,7 +369,7 @@ def _process_item(self, item: ProcessingItem, missing_outputs: List[OutputSpec]) ) if self.embeddings_dir: - episode_emb_dir = self.episode_manager.get_episode_subdir(episode_info, settings.output_subdirs.embeddings) + episode_emb_dir = self.path_manager.get_episode_dir(episode_info, settings.output_subdirs.embeddings) full_episode_emb_file = episode_emb_dir / f"{base_name}_embeddings_full_episode.json" if full_episode_emb_file.exists() and any("_full_episode_embedding.jsonl" in str(o.path) for o in missing_outputs): @@ -377,7 +414,7 @@ def __load_character_detections(self, episode_info) -> Dict[int, List[CharacterD if not self.character_detections_dir: return {} - detection_dir = self.episode_manager.get_episode_subdir(episode_info, settings.output_subdirs.character_detections) + detection_dir = self.path_manager.get_episode_dir(episode_info, settings.output_subdirs.character_detections) detection_files = list(detection_dir.glob("*_character_detections.json")) detection_file = detection_files[0] if detection_files else None @@ -406,7 +443,7 @@ def __load_object_detections(self, episode_info) -> Dict[str, List[ObjectDetecti if not self.object_detections_dir: return {} - detection_dir = self.episode_manager.get_episode_subdir(episode_info, settings.output_subdirs.object_detections) + detection_dir = self.path_manager.get_episode_dir(episode_info, settings.output_subdirs.object_detections) detection_files = list(detection_dir.glob("*_object_detections.json")) detection_file = detection_files[0] if detection_files else None @@ -507,7 +544,7 @@ def __generate_segments( # pylint: disable=too-many-locals episode_info = self.episode_manager.get_episode_by_season_and_relative(season, episode) if episode_info: - output_file = self.episode_manager.build_episode_output_path( + output_file = self.path_manager.build_path( episode_info, f"{settings.output_subdirs.elastic_documents}/{ELASTIC_SUBDIRS.text_segments}", f"{base_name}{FILE_SUFFIXES['text_segments']}{FILE_EXTENSIONS['jsonl']}", @@ -568,7 +605,7 @@ def __generate_sound_events( if not segments: return - output_file = self.episode_manager.build_episode_output_path( + output_file = self.path_manager.build_path( episode_info, f"{settings.output_subdirs.elastic_documents}/{ELASTIC_SUBDIRS.sound_events}", f"{base_name}_sound_events.jsonl", @@ -624,7 +661,7 @@ def __generate_text_embeddings( if not text_embeddings: return - output_file = self.episode_manager.build_episode_output_path( + output_file = self.path_manager.build_path( episode_info, f"{settings.output_subdirs.elastic_documents}/{ELASTIC_SUBDIRS.text_embeddings}", f"{base_name}_text_embeddings.jsonl", @@ -673,7 +710,7 @@ def __generate_video_frames( # pylint: disable=too-many-locals if not video_embeddings: return - output_file = self.episode_manager.build_episode_output_path( + output_file = self.path_manager.build_path( episode_info, f"{settings.output_subdirs.elastic_documents}/{ELASTIC_SUBDIRS.video_frames}", f"{base_name}_video_frames.jsonl", @@ -741,7 +778,7 @@ def __generate_episode_name_document( episode_info, base_name: str, ) -> None: - output_file = self.episode_manager.build_episode_output_path( + output_file = self.path_manager.build_path( episode_info, f"{settings.output_subdirs.elastic_documents}/{ELASTIC_SUBDIRS.episode_names}", f"{base_name}_episode_name.jsonl", @@ -783,7 +820,7 @@ def __generate_text_statistics_document( if not basic_stats: return - output_file = self.episode_manager.build_episode_output_path( + output_file = self.path_manager.build_path( episode_info, f"{settings.output_subdirs.elastic_documents}/{ELASTIC_SUBDIRS.text_statistics}", f"{base_name}_text_statistics.jsonl", @@ -824,7 +861,7 @@ def __generate_full_episode_embedding_document( if not full_episode_embedding_data or "embedding" not in full_episode_embedding_data: return - output_file = self.episode_manager.build_episode_output_path( + output_file = self.path_manager.build_path( episode_info, f"{settings.output_subdirs.elastic_documents}/{ELASTIC_SUBDIRS.full_episode_embeddings}", f"{base_name}_full_episode_embedding.jsonl", @@ -861,7 +898,7 @@ def __generate_sound_event_embeddings_document( # pylint: disable=too-many-loca if not sound_event_embeddings: return - output_file = self.episode_manager.build_episode_output_path( + output_file = self.path_manager.build_path( episode_info, f"{settings.output_subdirs.elastic_documents}/{ELASTIC_SUBDIRS.sound_event_embeddings}", f"{base_name}_sound_event_embeddings.jsonl", diff --git a/preprocessor/indexing/elasticsearch.py b/preprocessor/processors/elasticsearch_indexer.py similarity index 95% rename from preprocessor/indexing/elasticsearch.py rename to preprocessor/processors/elasticsearch_indexer.py index a3e8bbb6d..e5be2fbba 100644 --- a/preprocessor/indexing/elasticsearch.py +++ b/preprocessor/processors/elasticsearch_indexer.py @@ -16,16 +16,26 @@ async_bulk, ) -from preprocessor.config.config import settings +from preprocessor.config.config import ( + get_base_output_dir, + settings, +) from preprocessor.core.base_processor import BaseProcessor from preprocessor.core.episode_manager import EpisodeManager +from preprocessor.core.processor_registry import register_processor from preprocessor.search.elastic_manager import ElasticSearchManager from preprocessor.utils.console import console ELASTIC_SUBDIRS = settings.output_subdirs.elastic_document_subdirs +@register_processor("index_elasticsearch") class ElasticSearchIndexer(BaseProcessor): + REQUIRES = ["elastic_documents"] + PRODUCES = ["indexed"] + PRIORITY = 95 + DESCRIPTION = "Index documents in Elasticsearch" + def __init__(self, args: Dict[str, Any]) -> None: super().__init__( args=args, @@ -36,7 +46,10 @@ def __init__(self, args: Dict[str, Any]) -> None: self.dry_run = self._args.get("dry_run", False) self.name = self._args["name"] - self.elastic_documents_dir = self._args.get("elastic_documents_dir", Path("/app/output_data/elastic_documents")) + self.elastic_documents_dir = self._args.get( + "elastic_documents_dir", + get_base_output_dir(self.series_name) / "elastic_documents" + ) self.transcription_jsons = self._args.get("transcription_jsons") self.append = self._args.get("append", False) @@ -48,6 +61,9 @@ def _validate_args(self, args: Dict[str, Any]) -> None: if "name" not in args: raise ValueError("index name is required") + def get_output_subdir(self) -> str: + return settings.output_subdirs.elastic_documents + @staticmethod def __sanitize_error_for_logging(error: Dict[str, Any]) -> Dict[str, Any]: vector_keys = {"text_embedding", "video_embedding", "title_embedding", "embedding"} diff --git a/preprocessor/processors/embedding_generator.py b/preprocessor/processors/embedding_generator.py new file mode 100644 index 000000000..28b71fb1f --- /dev/null +++ b/preprocessor/processors/embedding_generator.py @@ -0,0 +1,821 @@ +import gc +import json +import logging +from pathlib import Path +import re +from typing import ( + Any, + Dict, + List, + Optional, +) + +import numpy as np +import torch + +from preprocessor.config.config import settings +from preprocessor.core.base_processor import ( + BaseProcessor, + OutputSpec, + ProcessingItem, +) +from preprocessor.core.constants import FILE_SUFFIXES +from preprocessor.core.episode_manager import EpisodeManager +from preprocessor.core.processor_registry import register_processor +from preprocessor.embeddings.episode_name_embedder import EpisodeNameEmbedder +from preprocessor.embeddings.gpu_batch_processor import GPUBatchProcessor +from preprocessor.embeddings.qwen3_vl_embedding import Qwen3VLEmbedder +from preprocessor.utils.batch_processing_utils import compute_embeddings_in_batches +from preprocessor.utils.console import console +from preprocessor.utils.constants import EpisodeMetadataKeys +from preprocessor.utils.file_utils import atomic_write_json +from preprocessor.utils.image_hash_utils import load_image_hashes_for_episode +from preprocessor.utils.metadata_utils import create_processing_metadata + +# pylint: disable=duplicate-code + + +@register_processor("generate_embeddings") +class EmbeddingGenerator(BaseProcessor): # pylint: disable=too-many-instance-attributes + REQUIRES = ["transcriptions", "frames"] + PRODUCES = ["embeddings"] + PRIORITY = 50 + DESCRIPTION = "Generate multimodal embeddings" + + def __init__(self, args: Dict[str, Any]): + super().__init__( + args=args, + class_name=self.__class__.__name__, + error_exit_code=9, + loglevel=logging.DEBUG, + ) + + self.transcription_jsons: Path = self._args["transcription_jsons"] + self.frames_dir: Path = self._args.get("frames_dir", settings.frame_export.output_dir) + self.output_dir: Path = self._args.get("output_dir", settings.embedding.default_output_dir) + + self.model_name: str = self._args.get("model", settings.embedding_model.model_name) + self.model_revision: str = self._args.get("model_revision", settings.embedding_model.model_revision) + self.batch_size: int = self._args.get("batch_size", settings.embedding.batch_size) + self.device: str = "cuda" + + self.segments_per_embedding: int = self._args.get("segments_per_embedding", settings.text_chunking.segments_per_embedding) + self.text_sentences_per_chunk: int = self._args.get("text_sentences_per_chunk", settings.text_chunking.text_sentences_per_chunk) + self.text_chunk_overlap: int = self._args.get("text_chunk_overlap", settings.text_chunking.text_chunk_overlap) + self.generate_text: bool = self._args.get("generate_text", True) + self.generate_video: bool = self._args.get("generate_video", True) + self.generate_episode_names: bool = self._args.get("generate_episode_names", True) + self.generate_full_episode: bool = self._args.get("generate_full_episode", settings.embedding.generate_full_episode_embedding) + self.generate_sound_events: bool = self._args.get("generate_sound_events", True) + + self.image_hashes_dir: Path = Path(self._args.get("image_hashes_dir", settings.image_hash.output_dir)) + + episodes_info_json = self._args.get("episodes_info_json") + self.episode_manager = EpisodeManager(episodes_info_json, self.series_name) + + self.model = None + self.processor = None + self.gpu_processor: Optional[GPUBatchProcessor] = None + self.episode_name_embedder: Optional[EpisodeNameEmbedder] = None + + def _validate_args(self, args: Dict[str, Any]) -> None: + if "transcription_jsons" not in args: + raise ValueError("transcription_jsons is required") + if not torch.cuda.is_available(): + raise RuntimeError("CUDA is not available. This application requires GPU.") + + def get_output_subdir(self) -> str: + return settings.output_subdirs.embeddings + + def cleanup(self) -> None: + console.print("[cyan]Unloading embedding model...[/cyan]") + self.model = None + self.processor = None + self._cleanup_memory() + console.print("[green]✓ Model unloaded[/green]") + + def _get_processing_items(self) -> List[ProcessingItem]: + all_transcription_files = list(self.transcription_jsons.glob("**/*.json")) + items = [] + seen_episodes = set() + + for trans_file in all_transcription_files: + if "_simple.json" in trans_file.name or "_text_stats.json" in trans_file.name: + continue + + if trans_file.parent.name in {"clean", "sound_events"}: + continue + + if not trans_file.name.endswith("_segmented.json"): + segmented_version = trans_file.parent / f"{trans_file.stem}_segmented.json" + if segmented_version.exists(): + continue + + episode_info = self.episode_manager.parse_filename(trans_file) + if episode_info: + episode_key = (episode_info.season, episode_info.relative_episode) + if episode_key in seen_episodes: + continue + seen_episodes.add(episode_key) + + items.append(self._create_transcription_processing_item(trans_file)) + + return items + + def _should_skip_item(self, item: ProcessingItem): + trans_file = item.input_path + parent_name = trans_file.parent.name + if parent_name in {"raw", "clean", "sound_events"}: + episode_dir = trans_file.parent.parent + else: + episode_dir = trans_file.parent + + clean_dir = episode_dir / settings.output_subdirs.transcription_subdirs.clean + base_name = self.__remove_all_suffixes(trans_file.stem) + clean_transcription_file = clean_dir / f"{base_name}_clean_transcription.json" + + if clean_transcription_file.exists(): + try: + with open(clean_transcription_file, "r", encoding="utf-8") as f: + data = json.load(f) + segments = data.get("segments", []) + if not segments: + episode_id = item.episode_id + self.logger.warning( + f"Empty clean transcription (no text segments) for {episode_id}, " + f"will skip text embeddings but generate other types (sound events, episode names, etc.)", + ) + except Exception as e: + self.logger.error(f"Failed to read {clean_transcription_file}: {e}") + + return super()._should_skip_item(item) + + def _get_expected_outputs(self, item: ProcessingItem) -> List[OutputSpec]: + outputs = [] + episode_info = self.episode_manager.parse_filename(item.input_path) + if not episode_info: + return outputs + + if self.generate_text: + text_filename = self.episode_manager.file_naming.build_filename( + episode_info, + extension="json", + suffix="_embeddings_text", + ) + text_output = self._build_output_path(episode_info, text_filename) + outputs.append(OutputSpec(path=text_output, required=True)) + + if self.generate_episode_names: + episode_name_filename = f"{FILE_SUFFIXES['episode_name']}.json" + episode_name_output = self._build_output_path(episode_info, episode_name_filename) + outputs.append(OutputSpec(path=episode_name_output, required=True)) + + if self.generate_video: + video_filename = self.episode_manager.file_naming.build_filename( + episode_info, + extension="json", + suffix="_embeddings_video", + ) + video_output = self._build_output_path(episode_info, video_filename) + outputs.append(OutputSpec(path=video_output, required=True)) + + if self.generate_full_episode: + full_episode_filename = self.episode_manager.file_naming.build_filename( + episode_info, + extension="json", + suffix="_embeddings_full_episode", + ) + full_episode_output = self._build_output_path(episode_info, full_episode_filename) + outputs.append(OutputSpec(path=full_episode_output, required=True)) + + if self.generate_sound_events: + sound_events_filename = self.episode_manager.file_naming.build_filename( + episode_info, + extension="json", + suffix="_embeddings_sound_events", + ) + sound_events_output = self._build_output_path(episode_info, sound_events_filename) + outputs.append(OutputSpec(path=sound_events_output, required=True)) + + return outputs + + def _get_temp_files(self, item: ProcessingItem) -> List[str]: + temp_files = [] + expected_outputs = self._get_expected_outputs(item) + for output in expected_outputs: + temp_path = output.path.with_suffix('.json.tmp') + temp_files.append(str(temp_path)) + return temp_files + + def _get_processing_info(self) -> List[str]: + return [ + f"[cyan]Loading model: {self.model_name}[/cyan]", + f"[cyan]Device: {self.device}[/cyan]", + f"[cyan]Batch size: {self.batch_size}[/cyan]", + ] + + def _load_resources(self) -> bool: + self.__load_model() + self.gpu_processor = GPUBatchProcessor( + self.model, + self.batch_size, + self.logger, + self.device, + progress_sub_batch_size=settings.embedding.progress_sub_batch_size, + ) + self.episode_name_embedder = EpisodeNameEmbedder( + model=self.model, + episode_manager=self.episode_manager, + series_name=self.series_name, + logger=self.logger, + ) + return True + + def __load_model(self) -> None: + try: + self.model = Qwen3VLEmbedder( + model_name_or_path=self.model_name, + torch_dtype=torch.bfloat16, + ) + console.print("[green]Qwen3-VL-Embedding model loaded successfully (vLLM)[/green]") + except Exception as e: + self.logger.error(f"Failed to load model: {e}") + raise + + def _process_item(self, item: ProcessingItem, missing_outputs: List[OutputSpec]) -> None: # pylint: disable=too-many-locals,too-many-statements + trans_file = item.input_path + + parent_name = trans_file.parent.name + if parent_name in {"raw", "clean", "sound_events"}: + episode_dir = trans_file.parent.parent + else: + episode_dir = trans_file.parent + + clean_dir = episode_dir / settings.output_subdirs.transcription_subdirs.clean + + base_name = self.__remove_all_suffixes(trans_file.stem) + clean_transcription_file = clean_dir / f"{base_name}_clean_transcription.json" + + if not clean_transcription_file.exists(): + self.logger.warning(f"Clean transcription not found: {clean_transcription_file}, skipping text embeddings generation") + with open(trans_file, "r", encoding="utf-8") as f: + data = json.load(f) + data["segments"] = [] + else: + with open(clean_transcription_file, "r", encoding="utf-8") as f: + data = json.load(f) + + has_segments = bool(data.get("segments")) + segmented_file = trans_file.parent / f"{trans_file.stem}_segmented.json" + + if not has_segments and segmented_file.exists(): + return + + need_text = any("embeddings_text.json" in str(o.path) for o in missing_outputs) + need_video = any("embeddings_video.json" in str(o.path) for o in missing_outputs) + need_episode_name = any("episode_name_embedding.json" in str(o.path) for o in missing_outputs) + need_full_episode = any("embeddings_full_episode.json" in str(o.path) for o in missing_outputs) + need_sound_events = any("embeddings_sound_events.json" in str(o.path) for o in missing_outputs) + + text_embeddings = [] + if need_text: + text_embeddings = self.__generate_text_embeddings(data) + + sound_event_embeddings = [] + if need_sound_events: + sound_event_embeddings = self.__generate_sound_event_embeddings(trans_file) + + video_embeddings = [] + if need_video: + episode_info = data.get("episode_info", {}) + frame_metadata = self.__load_frame_metadata(episode_info) + if frame_metadata: + video_embeddings = self.__generate_video_embeddings(episode_info, frame_metadata) + + if need_episode_name and self.episode_name_embedder: + self.episode_name_embedder.generate_and_save_for_transcription(data) + + full_episode_embedding = None + if need_full_episode: + full_episode_embedding = self.__generate_full_episode_embedding(trans_file) + + episode_dir = self.__get_episode_output_dir(trans_file) + episode_info_dict = data.get("episode_info", {}) + season = episode_info_dict.get("season", 0) + episode_num = episode_info_dict.get("episode_number", 0) + + episode_info_temp = self.episode_manager.get_episode_by_season_and_relative(season, episode_num) + if episode_info_temp: + episode_code = episode_info_temp.episode_code() + else: + episode_code = f"S{season:02d}E{episode_num:02d}" + + text_output = episode_dir / f"{self.episode_manager.series_name}_{episode_code}_embeddings_text.json" + video_output = episode_dir / f"{self.episode_manager.series_name}_{episode_code}_embeddings_video.json" + full_episode_output = episode_dir / f"{self.episode_manager.series_name}_{episode_code}_embeddings_full_episode.json" + sound_events_output = episode_dir / f"{self.episode_manager.series_name}_{episode_code}_embeddings_sound_events.json" + self.__save_embeddings( + data, + text_embeddings, + video_embeddings, + full_episode_embedding, + sound_event_embeddings, + text_output, + video_output, + full_episode_output, + sound_events_output, + ) + self._cleanup_memory() + + def __generate_text_embeddings(self, data: Dict[str, Any]) -> List[Dict[str, Any]]: # pylint: disable=too-many-locals + segments = data.get("segments", []) + if not segments: + return [] + + text_chunks = [] + chunk_metadata = [] + + if True: # Always use sentence-based chunking for text # pylint: disable=using-constant-test + full_text = " ".join([seg.get("text", "") for seg in segments]) + sentences = self.__split_into_sentences(full_text) + + sentences_per_chunk = self.text_sentences_per_chunk + overlap = self.text_chunk_overlap + step = sentences_per_chunk - overlap + + for i in range(0, len(sentences), step): + chunk_sentences = sentences[i:i + sentences_per_chunk] + if not chunk_sentences: + continue + + chunk_text = " ".join(chunk_sentences).strip() + if not chunk_text: + continue + + char_start = sum(len(s) + 1 for s in sentences[:i]) + char_end = char_start + len(chunk_text) + + start_seg_id = self.__find_segment_at_position(segments, char_start) + end_seg_id = self.__find_segment_at_position(segments, char_end) + + text_chunks.append(chunk_text) + chunk_metadata.append({ + "segment_range": [start_seg_id, end_seg_id], + "text": chunk_text, + }) + else: + for i in range(0, len(segments), self.segments_per_embedding): + chunk = segments[i: i + self.segments_per_embedding] + combined_text = " ".join([seg.get("text", "") for seg in chunk]) + + if combined_text.strip(): + text_chunks.append(combined_text) + chunk_metadata.append({ + "segment_range": [i, i + len(chunk) - 1], + "text": combined_text, + }) + + if not text_chunks: + return [] + + embeddings = [] + text_batch_size = settings.embedding.text_batch_size + + with self.progress.track_operation( + f"Text embeddings ({len(text_chunks)} chunks)", + (len(text_chunks) + text_batch_size - 1) // text_batch_size, + ) as tracker: + for batch_idx in range(0, len(text_chunks), text_batch_size): + batch_texts = text_chunks[batch_idx: batch_idx + text_batch_size] + batch_meta = chunk_metadata[batch_idx: batch_idx + text_batch_size] + + try: + batch_embeddings = self.__encode_text_batch(batch_texts) + for meta, embedding in zip(batch_meta, batch_embeddings): + embeddings.append({ + **meta, + "embedding": embedding.tolist(), + }) + except (RuntimeError, ValueError, OSError) as e: + self.logger.error(f"Failed text embedding batch {batch_idx}: {e}") + + tracker.update((batch_idx // text_batch_size) + 1, interval=5) + + return embeddings + + def __generate_sound_event_embeddings(self, trans_file: Path) -> List[Dict[str, Any]]: # pylint: disable=too-many-locals + parent_name = trans_file.parent.name + if parent_name in {"raw", "clean", "sound_events"}: + episode_dir = trans_file.parent.parent + else: + episode_dir = trans_file.parent + + sound_events_dir = episode_dir / settings.output_subdirs.transcription_subdirs.sound_events + + base_name = self.__remove_all_suffixes(trans_file.stem) + sound_events_file = sound_events_dir / f"{base_name}_sound_events.json" + + if not sound_events_file.exists(): + self.logger.warning(f"Sound events file not found: {sound_events_file}, skipping sound event embeddings generation") + return [] + + try: + with open(sound_events_file, "r", encoding="utf-8") as f: + sound_events_data = json.load(f) + except Exception as e: + self.logger.error(f"Failed to load sound events file {sound_events_file}: {e}") + return [] + + segments = sound_events_data.get("segments", []) + if not segments: + return [] + + text_chunks = [] + chunk_metadata = [] + + for i in range(0, len(segments), self.segments_per_embedding): + chunk = segments[i: i + self.segments_per_embedding] + combined_text = " ".join([seg.get("text", "") for seg in chunk]) + + if combined_text.strip(): + sound_types = set() + for seg in chunk: + sound_type = seg.get("sound_type", "sound") + sound_types.add(sound_type) + + start_time = chunk[0].get("start", 0.0) if chunk else 0.0 + end_time = chunk[-1].get("end", 0.0) if chunk else 0.0 + + text_chunks.append(combined_text) + chunk_metadata.append({ + "segment_range": [i, i + len(chunk) - 1], + "text": combined_text, + "sound_types": list(sound_types), + "start_time": start_time, + "end_time": end_time, + }) + + if not text_chunks: + return [] + + embeddings = [] + text_batch_size = settings.embedding.text_batch_size + + with self.progress.track_operation( + f"Sound event embeddings ({len(text_chunks)} chunks)", + (len(text_chunks) + text_batch_size - 1) // text_batch_size, + ) as tracker: + for batch_idx in range(0, len(text_chunks), text_batch_size): + batch_texts = text_chunks[batch_idx: batch_idx + text_batch_size] + batch_meta = chunk_metadata[batch_idx: batch_idx + text_batch_size] + + try: + batch_embeddings = self.__encode_text_batch(batch_texts) + for meta, embedding in zip(batch_meta, batch_embeddings): + embeddings.append({ + **meta, + "embedding": embedding.tolist(), + }) + except (RuntimeError, ValueError, OSError) as e: + self.logger.error(f"Failed sound event embedding batch {batch_idx}: {e}") + + tracker.update((batch_idx // text_batch_size) + 1, interval=5) + + return embeddings + + @staticmethod + def __remove_all_suffixes(base_name: str) -> str: + suffixes = (FILE_SUFFIXES["segmented"], FILE_SUFFIXES["sound_events"], FILE_SUFFIXES["clean"], FILE_SUFFIXES["clean_alt"]) + while True: + removed = False + for suffix in suffixes: + if base_name.endswith(suffix): + base_name = base_name[:-len(suffix)] + removed = True + break + if not removed: + break + return base_name + + @staticmethod + def __split_into_sentences(text: str) -> List[str]: + normalized_text = re.sub(r'\.{2,}', '.', text) + normalized_text = re.sub(r'!{2,}', '!', normalized_text) + normalized_text = re.sub(r'\?{2,}', '?', normalized_text) + + sentences = re.split(r'([.!?]+(?:\s+|$))', normalized_text) + raw_sentences = [] + for i in range(0, len(sentences) - 1, 2): + sentence = sentences[i] + (sentences[i + 1] if i + 1 < len(sentences) else "") + sentence = sentence.strip() + if sentence: + raw_sentences.append(sentence) + if len(sentences) % 2 == 1 and sentences[-1].strip(): + raw_sentences.append(sentences[-1].strip()) + + result = [] + buffer = "" + min_sentence_length = 30 + + for sentence in raw_sentences: + buffer = (buffer + " " + sentence).strip() if buffer else sentence + + if len(buffer) >= min_sentence_length: + result.append(buffer) + buffer = "" + + if buffer: + if result: + result[-1] = result[-1] + " " + buffer + else: + result.append(buffer) + + return result + + @staticmethod + def __find_segment_at_position(segments: List[Dict[str, Any]], char_pos: int) -> int: + cumulative_length = 0 + for idx, seg in enumerate(segments): + seg_text = seg.get("text", "") + seg_length = len(seg_text) + 1 + if cumulative_length <= char_pos < cumulative_length + seg_length: + return idx + cumulative_length += seg_length + return len(segments) - 1 if segments else 0 + + def __encode_text_batch(self, texts: List[str]) -> List[np.ndarray]: + inputs = [{"text": text} for text in texts] + embeddings_tensor = self.model.process(inputs, normalize=True) + embeddings = [emb.cpu().numpy() for emb in embeddings_tensor] + del embeddings_tensor + return embeddings + + def __generate_full_episode_embedding(self, trans_file: Path) -> Optional[Dict[str, Any]]: # pylint: disable=too-many-locals,too-many-statements + parent_name = trans_file.parent.name + if parent_name in {"raw", "clean", "sound_events"}: + episode_dir = trans_file.parent.parent + else: + episode_dir = trans_file.parent + + clean_dir = episode_dir / settings.output_subdirs.transcription_subdirs.clean + + base_name = self.__remove_all_suffixes(trans_file.stem) + clean_txt_file = clean_dir / f"{base_name}_clean_transcription.txt" + + if not clean_txt_file.exists(): + self.logger.warning(f"Clean transcript file not found: {clean_txt_file}") + return None + + try: # pylint: disable=too-many-try-statements + with open(clean_txt_file, "r", encoding="utf-8") as f: + full_text = f.read().strip() + + if not full_text: + self.logger.warning(f"Empty clean transcript file: {clean_txt_file}") + return None + + console.print(f"[cyan]Generating full episode embedding ({len(full_text)} chars)...[/cyan]") + + max_chars_per_chunk = 6000 + overlap_chars = 4500 + + if len(full_text) > max_chars_per_chunk: + console.print( + f"[yellow]Text too long ({len(full_text)} chars), " + f"using sliding window (chunk={max_chars_per_chunk}, overlap={overlap_chars})...[/yellow]", + ) + + chunks = [] + step_size = max_chars_per_chunk - overlap_chars + + for i in range(0, len(full_text), step_size): + chunk_end = min(i + max_chars_per_chunk, len(full_text)) + chunk = full_text[i:chunk_end] + + if len(chunk.strip()) < 100: + continue + + chunks.append(chunk) + + if chunk_end >= len(full_text): + break + + console.print(f"[cyan]Processing {len(chunks)} overlapping chunks...[/cyan]") + chunk_embeddings = [] + chunk_weights = [] + + for idx, chunk in enumerate(chunks): + inputs = [{"text": chunk}] + embeddings_tensor = self.model.process(inputs, normalize=True) + chunk_embedding = embeddings_tensor[0].cpu().numpy() + chunk_embeddings.append(chunk_embedding) + del embeddings_tensor + + weight = len(chunk) / max_chars_per_chunk + chunk_weights.append(weight) + + if (idx + 1) % 5 == 0 or idx == len(chunks) - 1: + console.print(f"[cyan]Processed chunk {idx + 1}/{len(chunks)}[/cyan]") + + chunk_weights_array = np.array(chunk_weights) + chunk_weights_normalized = chunk_weights_array / chunk_weights_array.sum() + + embedding = np.average(chunk_embeddings, axis=0, weights=chunk_weights_normalized) + embedding = embedding / np.linalg.norm(embedding) + + console.print(f"[green]✓ Weighted-averaged {len(chunks)} overlapping chunks[/green]") + else: + inputs = [{"text": full_text}] + embeddings_tensor = self.model.process(inputs, normalize=True) + embedding = embeddings_tensor[0].cpu().numpy() + del embeddings_tensor + + return { + "text": full_text, + "embedding": embedding.tolist(), + "transcript_length": len(full_text), + } + + except Exception as e: + self.logger.error(f"Failed to generate full episode embedding: {e}") + return None + + def __load_frame_metadata(self, episode_info_dict: Dict[str, Any]) -> Optional[Dict[str, Any]]: + season = episode_info_dict.get("season") + episode = episode_info_dict.get("episode_number") + if season is None or episode is None: + return None + + episode_info_obj = self.episode_manager.get_episode_by_season_and_relative(season, episode) + if not episode_info_obj: + return None + + frames_episode_dir = self.path_manager.get_episode_dir(episode_info_obj, settings.output_subdirs.frames) + metadata_file = frames_episode_dir / f"{self.episode_manager.series_name}_{episode_info_obj.episode_code()}_frame_metadata.json" + + if not metadata_file.exists(): + self.logger.warning(f"Frame metadata not found: {metadata_file}") + return None + + with open(metadata_file, "r", encoding="utf-8") as f: + return json.load(f) + + def __load_image_hashes(self, episode_info_dict: Dict[str, Any]) -> Dict[int, str]: + return load_image_hashes_for_episode(episode_info_dict, self.series_name, self.logger) + + def __generate_video_embeddings(self, episode_info_dict: Dict[str, Any], frame_metadata: Dict[str, Any]) -> List[Dict[str, Any]]: + frame_requests = frame_metadata.get("frames", []) + if not frame_requests: + return [] + + season = episode_info_dict.get("season") + episode = episode_info_dict.get("episode_number") + + episode_info_obj = self.episode_manager.get_episode_by_season_and_relative(season, episode) + if not episode_info_obj: + return [] + + frames_episode_dir = self.path_manager.get_episode_dir(episode_info_obj, settings.output_subdirs.frames) + episode_output_dir = self.path_manager.get_episode_dir(episode_info_obj, settings.output_subdirs.embeddings) + checkpoint_file = episode_output_dir / "embeddings_video_checkpoint.json" + + image_hashes = self.__load_image_hashes(episode_info_dict) + embeddings = compute_embeddings_in_batches( + frames_episode_dir, + frame_requests, + self.gpu_processor, + self.batch_size, + image_hashes, + checkpoint_file=checkpoint_file, + checkpoint_interval=20, + prefetch_count=settings.embedding.prefetch_chunks, + ) + self._cleanup_memory() + return embeddings + + def __get_episode_output_dir(self, transcription_file: Path) -> Path: + episode_info_from_file = self.episode_manager.parse_filename(transcription_file) + if episode_info_from_file: + return self.path_manager.get_episode_dir(episode_info_from_file, settings.output_subdirs.embeddings) + return self.path_manager.base_output_dir / settings.output_subdirs.embeddings + + def __save_embeddings( + self, + data, + text_embeddings, + video_embeddings, + full_episode_embedding, + sound_event_embeddings, + text_output, + video_output, + full_episode_output, + sound_events_output, + ): + episode_info = data.get(EpisodeMetadataKeys.EPISODE_INFO, {}) + text_output.parent.mkdir(parents=True, exist_ok=True) + + if text_embeddings: + text_data = create_processing_metadata( + episode_info=type( + 'obj', (object,), { + 'season': episode_info.get(EpisodeMetadataKeys.SEASON), + 'relative_episode': episode_info.get(EpisodeMetadataKeys.EPISODE_NUMBER), + }, + )(), + processing_params={ + "model_name": self.model_name, + "model_revision": self.model_revision, + "segments_per_embedding": self.segments_per_embedding, + "use_sentence_based_chunking": True, + "text_sentences_per_chunk": self.text_sentences_per_chunk, + "text_chunk_overlap": self.text_chunk_overlap, + "device": self.device, + }, + statistics={ + "total_embeddings": len(text_embeddings), + "embedding_dimension": len(text_embeddings[0]["embedding"]) if text_embeddings else 0, + }, + results_key="text_embeddings", + results_data=text_embeddings, + ) + atomic_write_json(text_output, text_data, indent=2, ensure_ascii=False) + + if video_embeddings: + video_data = create_processing_metadata( + episode_info=type( + 'obj', (object,), { + 'season': episode_info.get(EpisodeMetadataKeys.SEASON), + 'relative_episode': episode_info.get(EpisodeMetadataKeys.EPISODE_NUMBER), + }, + )(), + processing_params={ + "model_name": self.model_name, + "model_revision": self.model_revision, + "batch_size": self.batch_size, + "device": self.device, + }, + statistics={ + "total_embeddings": len(video_embeddings), + "embedding_dimension": len(video_embeddings[0]["embedding"]) if video_embeddings else 0, + "frames_with_hash": sum(1 for e in video_embeddings if "perceptual_hash" in e), + }, + results_key="video_embeddings", + results_data=video_embeddings, + ) + atomic_write_json(video_output, video_data, indent=2, ensure_ascii=False) + + if full_episode_embedding: + full_episode_data = create_processing_metadata( + episode_info=type( + 'obj', (object,), { + 'season': episode_info.get(EpisodeMetadataKeys.SEASON), + 'relative_episode': episode_info.get(EpisodeMetadataKeys.EPISODE_NUMBER), + }, + )(), + processing_params={ + "model_name": self.model_name, + "model_revision": self.model_revision, + "device": self.device, + }, + statistics={ + "transcript_length": full_episode_embedding.get("transcript_length", 0), + "embedding_dimension": len(full_episode_embedding["embedding"]) if "embedding" in full_episode_embedding else 0, + }, + results_key="full_episode_embedding", + results_data=full_episode_embedding, + ) + atomic_write_json(full_episode_output, full_episode_data, indent=2, ensure_ascii=False) + console.print(f"[green]✓ Saved full episode embedding to: {full_episode_output}[/green]") + + if sound_event_embeddings: + sound_events_data = create_processing_metadata( + episode_info=type( + 'obj', (object,), { + 'season': episode_info.get(EpisodeMetadataKeys.SEASON), + 'relative_episode': episode_info.get(EpisodeMetadataKeys.EPISODE_NUMBER), + }, + )(), + processing_params={ + "model_name": self.model_name, + "model_revision": self.model_revision, + "segments_per_embedding": self.segments_per_embedding, + "use_sentence_based_chunking": True, + "text_sentences_per_chunk": self.text_sentences_per_chunk, + "text_chunk_overlap": self.text_chunk_overlap, + "device": self.device, + }, + statistics={ + "total_embeddings": len(sound_event_embeddings), + "embedding_dimension": len(sound_event_embeddings[0]["embedding"]) if sound_event_embeddings else 0, + }, + results_key="sound_event_embeddings", + results_data=sound_event_embeddings, + ) + atomic_write_json(sound_events_output, sound_events_data, indent=2, ensure_ascii=False) + console.print(f"[green]✓ Saved sound event embeddings to: {sound_events_output}[/green]") + + @staticmethod + def _cleanup_memory() -> None: + gc.collect() + if torch.cuda.is_available(): + torch.cuda.empty_cache() diff --git a/preprocessor/video/frame_exporter.py b/preprocessor/processors/frame_exporter.py similarity index 94% rename from preprocessor/video/frame_exporter.py rename to preprocessor/processors/frame_exporter.py index f505e66d8..dc236714a 100644 --- a/preprocessor/video/frame_exporter.py +++ b/preprocessor/processors/frame_exporter.py @@ -21,14 +21,20 @@ ) from preprocessor.core.enums import KeyframeStrategy from preprocessor.core.episode_manager import EpisodeManager -from preprocessor.core.output_path_builder import OutputPathBuilder +from preprocessor.core.processor_registry import register_processor from preprocessor.embeddings.strategies.strategy_factory import KeyframeStrategyFactory from preprocessor.utils.console import console from preprocessor.utils.file_utils import atomic_write_json -from preprocessor.video.base_video_processor import BaseVideoProcessor +from preprocessor.video.helpers.base_video_processor import BaseVideoProcessor +@register_processor("export_frames") class FrameExporter(BaseVideoProcessor): + REQUIRES = ["videos", "scene_timestamps"] + PRODUCES = ["frames"] + PRIORITY = 30 + DESCRIPTION = "Export keyframes from videos" + def __init__(self, args: Dict[str, Any]) -> None: super().__init__( args=args, @@ -65,16 +71,18 @@ def _validate_args(self, args: Dict[str, Any]) -> None: if scene_path and not scene_path.exists(): console.print(f"[yellow]Warning: Scene timestamps directory does not exist: {scene_path}[/yellow]") + def get_output_subdir(self) -> str: + return settings.output_subdirs.frames + def _get_expected_outputs(self, item: ProcessingItem) -> List[OutputSpec]: episode_info = item.metadata["episode_info"] - episode_dir = OutputPathBuilder.get_episode_dir(episode_info, settings.output_subdirs.frames) metadata_filename = self.episode_manager.file_naming.build_filename( episode_info, extension="json", suffix="_frame_metadata", ) - metadata_file = episode_dir / metadata_filename + metadata_file = self._build_output_path(episode_info, metadata_filename) return [OutputSpec(path=metadata_file, required=True)] def _get_temp_files(self, item: ProcessingItem) -> List[str]: @@ -121,7 +129,9 @@ def _process_item(self, item: ProcessingItem, missing_outputs: List[OutputSpec]) raise def __get_episode_dir(self, episode_info) -> Path: - return self.episode_manager.get_episode_subdir(episode_info, settings.output_subdirs.frames) + season_code = episode_info.season_code() + episode_code = episode_info.episode_num() + return self.path_manager.base_output_dir / self.get_output_subdir() / season_code / episode_code def __prepare_data(self, episode_info) -> Dict[str, Any]: data = {} diff --git a/preprocessor/hashing/image_hash_processor.py b/preprocessor/processors/image_hash_processor.py similarity index 90% rename from preprocessor/hashing/image_hash_processor.py rename to preprocessor/processors/image_hash_processor.py index 23f2949cd..a55fc0640 100644 --- a/preprocessor/hashing/image_hash_processor.py +++ b/preprocessor/processors/image_hash_processor.py @@ -18,7 +18,7 @@ ProcessingItem, ) from preprocessor.core.episode_manager import EpisodeManager -from preprocessor.core.output_path_builder import OutputPathBuilder +from preprocessor.core.processor_registry import register_processor from preprocessor.hashing.image_hasher import PerceptualHasher from preprocessor.utils.batch_processing_utils import compute_hashes_in_batches from preprocessor.utils.console import console @@ -27,8 +27,13 @@ # pylint: disable=duplicate-code - +@register_processor("hash_images") class ImageHashProcessor(BaseProcessor): + REQUIRES = ["frames"] + PRODUCES = ["image_hashes"] + PRIORITY = 55 + DESCRIPTION = "Generate perceptual hashes for frames" + def __init__(self, args: Dict[str, Any]) -> None: super().__init__( args=args, @@ -51,6 +56,9 @@ def _validate_args(self, args: Dict[str, Any]) -> None: if not torch.cuda.is_available(): raise RuntimeError("CUDA is not available. This application requires GPU.") + def get_output_subdir(self) -> str: + return settings.output_subdirs.image_hashes + def cleanup(self) -> None: console.print("[cyan]Unloading image hasher...[/cyan]") self.hasher = None @@ -72,11 +80,7 @@ def _get_expected_outputs(self, item: ProcessingItem) -> List[OutputSpec]: extension="json", suffix="image_hashes", ) - hash_output = OutputPathBuilder.build_output_path( - episode_info, - settings.output_subdirs.image_hashes, - hash_filename, - ) + hash_output = self._build_output_path(episode_info, hash_filename) return [OutputSpec(path=hash_output, required=True)] # pylint: enable=duplicate-code @@ -110,7 +114,7 @@ def _process_item(self, item: ProcessingItem, missing_outputs: List[OutputSpec]) self.__cleanup_memory() def __get_episode_output_dir(self, episode_info) -> Path: - return self.episode_manager.get_episode_subdir(episode_info, settings.output_subdirs.image_hashes) + return self.path_manager.get_episode_dir(episode_info, settings.output_subdirs.image_hashes) def __save_hashes(self, episode_dir: Path, episode_info, hash_results: List[Dict[str, Any]]) -> None: episode_dir.mkdir(parents=True, exist_ok=True) diff --git a/preprocessor/video/scene_detector.py b/preprocessor/processors/scene_detector.py similarity index 93% rename from preprocessor/video/scene_detector.py rename to preprocessor/processors/scene_detector.py index 0f06c7e4e..82ee7ab9a 100644 --- a/preprocessor/video/scene_detector.py +++ b/preprocessor/processors/scene_detector.py @@ -21,12 +21,18 @@ ProcessingItem, ) from preprocessor.core.episode_manager import EpisodeManager -from preprocessor.core.output_path_builder import OutputPathBuilder +from preprocessor.core.processor_registry import register_processor from preprocessor.utils.console import console from preprocessor.utils.file_utils import atomic_write_json +@register_processor("detect_scenes") class SceneDetector(BaseProcessor): + REQUIRES = ["videos"] + PRODUCES = ["scene_timestamps"] + PRIORITY = 25 + DESCRIPTION = "Detect scene changes using TransNetV2" + def __init__(self, args: Dict[str, Any]): super().__init__( args=args, @@ -51,6 +57,9 @@ def _validate_args(self, args: Dict[str, Any]) -> None: if not torch.cuda.is_available(): raise RuntimeError("CUDA is not available. TransNetV2 requires GPU.") + def get_output_subdir(self) -> str: + return settings.output_subdirs.scenes + def cleanup(self) -> None: console.print("[cyan]Unloading TransNetV2 model and clearing GPU memory...[/cyan]") if hasattr(self, 'model') and self.model is not None: @@ -78,10 +87,10 @@ def _get_expected_outputs(self, item: ProcessingItem) -> List[OutputSpec]: extension="json", suffix="scenes", ) - output_path = OutputPathBuilder.build_scene_path(episode_info, output_filename) + output_path = self._build_output_path(episode_info, output_filename) else: output_filename = f"{item.input_path.stem}_scenes.json" - output_path = OutputPathBuilder.get_episode_dir(None, settings.output_subdirs.scenes) / output_filename + output_path = self.path_manager.base_output_dir / self.get_output_subdir() / output_filename return [OutputSpec(path=output_path, required=True)] diff --git a/preprocessor/text_analysis/text_analyzer.py b/preprocessor/processors/text_analyzer.py similarity index 90% rename from preprocessor/text_analysis/text_analyzer.py rename to preprocessor/processors/text_analyzer.py index 0f6711ccf..7333a8c8d 100644 --- a/preprocessor/text_analysis/text_analyzer.py +++ b/preprocessor/processors/text_analyzer.py @@ -7,7 +7,7 @@ ) from preprocessor.config.config import ( - BASE_OUTPUT_DIR, + get_base_output_dir, settings, ) from preprocessor.core.base_processor import ( @@ -16,11 +16,18 @@ ProcessingItem, ) from preprocessor.core.episode_manager import EpisodeManager +from preprocessor.core.processor_registry import register_processor from preprocessor.text_analysis.text_statistics import TextStatistics from preprocessor.utils.file_utils import atomic_write_json +@register_processor("analyze_text") class TextAnalyzer(BaseProcessor): + REQUIRES = ["transcriptions"] + PRODUCES = ["text_analysis"] + PRIORITY = 70 + DESCRIPTION = "Analyze transcription text statistics" + def __init__(self, args: Dict[str, Any]): super().__init__( args=args, @@ -28,7 +35,7 @@ def __init__(self, args: Dict[str, Any]): error_exit_code=40, loglevel=logging.INFO, ) - self.transcriptions_base = BASE_OUTPUT_DIR / settings.output_subdirs.transcriptions + self.transcriptions_base = get_base_output_dir(self.series_name) / settings.output_subdirs.transcriptions self.language = args.get("language", "pl") self.episode_manager = EpisodeManager( args.get("episodes_info_json"), @@ -39,6 +46,9 @@ def _validate_args(self, args: Dict[str, Any]) -> None: if "series_name" not in args: raise ValueError("series_name is required") + def get_output_subdir(self) -> str: + return settings.output_subdirs.transcriptions + def _get_processing_items(self) -> List[ProcessingItem]: items = [] diff --git a/preprocessor/transcription/generator.py b/preprocessor/processors/transcription_generator.py similarity index 82% rename from preprocessor/transcription/generator.py rename to preprocessor/processors/transcription_generator.py index d6e6767bc..2f7bed833 100644 --- a/preprocessor/transcription/generator.py +++ b/preprocessor/processors/transcription_generator.py @@ -7,20 +7,27 @@ List, ) +from preprocessor.config.config import settings from preprocessor.core.base_processor import ( BaseProcessor, OutputSpec, ProcessingItem, ) from preprocessor.core.episode_manager import EpisodeManager -from preprocessor.core.output_path_builder import OutputPathBuilder +from preprocessor.core.processor_registry import register_processor from preprocessor.transcription.generators.multi_format_generator import MultiFormatGenerator from preprocessor.transcription.processors.audio_normalizer import AudioNormalizer from preprocessor.transcription.processors.normalized_audio_processor import NormalizedAudioProcessor from preprocessor.transcription.processors.unicode_fixer import TranscriptionUnicodeFixer +@register_processor("transcribe") class TranscriptionGenerator(BaseProcessor): + REQUIRES = ["videos"] + PRODUCES = ["transcriptions"] + PRIORITY = 20 + DESCRIPTION = "Generate transcriptions using Whisper" + def __init__(self, args: Dict[str, Any]) -> None: super().__init__( args=args, @@ -51,6 +58,9 @@ def _validate_args(self, args: Dict[str, Any]) -> None: if not videos_path.is_dir(): raise NotADirectoryError(f"Input videos is not a directory: '{videos_path}'") + def get_output_subdir(self) -> str: + return settings.output_subdirs.transcriptions + def _get_processing_items(self) -> List[ProcessingItem]: if self.__check_all_transcriptions_exist(): return [] @@ -75,22 +85,18 @@ def _get_expected_outputs(self, item: ProcessingItem) -> List[OutputSpec]: continue filename = self.episode_manager.file_naming.build_filename(episode_info, extension="json") - expected_file = OutputPathBuilder.build_transcription_path( - episode_info, - filename, - subdir="raw", - ) + season_code = episode_info.season_code() + episode_code = episode_info.episode_num() + expected_file = self.path_manager.base_output_dir / self.get_output_subdir() / season_code / episode_code / "raw" / filename + expected_file.parent.mkdir(parents=True, exist_ok=True) segmented_filename = self.episode_manager.file_naming.build_filename( episode_info, extension="json", suffix="_segmented", ) - segmented_file = OutputPathBuilder.build_transcription_path( - episode_info, - segmented_filename, - subdir="raw", - ) + segmented_file = self.path_manager.base_output_dir / self.get_output_subdir() / season_code / episode_code / "raw" / segmented_filename + segmented_file.parent.mkdir(parents=True, exist_ok=True) if not expected_file.exists() and not segmented_file.exists(): outputs.append(OutputSpec(path=expected_file, required=True)) @@ -148,22 +154,18 @@ def __check_all_transcriptions_exist(self) -> bool: continue filename = self.episode_manager.file_naming.build_filename(episode_info, extension="json") - expected_file = OutputPathBuilder.build_transcription_path( - episode_info, - filename, - subdir="raw", - ) + season_code = episode_info.season_code() + episode_code = episode_info.episode_num() + expected_file = self.path_manager.base_output_dir / self.get_output_subdir() / season_code / episode_code / "raw" / filename + expected_file.parent.mkdir(parents=True, exist_ok=True) segmented_filename = self.episode_manager.file_naming.build_filename( episode_info, extension="json", suffix="_segmented", ) - segmented_file = OutputPathBuilder.build_transcription_path( - episode_info, - segmented_filename, - subdir="raw", - ) + segmented_file = self.path_manager.base_output_dir / self.get_output_subdir() / season_code / episode_code / "raw" / segmented_filename + segmented_file.parent.mkdir(parents=True, exist_ok=True) if not expected_file.exists() and not segmented_file.exists(): missing_files.append(f"{video_file.name} -> {expected_file}") @@ -188,11 +190,10 @@ def __get_missing_video_files(self, missing_outputs: List[OutputSpec]) -> List[P continue filename = self.episode_manager.file_naming.build_filename(episode_info, extension="json") - expected_file = OutputPathBuilder.build_transcription_path( - episode_info, - filename, - subdir="raw", - ) + season_code = episode_info.season_code() + episode_code = episode_info.episode_num() + expected_file = self.path_manager.base_output_dir / self.get_output_subdir() / season_code / episode_code / "raw" / filename + expected_file.parent.mkdir(parents=True, exist_ok=True) if any(expected_file == output.path for output in missing_outputs): missing_video_files.append(video_file) diff --git a/preprocessor/transcription/importer.py b/preprocessor/processors/transcription_importer.py similarity index 93% rename from preprocessor/transcription/importer.py rename to preprocessor/processors/transcription_importer.py index 7c8731ab4..8646d8dcb 100644 --- a/preprocessor/transcription/importer.py +++ b/preprocessor/processors/transcription_importer.py @@ -12,13 +12,20 @@ from preprocessor.core.base_processor import BaseProcessor from preprocessor.core.episode_manager import EpisodeManager +from preprocessor.core.processor_registry import register_processor from preprocessor.utils.console import ( console, create_progress, ) +@register_processor("import_transcriptions") class TranscriptionImporter(BaseProcessor): + REQUIRES = [] + PRODUCES = ["transcriptions"] + PRIORITY = 15 + DESCRIPTION = "Import external transcriptions" + def _validate_args(self, args: Dict[str, Any]) -> None: if "source_dir" not in args: raise ValueError("source_dir is required") @@ -31,6 +38,9 @@ def _validate_args(self, args: Dict[str, Any]) -> None: if not source_dir.exists(): raise FileNotFoundError(f"Source directory not found: {source_dir}") + def get_output_subdir(self) -> str: + return settings.output_subdirs.transcriptions + def __init__(self, args: Dict[str, Any]) -> None: super().__init__( args=args, @@ -129,7 +139,9 @@ def __import_single_file(self, json_file: Path) -> None: if episode_info: converted_data["episode_info"] = EpisodeManager.get_metadata(episode_info) - output_file = self.episode_manager.build_output_path(episode_info, self.output_dir) + filename = self.episode_manager.file_naming.build_filename(episode_info, extension="json") + season_dir = self.output_dir / episode_info.season_code() + output_file = season_dir / filename output_file.parent.mkdir(parents=True, exist_ok=True) with open(output_file, "w", encoding="utf-8") as f: diff --git a/preprocessor/video/transcoder.py b/preprocessor/processors/video_transcoder.py similarity index 94% rename from preprocessor/video/transcoder.py rename to preprocessor/processors/video_transcoder.py index ad44fc9d3..2d7f7e7d9 100644 --- a/preprocessor/video/transcoder.py +++ b/preprocessor/processors/video_transcoder.py @@ -9,21 +9,28 @@ Optional, ) +from preprocessor.config.config import settings from preprocessor.core.base_processor import ( OutputSpec, ProcessingItem, ) from preprocessor.core.constants import DEFAULT_VIDEO_EXTENSION -from preprocessor.core.output_path_builder import OutputPathBuilder +from preprocessor.core.processor_registry import register_processor from preprocessor.utils.constants import ( FfprobeKeys, FfprobeStreamKeys, ) from preprocessor.utils.resolution import Resolution -from preprocessor.video.base_video_processor import BaseVideoProcessor +from preprocessor.video.helpers.base_video_processor import BaseVideoProcessor +@register_processor("transcode") class VideoTranscoder(BaseVideoProcessor): + REQUIRES = ["videos"] + PRODUCES = ["transcoded_videos"] + PRIORITY = 10 + DESCRIPTION = "Transcode videos to H.264 with consistent format" + def __init__(self, args: Dict[str, Any]) -> None: super().__init__( args=args, @@ -66,9 +73,13 @@ def _validate_args(self, args: Dict[str, Any]) -> None: if not videos_path.is_dir(): raise NotADirectoryError(f"Input videos is not a directory: '{videos_path}'") + def get_output_subdir(self) -> str: + return settings.output_subdirs.video + def _get_expected_outputs(self, item: ProcessingItem) -> List[OutputSpec]: episode_info = item.metadata["episode_info"] - output_path = OutputPathBuilder.build_video_path(episode_info, self.series_name, extension=DEFAULT_VIDEO_EXTENSION) + filename = f"{self.series_name}_{episode_info.episode_code()}{DEFAULT_VIDEO_EXTENSION}" + output_path = self._build_season_path(episode_info, filename) return [OutputSpec(path=output_path, required=True)] def _get_temp_files(self, item: ProcessingItem) -> List[str]: diff --git a/preprocessor/scraping/base_scraper.py b/preprocessor/scraping/base_scraper.py index 3a1951a94..5a1ea8b2b 100644 --- a/preprocessor/scraping/base_scraper.py +++ b/preprocessor/scraping/base_scraper.py @@ -103,7 +103,7 @@ def __scrape_url(self, url: str, progress: "Progress") -> Optional[str]: if self.scraper_method == ScraperMethod.CLIPBOARD: return ScraperClipboard.scrape(url, headless=self.headless) if self.scraper_method == ScraperMethod.CRAWL4AI: - return ScraperCrawl4AI.scrape(url, save_markdown=True, output_dir=settings.scraper.output_dir) + return ScraperCrawl4AI.scrape(url, save_markdown=True, output_dir=settings.scraper.get_output_dir(self.series_name)) self.logger.error(f"Unknown scraper method: {self.scraper_method}") return None diff --git a/preprocessor/scraping/character_scraper.py b/preprocessor/scraping/character_scraper.py index 67c29ebff..6c0f99a3b 100644 --- a/preprocessor/scraping/character_scraper.py +++ b/preprocessor/scraping/character_scraper.py @@ -14,6 +14,9 @@ def __init__(self, args: Dict[str, Any]): super().__init__(args) self.series_name: str = self._args.get("series_name", "") + def get_output_subdir(self) -> str: + return "scraped_pages" + def _process_scraped_pages(self, scraped_pages: List[Dict[str, Any]]) -> None: characters = self.llm.extract_characters(scraped_pages, self.series_name) if not characters: diff --git a/preprocessor/scraping/episode_scraper.py b/preprocessor/scraping/episode_scraper.py index a9671860e..4e8ed2e3b 100644 --- a/preprocessor/scraping/episode_scraper.py +++ b/preprocessor/scraping/episode_scraper.py @@ -20,6 +20,9 @@ def __init__(self, args: Dict[str, Any]): self.expected_episodes_count: Optional[int] = self._args.get("expected_episodes_count") self.videos_dir: Optional[Path] = self._args.get("videos_dir") + def get_output_subdir(self) -> str: + return "scraped_pages" + def _process_scraped_pages(self, scraped_pages: List[Dict[str, Any]]) -> None: all_seasons = self.llm.extract_all_seasons(scraped_pages) if not all_seasons: diff --git a/preprocessor/text_analysis/__init__.py b/preprocessor/text_analysis/__init__.py index 937e69245..e8a533c57 100644 --- a/preprocessor/text_analysis/__init__.py +++ b/preprocessor/text_analysis/__init__.py @@ -1,4 +1,3 @@ -from preprocessor.text_analysis.text_analyzer import TextAnalyzer from preprocessor.text_analysis.text_statistics import TextStatistics -__all__ = ["TextAnalyzer", "TextStatistics"] +__all__ = ["TextStatistics"] diff --git a/preprocessor/transcription/elevenlabs.py b/preprocessor/transcription/elevenlabs.py index 3d0520175..2ef9ec40f 100644 --- a/preprocessor/transcription/elevenlabs.py +++ b/preprocessor/transcription/elevenlabs.py @@ -10,6 +10,7 @@ Optional, ) +from preprocessor.config.config import settings from preprocessor.core.base_processor import BaseProcessor from preprocessor.core.episode_manager import EpisodeManager from preprocessor.transcription.engines.elevenlabs_engine import ElevenLabsEngine @@ -33,6 +34,9 @@ def _validate_args(self, args: Dict[str, Any]) -> None: if not videos_path.is_dir(): raise NotADirectoryError(f"Input videos is not a directory: '{videos_path}'") + def get_output_subdir(self) -> str: + return settings.output_subdirs.transcriptions + def __init__(self, args: Dict[str, Any]): super().__init__( args=args, @@ -209,7 +213,9 @@ def __save_transcription(self, data: Dict[str, Any], video_file: Path) -> None: } json_dir = self.output_dir / "json" - output_file = self.episode_manager.build_output_path(episode_info, json_dir) + filename = self.episode_manager.file_naming.build_filename(episode_info, extension="json") + season_dir = json_dir / episode_info.season_code() + output_file = season_dir / filename output_file.parent.mkdir(parents=True, exist_ok=True) with open(output_file, "w", encoding="utf-8") as f: diff --git a/preprocessor/transcription/generators/multi_format_generator.py b/preprocessor/transcription/generators/multi_format_generator.py index 88a9b76f2..d6669e452 100644 --- a/preprocessor/transcription/generators/multi_format_generator.py +++ b/preprocessor/transcription/generators/multi_format_generator.py @@ -5,8 +5,11 @@ Dict, ) +from preprocessor.config.config import ( + get_base_output_dir, + settings, +) from preprocessor.core.episode_manager import EpisodeManager -from preprocessor.core.output_path_builder import OutputPathBuilder from preprocessor.transcription.generators.full_json_generator import FullJsonGenerator from preprocessor.transcription.generators.segmented_json_generator import SegmentedJsonGenerator from preprocessor.transcription.generators.simple_json_generator import SimpleJsonGenerator @@ -49,11 +52,9 @@ def __process_file(self, transcription_file: Path) -> None: return filename = self.episode_manager.file_naming.build_filename(episode_info, extension="json") - main_output_file = OutputPathBuilder.build_transcription_path( - episode_info, - filename, - subdir="raw", - ) + season_code = episode_info.season_code() + episode_code = episode_info.episode_num() + main_output_file = get_base_output_dir(self.series_name) / settings.output_subdirs.transcriptions / season_code / episode_code / "raw" / filename if main_output_file.exists(): self.logger.info(f"Skipping (already exists): {episode_info.episode_code()}") @@ -76,7 +77,10 @@ def __process_file(self, transcription_file: Path) -> None: def __generate_full_json(self, data: Dict[str, Any], episode_info) -> None: filename = self.episode_manager.file_naming.build_filename(episode_info, extension="json") - output_file = OutputPathBuilder.build_transcription_path(episode_info, filename, subdir="raw") + season_code = episode_info.season_code() + episode_code = episode_info.episode_num() + output_file = get_base_output_dir(self.series_name) / settings.output_subdirs.transcriptions / season_code / episode_code / "raw" / filename + output_file.parent.mkdir(parents=True, exist_ok=True) output_file.parent.mkdir(parents=True, exist_ok=True) generator = FullJsonGenerator(Path("."), output_file.parent, self.logger) @@ -94,7 +98,10 @@ def __generate_segmented_json(self, data: Dict[str, Any], episode_info) -> None: extension="json", suffix="segmented", ) - output_file = OutputPathBuilder.build_transcription_path(episode_info, filename, subdir="raw") + season_code = episode_info.season_code() + episode_code = episode_info.episode_num() + output_file = get_base_output_dir(self.series_name) / settings.output_subdirs.transcriptions / season_code / episode_code / "raw" / filename + output_file.parent.mkdir(parents=True, exist_ok=True) output_file.parent.mkdir(parents=True, exist_ok=True) generator = SegmentedJsonGenerator(Path("."), output_file.parent, self.logger) @@ -116,7 +123,10 @@ def __generate_simple_json(self, data: Dict[str, Any], episode_info) -> None: extension="json", suffix="simple", ) - output_file = OutputPathBuilder.build_transcription_path(episode_info, filename, subdir="raw") + season_code = episode_info.season_code() + episode_code = episode_info.episode_num() + output_file = get_base_output_dir(self.series_name) / settings.output_subdirs.transcriptions / season_code / episode_code / "raw" / filename + output_file.parent.mkdir(parents=True, exist_ok=True) output_file.parent.mkdir(parents=True, exist_ok=True) generator = SimpleJsonGenerator(Path("."), output_file.parent, self.logger) @@ -134,7 +144,10 @@ def __generate_simple_json(self, data: Dict[str, Any], episode_info) -> None: def __generate_srt(self, data: Dict[str, Any], episode_info) -> None: filename = self.episode_manager.file_naming.build_filename(episode_info, extension="srt") - output_file = OutputPathBuilder.build_transcription_path(episode_info, filename, subdir="raw") + season_code = episode_info.season_code() + episode_code = episode_info.episode_num() + output_file = get_base_output_dir(self.series_name) / settings.output_subdirs.transcriptions / season_code / episode_code / "raw" / filename + output_file.parent.mkdir(parents=True, exist_ok=True) output_file.parent.mkdir(parents=True, exist_ok=True) generator = SrtGenerator(Path("."), output_file.parent, self.logger) @@ -147,7 +160,10 @@ def __generate_srt(self, data: Dict[str, Any], episode_info) -> None: def __generate_txt(self, data: Dict[str, Any], episode_info) -> None: filename = self.episode_manager.file_naming.build_filename(episode_info, extension="txt") - output_file = OutputPathBuilder.build_transcription_path(episode_info, filename, subdir="raw") + season_code = episode_info.season_code() + episode_code = episode_info.episode_num() + output_file = get_base_output_dir(self.series_name) / settings.output_subdirs.transcriptions / season_code / episode_code / "raw" / filename + output_file.parent.mkdir(parents=True, exist_ok=True) output_file.parent.mkdir(parents=True, exist_ok=True) generator = TxtGenerator(Path("."), output_file.parent, self.logger) diff --git a/preprocessor/transcription/processors/episode_info_processor.py b/preprocessor/transcription/processors/episode_info_processor.py index a52f20d9b..5beec534f 100644 --- a/preprocessor/transcription/processors/episode_info_processor.py +++ b/preprocessor/transcription/processors/episode_info_processor.py @@ -59,7 +59,8 @@ def __load_transcription(path: Path) -> Dict[str, Any]: def __write_episode_json(self, transcription: Dict[str, Any], episode_info) -> Tuple[Path, str]: new_json_name = self.__episode_manager.file_naming.build_filename(episode_info, extension="json") - output_path = self.__episode_manager.build_output_path(episode_info, self.__output_path) + season_dir = self.__output_path / episode_info.season_code() + output_path = season_dir / new_json_name output_path.parent.mkdir(parents=True, exist_ok=True) result = { diff --git a/preprocessor/transcription/processors/sound_separator.py b/preprocessor/transcription/processors/sound_separator.py index 57d4e2043..5a31653c4 100644 --- a/preprocessor/transcription/processors/sound_separator.py +++ b/preprocessor/transcription/processors/sound_separator.py @@ -35,13 +35,18 @@ def __init__(self, args: Dict[str, Any]) -> None: loglevel=args.get("loglevel", 20), ) - self.transcription_dir = Path(self._args.get("transcription_dir", settings.transcription.output_dir)) + self.transcription_dir = Path( + self._args.get("transcription_dir", settings.transcription.get_output_dir(self.series_name)), + ) episodes_info_json = self._args.get("episodes_info_json") self.episode_manager = EpisodeManager(episodes_info_json, self.series_name) def _validate_args(self, args: Dict[str, Any]) -> None: pass + def get_output_subdir(self) -> str: + return settings.output_subdirs.transcriptions + def _get_processing_items(self) -> List[ProcessingItem]: segmented_files = list(self.transcription_dir.rglob("**/raw/*_segmented.json")) diff --git a/preprocessor/transcription/processors/unicode_fixer.py b/preprocessor/transcription/processors/unicode_fixer.py index 2d84a903c..c616ee96b 100644 --- a/preprocessor/transcription/processors/unicode_fixer.py +++ b/preprocessor/transcription/processors/unicode_fixer.py @@ -24,13 +24,18 @@ def __init__(self, args: Dict[str, Any]) -> None: loglevel=args.get("loglevel", 20), ) - self.transcription_jsons = Path(self._args.get("transcription_jsons", settings.transcription.output_dir)) + self.transcription_jsons = Path( + self._args.get("transcription_jsons", settings.transcription.get_output_dir(self.series_name)), + ) episodes_info_json = self._args.get("episodes_info_json") self.episode_manager = EpisodeManager(episodes_info_json, self.series_name) def _validate_args(self, args: Dict[str, Any]) -> None: pass + def get_output_subdir(self) -> str: + return settings.output_subdirs.transcriptions + def _get_processing_items(self) -> List[ProcessingItem]: transcription_files = list(self.transcription_jsons.rglob("*.json")) diff --git a/preprocessor/utils/detection_io.py b/preprocessor/utils/detection_io.py index 6826786d9..72ed5c972 100644 --- a/preprocessor/utils/detection_io.py +++ b/preprocessor/utils/detection_io.py @@ -7,10 +7,10 @@ Optional, ) -from preprocessor.characters.face_detection_utils import detect_characters_in_frame +from preprocessor.characters.face.face_detection_utils import detect_characters_in_frame from preprocessor.config.config import settings from preprocessor.core.file_naming import FileNamingConventions -from preprocessor.core.output_path_builder import OutputPathBuilder +from preprocessor.core.path_manager import PathManager from preprocessor.utils.console import console from preprocessor.utils.file_utils import atomic_write_json from preprocessor.utils.metadata_utils import create_minimal_episode_info @@ -26,8 +26,9 @@ def _parse_frame_number(frame_filename: str) -> Optional[int]: def save_character_detections( episode_info, results: List[Dict[str, Any]], + path_manager: Optional[PathManager] = None, fps: float = 25.0, -) -> None: +) -> Path: detections_data = { "episode_info": create_minimal_episode_info(episode_info), "video_metadata": { @@ -36,24 +37,26 @@ def save_character_detections( "detections": results, } - file_naming = FileNamingConventions(episode_info.series_name) + series_name = episode_info.series_name or "unknown" + file_naming = FileNamingConventions(series_name) + detections_filename = file_naming.build_filename( episode_info, extension="json", suffix="character_detections", ) - detections_output = OutputPathBuilder.build_output_path( + + if path_manager is None: + path_manager = PathManager(series_name) + + detections_output = path_manager.build_path( episode_info, settings.output_subdirs.character_detections, detections_filename, ) atomic_write_json(detections_output, detections_data, indent=2, ensure_ascii=False) - frames_with_chars = sum(1 for r in results if r["characters"]) - console.print( - f"[green]✓ {episode_info.episode_code()}: {len(results)} frames, " - f"{frames_with_chars} with characters[/green]", - ) + return detections_output def process_frames_for_detection( diff --git a/preprocessor/utils/image_hash_utils.py b/preprocessor/utils/image_hash_utils.py index e788b5d37..a12c0e181 100644 --- a/preprocessor/utils/image_hash_utils.py +++ b/preprocessor/utils/image_hash_utils.py @@ -1,27 +1,28 @@ import json -from typing import ( - Any, - Dict, -) +from typing import Dict from preprocessor.config.config import settings from preprocessor.core.episode_manager import EpisodeInfo -from preprocessor.core.output_path_builder import OutputPathBuilder +from preprocessor.core.path_manager import PathManager -def load_image_hashes_for_episode(episode_info_dict: Dict[str, Any], logger=None) -> Dict[int, str]: +def load_image_hashes_for_episode( + episode_info_dict: Dict[str, int], + series_name: str, + logger=None, +) -> Dict[int, str]: season = episode_info_dict.get("season") episode = episode_info_dict.get("episode_number") if season is None or episode is None: return {} - episode_info = EpisodeInfo( - absolute_episode=0, - season=season, - relative_episode=episode, - title="", + path_manager = PathManager(series_name) + episode_info = EpisodeInfo.create_minimal(season, episode, series_name) + + hashes_episode_dir = path_manager.get_episode_dir( + episode_info, + settings.output_subdirs.image_hashes, ) - hashes_episode_dir = OutputPathBuilder.get_episode_dir(episode_info, settings.output_subdirs.image_hashes) hash_files = list(hashes_episode_dir.glob("*_image_hashes.json")) if not hash_files: diff --git a/preprocessor/validation/episode_stats.py b/preprocessor/validation/episode_stats.py index db2109cff..62ff86764 100644 --- a/preprocessor/validation/episode_stats.py +++ b/preprocessor/validation/episode_stats.py @@ -11,16 +11,17 @@ Tuple, ) -from preprocessor.config.config import settings +from preprocessor.config.config import ( + get_base_output_dir, + settings, +) from preprocessor.core.constants import ( + DEFAULT_VIDEO_EXTENSION, OUTPUT_FILE_NAMES, OUTPUT_FILE_PATTERNS, ) -from preprocessor.core.episode_manager import ( - EpisodeInfo, - EpisodeManager, -) -from preprocessor.core.output_path_builder import OutputPathBuilder +from preprocessor.core.episode_manager import EpisodeInfo +from preprocessor.core.path_manager import PathManager from preprocessor.validation.base_result import ValidationStatusMixin from preprocessor.validation.file_validators import ( validate_image_file, @@ -75,7 +76,7 @@ def collect_stats(self): self.__validate_other_files() def __validate_transcription(self): - transcriptions_dir = EpisodeManager.get_episode_subdir(self.episode_info, settings.output_subdirs.transcriptions) + transcriptions_dir = PathManager(self.series_name).get_episode_dir(self.episode_info,settings.output_subdirs.transcriptions) base_name = f"{self.series_name}_{self.episode_info.episode_code()}" raw_dir = transcriptions_dir / settings.output_subdirs.transcription_subdirs.raw @@ -165,7 +166,7 @@ def __validate_sound_events(self, sound_events_file): self.warnings.append(f"Invalid sound events JSON: {result.error_message}") def __validate_exported_frames(self): - frames_dir = EpisodeManager.get_episode_subdir(self.episode_info, settings.output_subdirs.frames) + frames_dir = PathManager(self.series_name).get_episode_dir(self.episode_info,settings.output_subdirs.frames) if not frames_dir.exists(): self.warnings.append(f"Missing {settings.output_subdirs.frames} directory: {frames_dir}") return @@ -200,7 +201,10 @@ def __validate_exported_frames(self): self.exported_frames_avg_resolution = most_common_res def __validate_video(self): - video_file = OutputPathBuilder.build_video_path(self.episode_info, self.series_name) + filename = f"{self.series_name.lower()}_{self.episode_info.episode_code()}{DEFAULT_VIDEO_EXTENSION}" + season_dir = get_base_output_dir(self.series_name) / settings.output_subdirs.video / self.episode_info.season_code() + video_file = season_dir / filename + if not video_file.exists(): self.warnings.append(f"Missing video file: {video_file}") return @@ -216,7 +220,7 @@ def __validate_video(self): self.video_resolution = (result.metadata["width"], result.metadata["height"]) def __validate_scenes(self): - scenes_dir = EpisodeManager.get_episode_subdir(self.episode_info, settings.output_subdirs.scenes) + scenes_dir = PathManager(self.series_name).get_episode_dir(self.episode_info,settings.output_subdirs.scenes) scenes_file = scenes_dir / f"{self.series_name}_{self.episode_info.episode_code()}{OUTPUT_FILE_PATTERNS['scenes_suffix']}" if not scenes_file.exists(): self.errors.append(f"Missing scenes file: {scenes_file}") @@ -240,7 +244,7 @@ def __validate_scenes(self): self.errors.append(f"Error reading scenes: {e}") def __validate_image_hashes(self): - hashes_dir = EpisodeManager.get_episode_subdir(self.episode_info, settings.output_subdirs.image_hashes) + hashes_dir = PathManager(self.series_name).get_episode_dir(self.episode_info,settings.output_subdirs.image_hashes) if not hashes_dir.exists(): self.warnings.append(f"Missing {settings.output_subdirs.image_hashes} directory") return @@ -263,7 +267,7 @@ def __validate_image_hashes(self): self.__check_size_anomalies(sizes, "image_hashes") def __validate_character_visualizations(self): - viz_dir = EpisodeManager.get_episode_subdir(self.episode_info, settings.output_subdirs.character_visualizations) + viz_dir = PathManager(self.series_name).get_episode_dir(self.episode_info,settings.output_subdirs.character_visualizations) if not viz_dir.exists(): return @@ -285,7 +289,7 @@ def __validate_character_visualizations(self): self.warnings.append(f"{invalid_count} invalid character visualization images found") def __validate_face_clusters(self): - clusters_dir = EpisodeManager.get_episode_subdir(self.episode_info, settings.output_subdirs.face_clusters) + clusters_dir = PathManager(self.series_name).get_episode_dir(self.episode_info,settings.output_subdirs.face_clusters) if not clusters_dir.exists(): return @@ -331,7 +335,7 @@ def __validate_face_clusters(self): self.errors.append(f"Error reading face clustering metadata: {e}") def __validate_object_detections(self): - detections_dir = EpisodeManager.get_episode_subdir(self.episode_info, settings.output_subdirs.object_detections) + detections_dir = PathManager(self.series_name).get_episode_dir(self.episode_info,settings.output_subdirs.object_detections) if not detections_dir.exists(): self.warnings.append(f"Missing {settings.output_subdirs.object_detections} directory") return @@ -354,7 +358,7 @@ def __validate_object_detections(self): self.__check_size_anomalies(sizes, "object_detections") def __validate_object_visualizations(self): - viz_dir = EpisodeManager.get_episode_subdir(self.episode_info, settings.output_subdirs.object_visualizations) + viz_dir = PathManager(self.series_name).get_episode_dir(self.episode_info,settings.output_subdirs.object_visualizations) if not viz_dir.exists(): return @@ -425,14 +429,14 @@ def __check_size_anomalies(self, sizes: List[int], folder_name: str, threshold: ) def __validate_other_files(self): - char_detections_dir = EpisodeManager.get_episode_subdir(self.episode_info, settings.output_subdirs.character_detections) + char_detections_dir = PathManager(self.series_name).get_episode_dir(self.episode_info,settings.output_subdirs.character_detections) detections_file = char_detections_dir / OUTPUT_FILE_NAMES["detections"] if detections_file.exists(): result = validate_json_file(detections_file) if not result.is_valid: self.errors.append(f"Invalid {OUTPUT_FILE_NAMES['detections']}: {result.error_message}") - embeddings_dir = EpisodeManager.get_episode_subdir(self.episode_info, settings.output_subdirs.embeddings) + embeddings_dir = PathManager(self.series_name).get_episode_dir(self.episode_info,settings.output_subdirs.embeddings) if embeddings_dir.exists(): embeddings_file = embeddings_dir / OUTPUT_FILE_NAMES["embeddings_text"] if embeddings_file.exists(): @@ -452,7 +456,7 @@ def __validate_other_files(self): ] found_elastic_docs = False for subdir in elastic_subdirs: - elastic_docs_dir = EpisodeManager.get_episode_subdir( + elastic_docs_dir = PathManager(self.series_name).get_episode_dir( self.episode_info, f"{settings.output_subdirs.elastic_documents}/{subdir}", ) @@ -468,7 +472,7 @@ def __validate_other_files(self): if not found_elastic_docs: self.warnings.append(f"Missing {settings.output_subdirs.elastic_documents} directory") - transcriptions_dir = EpisodeManager.get_episode_subdir(self.episode_info, settings.output_subdirs.transcriptions) + transcriptions_dir = PathManager(self.series_name).get_episode_dir(self.episode_info,settings.output_subdirs.transcriptions) if transcriptions_dir.exists(): clean_dir = transcriptions_dir / settings.output_subdirs.transcription_subdirs.clean text_stats_file = clean_dir / f"{self.series_name}_{self.episode_info.episode_code()}_text_stats.json" diff --git a/preprocessor/validation/global_validator.py b/preprocessor/validation/global_validator.py index eef9cd9cd..e9ed67bb8 100644 --- a/preprocessor/validation/global_validator.py +++ b/preprocessor/validation/global_validator.py @@ -1,7 +1,6 @@ from pathlib import Path from typing import List -from preprocessor.config.config import BASE_OUTPUT_DIR from preprocessor.validation.base_result import BaseValidationResult from preprocessor.validation.file_validators import ( validate_image_file, @@ -17,7 +16,7 @@ class GlobalValidator: def __init__( self, series_name: str, - base_output_dir: Path = BASE_OUTPUT_DIR, + base_output_dir: Path, ): self.series_name = series_name self.base_output_dir = base_output_dir diff --git a/preprocessor/validation/validator.py b/preprocessor/validation/validator.py index ef160e6b4..ee596f4ea 100644 --- a/preprocessor/validation/validator.py +++ b/preprocessor/validation/validator.py @@ -8,10 +8,7 @@ from rich.console import Console from rich.progress import track -from preprocessor.config.config import ( - BASE_OUTPUT_DIR, - settings, -) +from preprocessor.config.config import settings from preprocessor.core.episode_manager import EpisodeManager from preprocessor.core.file_naming import FileNamingConventions from preprocessor.utils.file_utils import atomic_write_json @@ -28,7 +25,7 @@ def __init__( season: str, series_name: str = "ranczo", anomaly_threshold: float = 20.0, - base_output_dir: Path = BASE_OUTPUT_DIR, + base_output_dir: Path = None, episodes_info_json: Optional[Path] = None, ): self.season = season diff --git a/preprocessor/video/helpers/__init__.py b/preprocessor/video/helpers/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/preprocessor/video/base_video_processor.py b/preprocessor/video/helpers/base_video_processor.py similarity index 100% rename from preprocessor/video/base_video_processor.py rename to preprocessor/video/helpers/base_video_processor.py diff --git a/preprocessor/video/frame_processor.py b/preprocessor/video/helpers/frame_processor.py similarity index 95% rename from preprocessor/video/frame_processor.py rename to preprocessor/video/helpers/frame_processor.py index 717b17766..7d5856ac5 100644 --- a/preprocessor/video/frame_processor.py +++ b/preprocessor/video/helpers/frame_processor.py @@ -26,7 +26,9 @@ def __init__(self, args: Dict[str, Any]): loglevel=logging.DEBUG, ) - self.frames_dir: Path = Path(self._args.get("frames_dir", settings.frame_export.output_dir)) + self.frames_dir: Path = Path( + self._args.get("frames_dir", settings.frame_export.get_output_dir(self.series_name)) + ) self.ramdisk_path: Path = Path(self._args.get("ramdisk_path", "/dev/shm")) episodes_info_json = self._args.get("episodes_info_json") @@ -37,6 +39,9 @@ def __init__(self, args: Dict[str, Any]): def _validate_args(self, args: Dict[str, Any]) -> None: pass + def get_output_subdir(self) -> str: + return settings.output_subdirs.frames + def add_sub_processor(self, processor: 'FrameSubProcessor') -> None: self.sub_processors.append(processor) diff --git a/preprocessor/video/subprocessors/__init__.py b/preprocessor/video/subprocessors/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/preprocessor/video/emotion_detection_subprocessor.py b/preprocessor/video/subprocessors/emotion_detection_subprocessor.py similarity index 90% rename from preprocessor/video/emotion_detection_subprocessor.py rename to preprocessor/video/subprocessors/emotion_detection_subprocessor.py index 9ae007dc3..d04704aac 100644 --- a/preprocessor/video/emotion_detection_subprocessor.py +++ b/preprocessor/video/subprocessors/emotion_detection_subprocessor.py @@ -16,6 +16,7 @@ ) from preprocessor.core.episode_manager import EpisodeManager from preprocessor.core.file_naming import FileNamingConventions +from preprocessor.core.path_manager import PathManager from preprocessor.utils.console import console from preprocessor.utils.emotion_utils import ( crop_face_from_frame, @@ -24,7 +25,7 @@ ) from preprocessor.utils.error_handling_logger import ErrorHandlingLogger from preprocessor.utils.file_utils import atomic_write_json -from preprocessor.video.frame_processor import FrameSubProcessor +from preprocessor.video.helpers.frame_processor import FrameSubProcessor class EmotionDetectionSubProcessor(FrameSubProcessor): @@ -46,14 +47,14 @@ def finalize(self) -> None: def get_expected_outputs(self, item: ProcessingItem) -> List[OutputSpec]: episode_info = item.metadata["episode_info"] - episode_dir = EpisodeManager.get_episode_subdir(episode_info, settings.output_subdirs.character_detections) + episode_dir = PathManager(episode_info.series_name or "unknown").get_episode_dir(episode_info,settings.output_subdirs.character_detections) marker_file = episode_dir / ".emotion_complete" return [OutputSpec(path=marker_file, required=True)] def should_run(self, item: ProcessingItem, missing_outputs: List[OutputSpec]) -> bool: episode_info = item.metadata["episode_info"] series_name = item.metadata["series_name"] - episode_dir = EpisodeManager.get_episode_subdir(episode_info, settings.output_subdirs.character_detections) + episode_dir = PathManager(episode_info.series_name or "unknown").get_episode_dir(episode_info,settings.output_subdirs.character_detections) file_naming = FileNamingConventions(series_name) detections_filename = file_naming.build_filename( @@ -77,7 +78,7 @@ def process(self, item: ProcessingItem, ramdisk_frames_dir: Path) -> None: # pyl episode_info = item.metadata["episode_info"] series_name = item.metadata["series_name"] - episode_dir = EpisodeManager.get_episode_subdir(episode_info, settings.output_subdirs.character_detections) + episode_dir = PathManager(episode_info.series_name or "unknown").get_episode_dir(episode_info,settings.output_subdirs.character_detections) file_naming = FileNamingConventions(series_name) detections_filename = file_naming.build_filename( diff --git a/preprocessor/video/face_clustering_subprocessor.py b/preprocessor/video/subprocessors/face_clustering_subprocessor.py similarity index 94% rename from preprocessor/video/face_clustering_subprocessor.py rename to preprocessor/video/subprocessors/face_clustering_subprocessor.py index ecff9593a..a9d9ecc52 100644 --- a/preprocessor/video/face_clustering_subprocessor.py +++ b/preprocessor/video/subprocessors/face_clustering_subprocessor.py @@ -16,7 +16,7 @@ import numpy as np import torch -from preprocessor.characters.utils import init_face_detection +from preprocessor.characters.face.utils import init_face_detection from preprocessor.config.config import settings from preprocessor.core.base_processor import ( OutputSpec, @@ -24,11 +24,12 @@ ) from preprocessor.core.episode_manager import EpisodeManager from preprocessor.core.file_naming import FileNamingConventions +from preprocessor.core.path_manager import PathManager from preprocessor.utils.console import console from preprocessor.utils.error_handling_logger import ErrorHandlingLogger from preprocessor.utils.file_utils import atomic_write_json from preprocessor.utils.metadata_utils import create_processing_metadata -from preprocessor.video.frame_processor import FrameSubProcessor +from preprocessor.video.helpers.frame_processor import FrameSubProcessor class FaceClusteringSubProcessor(FrameSubProcessor): @@ -66,7 +67,7 @@ def needs_ramdisk(self) -> bool: def get_expected_outputs(self, item: ProcessingItem) -> List[OutputSpec]: episode_info = item.metadata["episode_info"] - episode_dir = EpisodeManager.get_episode_subdir(episode_info, settings.output_subdirs.face_clusters) + episode_dir = PathManager(episode_info.series_name or "unknown").get_episode_dir(episode_info,settings.output_subdirs.face_clusters) series_name = item.metadata["series_name"] file_naming = FileNamingConventions(series_name) metadata_filename = file_naming.build_filename( @@ -179,7 +180,7 @@ def __save_clusters( # pylint: disable=too-many-locals all_frame_files: List[Path], series_name: str, ) -> None: - episode_dir = EpisodeManager.get_episode_subdir(episode_info, settings.output_subdirs.face_clusters) + episode_dir = PathManager(episode_info.series_name or "unknown").get_episode_dir(episode_info,settings.output_subdirs.face_clusters) episode_dir.mkdir(parents=True, exist_ok=True) clusters = defaultdict(list) @@ -244,7 +245,7 @@ def __save_metadata( all_frame_files: List[Path], series_name: str, ) -> None: - episode_dir = EpisodeManager.get_episode_subdir(episode_info, settings.output_subdirs.face_clusters) + episode_dir = PathManager(episode_info.series_name or "unknown").get_episode_dir(episode_info,settings.output_subdirs.face_clusters) n_clusters = len(set(labels)) - (1 if -1 in labels else 0) n_noise = list(labels).count(-1) diff --git a/preprocessor/video/frame_subprocessors.py b/preprocessor/video/subprocessors/frame_subprocessors.py similarity index 91% rename from preprocessor/video/frame_subprocessors.py rename to preprocessor/video/subprocessors/frame_subprocessors.py index 85b4c9e6a..229dc1f70 100644 --- a/preprocessor/video/frame_subprocessors.py +++ b/preprocessor/video/subprocessors/frame_subprocessors.py @@ -15,8 +15,8 @@ import numpy as np import torch -from preprocessor.characters.face_detection_utils import load_character_references -from preprocessor.characters.utils import init_face_detection +from preprocessor.characters.face.face_detection_utils import load_character_references +from preprocessor.characters.face.utils import init_face_detection from preprocessor.config.config import settings from preprocessor.core.base_processor import ( OutputSpec, @@ -24,6 +24,7 @@ ) from preprocessor.core.episode_manager import EpisodeManager from preprocessor.core.file_naming import FileNamingConventions +from preprocessor.core.path_manager import PathManager from preprocessor.embeddings.gpu_batch_processor import GPUBatchProcessor from preprocessor.hashing.image_hasher import PerceptualHasher from preprocessor.utils.batch_processing_utils import ( @@ -39,7 +40,7 @@ from preprocessor.utils.file_utils import atomic_write_json from preprocessor.utils.image_hash_utils import load_image_hashes_for_episode from preprocessor.utils.metadata_utils import create_processing_metadata -from preprocessor.video.frame_processor import FrameSubProcessor +from preprocessor.video.helpers.frame_processor import FrameSubProcessor # pylint: disable=duplicate-code @@ -67,7 +68,7 @@ def finalize(self) -> None: def get_expected_outputs(self, item: ProcessingItem) -> List[OutputSpec]: episode_info = item.metadata["episode_info"] - episode_dir = EpisodeManager.get_episode_subdir(episode_info, settings.output_subdirs.image_hashes) + episode_dir = PathManager(episode_info.series_name or "unknown").get_episode_dir(episode_info,settings.output_subdirs.image_hashes) series_name = item.metadata["series_name"] file_naming = FileNamingConventions(series_name) hash_filename = file_naming.build_filename( @@ -101,7 +102,7 @@ def process(self, item: ProcessingItem, ramdisk_frames_dir: Path) -> None: self.__save_hashes(episode_info, hash_results, series_name) def __save_hashes(self, episode_info, hash_results: List[Dict[str, Any]], series_name: str) -> None: - episode_dir = EpisodeManager.get_episode_subdir(episode_info, settings.output_subdirs.image_hashes) + episode_dir = PathManager(episode_info.series_name or "unknown").get_episode_dir(episode_info,settings.output_subdirs.image_hashes) episode_dir.mkdir(parents=True, exist_ok=True) hash_data = create_processing_metadata( @@ -176,7 +177,7 @@ def finalize(self) -> None: def get_expected_outputs(self, item: ProcessingItem) -> List[OutputSpec]: episode_info = item.metadata["episode_info"] - episode_dir = EpisodeManager.get_episode_subdir(episode_info, settings.output_subdirs.embeddings) + episode_dir = PathManager(episode_info.series_name or "unknown").get_episode_dir(episode_info,settings.output_subdirs.embeddings) series_name = item.metadata["series_name"] file_naming = FileNamingConventions(series_name) video_filename = file_naming.build_filename( @@ -205,11 +206,13 @@ def process(self, item: ProcessingItem, ramdisk_frames_dir: Path) -> None: console.print(f"[yellow]No frames in metadata for {metadata_file}[/yellow]") return - episode_dir = EpisodeManager.get_episode_subdir(episode_info, settings.output_subdirs.embeddings) + episode_dir = PathManager(episode_info.series_name or "unknown").get_episode_dir(episode_info,settings.output_subdirs.embeddings) checkpoint_file = episode_dir / "embeddings_video_checkpoint.json" + series_name = item.metadata.get("series_name", "unknown") image_hashes = load_image_hashes_for_episode( {"season": episode_info.season, "episode_number": episode_info.relative_episode}, + series_name, self.logger, ) video_embeddings = compute_embeddings_in_batches( @@ -226,7 +229,7 @@ def process(self, item: ProcessingItem, ramdisk_frames_dir: Path) -> None: self.__save_embeddings(episode_info, video_embeddings, series_name) def __save_embeddings(self, episode_info, video_embeddings: List[Dict[str, Any]], series_name: str) -> None: - episode_dir = EpisodeManager.get_episode_subdir(episode_info, settings.output_subdirs.embeddings) + episode_dir = PathManager(episode_info.series_name or "unknown").get_episode_dir(episode_info,settings.output_subdirs.embeddings) episode_dir.mkdir(parents=True, exist_ok=True) video_data = create_processing_metadata( @@ -290,7 +293,7 @@ def finalize(self) -> None: def get_expected_outputs(self, item: ProcessingItem) -> List[OutputSpec]: episode_info = item.metadata["episode_info"] - episode_dir = EpisodeManager.get_episode_subdir(episode_info, settings.output_subdirs.character_detections) + episode_dir = PathManager(episode_info.series_name or "unknown").get_episode_dir(episode_info,settings.output_subdirs.character_detections) series_name = item.metadata["series_name"] file_naming = FileNamingConventions(series_name) detections_filename = file_naming.build_filename( @@ -373,7 +376,7 @@ def finalize(self) -> None: def get_expected_outputs(self, item: ProcessingItem) -> List[OutputSpec]: episode_info = item.metadata["episode_info"] - episode_dir = EpisodeManager.get_episode_subdir(episode_info, settings.output_subdirs.object_detections) + episode_dir = PathManager(episode_info.series_name or "unknown").get_episode_dir(episode_info,settings.output_subdirs.object_detections) series_name = item.metadata["series_name"] file_naming = FileNamingConventions(series_name) detections_filename = file_naming.build_filename( @@ -481,7 +484,7 @@ def process(self, item: ProcessingItem, ramdisk_frames_dir: Path) -> None: # py self.__save_detections(episode_info, detections_data, series_name) def __save_detections(self, episode_info, detections_data: Dict[str, Any], series_name: str) -> None: - episode_dir = EpisodeManager.get_episode_subdir(episode_info, settings.output_subdirs.object_detections) + episode_dir = PathManager(episode_info.series_name or "unknown").get_episode_dir(episode_info,settings.output_subdirs.object_detections) episode_dir.mkdir(parents=True, exist_ok=True) output_data = create_processing_metadata( @@ -536,13 +539,13 @@ def needs_ramdisk(self) -> bool: def get_expected_outputs(self, item: ProcessingItem) -> List[OutputSpec]: episode_info = item.metadata["episode_info"] - episode_dir = EpisodeManager.get_episode_subdir(episode_info, settings.output_subdirs.object_visualizations) + episode_dir = PathManager(episode_info.series_name or "unknown").get_episode_dir(episode_info,settings.output_subdirs.object_visualizations) marker_file = episode_dir / ".visualization_complete" return [OutputSpec(path=marker_file, required=True)] def should_run(self, item: ProcessingItem, missing_outputs: List[OutputSpec]) -> bool: episode_info = item.metadata["episode_info"] - detection_dir = EpisodeManager.get_episode_subdir(episode_info, settings.output_subdirs.object_detections) + detection_dir = PathManager(episode_info.series_name or "unknown").get_episode_dir(episode_info,settings.output_subdirs.object_detections) detection_files = list(detection_dir.glob("*_object_detections.json")) detection_file = detection_files[0] if detection_files else None @@ -557,7 +560,7 @@ def process(self, item: ProcessingItem, ramdisk_frames_dir: Path) -> None: import cv2 # pylint: disable=import-outside-toplevel episode_info = item.metadata["episode_info"] - detection_dir = EpisodeManager.get_episode_subdir(episode_info, settings.output_subdirs.object_detections) + detection_dir = PathManager(episode_info.series_name or "unknown").get_episode_dir(episode_info,settings.output_subdirs.object_detections) detection_files = list(detection_dir.glob("*_object_detections.json")) detection_file = detection_files[0] if detection_files else None @@ -577,7 +580,7 @@ def process(self, item: ProcessingItem, ramdisk_frames_dir: Path) -> None: console.print(f"[yellow]No frames with detections for {episode_info.episode_code()}[/yellow]") return - output_dir = EpisodeManager.get_episode_subdir(episode_info, settings.output_subdirs.object_visualizations) + output_dir = PathManager(episode_info.series_name or "unknown").get_episode_dir(episode_info,settings.output_subdirs.object_visualizations) output_dir.mkdir(parents=True, exist_ok=True) colors = self.__generate_colors() conf_threshold = detection_data.get("processing_params", {}).get("confidence_threshold", 0.25) @@ -656,13 +659,13 @@ def needs_ramdisk(self) -> bool: def get_expected_outputs(self, item: ProcessingItem) -> List[OutputSpec]: episode_info = item.metadata["episode_info"] - episode_dir = EpisodeManager.get_episode_subdir(episode_info, settings.output_subdirs.character_visualizations) + episode_dir = PathManager(episode_info.series_name or "unknown").get_episode_dir(episode_info,settings.output_subdirs.character_visualizations) marker_file = episode_dir / ".visualization_complete" return [OutputSpec(path=marker_file, required=True)] def should_run(self, item: ProcessingItem, missing_outputs: List[OutputSpec]) -> bool: episode_info = item.metadata["episode_info"] - detection_dir = EpisodeManager.get_episode_subdir(episode_info, settings.output_subdirs.character_detections) + detection_dir = PathManager(episode_info.series_name or "unknown").get_episode_dir(episode_info,settings.output_subdirs.character_detections) detection_files = list(detection_dir.glob("*_character_detections.json")) detection_file = detection_files[0] if detection_files else None @@ -677,7 +680,7 @@ def process(self, item: ProcessingItem, ramdisk_frames_dir: Path) -> None: # py import cv2 # pylint: disable=import-outside-toplevel episode_info = item.metadata["episode_info"] - detection_dir = EpisodeManager.get_episode_subdir(episode_info, settings.output_subdirs.character_detections) + detection_dir = PathManager(episode_info.series_name or "unknown").get_episode_dir(episode_info,settings.output_subdirs.character_detections) detection_files = list(detection_dir.glob("*_character_detections.json")) detection_file = detection_files[0] if detection_files else None @@ -697,7 +700,7 @@ def process(self, item: ProcessingItem, ramdisk_frames_dir: Path) -> None: # py console.print(f"[yellow]No frames with character detections for {episode_info.episode_code()}[/yellow]") return - output_dir = EpisodeManager.get_episode_subdir(episode_info, settings.output_subdirs.character_visualizations) + output_dir = PathManager(episode_info.series_name or "unknown").get_episode_dir(episode_info,settings.output_subdirs.character_visualizations) output_dir.mkdir(parents=True, exist_ok=True) all_character_names = set() From a05c153c5ef43d74c0e9863109add86043ff1609 Mon Sep 17 00:00:00 2001 From: dam2452 <81230036+dam2452@users.noreply.github.com> Date: Mon, 9 Feb 2026 15:22:26 +0100 Subject: [PATCH 02/89] Hoist imports to top-level; adjust pylint formatting Move EpisodeScraper and CharacterReferenceDownloader imports from inside functions to module-level imports in preprocessor/cli/pipeline/steps.py to remove import-outside-toplevel usage and clarify dependencies. In preprocessor/core/base_processor.py, adjust the pylint disable placement and reformat the conditional return for file_naming.build_filename; this is a styling/lint change with no functional behavior change. --- preprocessor/cli/pipeline/steps.py | 8 ++------ preprocessor/core/base_processor.py | 4 ++-- 2 files changed, 4 insertions(+), 8 deletions(-) diff --git a/preprocessor/cli/pipeline/steps.py b/preprocessor/cli/pipeline/steps.py index 4030e33b7..b7d199ef9 100644 --- a/preprocessor/cli/pipeline/steps.py +++ b/preprocessor/cli/pipeline/steps.py @@ -1,10 +1,12 @@ from pathlib import Path +from preprocessor.characters.reference.reference_downloader import CharacterReferenceDownloader from preprocessor.config.config import ( get_base_output_dir, settings, ) from preprocessor.core.constants import SUPPORTED_VIDEO_EXTENSIONS +from preprocessor.scraping.episode_scraper import EpisodeScraper from preprocessor.utils.console import console from preprocessor.video.helpers.frame_processor import FrameProcessor from preprocessor.video.subprocessors.emotion_detection_subprocessor import EmotionDetectionSubProcessor @@ -22,8 +24,6 @@ def run_scrape_step(scrape_urls, episodes_info_json, videos=None, parser_mode="normal", **_kwargs): - from preprocessor.scraping.episode_scraper import EpisodeScraper # pylint: disable=import-outside-toplevel - if not scrape_urls: return 0 @@ -85,10 +85,6 @@ def run_character_scrape_step(character_urls, characters_json, name, parser_mode def run_character_reference_download_step(name, characters_json, search_mode="normal", **_kwargs): - from preprocessor.characters.reference.reference_downloader import ( - CharacterReferenceDownloader, # pylint: disable=import-outside-toplevel - ) - if not characters_json.exists(): console.print("[yellow]No characters.json found, skipping reference download[/yellow]") return 0 diff --git a/preprocessor/core/base_processor.py b/preprocessor/core/base_processor.py index a3ab82bef..f2d646747 100644 --- a/preprocessor/core/base_processor.py +++ b/preprocessor/core/base_processor.py @@ -405,8 +405,8 @@ def _build_filename( extension: str = "json", suffix: Optional[str] = None, ) -> str: - if hasattr(self, 'episode_manager') and self.episode_manager: - return self.episode_manager.file_naming.build_filename( # pylint: disable=no-member + if hasattr(self, 'episode_manager') and self.episode_manager: # pylint: disable=no-member + return self.episode_manager.file_naming.build_filename( episode_info, extension=extension, suffix=suffix, From 2804a738b729d3cd53cc482e639a33b072b6da37 Mon Sep 17 00:00:00 2001 From: dam2452 <81230036+dam2452@users.noreply.github.com> Date: Mon, 9 Feb 2026 15:43:19 +0100 Subject: [PATCH 03/89] Add safe resize and use in reference processor Introduce CharacterReferenceProcessor.__safe_resize to guard against None/empty images and OpenCV resize errors, returning None on failure. Replace direct cv2.resize calls with the safe helper in reference_processor, skip invalid faces and log warnings. Also reorder an import (init_face_detection) in reference_downloader and apply small formatting fixes (trailing commas) in processor_registry, elastic document/indexer, and frame_processor. --- .../reference/reference_downloader.py | 2 +- .../reference/reference_processor.py | 34 +++++++++++++++---- preprocessor/core/processor_registry.py | 2 +- .../processors/elastic_document_generator.py | 2 +- .../processors/elasticsearch_indexer.py | 2 +- preprocessor/video/helpers/frame_processor.py | 2 +- 6 files changed, 33 insertions(+), 11 deletions(-) diff --git a/preprocessor/characters/reference/reference_downloader.py b/preprocessor/characters/reference/reference_downloader.py index 6eabc14be..6946bc09b 100644 --- a/preprocessor/characters/reference/reference_downloader.py +++ b/preprocessor/characters/reference/reference_downloader.py @@ -21,10 +21,10 @@ sync_playwright, ) +from preprocessor.characters.face.utils import init_face_detection from preprocessor.characters.search.base_image_search import BaseImageSearch from preprocessor.characters.search.duckduckgo_search import DuckDuckGoImageSearch from preprocessor.characters.search.google_image_search import GoogleImageSearch -from preprocessor.characters.face.utils import init_face_detection from preprocessor.config.config import settings from preprocessor.core.base_processor import BaseProcessor from preprocessor.utils.console import ( diff --git a/preprocessor/characters/reference/reference_processor.py b/preprocessor/characters/reference/reference_processor.py index 154a0342b..84ba5db4c 100644 --- a/preprocessor/characters/reference/reference_processor.py +++ b/preprocessor/characters/reference/reference_processor.py @@ -76,6 +76,18 @@ def _load_resources(self) -> bool: self.face_app = init_face_detection() return True + @staticmethod + def __safe_resize(img: np.ndarray, target_size: tuple) -> Optional[np.ndarray]: + if img is None or img.size == 0: + return None + if img.shape[0] == 0 or img.shape[1] == 0: + return None + try: + return cv2.resize(img, target_size) + except cv2.error as e: + logging.error(f"OpenCV resize error: {e}") + return None + def _get_processing_items(self) -> List[ProcessingItem]: items = [] @@ -368,8 +380,9 @@ def __create_selection_grid( # pylint: disable=too-many-locals x = padding + face_idx * (face_size + padding) y = y_base - face_resized = cv2.resize(face_data.face_img, (face_size, face_size)) - grid[y:y + face_size, x:x + face_size] = face_resized + face_resized = self.__safe_resize(face_data.face_img, (face_size, face_size)) + if face_resized is not None: + grid[y:y + face_size, x:x + face_size] = face_resized label = f"Candidate {cand_idx + 1}" cv2.putText( @@ -403,8 +416,9 @@ def __create_selection_grid( # pylint: disable=too-many-locals x = padding + col * (face_size + padding) y = padding + row * (face_size + padding) - face_resized = cv2.resize(face_data.face_img, (face_size, face_size)) - grid[y:y + face_size, x:x + face_size] = face_resized + face_resized = self.__safe_resize(face_data.face_img, (face_size, face_size)) + if face_resized is not None: + grid[y:y + face_size, x:x + face_size] = face_resized label = str(idx + 1) cv2.putText( @@ -436,11 +450,17 @@ def __save_processed_references( # pylint: disable=too-many-locals face_vectors = [] for idx, face_data in enumerate(selected_faces): - face_normalized = cv2.resize( + face_normalized = self.__safe_resize( face_data.face_img, settings.character.normalized_face_size, ) + if face_normalized is None: + self.logger.warning( + f"Skipping face {idx} for {char_name}: failed to resize (invalid dimensions)", + ) + continue + face_output_path = char_output_dir / f"face_{idx:02d}.jpg" cv2.imwrite(str(face_output_path), face_normalized) @@ -697,7 +717,9 @@ def generate_validation_grid(self) -> None: # pylint: disable=too-many-locals,t if face_img is None: continue - face_resized = cv2.resize(face_img, (face_size, face_size)) + face_resized = CharacterReferenceProcessor.__safe_resize(face_img, (face_size, face_size)) + if face_resized is None: + continue x = label_col_width + stats_col_width + face_idx * face_col_width + padding y = y_offset diff --git a/preprocessor/core/processor_registry.py b/preprocessor/core/processor_registry.py index 171dc890e..77a982376 100644 --- a/preprocessor/core/processor_registry.py +++ b/preprocessor/core/processor_registry.py @@ -27,7 +27,7 @@ def get_processor_class(name: str) -> Type[BaseProcessor]: available = ", ".join(sorted(PROCESSOR_REGISTRY.keys())) raise ValueError( f"Unknown processor: '{name}'\n" - f"Available processors: {available}" + f"Available processors: {available}", ) return PROCESSOR_REGISTRY[name] diff --git a/preprocessor/processors/elastic_document_generator.py b/preprocessor/processors/elastic_document_generator.py index 8c0e6f8fc..936735ff7 100644 --- a/preprocessor/processors/elastic_document_generator.py +++ b/preprocessor/processors/elastic_document_generator.py @@ -68,7 +68,7 @@ def __init__(self, args: Dict[str, Any]): self.object_detections_dir: Optional[Path] = self._args.get("object_detections_dir") self.output_dir: Path = self._args.get( "output_dir", - get_base_output_dir(self.series_name) / "elastic_documents" + get_base_output_dir(self.series_name) / "elastic_documents", ) episodes_info_json = self._args.get("episodes_info_json") diff --git a/preprocessor/processors/elasticsearch_indexer.py b/preprocessor/processors/elasticsearch_indexer.py index e5be2fbba..960688448 100644 --- a/preprocessor/processors/elasticsearch_indexer.py +++ b/preprocessor/processors/elasticsearch_indexer.py @@ -48,7 +48,7 @@ def __init__(self, args: Dict[str, Any]) -> None: self.name = self._args["name"] self.elastic_documents_dir = self._args.get( "elastic_documents_dir", - get_base_output_dir(self.series_name) / "elastic_documents" + get_base_output_dir(self.series_name) / "elastic_documents", ) self.transcription_jsons = self._args.get("transcription_jsons") self.append = self._args.get("append", False) diff --git a/preprocessor/video/helpers/frame_processor.py b/preprocessor/video/helpers/frame_processor.py index 7d5856ac5..b6ac68ee6 100644 --- a/preprocessor/video/helpers/frame_processor.py +++ b/preprocessor/video/helpers/frame_processor.py @@ -27,7 +27,7 @@ def __init__(self, args: Dict[str, Any]): ) self.frames_dir: Path = Path( - self._args.get("frames_dir", settings.frame_export.get_output_dir(self.series_name)) + self._args.get("frames_dir", settings.frame_export.get_output_dir(self.series_name)), ) self.ramdisk_path: Path = Path(self._args.get("ramdisk_path", "/dev/shm")) From 540336ffd909d966b341870970a1c168c479dcf7 Mon Sep 17 00:00:00 2001 From: dam2452 <81230036+dam2452@users.noreply.github.com> Date: Mon, 9 Feb 2026 20:45:39 +0100 Subject: [PATCH 04/89] refaktor --- bot/services/reindex/reindex_service.py | 2 +- bot/types.py | 244 ++---- ...o_search.py => duckduckgo_image_search.py} | 2 +- preprocessor/characters/face/__init__.py | 0 preprocessor/characters/face/utils.py | 66 -- ...e_detection_utils.py => face_detection.py} | 85 +- .../{search => }/google_image_search.py | 2 +- .../base_image_search.py => image_search.py} | 0 preprocessor/characters/reference/__init__.py | 0 .../{reference => }/reference_downloader.py | 8 +- .../{reference => }/reference_processor.py | 2 +- preprocessor/characters/search/__init__.py | 0 preprocessor/cli/commands/detect_scenes.py | 2 +- preprocessor/cli/commands/export_frames.py | 2 +- preprocessor/cli/commands/fix_unicode.py | 2 +- .../cli/commands/generate_embeddings.py | 2 +- preprocessor/cli/commands/image_hashing.py | 2 +- .../cli/commands/import_transcriptions.py | 2 +- .../commands/process_character_references.py | 4 +- preprocessor/cli/commands/run_all.py | 2 +- preprocessor/cli/commands/search.py | 2 +- preprocessor/cli/commands/separate_sounds.py | 2 +- preprocessor/cli/commands/transcode.py | 4 +- preprocessor/cli/commands/transcribe.py | 2 +- .../cli/commands/transcribe_elevenlabs.py | 2 +- preprocessor/cli/{utils.py => helpers.py} | 0 preprocessor/cli/pipeline/orchestrator.py | 2 +- preprocessor/cli/pipeline/steps.py | 8 +- preprocessor/cli_utils/__init__.py | 0 .../llm.py => config/llm_provider.py} | 0 preprocessor/core/base_processor.py | 12 +- preprocessor/core/episode_file_finder.py | 8 +- preprocessor/core/episode_manager.py | 4 +- preprocessor/core/file_naming.py | 44 - preprocessor/core/path_manager.py | 37 + .../embeddings/gpu_batch_processor.py | 36 +- preprocessor/hashing/__init__.py | 1 - preprocessor/indexing/__init__.py | 0 preprocessor/processors/character_detector.py | 12 +- .../processors/elastic_document_generator.py | 2 +- .../processors/elasticsearch_indexer.py | 2 +- .../elasticsearch_manager.py} | 0 preprocessor/processors/frame_exporter.py | 23 +- .../processors/image_hash_processor.py | 2 +- preprocessor/processors/scene_detector.py | 2 +- preprocessor/processors/video_transcoder.py | 22 +- preprocessor/providers/__init__.py | 0 preprocessor/scraping/base_scraper.py | 2 +- preprocessor/search/__init__.py | 0 .../generators/full_json_generator.py | 38 - .../generators/json_generator.py | 159 ++-- .../generators/multi_format_generator.py | 13 +- .../generators/segmented_json_generator.py | 36 - .../generators/simple_json_generator.py | 39 - preprocessor/types/__init__.py | 69 ++ preprocessor/types/clip.py | 10 + preprocessor/types/detection.py | 26 + preprocessor/types/episode.py | 27 + preprocessor/types/frame.py | 10 + preprocessor/types/scene.py | 30 + preprocessor/types/search.py | 52 ++ preprocessor/types/transcription.py | 46 ++ preprocessor/types/video.py | 20 + preprocessor/utils/batch_processing_utils.py | 31 +- preprocessor/utils/batch_processor.py | 24 + preprocessor/utils/detection_io.py | 7 +- .../{hashing => utils}/image_hasher.py | 0 .../{cli_utils => utils}/resource_scope.py | 0 preprocessor/validation/validator.py | 6 +- .../{utils => video}/emotion_utils.py | 0 .../video/{helpers => }/frame_processor.py | 0 preprocessor/{utils => video}/frame_utils.py | 0 preprocessor/video/helpers/__init__.py | 0 .../video/helpers/base_video_processor.py | 45 - preprocessor/video/subprocessors/__init__.py | 19 + .../character_detection_subprocessor.py | 103 +++ ...er_detection_visualization_subprocessor.py | 155 ++++ .../emotion_detection_subprocessor.py | 13 +- .../face_clustering_subprocessor.py | 13 +- .../subprocessors/frame_subprocessors.py | 772 ------------------ .../subprocessors/image_hash_subprocessor.py | 118 +++ .../object_detection_subprocessor.py | 207 +++++ ...ct_detection_visualization_subprocessor.py | 141 ++++ .../video_embedding_subprocessor.py | 155 ++++ 84 files changed, 1601 insertions(+), 1443 deletions(-) rename preprocessor/characters/{search/duckduckgo_search.py => duckduckgo_image_search.py} (83%) delete mode 100644 preprocessor/characters/face/__init__.py delete mode 100644 preprocessor/characters/face/utils.py rename preprocessor/characters/{face/face_detection_utils.py => face_detection.py} (51%) rename preprocessor/characters/{search => }/google_image_search.py (92%) rename preprocessor/characters/{search/base_image_search.py => image_search.py} (100%) delete mode 100644 preprocessor/characters/reference/__init__.py rename preprocessor/characters/{reference => }/reference_downloader.py (97%) rename preprocessor/characters/{reference => }/reference_processor.py (99%) delete mode 100644 preprocessor/characters/search/__init__.py rename preprocessor/cli/{utils.py => helpers.py} (100%) delete mode 100644 preprocessor/cli_utils/__init__.py rename preprocessor/{providers/llm.py => config/llm_provider.py} (100%) delete mode 100644 preprocessor/core/file_naming.py delete mode 100644 preprocessor/hashing/__init__.py delete mode 100644 preprocessor/indexing/__init__.py rename preprocessor/{search/elastic_manager.py => processors/elasticsearch_manager.py} (100%) delete mode 100644 preprocessor/providers/__init__.py delete mode 100644 preprocessor/search/__init__.py delete mode 100644 preprocessor/transcription/generators/full_json_generator.py delete mode 100644 preprocessor/transcription/generators/segmented_json_generator.py delete mode 100644 preprocessor/transcription/generators/simple_json_generator.py create mode 100644 preprocessor/types/__init__.py create mode 100644 preprocessor/types/clip.py create mode 100644 preprocessor/types/detection.py create mode 100644 preprocessor/types/episode.py create mode 100644 preprocessor/types/frame.py create mode 100644 preprocessor/types/scene.py create mode 100644 preprocessor/types/search.py create mode 100644 preprocessor/types/transcription.py create mode 100644 preprocessor/types/video.py create mode 100644 preprocessor/utils/batch_processor.py rename preprocessor/{hashing => utils}/image_hasher.py (100%) rename preprocessor/{cli_utils => utils}/resource_scope.py (100%) rename preprocessor/{utils => video}/emotion_utils.py (100%) rename preprocessor/video/{helpers => }/frame_processor.py (100%) rename preprocessor/{utils => video}/frame_utils.py (100%) delete mode 100644 preprocessor/video/helpers/__init__.py delete mode 100644 preprocessor/video/helpers/base_video_processor.py create mode 100644 preprocessor/video/subprocessors/character_detection_subprocessor.py create mode 100644 preprocessor/video/subprocessors/character_detection_visualization_subprocessor.py delete mode 100644 preprocessor/video/subprocessors/frame_subprocessors.py create mode 100644 preprocessor/video/subprocessors/image_hash_subprocessor.py create mode 100644 preprocessor/video/subprocessors/object_detection_subprocessor.py create mode 100644 preprocessor/video/subprocessors/object_detection_visualization_subprocessor.py create mode 100644 preprocessor/video/subprocessors/video_embedding_subprocessor.py diff --git a/bot/services/reindex/reindex_service.py b/bot/services/reindex/reindex_service.py index 310bc8434..017c5ac8a 100644 --- a/bot/services/reindex/reindex_service.py +++ b/bot/services/reindex/reindex_service.py @@ -22,7 +22,7 @@ from bot.services.reindex.video_path_transformer import VideoPathTransformer from bot.services.reindex.zip_extractor import ZipExtractor from bot.settings import settings -from preprocessor.search.elastic_manager import ElasticSearchManager +from preprocessor.processors.elasticsearch_manager import ElasticSearchManager @dataclass diff --git a/bot/types.py b/bot/types.py index c8bb18c44..e1e866ca7 100644 --- a/bot/types.py +++ b/bot/types.py @@ -1,193 +1,55 @@ -from typing import ( - Any, - Dict, - List, - NotRequired, - TypedDict, - Union, +from preprocessor.types import ( + BaseSegment, + CharacterDetectionInFrame, + ClipSegment, + Detection, + ElasticsearchAggregations, + ElasticsearchHit, + ElasticsearchHits, + ElasticsearchResponse, + ElasticsearchSegment, + EpisodeBucket, + EpisodeInfo, + EpisodeMetadata, + FrameRequest, + HashResult, + ObjectDetectionInFrame, + SceneDict, + SceneTimestamp, + SceneTimestampPoint, + SceneTimestampsData, + SeasonBucket, + SeasonInfo, + SeasonInfoDict, + SearchSegment, + TranscriptionContext, + VideoMetadata, ) - -class EpisodeInfo(TypedDict): - episode_number: int - title: str - premiere_date: str - viewership: Union[str, int, float] - - -class EpisodeMetadata(TypedDict): - season: int - episode_number: int - title: str - premiere_date: str - viewership: Union[str, int, float] - series_name: str - - -class SeasonInfo(TypedDict): - pass - - -SeasonInfoDict = Dict[str, int] - - -class BaseSegment(TypedDict): - id: int - text: str - start: float - end: float - - -class SegmentWithTimes(TypedDict): - segment_id: int - text: str - start_time: float - end_time: float - episode_metadata: EpisodeMetadata - video_path: NotRequired[str] - - -class SegmentWithScore(SegmentWithTimes): - _score: float - - -class ElasticsearchSegment(TypedDict): - segment_id: NotRequired[int] - id: NotRequired[int] - text: str - start_time: NotRequired[float] - start: NotRequired[float] - end_time: NotRequired[float] - end: NotRequired[float] - episode_metadata: NotRequired[EpisodeMetadata] - episode_info: NotRequired[EpisodeMetadata] - video_path: NotRequired[str] - _score: NotRequired[float] - - -class TranscriptionContext(TypedDict): - target: ElasticsearchSegment - context: List[BaseSegment] - overall_start_time: float - overall_end_time: float - - -class ClipSegment(TypedDict): - video_path: Union[str, Any] - start_time: float - end_time: float - - -class SearchSegment(TypedDict): - season: int - episode_number: int - title: str - start_time: float - end_time: float - - -class ElasticsearchHit(TypedDict): - _source: ElasticsearchSegment - _score: float - - -class ElasticsearchHits(TypedDict): - hits: List[ElasticsearchHit] - total: Dict[str, Any] - max_score: float - - -class ElasticsearchResponse(TypedDict): - hits: ElasticsearchHits - aggregations: NotRequired[Dict[str, Any]] - took: int - timed_out: bool - - -class EpisodeBucket(TypedDict): - key: int - doc_count: int - episode_metadata: Dict[str, Any] - - -class SeasonBucket(TypedDict): - key: int - doc_count: int - unique_episodes: Dict[str, int] - - -class ElasticsearchAggregations(TypedDict): - seasons: Dict[str, Union[List[SeasonBucket], int]] - unique_episodes: Dict[str, Union[List[EpisodeBucket], int]] - buckets: NotRequired[List[Union[SeasonBucket, EpisodeBucket]]] - - -class SceneDict(TypedDict): - scene_number: int - start_frame: int - end_frame: int - start_time: float - end_time: float - fps: float - - -class FrameRequest(TypedDict): - frame: int - time: float - type: str - scene_number: NotRequired[int] - - -class HashResult(TypedDict): - frame_number: int - timestamp: float - hash: str - file_path: NotRequired[str] - - -class Detection(TypedDict): - bbox: List[int] - confidence: float - class_id: NotRequired[int] - class_name: NotRequired[str] - name: NotRequired[str] - - -class VideoMetadata(TypedDict): - width: int - height: int - fps: float - duration: float - codec: NotRequired[str] - bitrate: NotRequired[int] - - -class SceneTimestampPoint(TypedDict): - frame: int - seconds: float - - -class SceneTimestamp(TypedDict): - scene_number: int - start: SceneTimestampPoint - end: SceneTimestampPoint - - -class SceneTimestampsData(TypedDict): - scenes: List[SceneTimestamp] - total_scenes: NotRequired[int] - fps: NotRequired[float] - - -class CharacterDetectionInFrame(TypedDict): - name: str - confidence: float - bbox: List[int] - embedding: NotRequired[List[float]] - - -class ObjectDetectionInFrame(TypedDict): - class_name: str - class_id: int - confidence: float - bbox: List[int] +__all__ = [ + "BaseSegment", + "CharacterDetectionInFrame", + "ClipSegment", + "Detection", + "ElasticsearchAggregations", + "ElasticsearchHit", + "ElasticsearchHits", + "ElasticsearchResponse", + "ElasticsearchSegment", + "EpisodeBucket", + "EpisodeInfo", + "EpisodeMetadata", + "FrameRequest", + "HashResult", + "ObjectDetectionInFrame", + "SceneDict", + "SceneTimestamp", + "SceneTimestampPoint", + "SceneTimestampsData", + "SeasonBucket", + "SeasonInfo", + "SeasonInfoDict", + "SearchSegment", + "TranscriptionContext", + "VideoMetadata", +] \ No newline at end of file diff --git a/preprocessor/characters/search/duckduckgo_search.py b/preprocessor/characters/duckduckgo_image_search.py similarity index 83% rename from preprocessor/characters/search/duckduckgo_search.py rename to preprocessor/characters/duckduckgo_image_search.py index c01819a32..224434dc0 100644 --- a/preprocessor/characters/search/duckduckgo_search.py +++ b/preprocessor/characters/duckduckgo_image_search.py @@ -5,7 +5,7 @@ from ddgs import DDGS -from preprocessor.characters.search.base_image_search import BaseImageSearch +from preprocessor.characters.image_search import BaseImageSearch class DuckDuckGoImageSearch(BaseImageSearch): diff --git a/preprocessor/characters/face/__init__.py b/preprocessor/characters/face/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/preprocessor/characters/face/utils.py b/preprocessor/characters/face/utils.py deleted file mode 100644 index d5b0e3bb1..000000000 --- a/preprocessor/characters/face/utils.py +++ /dev/null @@ -1,66 +0,0 @@ -import os -import warnings - -from insightface.app import FaceAnalysis -import onnxruntime as ort - -from preprocessor.config.config import settings -from preprocessor.utils.console import console - - -def init_face_detection() -> FaceAnalysis: - model_root = os.getenv("INSIGHTFACE_HOME", os.path.expanduser("~/.insightface")) - - available_providers = ort.get_available_providers() - console.print(f"[dim]Available ONNX providers: {', '.join(available_providers)}[/dim]") - - if 'CUDAExecutionProvider' not in available_providers: - console.print("[red]✗ CUDAExecutionProvider not available in onnxruntime[/red]") - console.print("[red] Check if onnxruntime-gpu is installed and CUDA libraries are accessible[/red]") - raise RuntimeError("CUDA provider not available in onnxruntime") - - providers = [ - ( - 'CUDAExecutionProvider', { - 'device_id': 0, - 'arena_extend_strategy': 'kNextPowerOfTwo', - 'gpu_mem_limit': 8 * 1024 * 1024 * 1024, - 'cudnn_conv_algo_search': 'EXHAUSTIVE', - 'do_copy_in_default_stream': True, - }, - ), - ] - - with warnings.catch_warnings(): - warnings.filterwarnings("ignore", category=UserWarning, module="onnxruntime") - warnings.filterwarnings("ignore", category=FutureWarning, module="insightface") - - console.print(f"[cyan]Loading {settings.face_recognition.model_name} face detection model (GPU-only)...[/cyan]") - - try: - face_app = FaceAnalysis(name=settings.face_recognition.model_name, root=model_root, providers=providers) - face_app.prepare( - ctx_id=0, - det_size=settings.face_recognition.detection_size, - det_thresh=settings.character.face_detection_threshold, - ) - except Exception as e: - console.print("[red]✗ Failed to initialize face detection on GPU[/red]") - console.print(f"[red] Error: {e}[/red]") - console.print("[red] Ensure CUDA and onnxruntime-gpu are properly configured[/red]") - raise RuntimeError("GPU required but face detection initialization failed") from e - - actual_providers = face_app.models['detection'].session.get_providers() - - if 'CUDAExecutionProvider' not in actual_providers: - console.print("[red]✗ CUDA provider not active after initialization[/red]") - console.print(f"[red] Active providers: {', '.join(actual_providers)}[/red]") - raise RuntimeError("CUDA required but not available for face detection") - - console.print(f"[green]✓ Face detection initialized ({settings.face_recognition.model_name})[/green]") - console.print("[dim] Device: GPU (CUDA)[/dim]") - console.print(f"[dim] Detection size: {settings.face_recognition.detection_size}[/dim]") - console.print(f"[dim] Face detection threshold: {settings.character.face_detection_threshold}[/dim]") - console.print(f"[dim] Model cache: {model_root}[/dim]") - - return face_app diff --git a/preprocessor/characters/face/face_detection_utils.py b/preprocessor/characters/face_detection.py similarity index 51% rename from preprocessor/characters/face/face_detection_utils.py rename to preprocessor/characters/face_detection.py index a1d94bd97..e5b1bde65 100644 --- a/preprocessor/characters/face/face_detection_utils.py +++ b/preprocessor/characters/face_detection.py @@ -1,5 +1,18 @@ +import os import warnings +from pathlib import Path +from typing import Any, Dict, List, Optional +import cv2 +import numpy as np +from numpy.linalg import norm +from insightface.app import FaceAnalysis +import onnxruntime as ort + +from preprocessor.config.config import settings +from preprocessor.utils.console import console + +# Suppress insightface warnings warnings.filterwarnings( "ignore", message=".*estimate.*is deprecated.*", @@ -7,23 +20,63 @@ module="insightface", ) -# pylint: disable=wrong-import-position -from pathlib import Path -from typing import ( - Any, - Dict, - List, - Optional, -) - -import cv2 -from insightface.app import FaceAnalysis -import numpy as np -from numpy.linalg import norm - -from preprocessor.utils.console import console -# pylint: enable=wrong-import-position +def init_face_detection() -> FaceAnalysis: + model_root = os.getenv("INSIGHTFACE_HOME", os.path.expanduser("~/.insightface")) + + available_providers = ort.get_available_providers() + console.print(f"[dim]Available ONNX providers: {', '.join(available_providers)}[/dim]") + + if 'CUDAExecutionProvider' not in available_providers: + console.print("[red]✗ CUDAExecutionProvider not available in onnxruntime[/red]") + console.print("[red] Check if onnxruntime-gpu is installed and CUDA libraries are accessible[/red]") + raise RuntimeError("CUDA provider not available in onnxruntime") + + providers = [ + ( + 'CUDAExecutionProvider', { + 'device_id': 0, + 'arena_extend_strategy': 'kNextPowerOfTwo', + 'gpu_mem_limit': 8 * 1024 * 1024 * 1024, + 'cudnn_conv_algo_search': 'EXHAUSTIVE', + 'do_copy_in_default_stream': True, + }, + ), + ] + + with warnings.catch_warnings(): + warnings.filterwarnings("ignore", category=UserWarning, module="onnxruntime") + warnings.filterwarnings("ignore", category=FutureWarning, module="insightface") + + console.print(f"[cyan]Loading {settings.face_recognition.model_name} face detection model (GPU-only)...[/cyan]") + + try: + face_app = FaceAnalysis(name=settings.face_recognition.model_name, root=model_root, providers=providers) + face_app.prepare( + ctx_id=0, + det_size=settings.face_recognition.detection_size, + det_thresh=settings.character.face_detection_threshold, + ) + except Exception as e: + console.print("[red]✗ Failed to initialize face detection on GPU[/red]") + console.print(f"[red] Error: {e}[/red]") + console.print("[red] Ensure CUDA and onnxruntime-gpu are properly configured[/red]") + raise RuntimeError("GPU required but face detection initialization failed") from e + + actual_providers = face_app.models['detection'].session.get_providers() + + if 'CUDAExecutionProvider' not in actual_providers: + console.print("[red]✗ CUDA provider not active after initialization[/red]") + console.print(f"[red] Active providers: {', '.join(actual_providers)}[/red]") + raise RuntimeError("CUDA required but not available for face detection") + + console.print(f"[green]✓ Face detection initialized ({settings.face_recognition.model_name})[/green]") + console.print("[dim] Device: GPU (CUDA)[/dim]") + console.print(f"[dim] Detection size: {settings.face_recognition.detection_size}[/dim]") + console.print(f"[dim] Face detection threshold: {settings.character.face_detection_threshold}[/dim]") + console.print(f"[dim] Model cache: {model_root}[/dim]") + + return face_app def load_character_references( diff --git a/preprocessor/characters/search/google_image_search.py b/preprocessor/characters/google_image_search.py similarity index 92% rename from preprocessor/characters/search/google_image_search.py rename to preprocessor/characters/google_image_search.py index f64bddd0b..e2b74e6c9 100644 --- a/preprocessor/characters/search/google_image_search.py +++ b/preprocessor/characters/google_image_search.py @@ -5,7 +5,7 @@ from serpapi import GoogleSearch -from preprocessor.characters.search.base_image_search import BaseImageSearch +from preprocessor.characters.image_search import BaseImageSearch class GoogleImageSearch(BaseImageSearch): diff --git a/preprocessor/characters/search/base_image_search.py b/preprocessor/characters/image_search.py similarity index 100% rename from preprocessor/characters/search/base_image_search.py rename to preprocessor/characters/image_search.py diff --git a/preprocessor/characters/reference/__init__.py b/preprocessor/characters/reference/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/preprocessor/characters/reference/reference_downloader.py b/preprocessor/characters/reference_downloader.py similarity index 97% rename from preprocessor/characters/reference/reference_downloader.py rename to preprocessor/characters/reference_downloader.py index 6946bc09b..7a3b27cff 100644 --- a/preprocessor/characters/reference/reference_downloader.py +++ b/preprocessor/characters/reference_downloader.py @@ -21,10 +21,10 @@ sync_playwright, ) -from preprocessor.characters.face.utils import init_face_detection -from preprocessor.characters.search.base_image_search import BaseImageSearch -from preprocessor.characters.search.duckduckgo_search import DuckDuckGoImageSearch -from preprocessor.characters.search.google_image_search import GoogleImageSearch +from preprocessor.characters.face_detection import init_face_detection +from preprocessor.characters.image_search import BaseImageSearch +from preprocessor.characters.duckduckgo_image_search import DuckDuckGoImageSearch +from preprocessor.characters.google_image_search import GoogleImageSearch from preprocessor.config.config import settings from preprocessor.core.base_processor import BaseProcessor from preprocessor.utils.console import ( diff --git a/preprocessor/characters/reference/reference_processor.py b/preprocessor/characters/reference_processor.py similarity index 99% rename from preprocessor/characters/reference/reference_processor.py rename to preprocessor/characters/reference_processor.py index 84ba5db4c..93ce1eb5f 100644 --- a/preprocessor/characters/reference/reference_processor.py +++ b/preprocessor/characters/reference_processor.py @@ -15,7 +15,7 @@ from insightface.app import FaceAnalysis import numpy as np -from preprocessor.characters.face.utils import init_face_detection +from preprocessor.characters.face_detection import init_face_detection from preprocessor.config.config import settings from preprocessor.core.base_processor import ( BaseProcessor, diff --git a/preprocessor/characters/search/__init__.py b/preprocessor/characters/search/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/preprocessor/cli/commands/detect_scenes.py b/preprocessor/cli/commands/detect_scenes.py index 2dae67025..924ed53da 100644 --- a/preprocessor/cli/commands/detect_scenes.py +++ b/preprocessor/cli/commands/detect_scenes.py @@ -3,7 +3,7 @@ import click -from preprocessor.cli_utils.resource_scope import ResourceScope +from preprocessor.utils.resource_scope import ResourceScope from preprocessor.config.config import settings from preprocessor.processors.scene_detector import SceneDetector diff --git a/preprocessor/cli/commands/export_frames.py b/preprocessor/cli/commands/export_frames.py index bf1794cd5..ae15e3b91 100644 --- a/preprocessor/cli/commands/export_frames.py +++ b/preprocessor/cli/commands/export_frames.py @@ -3,7 +3,7 @@ import click -from preprocessor.cli.utils import create_state_manager +from preprocessor.cli.helpers import create_state_manager from preprocessor.config.config import settings from preprocessor.processors.frame_exporter import FrameExporter from preprocessor.utils.resolution import Resolution diff --git a/preprocessor/cli/commands/fix_unicode.py b/preprocessor/cli/commands/fix_unicode.py index 74b2ffde9..92a684895 100644 --- a/preprocessor/cli/commands/fix_unicode.py +++ b/preprocessor/cli/commands/fix_unicode.py @@ -3,7 +3,7 @@ import click -from preprocessor.cli_utils.resource_scope import ResourceScope +from preprocessor.utils.resource_scope import ResourceScope from preprocessor.config.config import settings from preprocessor.transcription.processors.unicode_fixer import TranscriptionUnicodeFixer diff --git a/preprocessor/cli/commands/generate_embeddings.py b/preprocessor/cli/commands/generate_embeddings.py index cc2d15eca..23007b8d8 100644 --- a/preprocessor/cli/commands/generate_embeddings.py +++ b/preprocessor/cli/commands/generate_embeddings.py @@ -3,7 +3,7 @@ import click -from preprocessor.cli_utils.resource_scope import ResourceScope +from preprocessor.utils.resource_scope import ResourceScope from preprocessor.config.config import settings from preprocessor.processors.embedding_generator import EmbeddingGenerator diff --git a/preprocessor/cli/commands/image_hashing.py b/preprocessor/cli/commands/image_hashing.py index 26ee7267b..f73c3a919 100644 --- a/preprocessor/cli/commands/image_hashing.py +++ b/preprocessor/cli/commands/image_hashing.py @@ -3,7 +3,7 @@ import click -from preprocessor.cli.utils import create_state_manager +from preprocessor.cli.helpers import create_state_manager from preprocessor.config.config import settings from preprocessor.processors.image_hash_processor import ImageHashProcessor diff --git a/preprocessor/cli/commands/import_transcriptions.py b/preprocessor/cli/commands/import_transcriptions.py index 6ee89871a..062714b87 100644 --- a/preprocessor/cli/commands/import_transcriptions.py +++ b/preprocessor/cli/commands/import_transcriptions.py @@ -3,7 +3,7 @@ import click -from preprocessor.cli.utils import create_state_manager +from preprocessor.cli.helpers import create_state_manager from preprocessor.config.config import settings from preprocessor.processors.transcription_importer import TranscriptionImporter from preprocessor.utils.console import console diff --git a/preprocessor/cli/commands/process_character_references.py b/preprocessor/cli/commands/process_character_references.py index 627d9ea16..588fc4b37 100644 --- a/preprocessor/cli/commands/process_character_references.py +++ b/preprocessor/cli/commands/process_character_references.py @@ -3,8 +3,8 @@ import click -from preprocessor.characters.reference.reference_processor import CharacterReferenceProcessor -from preprocessor.cli.utils import create_state_manager +from preprocessor.characters.reference_processor import CharacterReferenceProcessor +from preprocessor.cli.helpers import create_state_manager from preprocessor.config.config import settings diff --git a/preprocessor/cli/commands/run_all.py b/preprocessor/cli/commands/run_all.py index 4a2648f82..3ed1493f4 100644 --- a/preprocessor/cli/commands/run_all.py +++ b/preprocessor/cli/commands/run_all.py @@ -23,7 +23,7 @@ run_transcribe_step, run_validation_step, ) -from preprocessor.cli.utils import create_state_manager +from preprocessor.cli.helpers import create_state_manager from preprocessor.config.config import ( get_base_output_dir, settings, diff --git a/preprocessor/cli/commands/search.py b/preprocessor/cli/commands/search.py index d6c31dcb1..80d3eaec8 100644 --- a/preprocessor/cli/commands/search.py +++ b/preprocessor/cli/commands/search.py @@ -15,7 +15,7 @@ ) from preprocessor.config.config import settings -from preprocessor.hashing.image_hasher import PerceptualHasher +from preprocessor.utils.image_hasher import PerceptualHasher from preprocessor.utils.constants import ( ElasticsearchAggregationKeys, ElasticsearchKeys, diff --git a/preprocessor/cli/commands/separate_sounds.py b/preprocessor/cli/commands/separate_sounds.py index 1358f0e96..709521e5a 100644 --- a/preprocessor/cli/commands/separate_sounds.py +++ b/preprocessor/cli/commands/separate_sounds.py @@ -3,7 +3,7 @@ import click -from preprocessor.cli_utils.resource_scope import ResourceScope +from preprocessor.utils.resource_scope import ResourceScope from preprocessor.config.config import settings from preprocessor.transcription.processors.sound_separator import SoundEventSeparator diff --git a/preprocessor/cli/commands/transcode.py b/preprocessor/cli/commands/transcode.py index 4b50c3e07..a5bcae9de 100644 --- a/preprocessor/cli/commands/transcode.py +++ b/preprocessor/cli/commands/transcode.py @@ -3,8 +3,8 @@ import click -from preprocessor.cli.utils import create_state_manager -from preprocessor.cli_utils.resource_scope import ResourceScope +from preprocessor.cli.helpers import create_state_manager +from preprocessor.utils.resource_scope import ResourceScope from preprocessor.config.config import ( TranscodeConfig, settings, diff --git a/preprocessor/cli/commands/transcribe.py b/preprocessor/cli/commands/transcribe.py index a156333c3..37f51d36b 100644 --- a/preprocessor/cli/commands/transcribe.py +++ b/preprocessor/cli/commands/transcribe.py @@ -4,7 +4,7 @@ import click -from preprocessor.cli_utils.resource_scope import ResourceScope +from preprocessor.utils.resource_scope import ResourceScope from preprocessor.config.config import ( TranscriptionConfig, settings, diff --git a/preprocessor/cli/commands/transcribe_elevenlabs.py b/preprocessor/cli/commands/transcribe_elevenlabs.py index ea2535a76..85149594c 100644 --- a/preprocessor/cli/commands/transcribe_elevenlabs.py +++ b/preprocessor/cli/commands/transcribe_elevenlabs.py @@ -3,7 +3,7 @@ import click -from preprocessor.cli.utils import create_state_manager +from preprocessor.cli.helpers import create_state_manager from preprocessor.config.config import settings from preprocessor.transcription.elevenlabs import ElevenLabsTranscriber from preprocessor.utils.console import console diff --git a/preprocessor/cli/utils.py b/preprocessor/cli/helpers.py similarity index 100% rename from preprocessor/cli/utils.py rename to preprocessor/cli/helpers.py diff --git a/preprocessor/cli/pipeline/orchestrator.py b/preprocessor/cli/pipeline/orchestrator.py index 72cfa6ba1..5751ff702 100644 --- a/preprocessor/cli/pipeline/orchestrator.py +++ b/preprocessor/cli/pipeline/orchestrator.py @@ -9,7 +9,7 @@ Optional, ) -from preprocessor.cli_utils.resource_scope import ResourceScope +from preprocessor.utils.resource_scope import ResourceScope from preprocessor.config.config import ( get_output_path, settings, diff --git a/preprocessor/cli/pipeline/steps.py b/preprocessor/cli/pipeline/steps.py index b7d199ef9..0d5cb81ea 100644 --- a/preprocessor/cli/pipeline/steps.py +++ b/preprocessor/cli/pipeline/steps.py @@ -1,6 +1,6 @@ from pathlib import Path -from preprocessor.characters.reference.reference_downloader import CharacterReferenceDownloader +from preprocessor.characters.reference_downloader import CharacterReferenceDownloader from preprocessor.config.config import ( get_base_output_dir, settings, @@ -8,10 +8,10 @@ from preprocessor.core.constants import SUPPORTED_VIDEO_EXTENSIONS from preprocessor.scraping.episode_scraper import EpisodeScraper from preprocessor.utils.console import console -from preprocessor.video.helpers.frame_processor import FrameProcessor +from preprocessor.video.frame_processor import FrameProcessor from preprocessor.video.subprocessors.emotion_detection_subprocessor import EmotionDetectionSubProcessor from preprocessor.video.subprocessors.face_clustering_subprocessor import FaceClusteringSubProcessor -from preprocessor.video.subprocessors.frame_subprocessors import ( +from preprocessor.video.subprocessors import ( CharacterDetectionSubProcessor, CharacterDetectionVisualizationSubProcessor, ImageHashSubProcessor, @@ -102,7 +102,7 @@ def run_character_reference_download_step(name, characters_json, search_mode="no def run_character_reference_processing_step(name, state_manager, interactive_character_processing=False, debug_visualizations=False, **_kwargs): - from preprocessor.characters.reference.reference_processor import CharacterReferenceProcessor # pylint: disable=import-outside-toplevel + from preprocessor.characters.reference_processor import CharacterReferenceProcessor # pylint: disable=import-outside-toplevel characters_dir = settings.character.get_output_dir(name) if not characters_dir.exists() or not list(characters_dir.iterdir()): diff --git a/preprocessor/cli_utils/__init__.py b/preprocessor/cli_utils/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/preprocessor/providers/llm.py b/preprocessor/config/llm_provider.py similarity index 100% rename from preprocessor/providers/llm.py rename to preprocessor/config/llm_provider.py diff --git a/preprocessor/core/base_processor.py b/preprocessor/core/base_processor.py index f2d646747..1787998c5 100644 --- a/preprocessor/core/base_processor.py +++ b/preprocessor/core/base_processor.py @@ -18,7 +18,7 @@ FILE_SUFFIXES, SUPPORTED_VIDEO_EXTENSIONS, ) -from preprocessor.core.file_naming import FileNamingConventions +from preprocessor.core.path_manager import PathManager from preprocessor.core.state_manager import StateManager from preprocessor.utils.console import ( console, @@ -405,15 +405,7 @@ def _build_filename( extension: str = "json", suffix: Optional[str] = None, ) -> str: - if hasattr(self, 'episode_manager') and self.episode_manager: # pylint: disable=no-member - return self.episode_manager.file_naming.build_filename( - episode_info, - extension=extension, - suffix=suffix, - ) - - file_naming = FileNamingConventions(self.series_name) - return file_naming.build_filename( + return self.path_manager.build_filename( episode_info, extension=extension, suffix=suffix, diff --git a/preprocessor/core/episode_file_finder.py b/preprocessor/core/episode_file_finder.py index 59762b919..92c7f9aea 100644 --- a/preprocessor/core/episode_file_finder.py +++ b/preprocessor/core/episode_file_finder.py @@ -10,14 +10,14 @@ ) from preprocessor.core.constants import SUPPORTED_VIDEO_EXTENSIONS -from preprocessor.core.file_naming import FileNamingConventions +from preprocessor.core.path_manager import PathManager logger = logging.getLogger(__name__) class EpisodeFileFinder: def __init__(self, series_name: str): - self.file_naming = FileNamingConventions(series_name) + self.path_manager = PathManager(series_name) @staticmethod def find_video_file(episode_info, search_dir: Path) -> Optional[Path]: @@ -57,7 +57,7 @@ def find_transcription_file( return None if prefer_segmented: - segmented = season_dir / self.file_naming.build_filename( + segmented = season_dir / self.path_manager.build_filename( episode_info, extension="json", suffix="segmented", @@ -65,7 +65,7 @@ def find_transcription_file( if segmented.exists(): return segmented - regular = season_dir / self.file_naming.build_filename(episode_info, extension="json") + regular = season_dir / self.path_manager.build_filename(episode_info, extension="json") if regular.exists(): return regular diff --git a/preprocessor/core/episode_manager.py b/preprocessor/core/episode_manager.py index 9d6ecb3fb..66ad400e0 100644 --- a/preprocessor/core/episode_manager.py +++ b/preprocessor/core/episode_manager.py @@ -11,7 +11,7 @@ from preprocessor.core.episode_file_finder import EpisodeFileFinder from preprocessor.core.episode_parser import EpisodeInfoParser -from preprocessor.core.file_naming import FileNamingConventions +from preprocessor.core.path_manager import PathManager from preprocessor.utils.constants import ( EpisodeMetadataKeys, EpisodesDataKeys, @@ -60,7 +60,7 @@ class EpisodeManager: def __init__(self, episodes_info_json: Optional[Path], series_name: str): self.series_name = series_name.lower() self.episodes_data: Optional[Dict[str, Any]] = None - self.file_naming = FileNamingConventions(self.series_name) + self.path_manager = PathManager(self.series_name) self.file_finder = EpisodeFileFinder(self.series_name) self.parser = EpisodeInfoParser() diff --git a/preprocessor/core/file_naming.py b/preprocessor/core/file_naming.py deleted file mode 100644 index 8bb05deb3..000000000 --- a/preprocessor/core/file_naming.py +++ /dev/null @@ -1,44 +0,0 @@ -from pathlib import Path -from typing import Optional - -from preprocessor.core.constants import ( - FILE_EXTENSIONS, - FILE_SUFFIXES, -) - - -class FileNamingConventions: - def __init__(self, series_name: str): - self.series_name = series_name.lower() - - def build_base_filename(self, episode_info) -> str: - return f"{self.series_name}_{episode_info.episode_code()}" - - def build_filename( - self, - episode_info, - extension: str = "json", - suffix: Optional[str] = None, - ) -> str: - base = self.build_base_filename(episode_info) - suffix_str = FILE_SUFFIXES.get(suffix, suffix) if suffix else "" - ext = FILE_EXTENSIONS.get(extension, f".{extension}") - return f"{base}{suffix_str}{ext}" - - @staticmethod - def parse_base_filename(filename: str) -> str: - name = Path(filename).stem - for suffix_value in FILE_SUFFIXES.values(): - if name.endswith(suffix_value): - return name[:-len(suffix_value)] - return name - - @staticmethod - def add_suffix_to_filename(filename: str, suffix: str) -> str: - path = Path(filename) - suffix_str = FILE_SUFFIXES.get(suffix, suffix) if suffix else "" - return str(path.parent / f"{path.stem}{suffix_str}{path.suffix}") - - @staticmethod - def get_suffix(suffix_key: str) -> str: - return FILE_SUFFIXES.get(suffix_key, "") diff --git a/preprocessor/core/path_manager.py b/preprocessor/core/path_manager.py index 184cdd208..141b5cd0d 100644 --- a/preprocessor/core/path_manager.py +++ b/preprocessor/core/path_manager.py @@ -1,6 +1,11 @@ from pathlib import Path +from typing import Optional from preprocessor.config.config import get_base_output_dir +from preprocessor.core.constants import ( + FILE_EXTENSIONS, + FILE_SUFFIXES, +) class PathManager: @@ -49,3 +54,35 @@ def get_episode_dir(self, episode_info, subdir: str) -> Path: episode_dir = self._base_output_dir / subdir / season_code / episode_code episode_dir.mkdir(parents=True, exist_ok=True) return episode_dir + + def build_base_filename(self, episode_info) -> str: + return f"{self._series_name}_{episode_info.episode_code()}" + + def build_filename( + self, + episode_info, + extension: str = "json", + suffix: Optional[str] = None, + ) -> str: + base = self.build_base_filename(episode_info) + suffix_str = FILE_SUFFIXES.get(suffix, suffix) if suffix else "" + ext = FILE_EXTENSIONS.get(extension, f".{extension}") + return f"{base}{suffix_str}{ext}" + + @staticmethod + def parse_base_filename(filename: str) -> str: + name = Path(filename).stem + for suffix_value in FILE_SUFFIXES.values(): + if name.endswith(suffix_value): + return name[:-len(suffix_value)] + return name + + @staticmethod + def add_suffix_to_filename(filename: str, suffix: str) -> str: + path = Path(filename) + suffix_str = FILE_SUFFIXES.get(suffix, suffix) if suffix else "" + return str(path.parent / f"{path.stem}{suffix_str}{path.suffix}") + + @staticmethod + def get_suffix(suffix_key: str) -> str: + return FILE_SUFFIXES.get(suffix_key, "") \ No newline at end of file diff --git a/preprocessor/embeddings/gpu_batch_processor.py b/preprocessor/embeddings/gpu_batch_processor.py index 99d25a881..15909c423 100644 --- a/preprocessor/embeddings/gpu_batch_processor.py +++ b/preprocessor/embeddings/gpu_batch_processor.py @@ -8,6 +8,7 @@ from PIL import Image import torch +from preprocessor.utils.batch_processor import BatchProcessor from preprocessor.utils.console import console from preprocessor.utils.error_handling_logger import ErrorHandlingLogger @@ -28,6 +29,7 @@ def __init__( self.device = device self.max_vram_used = 0.0 self.vram_samples = [] + self.batch_processor = BatchProcessor(min(self.batch_size, self.progress_sub_batch_size)) def __log_vram_usage(self) -> None: if torch.cuda.is_available(): @@ -59,46 +61,46 @@ def suggest_optimal_batch_size(self, target_vram_gb: float = 21.0) -> int: return suggested - def process_images_batch( # pylint: disable=too-many-locals + def process_images_batch( self, pil_images: List[Image.Image], chunk_idx: int, ) -> List[List[float]]: - results = [] total_images = len(pil_images) - effective_batch_size = min(self.batch_size, self.progress_sub_batch_size) batch_start_time = time.time() + processed_count = 0 - for sub_idx in range(0, total_images, effective_batch_size): - sub_end = min(sub_idx + effective_batch_size, total_images) - batch_pil = pil_images[sub_idx:sub_end] + def _process_sub_batch(batch_pil: List[Image.Image]) -> List[List[float]]: + nonlocal processed_count current_batch_size = len(batch_pil) + sub_batch_start = time.time() - try: # pylint: disable=too-many-try-statements - sub_batch_start = time.time() - + try: inputs = [{"image": img} for img in batch_pil] embeddings_tensor = self.model.process(inputs, normalize=True) self.__log_vram_usage() batch_np = embeddings_tensor.cpu().numpy() del embeddings_tensor - results.extend([emb.tolist() for emb in batch_np]) + results = [emb.tolist() for emb in batch_np] del batch_np torch.cuda.empty_cache() + processed_count += current_batch_size if total_images > self.progress_sub_batch_size: elapsed = time.time() - sub_batch_start rate = current_batch_size / elapsed if elapsed > 0 else 0 console.print( - f" [dim cyan]→ {sub_idx + 1}-{sub_end}/{total_images} " - f"({sub_end / total_images * 100:.0f}%) - {elapsed:.1f}s ({rate:.3f} img/s)[/dim cyan]", + f" [dim cyan]→ {processed_count}/{total_images} " + f"({processed_count / total_images * 100:.0f}%) - {elapsed:.1f}s ({rate:.3f} img/s)[/dim cyan]", ) elapsed_total = time.time() - batch_start_time - if sub_end < total_images: - remaining_images = total_images - sub_end - eta = remaining_images / (sub_end / elapsed_total) if elapsed_total > 0 else 0 + remaining_images = total_images - processed_count + if processed_count > 0: + eta = remaining_images / (processed_count / elapsed_total) console.print(f" [dim]Batch ETA: {eta:.0f}s[/dim]") + + return results except RuntimeError as e: if "out of memory" in str(e).lower(): torch.cuda.empty_cache() @@ -108,7 +110,7 @@ def process_images_batch( # pylint: disable=too-many-locals ) raise e except Exception as e: - self.logger.error(f"Unexpected error in chunk {chunk_idx} sub-batch {sub_idx}-{sub_end}: {e}") + self.logger.error(f"Unexpected error in chunk {chunk_idx}: {e}") raise e - return results + return self.batch_processor.process(pil_images, _process_sub_batch) diff --git a/preprocessor/hashing/__init__.py b/preprocessor/hashing/__init__.py deleted file mode 100644 index a9bc298b9..000000000 --- a/preprocessor/hashing/__init__.py +++ /dev/null @@ -1 +0,0 @@ -__all__ = ["ImageHashProcessor", "PerceptualHasher"] # pylint: disable=undefined-all-variable diff --git a/preprocessor/indexing/__init__.py b/preprocessor/indexing/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/preprocessor/processors/character_detector.py b/preprocessor/processors/character_detector.py index 1b03a3cae..95fb682e1 100644 --- a/preprocessor/processors/character_detector.py +++ b/preprocessor/processors/character_detector.py @@ -11,8 +11,10 @@ from insightface.app import FaceAnalysis import numpy as np -from preprocessor.characters.face.face_detection_utils import load_character_references -from preprocessor.characters.face.utils import init_face_detection +from preprocessor.characters.face_detection import ( + init_face_detection, + load_character_references, +) from preprocessor.config.config import settings from preprocessor.core.base_processor import ( BaseProcessor, @@ -20,7 +22,7 @@ ProcessingItem, ) from preprocessor.core.episode_manager import EpisodeManager -from preprocessor.core.file_naming import FileNamingConventions +from preprocessor.core.path_manager import PathManager from preprocessor.core.processor_registry import register_processor from preprocessor.utils.console import console from preprocessor.utils.detection_io import ( @@ -73,8 +75,8 @@ def _get_processing_items(self) -> List[ProcessingItem]: def _get_expected_outputs(self, item: ProcessingItem) -> List[OutputSpec]: episode_info = item.metadata["episode_info"] - file_naming = FileNamingConventions(self.series_name) - detections_filename = file_naming.build_filename( + path_manager = PathManager(self.series_name) + detections_filename = path_manager.build_filename( episode_info, extension="json", suffix="character_detections", diff --git a/preprocessor/processors/elastic_document_generator.py b/preprocessor/processors/elastic_document_generator.py index 936735ff7..ff6cea1d7 100644 --- a/preprocessor/processors/elastic_document_generator.py +++ b/preprocessor/processors/elastic_document_generator.py @@ -8,7 +8,7 @@ Optional, ) -from bot.types import ( +from preprocessor.types import ( CharacterDetectionInFrame, EpisodeMetadata, ObjectDetectionInFrame, diff --git a/preprocessor/processors/elasticsearch_indexer.py b/preprocessor/processors/elasticsearch_indexer.py index 960688448..b18850650 100644 --- a/preprocessor/processors/elasticsearch_indexer.py +++ b/preprocessor/processors/elasticsearch_indexer.py @@ -23,7 +23,7 @@ from preprocessor.core.base_processor import BaseProcessor from preprocessor.core.episode_manager import EpisodeManager from preprocessor.core.processor_registry import register_processor -from preprocessor.search.elastic_manager import ElasticSearchManager +from preprocessor.processors.elasticsearch_manager import ElasticSearchManager from preprocessor.utils.console import console ELASTIC_SUBDIRS = settings.output_subdirs.elastic_document_subdirs diff --git a/preprocessor/search/elastic_manager.py b/preprocessor/processors/elasticsearch_manager.py similarity index 100% rename from preprocessor/search/elastic_manager.py rename to preprocessor/processors/elasticsearch_manager.py diff --git a/preprocessor/processors/frame_exporter.py b/preprocessor/processors/frame_exporter.py index dc236714a..4bf0c14bf 100644 --- a/preprocessor/processors/frame_exporter.py +++ b/preprocessor/processors/frame_exporter.py @@ -1,3 +1,4 @@ +import logging from datetime import datetime import json from pathlib import Path @@ -13,9 +14,10 @@ from PIL import Image import decord -from bot.types import FrameRequest +from preprocessor.types import FrameRequest from preprocessor.config.config import settings from preprocessor.core.base_processor import ( + BaseProcessor, OutputSpec, ProcessingItem, ) @@ -25,11 +27,10 @@ from preprocessor.embeddings.strategies.strategy_factory import KeyframeStrategyFactory from preprocessor.utils.console import console from preprocessor.utils.file_utils import atomic_write_json -from preprocessor.video.helpers.base_video_processor import BaseVideoProcessor @register_processor("export_frames") -class FrameExporter(BaseVideoProcessor): +class FrameExporter(BaseProcessor): REQUIRES = ["videos", "scene_timestamps"] PRODUCES = ["frames"] PRIORITY = 30 @@ -40,10 +41,15 @@ def __init__(self, args: Dict[str, Any]) -> None: args=args, class_name=self.__class__.__name__, error_exit_code=10, - input_videos_key="videos", + loglevel=logging.DEBUG, ) decord.bridge.set_bridge('native') + self.input_videos: Path = Path(self._args["videos"]) + self.subdirectory_filter: Optional[str] = None + episodes_json_path = self._args.get("episodes_info_json") + self.episode_manager = EpisodeManager(episodes_json_path, self.series_name) + self.output_frames: Path = Path(self._args.get("output_frames", settings.frame_export.output_dir)) self.output_frames.mkdir(parents=True, exist_ok=True) @@ -62,6 +68,15 @@ def __init__(self, args: Dict[str, Any]) -> None: self.frames_per_scene, ) + def _get_processing_items(self) -> List[ProcessingItem]: + return self._create_video_processing_items( + source_path=self.input_videos, + extensions=self.get_video_glob_patterns(), + episode_manager=self.episode_manager, + skip_unparseable=True, + subdirectory_filter=self.subdirectory_filter, + ) + def _validate_args(self, args: Dict[str, Any]) -> None: if "videos" not in args: raise ValueError("videos path is required") diff --git a/preprocessor/processors/image_hash_processor.py b/preprocessor/processors/image_hash_processor.py index a55fc0640..bc891ea98 100644 --- a/preprocessor/processors/image_hash_processor.py +++ b/preprocessor/processors/image_hash_processor.py @@ -19,7 +19,7 @@ ) from preprocessor.core.episode_manager import EpisodeManager from preprocessor.core.processor_registry import register_processor -from preprocessor.hashing.image_hasher import PerceptualHasher +from preprocessor.utils.image_hasher import PerceptualHasher from preprocessor.utils.batch_processing_utils import compute_hashes_in_batches from preprocessor.utils.console import console from preprocessor.utils.metadata_utils import create_processing_metadata diff --git a/preprocessor/processors/scene_detector.py b/preprocessor/processors/scene_detector.py index 82ee7ab9a..7c58a7a09 100644 --- a/preprocessor/processors/scene_detector.py +++ b/preprocessor/processors/scene_detector.py @@ -13,7 +13,7 @@ import torch from transnetv2_pytorch import TransNetV2 -from bot.types import SceneDict +from preprocessor.types import SceneDict from preprocessor.config.config import settings from preprocessor.core.base_processor import ( BaseProcessor, diff --git a/preprocessor/processors/video_transcoder.py b/preprocessor/processors/video_transcoder.py index 2d7f7e7d9..ab3470f41 100644 --- a/preprocessor/processors/video_transcoder.py +++ b/preprocessor/processors/video_transcoder.py @@ -1,4 +1,5 @@ import json +import logging import os from pathlib import Path import subprocess @@ -11,21 +12,22 @@ from preprocessor.config.config import settings from preprocessor.core.base_processor import ( + BaseProcessor, OutputSpec, ProcessingItem, ) from preprocessor.core.constants import DEFAULT_VIDEO_EXTENSION +from preprocessor.core.episode_manager import EpisodeManager from preprocessor.core.processor_registry import register_processor from preprocessor.utils.constants import ( FfprobeKeys, FfprobeStreamKeys, ) from preprocessor.utils.resolution import Resolution -from preprocessor.video.helpers.base_video_processor import BaseVideoProcessor @register_processor("transcode") -class VideoTranscoder(BaseVideoProcessor): +class VideoTranscoder(BaseProcessor): REQUIRES = ["videos"] PRODUCES = ["transcoded_videos"] PRIORITY = 10 @@ -36,9 +38,14 @@ def __init__(self, args: Dict[str, Any]) -> None: args=args, class_name=self.__class__.__name__, error_exit_code=3, - input_videos_key="videos", + loglevel=logging.DEBUG, ) + self.input_videos: Path = Path(self._args["videos"]) + self.subdirectory_filter: Optional[str] = None + episodes_json_path = self._args.get("episodes_info_json") + self.episode_manager = EpisodeManager(episodes_json_path, self.series_name) + self.resolution: Resolution = self._args["resolution"] self.codec: str = str(self._args["codec"]) self.preset: str = "p7" @@ -49,6 +56,15 @@ def __init__(self, args: Dict[str, Any]) -> None: self.audio_bitrate_kbps: int = int(self._args.get("audio_bitrate_kbps", 128)) self.gop_size: float = float(self._args["gop_size"]) + def _get_processing_items(self) -> List[ProcessingItem]: + return self._create_video_processing_items( + source_path=self.input_videos, + extensions=self.get_video_glob_patterns(), + episode_manager=self.episode_manager, + skip_unparseable=True, + subdirectory_filter=self.subdirectory_filter, + ) + def _validate_args(self, args: Dict[str, Any]) -> None: if "videos" not in args: raise ValueError("videos path is required") diff --git a/preprocessor/providers/__init__.py b/preprocessor/providers/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/preprocessor/scraping/base_scraper.py b/preprocessor/scraping/base_scraper.py index 5a1ea8b2b..5f911b9c8 100644 --- a/preprocessor/scraping/base_scraper.py +++ b/preprocessor/scraping/base_scraper.py @@ -16,7 +16,7 @@ ParserMode, ScraperMethod, ) -from preprocessor.providers.llm import LLMProvider +from preprocessor.config.llm_provider import LLMProvider from preprocessor.scraping.clipboard import ScraperClipboard from preprocessor.scraping.crawl4ai import ScraperCrawl4AI from preprocessor.utils.console import ( diff --git a/preprocessor/search/__init__.py b/preprocessor/search/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/preprocessor/transcription/generators/full_json_generator.py b/preprocessor/transcription/generators/full_json_generator.py deleted file mode 100644 index 4dd881518..000000000 --- a/preprocessor/transcription/generators/full_json_generator.py +++ /dev/null @@ -1,38 +0,0 @@ -from pathlib import Path -from typing import ( - Any, - Dict, -) - -from preprocessor.transcription.generators.base_generator import BaseTranscriptionGenerator -from preprocessor.utils.transcription_utils import convert_words_list - - -class FullJsonGenerator(BaseTranscriptionGenerator): - def _process_file(self, json_file: Path, data: Dict[str, Any]) -> None: - pass - - def _get_output_filename(self, json_file: Path) -> str: - return json_file.name - - @staticmethod - def convert_to_full_format(data: Dict[str, Any]) -> Dict[str, Any]: - segments = data.get("segments", []) - - full_text = " ".join(seg.get("text", "").strip() for seg in segments) - - language_code = data.get("language", "pol") - if language_code in {"Polish", "polish"}: - language_code = "pol" - - words = [] - for seg in segments: - seg_words = seg.get("words", []) - words.extend(convert_words_list(seg_words)) - - return { - "language_code": language_code, - "language_probability": 1.0, - "text": full_text, - "words": words, - } diff --git a/preprocessor/transcription/generators/json_generator.py b/preprocessor/transcription/generators/json_generator.py index 77289644d..b6f5c6c08 100644 --- a/preprocessor/transcription/generators/json_generator.py +++ b/preprocessor/transcription/generators/json_generator.py @@ -1,80 +1,95 @@ -import json from pathlib import Path from typing import ( Any, Dict, - List, + Literal, ) -from preprocessor.core.constants import FILE_EXTENSIONS -from preprocessor.utils.error_handling_logger import ErrorHandlingLogger - - -class JsonGenerator: - DEFAULT_KEYS_TO_REMOVE: List[str] = [ - "tokens", "no_speech_prob", "compression_ratio", "avg_logprob", "temperature", - ] - - UNICODE_TO_POLISH_MAP: Dict[str, str] = { - '\\u0105': 'ą', '\\u0107': 'ć', '\\u0119': 'ę', '\\u0142': 'ł', - '\\u0144': 'ń', '\\u00F3': 'ó', '\\u015B': 'ś', '\\u017A': 'ź', - '\\u017C': 'ż', '\\u0104': 'Ą', '\\u0106': 'Ć', '\\u0118': 'Ę', - '\\u0141': 'Ł', '\\u0143': 'Ń', '\\u00D3': 'Ó', '\\u015A': 'Ś', - '\\u0179': 'Ź', '\\u017B': 'Ż', - } - - def __init__( - self, - jsons_dir: Path, - output_dir: Path, - logger: ErrorHandlingLogger, - extra_keys_to_remove: List[str], - ): - self.__jsons_dir: Path = jsons_dir - self.__output_dir: Path = output_dir - self.__logger: ErrorHandlingLogger = logger - self.__keys_to_remove: List[str] = self.DEFAULT_KEYS_TO_REMOVE + extra_keys_to_remove - - self.__output_dir.mkdir(parents=True, exist_ok=True) - - def __call__(self) -> None: - for item in self.__jsons_dir.rglob("*"): - if item.is_file() and item.suffix == FILE_EXTENSIONS["json"]: - output_path = self.__output_dir / item.name - self.__format_json(item, output_path) - - def __format_json(self, file_path: Path, output_path: Path) -> None: - try: - with file_path.open("r", encoding="utf-8") as file: - data = json.load(file) - - if "segments" in data: - data["segments"] = [self.__process_json_segment(segment) for segment in data["segments"]] - - with output_path.open("w", encoding="utf-8") as file: - json.dump({"segments": data["segments"]}, file, ensure_ascii=False, indent=4) - - self.__logger.info(f"Processed file: {file_path}") - - except Exception as e: - self.__logger.error(f"Error formatting JSON file {file_path}: {e}") - - def __process_json_segment(self, segment: Dict[str, Any]) -> Dict[str, Any]: - for key in self.__keys_to_remove: - segment.pop(key, None) - - segment["text"] = self.__replace_unicode_chars(segment.get("text", "")) - segment.update({ - "author": "", - "comment": "", - "tags": ["", ""], - "location": "", - "actors": ["", ""], - }) - return segment +from preprocessor.core.constants import ( + FILE_EXTENSIONS, + FILE_SUFFIXES, +) +from preprocessor.transcription.generators.base_generator import BaseTranscriptionGenerator +from preprocessor.utils.transcription_utils import convert_words_list + + +class JsonGenerator(BaseTranscriptionGenerator): + def __init__(self, format_type: Literal["full", "simple", "segmented"], *args, **kwargs): + super().__init__(*args, **kwargs) + self.format_type = format_type + + def _process_file(self, json_file: Path, data: Dict[str, Any]) -> None: + pass + + def _get_output_filename(self, json_file: Path) -> str: + if self.format_type == "full": + return json_file.name + suffix = FILE_SUFFIXES[self.format_type] + return json_file.name.replace(FILE_EXTENSIONS["json"], f"{suffix}{FILE_EXTENSIONS['json']}") + + def convert(self, data: Dict[str, Any]) -> Dict[str, Any]: + if self.format_type == "full": + return self.convert_to_full_format(data) + elif self.format_type == "simple": + return self.convert_to_simple_format(data) + elif self.format_type == "segmented": + return self.convert_to_segmented_format(data) + else: + raise ValueError(f"Unknown format type: {self.format_type}") + + @staticmethod + def convert_to_full_format(data: Dict[str, Any]) -> Dict[str, Any]: + segments = data.get("segments", []) + full_text = " ".join(seg.get("text", "").strip() for seg in segments) + + language_code = data.get("language", "pol") + if language_code in {"Polish", "polish"}: + language_code = "pol" + + words = [] + for seg in segments: + seg_words = seg.get("words", []) + words.extend(convert_words_list(seg_words)) + + return { + "language_code": language_code, + "language_probability": 1.0, + "text": full_text, + "words": words, + } + + @staticmethod + def convert_to_simple_format(data: Dict[str, Any]) -> Dict[str, Any]: + segments = data.get("segments", []) + result_segments = [] + + for seg in segments: + text = seg.get("text", "").strip() + seg_words = seg.get("words", []) + + speaker = "speaker_unknown" + if seg_words: + speaker = seg_words[0].get("speaker_id", "speaker_unknown") + + result_segments.append({ + "speaker": speaker, + "text": text, + }) + + return {"segments": result_segments} @staticmethod - def __replace_unicode_chars(text: str) -> str: - for unicode_char, char in JsonGenerator.UNICODE_TO_POLISH_MAP.items(): - text = text.replace(unicode_char, char) - return text + def convert_to_segmented_format(data: Dict[str, Any]) -> Dict[str, Any]: + segments = data.get("segments", []) + result_segments = [] + + for seg in segments: + text = seg.get("text", "").strip() + seg_words = seg.get("words", []) + + result_segments.append({ + "text": text, + "words": convert_words_list(seg_words), + }) + + return {"segments": result_segments} \ No newline at end of file diff --git a/preprocessor/transcription/generators/multi_format_generator.py b/preprocessor/transcription/generators/multi_format_generator.py index d6669e452..c1552ca00 100644 --- a/preprocessor/transcription/generators/multi_format_generator.py +++ b/preprocessor/transcription/generators/multi_format_generator.py @@ -10,9 +10,7 @@ settings, ) from preprocessor.core.episode_manager import EpisodeManager -from preprocessor.transcription.generators.full_json_generator import FullJsonGenerator -from preprocessor.transcription.generators.segmented_json_generator import SegmentedJsonGenerator -from preprocessor.transcription.generators.simple_json_generator import SimpleJsonGenerator +from preprocessor.transcription.generators.json_generator import JsonGenerator from preprocessor.transcription.generators.srt_generator import SrtGenerator from preprocessor.transcription.generators.txt_generator import TxtGenerator from preprocessor.utils.error_handling_logger import ErrorHandlingLogger @@ -81,9 +79,8 @@ def __generate_full_json(self, data: Dict[str, Any], episode_info) -> None: episode_code = episode_info.episode_num() output_file = get_base_output_dir(self.series_name) / settings.output_subdirs.transcriptions / season_code / episode_code / "raw" / filename output_file.parent.mkdir(parents=True, exist_ok=True) - output_file.parent.mkdir(parents=True, exist_ok=True) - generator = FullJsonGenerator(Path("."), output_file.parent, self.logger) + generator = JsonGenerator("full", Path("."), output_file.parent, self.logger) full_json = generator.convert_to_full_format(data) full_json["episode_info"] = data.get("episode_info", {}) @@ -102,9 +99,8 @@ def __generate_segmented_json(self, data: Dict[str, Any], episode_info) -> None: episode_code = episode_info.episode_num() output_file = get_base_output_dir(self.series_name) / settings.output_subdirs.transcriptions / season_code / episode_code / "raw" / filename output_file.parent.mkdir(parents=True, exist_ok=True) - output_file.parent.mkdir(parents=True, exist_ok=True) - generator = SegmentedJsonGenerator(Path("."), output_file.parent, self.logger) + generator = JsonGenerator("segmented", Path("."), output_file.parent, self.logger) segmented_json = generator.convert_to_segmented_format(data) segmented_json["episode_info"] = { @@ -127,9 +123,8 @@ def __generate_simple_json(self, data: Dict[str, Any], episode_info) -> None: episode_code = episode_info.episode_num() output_file = get_base_output_dir(self.series_name) / settings.output_subdirs.transcriptions / season_code / episode_code / "raw" / filename output_file.parent.mkdir(parents=True, exist_ok=True) - output_file.parent.mkdir(parents=True, exist_ok=True) - generator = SimpleJsonGenerator(Path("."), output_file.parent, self.logger) + generator = JsonGenerator("simple", Path("."), output_file.parent, self.logger) simple_json = generator.convert_to_simple_format(data) simple_json["episode_info"] = { diff --git a/preprocessor/transcription/generators/segmented_json_generator.py b/preprocessor/transcription/generators/segmented_json_generator.py deleted file mode 100644 index e5f920f08..000000000 --- a/preprocessor/transcription/generators/segmented_json_generator.py +++ /dev/null @@ -1,36 +0,0 @@ -from pathlib import Path -from typing import ( - Any, - Dict, -) - -from preprocessor.core.constants import ( - FILE_EXTENSIONS, - FILE_SUFFIXES, -) -from preprocessor.transcription.generators.base_generator import BaseTranscriptionGenerator -from preprocessor.utils.transcription_utils import convert_words_list - - -class SegmentedJsonGenerator(BaseTranscriptionGenerator): - def _process_file(self, json_file: Path, data: Dict[str, Any]) -> None: - pass - - def _get_output_filename(self, json_file: Path) -> str: - return json_file.name.replace(FILE_EXTENSIONS["json"], f"{FILE_SUFFIXES['segmented']}{FILE_EXTENSIONS['json']}") - - @staticmethod - def convert_to_segmented_format(data: Dict[str, Any]) -> Dict[str, Any]: - segments = data.get("segments", []) - result_segments = [] - - for seg in segments: - text = seg.get("text", "").strip() - seg_words = seg.get("words", []) - - result_segments.append({ - "text": text, - "words": convert_words_list(seg_words), - }) - - return {"segments": result_segments} diff --git a/preprocessor/transcription/generators/simple_json_generator.py b/preprocessor/transcription/generators/simple_json_generator.py deleted file mode 100644 index d0848cd73..000000000 --- a/preprocessor/transcription/generators/simple_json_generator.py +++ /dev/null @@ -1,39 +0,0 @@ -from pathlib import Path -from typing import ( - Any, - Dict, -) - -from preprocessor.core.constants import ( - FILE_EXTENSIONS, - FILE_SUFFIXES, -) -from preprocessor.transcription.generators.base_generator import BaseTranscriptionGenerator - - -class SimpleJsonGenerator(BaseTranscriptionGenerator): - def _process_file(self, json_file: Path, data: Dict[str, Any]) -> None: - pass - - def _get_output_filename(self, json_file: Path) -> str: - return json_file.name.replace(FILE_EXTENSIONS["json"], f"{FILE_SUFFIXES['simple']}{FILE_EXTENSIONS['json']}") - - @staticmethod - def convert_to_simple_format(data: Dict[str, Any]) -> Dict[str, Any]: - segments = data.get("segments", []) - result_segments = [] - - for seg in segments: - text = seg.get("text", "").strip() - seg_words = seg.get("words", []) - - speaker = "speaker_unknown" - if seg_words: - speaker = seg_words[0].get("speaker_id", "speaker_unknown") - - result_segments.append({ - "speaker": speaker, - "text": text, - }) - - return {"segments": result_segments} diff --git a/preprocessor/types/__init__.py b/preprocessor/types/__init__.py new file mode 100644 index 000000000..6d44a654a --- /dev/null +++ b/preprocessor/types/__init__.py @@ -0,0 +1,69 @@ +from .episode import ( + EpisodeInfo, + EpisodeMetadata, + SeasonInfo, + SeasonInfoDict, +) +from .frame import FrameRequest +from .scene import ( + SceneDict, + SceneTimestamp, + SceneTimestampPoint, + SceneTimestampsData, +) +from .clip import ClipSegment +from .detection import ( + CharacterDetectionInFrame, + Detection, + ObjectDetectionInFrame, +) +from .video import ( + HashResult, + VideoMetadata, +) +from .transcription import ( + BaseSegment, + ElasticsearchSegment, + SegmentWithScore, + SegmentWithTimes, + TranscriptionContext, +) +from .search import ( + ElasticsearchAggregations, + ElasticsearchHit, + ElasticsearchHits, + ElasticsearchResponse, + EpisodeBucket, + SearchSegment, + SeasonBucket, +) + +__all__ = [ + "EpisodeInfo", + "EpisodeMetadata", + "SeasonInfo", + "SeasonInfoDict", + "FrameRequest", + "SceneDict", + "SceneTimestamp", + "SceneTimestampPoint", + "SceneTimestampsData", + "ClipSegment", + "CharacterDetectionInFrame", + "Detection", + "ObjectDetectionInFrame", + "HashResult", + "VideoMetadata", + "BaseSegment", + "ElasticsearchSegment", + "SegmentWithScore", + "SegmentWithTimes", + "TranscriptionContext", + "ElasticsearchAggregations", + "ElasticsearchHit", + "ElasticsearchHits", + "ElasticsearchResponse", + "EpisodeBucket", + "SearchSegment", + "SeasonBucket", +] \ No newline at end of file diff --git a/preprocessor/types/clip.py b/preprocessor/types/clip.py new file mode 100644 index 000000000..4385f79e3 --- /dev/null +++ b/preprocessor/types/clip.py @@ -0,0 +1,10 @@ +from typing import ( + Any, + TypedDict, + Union, +) + +class ClipSegment(TypedDict): + video_path: Union[str, Any] + start_time: float + end_time: float diff --git a/preprocessor/types/detection.py b/preprocessor/types/detection.py new file mode 100644 index 000000000..2250d4b78 --- /dev/null +++ b/preprocessor/types/detection.py @@ -0,0 +1,26 @@ +from typing import ( + List, + NotRequired, + TypedDict, +) + +class CharacterDetectionInFrame(TypedDict): + name: str + confidence: float + bbox: List[int] + embedding: NotRequired[List[float]] + + +class ObjectDetectionInFrame(TypedDict): + class_name: str + class_id: int + confidence: float + bbox: List[int] + + +class Detection(TypedDict): + bbox: List[int] + confidence: float + class_id: NotRequired[int] + class_name: NotRequired[str] + name: NotRequired[str] diff --git a/preprocessor/types/episode.py b/preprocessor/types/episode.py new file mode 100644 index 000000000..9ef0f3174 --- /dev/null +++ b/preprocessor/types/episode.py @@ -0,0 +1,27 @@ +from typing import ( + Dict, + TypedDict, + Union, +) + +class EpisodeInfo(TypedDict): + episode_number: int + title: str + premiere_date: str + viewership: Union[str, int, float] + + +class EpisodeMetadata(TypedDict): + season: int + episode_number: int + title: str + premiere_date: str + viewership: Union[str, int, float] + series_name: str + + +class SeasonInfo(TypedDict): + pass + + +SeasonInfoDict = Dict[str, int] diff --git a/preprocessor/types/frame.py b/preprocessor/types/frame.py new file mode 100644 index 000000000..4009acd38 --- /dev/null +++ b/preprocessor/types/frame.py @@ -0,0 +1,10 @@ +from typing import ( + NotRequired, + TypedDict, +) + +class FrameRequest(TypedDict): + frame: int + time: float + type: str + scene_number: NotRequired[int] diff --git a/preprocessor/types/scene.py b/preprocessor/types/scene.py new file mode 100644 index 000000000..2f92d5dcf --- /dev/null +++ b/preprocessor/types/scene.py @@ -0,0 +1,30 @@ +from typing import ( + List, + NotRequired, + TypedDict, +) + +class SceneDict(TypedDict): + scene_number: int + start_frame: int + end_frame: int + start_time: float + end_time: float + fps: float + + +class SceneTimestampPoint(TypedDict): + frame: int + seconds: float + + +class SceneTimestamp(TypedDict): + scene_number: int + start: SceneTimestampPoint + end: SceneTimestampPoint + + +class SceneTimestampsData(TypedDict): + scenes: List[SceneTimestamp] + total_scenes: NotRequired[int] + fps: NotRequired[float] diff --git a/preprocessor/types/search.py b/preprocessor/types/search.py new file mode 100644 index 000000000..18c67fe33 --- /dev/null +++ b/preprocessor/types/search.py @@ -0,0 +1,52 @@ +from typing import ( + Any, + Dict, + List, + NotRequired, + TypedDict, + Union, +) +from .transcription import ElasticsearchSegment + +class SearchSegment(TypedDict): + season: int + episode_number: int + title: str + start_time: float + end_time: float + + +class ElasticsearchHit(TypedDict): + _source: ElasticsearchSegment + _score: float + + +class ElasticsearchHits(TypedDict): + hits: List[ElasticsearchHit] + total: Dict[str, Any] + max_score: float + + +class ElasticsearchResponse(TypedDict): + hits: ElasticsearchHits + aggregations: NotRequired[Dict[str, Any]] + took: int + timed_out: bool + + +class EpisodeBucket(TypedDict): + key: int + doc_count: int + episode_metadata: Dict[str, Any] + + +class SeasonBucket(TypedDict): + key: int + doc_count: int + unique_episodes: Dict[str, int] + + +class ElasticsearchAggregations(TypedDict): + seasons: Dict[str, Union[List[SeasonBucket], int]] + unique_episodes: Dict[str, Union[List[EpisodeBucket], int]] + buckets: NotRequired[List[Union[SeasonBucket, EpisodeBucket]]] diff --git a/preprocessor/types/transcription.py b/preprocessor/types/transcription.py new file mode 100644 index 000000000..24a448b5d --- /dev/null +++ b/preprocessor/types/transcription.py @@ -0,0 +1,46 @@ +from typing import ( + List, + NotRequired, + TypedDict, +) +from .episode import EpisodeMetadata + +class BaseSegment(TypedDict): + id: int + text: str + start: float + end: float + + +class SegmentWithTimes(TypedDict): + segment_id: int + text: str + start_time: float + end_time: float + episode_metadata: EpisodeMetadata + video_path: NotRequired[str] + + +class SegmentWithScore(SegmentWithTimes): + _score: float + + +class ElasticsearchSegment(TypedDict): + segment_id: NotRequired[int] + id: NotRequired[int] + text: str + start_time: NotRequired[float] + start: NotRequired[float] + end_time: NotRequired[float] + end: NotRequired[float] + episode_metadata: NotRequired[EpisodeMetadata] + episode_info: NotRequired[EpisodeMetadata] + video_path: NotRequired[str] + _score: NotRequired[float] + + +class TranscriptionContext(TypedDict): + target: ElasticsearchSegment + context: List[BaseSegment] + overall_start_time: float + overall_end_time: float diff --git a/preprocessor/types/video.py b/preprocessor/types/video.py new file mode 100644 index 000000000..000aface2 --- /dev/null +++ b/preprocessor/types/video.py @@ -0,0 +1,20 @@ +from typing import ( + List, + NotRequired, + TypedDict, +) + +class HashResult(TypedDict): + frame_number: int + timestamp: float + hash: str + file_path: NotRequired[str] + + +class VideoMetadata(TypedDict): + width: int + height: int + fps: float + duration: float + codec: NotRequired[str] + bitrate: NotRequired[int] diff --git a/preprocessor/utils/batch_processing_utils.py b/preprocessor/utils/batch_processing_utils.py index 453885b4e..f4d04d52c 100644 --- a/preprocessor/utils/batch_processing_utils.py +++ b/preprocessor/utils/batch_processing_utils.py @@ -14,9 +14,10 @@ from PIL import Image from preprocessor.embeddings.gpu_batch_processor import GPUBatchProcessor -from preprocessor.hashing.image_hasher import PerceptualHasher +from preprocessor.utils.image_hasher import PerceptualHasher +from preprocessor.utils.batch_processor import BatchProcessor from preprocessor.utils.console import console -from preprocessor.utils.frame_utils import load_frames_from_requests +from preprocessor.video.frame_utils import load_frames_from_requests from preprocessor.utils.time_utils import format_time_hms @@ -54,35 +55,35 @@ def compute_hashes_in_batches( batch_size: int, ) -> List[Dict[str, Any]]: total_chunks = (len(frame_requests) + batch_size - 1) // batch_size - results = [] - console.print(f"[cyan]Computing hashes for {len(frame_requests)} frames in {total_chunks} batches[/cyan]") start_time = time.time() + batch_processor = BatchProcessor(batch_size) + processed_batches = 0 - for chunk_idx in range(total_chunks): - chunk_start = chunk_idx * batch_size - chunk_end = min(chunk_start + batch_size, len(frame_requests)) - chunk_requests = frame_requests[chunk_start:chunk_end] - - pil_images = load_frames_from_requests(frames_dir, chunk_requests) + def _process_hash_batch(batch_requests: List[Dict[str, Any]]) -> List[Dict[str, Any]]: + nonlocal processed_batches + pil_images = load_frames_from_requests(frames_dir, batch_requests) phashes = hasher.compute_phash_batch(pil_images) - for request, phash in zip(chunk_requests, phashes): + batch_results = [] + for request, phash in zip(batch_requests, phashes): result = request.copy() result["perceptual_hash"] = phash - results.append(result) + batch_results.append(result) del pil_images - + processed_batches += 1 _report_batch_progress( - chunk_idx + 1, + processed_batches, total_chunks, - chunk_idx + 1, + processed_batches, total_chunks, start_time, ) + return batch_results + results = batch_processor.process(frame_requests, _process_hash_batch) console.print(f"[green]✓ Computed {len(results)} hashes[/green]") return results diff --git a/preprocessor/utils/batch_processor.py b/preprocessor/utils/batch_processor.py new file mode 100644 index 000000000..260c29acc --- /dev/null +++ b/preprocessor/utils/batch_processor.py @@ -0,0 +1,24 @@ +from typing import ( + Callable, + Generic, + List, + TypeVar, +) + +T = TypeVar('T') +R = TypeVar('R') + +class BatchProcessor(Generic[T, R]): + def __init__(self, batch_size: int): + self.batch_size = batch_size + + def process( + self, + items: List[T], + process_fn: Callable[[List[T]], List[R]], + ) -> List[R]: + results = [] + for i in range(0, len(items), self.batch_size): + batch = items[i:i+self.batch_size] + results.extend(process_fn(batch)) + return results diff --git a/preprocessor/utils/detection_io.py b/preprocessor/utils/detection_io.py index 72ed5c972..2ce695797 100644 --- a/preprocessor/utils/detection_io.py +++ b/preprocessor/utils/detection_io.py @@ -7,9 +7,8 @@ Optional, ) -from preprocessor.characters.face.face_detection_utils import detect_characters_in_frame +from preprocessor.characters.face_detection import detect_characters_in_frame from preprocessor.config.config import settings -from preprocessor.core.file_naming import FileNamingConventions from preprocessor.core.path_manager import PathManager from preprocessor.utils.console import console from preprocessor.utils.file_utils import atomic_write_json @@ -38,9 +37,9 @@ def save_character_detections( } series_name = episode_info.series_name or "unknown" - file_naming = FileNamingConventions(series_name) + path_manager = PathManager(series_name) - detections_filename = file_naming.build_filename( + detections_filename = path_manager.build_filename( episode_info, extension="json", suffix="character_detections", diff --git a/preprocessor/hashing/image_hasher.py b/preprocessor/utils/image_hasher.py similarity index 100% rename from preprocessor/hashing/image_hasher.py rename to preprocessor/utils/image_hasher.py diff --git a/preprocessor/cli_utils/resource_scope.py b/preprocessor/utils/resource_scope.py similarity index 100% rename from preprocessor/cli_utils/resource_scope.py rename to preprocessor/utils/resource_scope.py diff --git a/preprocessor/validation/validator.py b/preprocessor/validation/validator.py index ee596f4ea..4f1984cd0 100644 --- a/preprocessor/validation/validator.py +++ b/preprocessor/validation/validator.py @@ -10,7 +10,7 @@ from preprocessor.config.config import settings from preprocessor.core.episode_manager import EpisodeManager -from preprocessor.core.file_naming import FileNamingConventions +from preprocessor.core.path_manager import PathManager from preprocessor.utils.file_utils import atomic_write_json from preprocessor.validation.episode_stats import EpisodeStats from preprocessor.validation.report_generator import ReportGenerator @@ -107,8 +107,8 @@ def __generate_episode_reports(self, episodes_stats: Dict[str, EpisodeStats]): "stats": stats.to_dict()["stats"], } - file_naming = FileNamingConventions(self.series_name) - report_filename = file_naming.build_filename(stats.episode_info, extension="json") + path_manager = PathManager(self.series_name) + report_filename = path_manager.build_filename(stats.episode_info, extension="json") report_path = self.validation_reports_dir / report_filename atomic_write_json(report_path, episode_report) diff --git a/preprocessor/utils/emotion_utils.py b/preprocessor/video/emotion_utils.py similarity index 100% rename from preprocessor/utils/emotion_utils.py rename to preprocessor/video/emotion_utils.py diff --git a/preprocessor/video/helpers/frame_processor.py b/preprocessor/video/frame_processor.py similarity index 100% rename from preprocessor/video/helpers/frame_processor.py rename to preprocessor/video/frame_processor.py diff --git a/preprocessor/utils/frame_utils.py b/preprocessor/video/frame_utils.py similarity index 100% rename from preprocessor/utils/frame_utils.py rename to preprocessor/video/frame_utils.py diff --git a/preprocessor/video/helpers/__init__.py b/preprocessor/video/helpers/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/preprocessor/video/helpers/base_video_processor.py b/preprocessor/video/helpers/base_video_processor.py deleted file mode 100644 index c706faaab..000000000 --- a/preprocessor/video/helpers/base_video_processor.py +++ /dev/null @@ -1,45 +0,0 @@ -from abc import ABC -import logging -from pathlib import Path -from typing import ( - Any, - Dict, - List, -) - -from preprocessor.core.base_processor import ( - BaseProcessor, - ProcessingItem, -) -from preprocessor.core.episode_manager import EpisodeManager - - -class BaseVideoProcessor(BaseProcessor, ABC): - def __init__( - self, - args: Dict[str, Any], - class_name: str, - error_exit_code: int, - input_videos_key: str = "videos", - subdirectory_filter: str = None, - ): - super().__init__( - args=args, - class_name=class_name, - error_exit_code=error_exit_code, - loglevel=logging.DEBUG, - ) - - self.input_videos: Path = Path(self._args[input_videos_key]) - self.subdirectory_filter: str = subdirectory_filter - episodes_json_path = self._args.get("episodes_info_json") - self.episode_manager = EpisodeManager(episodes_json_path, self.series_name) - - def _get_processing_items(self) -> List[ProcessingItem]: - return self._create_video_processing_items( - source_path=self.input_videos, - extensions=self.get_video_glob_patterns(), - episode_manager=self.episode_manager, - skip_unparseable=True, - subdirectory_filter=self.subdirectory_filter, - ) diff --git a/preprocessor/video/subprocessors/__init__.py b/preprocessor/video/subprocessors/__init__.py index e69de29bb..9fc496d07 100644 --- a/preprocessor/video/subprocessors/__init__.py +++ b/preprocessor/video/subprocessors/__init__.py @@ -0,0 +1,19 @@ +from .image_hash_subprocessor import ImageHashSubProcessor +from .video_embedding_subprocessor import VideoEmbeddingSubProcessor +from .character_detection_subprocessor import CharacterDetectionSubProcessor +from .object_detection_subprocessor import ObjectDetectionSubProcessor +from .object_detection_visualization_subprocessor import ObjectDetectionVisualizationSubProcessor +from .character_detection_visualization_subprocessor import CharacterDetectionVisualizationSubProcessor +from .emotion_detection_subprocessor import EmotionDetectionSubProcessor +from .face_clustering_subprocessor import FaceClusteringSubProcessor + +__all__ = [ + "ImageHashSubProcessor", + "VideoEmbeddingSubProcessor", + "CharacterDetectionSubProcessor", + "ObjectDetectionSubProcessor", + "ObjectDetectionVisualizationSubProcessor", + "CharacterDetectionVisualizationSubProcessor", + "EmotionDetectionSubProcessor", + "FaceClusteringSubProcessor", +] \ No newline at end of file diff --git a/preprocessor/video/subprocessors/character_detection_subprocessor.py b/preprocessor/video/subprocessors/character_detection_subprocessor.py new file mode 100644 index 000000000..8412f9e27 --- /dev/null +++ b/preprocessor/video/subprocessors/character_detection_subprocessor.py @@ -0,0 +1,103 @@ +import logging +from pathlib import Path +from typing import ( + Any, + Dict, + List, + Optional, +) + +from insightface.app import FaceAnalysis +import numpy as np + +from preprocessor.characters.face_detection import ( + init_face_detection, + load_character_references, +) +from preprocessor.config.config import settings +from preprocessor.core.base_processor import ( + OutputSpec, + ProcessingItem, +) +from preprocessor.core.path_manager import PathManager +from preprocessor.utils.console import console +from preprocessor.utils.detection_io import ( + process_frames_for_detection, + save_character_detections, +) +from preprocessor.utils.error_handling_logger import ErrorHandlingLogger +from preprocessor.video.frame_processor import FrameSubProcessor + + +class CharacterDetectionSubProcessor(FrameSubProcessor): + def __init__(self, characters_dir: Path, use_gpu: bool, threshold: float): + super().__init__("Character Detection") + self.characters_dir = characters_dir + self.use_gpu = use_gpu + self.threshold = threshold + self.face_app: Optional[FaceAnalysis] = None + self.character_vectors: Dict[str, np.ndarray] = {} + self.logger = ErrorHandlingLogger("CharacterDetectionSubProcessor", logging.DEBUG, 15) + + def initialize(self) -> None: + if self.face_app is None: + console.print("[cyan]Initializing face detection...[/cyan]") + self.face_app = init_face_detection() + self.character_vectors = load_character_references(self.characters_dir, self.face_app) + console.print("[green]✓ Face detection initialized[/green]") + + def cleanup(self) -> None: + self.face_app = None + self.character_vectors = {} + + def finalize(self) -> None: + if hasattr(self, 'logger'): + self.logger.finalize() + + def get_expected_outputs(self, item: ProcessingItem) -> List[OutputSpec]: + episode_info = item.metadata["episode_info"] + episode_dir = PathManager(episode_info.series_name or "unknown").get_episode_dir(episode_info,settings.output_subdirs.character_detections) + series_name = item.metadata["series_name"] + path_manager = PathManager(series_name) + detections_filename = path_manager.build_filename( + episode_info, + extension="json", + suffix="character_detections", + ) + detections_output = episode_dir / detections_filename + return [OutputSpec(path=detections_output, required=True)] + + def should_run(self, item: ProcessingItem, missing_outputs: List[OutputSpec]) -> bool: + if not self.characters_dir.exists(): + console.print(f"[yellow]Characters directory not found: {self.characters_dir}, skipping[/yellow]") + return False + + expected = self.get_expected_outputs(item) + return any(str(exp.path) in str(miss.path) for exp in expected for miss in missing_outputs) + + def process(self, item: ProcessingItem, ramdisk_frames_dir: Path) -> None: + self.initialize() + + if not self.character_vectors: + console.print("[yellow]No character references loaded, skipping detection[/yellow]") + return + + episode_info = item.metadata["episode_info"] + + frame_files = sorted([ + f for f in ramdisk_frames_dir.glob("*.jpg") + if f.is_file() and "frame_" in f.name + ]) + + console.print(f"[cyan]Detecting characters in {len(frame_files)} frames[/cyan]") + + fps = 25.0 + + results = process_frames_for_detection( + frame_files, + self.face_app, + self.character_vectors, + self.threshold, + fps=fps, + ) + save_character_detections(episode_info, results, fps=fps) diff --git a/preprocessor/video/subprocessors/character_detection_visualization_subprocessor.py b/preprocessor/video/subprocessors/character_detection_visualization_subprocessor.py new file mode 100644 index 000000000..37fc17770 --- /dev/null +++ b/preprocessor/video/subprocessors/character_detection_visualization_subprocessor.py @@ -0,0 +1,155 @@ +import json +import logging +from pathlib import Path +from typing import ( + Any, + Dict, + List, + Set, + Tuple, +) + +import numpy as np + +from preprocessor.config.config import settings +from preprocessor.core.base_processor import ( + OutputSpec, + ProcessingItem, +) +from preprocessor.core.path_manager import PathManager +from preprocessor.utils.console import console +from preprocessor.utils.error_handling_logger import ErrorHandlingLogger +from preprocessor.video.frame_processor import FrameSubProcessor + + +class CharacterDetectionVisualizationSubProcessor(FrameSubProcessor): + def __init__(self): + super().__init__("Character Detection Visualization") + self.logger = ErrorHandlingLogger("CharacterDetectionVisualizationSubProcessor", logging.DEBUG, 15) + + def initialize(self) -> None: + pass + + def cleanup(self) -> None: + pass + + def finalize(self) -> None: + if hasattr(self, 'logger'): + self.logger.finalize() + + def needs_ramdisk(self) -> bool: + return False + + def get_expected_outputs(self, item: ProcessingItem) -> List[OutputSpec]: + episode_info = item.metadata["episode_info"] + episode_dir = PathManager(episode_info.series_name or "unknown").get_episode_dir(episode_info,settings.output_subdirs.character_visualizations) + marker_file = episode_dir / ".visualization_complete" + return [OutputSpec(path=marker_file, required=True)] + + def should_run(self, item: ProcessingItem, missing_outputs: List[OutputSpec]) -> bool: + episode_info = item.metadata["episode_info"] + detection_dir = PathManager(episode_info.series_name or "unknown").get_episode_dir(episode_info,settings.output_subdirs.character_detections) + detection_files = list(detection_dir.glob("*_character_detections.json")) + detection_file = detection_files[0] if detection_files else None + + if not detection_file or not detection_file.exists(): + console.print(f"[yellow]No character detections found for {episode_info.episode_code()}, skipping visualization[/yellow]") + return False + + expected = self.get_expected_outputs(item) + return any(str(exp.path) in str(miss.path) for exp in expected for miss in missing_outputs) + + def process(self, item: ProcessingItem, ramdisk_frames_dir: Path) -> None: # pylint: disable=too-many-locals + import cv2 # pylint: disable=import-outside-toplevel + + episode_info = item.metadata["episode_info"] + detection_dir = PathManager(episode_info.series_name or "unknown").get_episode_dir(episode_info,settings.output_subdirs.character_detections) + detection_files = list(detection_dir.glob("*_character_detections.json")) + detection_file = detection_files[0] if detection_files else None + + if not detection_file or not detection_file.exists(): + console.print(f"[yellow]No detections JSON found in {detection_dir}[/yellow]") + return + + if not ramdisk_frames_dir.exists(): + console.print(f"[yellow]No frames directory found: {ramdisk_frames_dir}[/yellow]") + return + + with open(detection_file, 'r', encoding='utf-8') as f: + detection_data = json.load(f) + + frames_with_detections = [f for f in detection_data.get("detections", []) if f.get('characters')] + if not frames_with_detections: + console.print(f"[yellow]No frames with character detections for {episode_info.episode_code()}[/yellow]") + return + + output_dir = PathManager(episode_info.series_name or "unknown").get_episode_dir(episode_info,settings.output_subdirs.character_visualizations) + output_dir.mkdir(parents=True, exist_ok=True) + + all_character_names = set() + for frame_data in frames_with_detections: + for char in frame_data.get('characters', []): + all_character_names.add(char['name']) + colors = self.__generate_character_colors(all_character_names) + + console.print(f"[cyan]Visualizing {len(frames_with_detections)} frames with characters for {episode_info.episode_code()}[/cyan]") + + for frame_data in frames_with_detections: + frame_name = frame_data.get('frame_file') or frame_data.get('frame') + if not frame_name: + continue + + output_path = output_dir / frame_name + if output_path.exists(): + continue + + frame_path = ramdisk_frames_dir / frame_name + if not frame_path.exists(): + continue + + img = cv2.imread(str(frame_path)) + if img is None: + continue + + self.__draw_characters_on_frame(img, frame_data['characters'], colors) + cv2.imwrite(str(output_path), img) + + marker_file = output_dir / ".visualization_complete" + marker_file.write_text(f"completed: {len(frames_with_detections)} frames") + console.print(f"[green]✓ Visualized {len(frames_with_detections)} frames saved to: {output_dir}[/green]") + + @staticmethod + def __draw_characters_on_frame(img, characters: List[Dict[str, Any]], colors: Dict[str, Tuple[int, int, int]]) -> None: + import cv2 # pylint: disable=import-outside-toplevel + + for character in characters: + name = character['name'] + confidence = character['confidence'] + bbox = character['bbox'] + + x1, y1 = bbox['x1'], bbox['y1'] + x2, y2 = bbox['x2'], bbox['y2'] + color = colors.get(name, (0, 255, 0)) + + cv2.rectangle(img, (x1, y1), (x2, y2), color, 2) + + label = f"{name} {confidence:.2f}" + if "emotion" in character: + emotion_label = character["emotion"]["label"] + emotion_conf = character["emotion"]["confidence"] + label += f" | {emotion_label} {emotion_conf:.2f}" + + label_size, _ = cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX, 0.5, 1) + label_y1 = max(y1 - 10, label_size[1]) + + cv2.rectangle(img, (x1, label_y1 - label_size[1] - 5), (x1 + label_size[0], label_y1), color, -1) + cv2.putText(img, label, (x1, label_y1 - 2), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255), 1) + + @staticmethod + def __generate_character_colors(character_names: Set[str]) -> Dict[str, Tuple[int, int, int]]: + np.random.seed(42) + colors = {} + sorted_names = sorted(character_names) + for _, name in enumerate(sorted_names): + colors[name] = tuple(int(x) for x in np.random.randint(50, 255, 3)) + return colors diff --git a/preprocessor/video/subprocessors/emotion_detection_subprocessor.py b/preprocessor/video/subprocessors/emotion_detection_subprocessor.py index d04704aac..bd5490a40 100644 --- a/preprocessor/video/subprocessors/emotion_detection_subprocessor.py +++ b/preprocessor/video/subprocessors/emotion_detection_subprocessor.py @@ -15,17 +15,16 @@ ProcessingItem, ) from preprocessor.core.episode_manager import EpisodeManager -from preprocessor.core.file_naming import FileNamingConventions from preprocessor.core.path_manager import PathManager from preprocessor.utils.console import console -from preprocessor.utils.emotion_utils import ( +from preprocessor.video.emotion_utils import ( crop_face_from_frame, detect_emotions_batch, init_emotion_model, ) from preprocessor.utils.error_handling_logger import ErrorHandlingLogger from preprocessor.utils.file_utils import atomic_write_json -from preprocessor.video.helpers.frame_processor import FrameSubProcessor +from preprocessor.video.frame_processor import FrameSubProcessor class EmotionDetectionSubProcessor(FrameSubProcessor): @@ -56,8 +55,8 @@ def should_run(self, item: ProcessingItem, missing_outputs: List[OutputSpec]) -> series_name = item.metadata["series_name"] episode_dir = PathManager(episode_info.series_name or "unknown").get_episode_dir(episode_info,settings.output_subdirs.character_detections) - file_naming = FileNamingConventions(series_name) - detections_filename = file_naming.build_filename( + path_manager = PathManager(series_name) + detections_filename = path_manager.build_filename( episode_info, extension="json", suffix="character_detections", @@ -80,8 +79,8 @@ def process(self, item: ProcessingItem, ramdisk_frames_dir: Path) -> None: # pyl series_name = item.metadata["series_name"] episode_dir = PathManager(episode_info.series_name or "unknown").get_episode_dir(episode_info,settings.output_subdirs.character_detections) - file_naming = FileNamingConventions(series_name) - detections_filename = file_naming.build_filename( + path_manager = PathManager(series_name) + detections_filename = path_manager.build_filename( episode_info, extension="json", suffix="character_detections", diff --git a/preprocessor/video/subprocessors/face_clustering_subprocessor.py b/preprocessor/video/subprocessors/face_clustering_subprocessor.py index a9d9ecc52..d67c17753 100644 --- a/preprocessor/video/subprocessors/face_clustering_subprocessor.py +++ b/preprocessor/video/subprocessors/face_clustering_subprocessor.py @@ -16,20 +16,19 @@ import numpy as np import torch -from preprocessor.characters.face.utils import init_face_detection +from preprocessor.characters.face_detection import init_face_detection from preprocessor.config.config import settings from preprocessor.core.base_processor import ( OutputSpec, ProcessingItem, ) from preprocessor.core.episode_manager import EpisodeManager -from preprocessor.core.file_naming import FileNamingConventions from preprocessor.core.path_manager import PathManager from preprocessor.utils.console import console from preprocessor.utils.error_handling_logger import ErrorHandlingLogger from preprocessor.utils.file_utils import atomic_write_json from preprocessor.utils.metadata_utils import create_processing_metadata -from preprocessor.video.helpers.frame_processor import FrameSubProcessor +from preprocessor.video.frame_processor import FrameSubProcessor class FaceClusteringSubProcessor(FrameSubProcessor): @@ -69,8 +68,8 @@ def get_expected_outputs(self, item: ProcessingItem) -> List[OutputSpec]: episode_info = item.metadata["episode_info"] episode_dir = PathManager(episode_info.series_name or "unknown").get_episode_dir(episode_info,settings.output_subdirs.face_clusters) series_name = item.metadata["series_name"] - file_naming = FileNamingConventions(series_name) - metadata_filename = file_naming.build_filename( + path_manager = PathManager(series_name) + metadata_filename = path_manager.build_filename( episode_info, extension="json", suffix="_face_clusters", @@ -270,8 +269,8 @@ def __save_metadata( results_key="clusters", results_data=cluster_stats, ) - file_naming = FileNamingConventions(series_name) - metadata_filename = file_naming.build_filename( + path_manager = PathManager(series_name) + metadata_filename = path_manager.build_filename( episode_info, extension="json", suffix="_face_clusters", diff --git a/preprocessor/video/subprocessors/frame_subprocessors.py b/preprocessor/video/subprocessors/frame_subprocessors.py deleted file mode 100644 index 229dc1f70..000000000 --- a/preprocessor/video/subprocessors/frame_subprocessors.py +++ /dev/null @@ -1,772 +0,0 @@ -import gc -import json -import logging -from pathlib import Path -from typing import ( - Any, - Dict, - List, - Optional, - Set, - Tuple, -) - -from insightface.app import FaceAnalysis -import numpy as np -import torch - -from preprocessor.characters.face.face_detection_utils import load_character_references -from preprocessor.characters.face.utils import init_face_detection -from preprocessor.config.config import settings -from preprocessor.core.base_processor import ( - OutputSpec, - ProcessingItem, -) -from preprocessor.core.episode_manager import EpisodeManager -from preprocessor.core.file_naming import FileNamingConventions -from preprocessor.core.path_manager import PathManager -from preprocessor.embeddings.gpu_batch_processor import GPUBatchProcessor -from preprocessor.hashing.image_hasher import PerceptualHasher -from preprocessor.utils.batch_processing_utils import ( - compute_embeddings_in_batches, - compute_hashes_in_batches, -) -from preprocessor.utils.console import console -from preprocessor.utils.detection_io import ( - process_frames_for_detection, - save_character_detections, -) -from preprocessor.utils.error_handling_logger import ErrorHandlingLogger -from preprocessor.utils.file_utils import atomic_write_json -from preprocessor.utils.image_hash_utils import load_image_hashes_for_episode -from preprocessor.utils.metadata_utils import create_processing_metadata -from preprocessor.video.helpers.frame_processor import FrameSubProcessor - -# pylint: disable=duplicate-code - - - -class ImageHashSubProcessor(FrameSubProcessor): - def __init__(self, device: str, batch_size: int): - super().__init__("Image Hashing") - self.device = device - self.batch_size = batch_size - self.hasher: Optional[PerceptualHasher] = None - self.logger = ErrorHandlingLogger("ImageHashSubProcessor", logging.DEBUG, 15) - - def initialize(self) -> None: - if self.hasher is None: - self.hasher = PerceptualHasher(device=self.device, hash_size=8) - - def cleanup(self) -> None: - self.hasher = None - self.__cleanup_memory() - - def finalize(self) -> None: - if hasattr(self, 'logger'): - self.logger.finalize() - - def get_expected_outputs(self, item: ProcessingItem) -> List[OutputSpec]: - episode_info = item.metadata["episode_info"] - episode_dir = PathManager(episode_info.series_name or "unknown").get_episode_dir(episode_info,settings.output_subdirs.image_hashes) - series_name = item.metadata["series_name"] - file_naming = FileNamingConventions(series_name) - hash_filename = file_naming.build_filename( - episode_info, - extension="json", - suffix="image_hashes", - ) - hash_output = episode_dir / hash_filename - return [OutputSpec(path=hash_output, required=True)] - - def should_run(self, item: ProcessingItem, missing_outputs: List[OutputSpec]) -> bool: - expected = self.get_expected_outputs(item) - return any(str(exp.path) in str(miss.path) for exp in expected for miss in missing_outputs) - - def process(self, item: ProcessingItem, ramdisk_frames_dir: Path) -> None: - self.initialize() - - metadata_file = item.input_path - episode_info = item.metadata["episode_info"] - - with open(metadata_file, "r", encoding="utf-8") as f: - metadata = json.load(f) - - frame_requests = metadata.get("frames", []) - if not frame_requests: - console.print(f"[yellow]No frames in metadata for {metadata_file}[/yellow]") - return - - hash_results = compute_hashes_in_batches(ramdisk_frames_dir, frame_requests, self.hasher, self.batch_size) - series_name = item.metadata["series_name"] - self.__save_hashes(episode_info, hash_results, series_name) - - def __save_hashes(self, episode_info, hash_results: List[Dict[str, Any]], series_name: str) -> None: - episode_dir = PathManager(episode_info.series_name or "unknown").get_episode_dir(episode_info,settings.output_subdirs.image_hashes) - episode_dir.mkdir(parents=True, exist_ok=True) - - hash_data = create_processing_metadata( - episode_info=episode_info, - processing_params={ - "device": self.device, - "batch_size": self.batch_size, - "hash_size": 8, - }, - statistics={ - "total_hashes": len(hash_results), - "unique_hashes": len(set(h.get("perceptual_hash") for h in hash_results if "perceptual_hash" in h)), - }, - results_key="image_hashes", - results_data=hash_results, - ) - - file_naming = FileNamingConventions(series_name) - hash_filename = file_naming.build_filename( - episode_info, - extension="json", - suffix="image_hashes", - ) - hash_output = episode_dir / hash_filename - atomic_write_json(hash_output, hash_data, indent=2, ensure_ascii=False) - - console.print(f"[green]✓ Saved hashes to: {hash_output}[/green]") - - @staticmethod - def __cleanup_memory() -> None: - gc.collect() - if torch.cuda.is_available(): - torch.cuda.empty_cache() - - -class VideoEmbeddingSubProcessor(FrameSubProcessor): - def __init__(self, device: str, batch_size: int, model_name: str, model_revision: str): - super().__init__("Video Embeddings") - self.device = device - self.batch_size = batch_size - self.model_name = model_name - self.model_revision = model_revision - self.model = None - self.gpu_processor: Optional[GPUBatchProcessor] = None - self.logger = ErrorHandlingLogger("VideoEmbeddingSubProcessor", logging.DEBUG, 15) - - def initialize(self) -> None: - if self.model is None: - from preprocessor.embeddings.qwen3_vl_embedding import Qwen3VLEmbedder # pylint: disable=import-outside-toplevel - console.print(f"[cyan]Loading embedding model: {self.model_name}[/cyan]") - self.model = Qwen3VLEmbedder( - model_name_or_path=self.model_name, - torch_dtype=torch.bfloat16, - ) - self.gpu_processor = GPUBatchProcessor( - self.model, - self.batch_size, - self.logger, - self.device, - progress_sub_batch_size=settings.embedding.progress_sub_batch_size, - ) - console.print("[green]✓ Qwen3-VL-Embedding model loaded[/green]") - - def cleanup(self) -> None: - self.model = None - self.gpu_processor = None - self.__cleanup_memory() - - def finalize(self) -> None: - if hasattr(self, 'logger'): - self.logger.finalize() - - def get_expected_outputs(self, item: ProcessingItem) -> List[OutputSpec]: - episode_info = item.metadata["episode_info"] - episode_dir = PathManager(episode_info.series_name or "unknown").get_episode_dir(episode_info,settings.output_subdirs.embeddings) - series_name = item.metadata["series_name"] - file_naming = FileNamingConventions(series_name) - video_filename = file_naming.build_filename( - episode_info, - extension="json", - suffix="embeddings_video", - ) - video_output = episode_dir / video_filename - return [OutputSpec(path=video_output, required=True)] - - def should_run(self, item: ProcessingItem, missing_outputs: List[OutputSpec]) -> bool: - expected = self.get_expected_outputs(item) - return any(str(exp.path) in str(miss.path) for exp in expected for miss in missing_outputs) - - def process(self, item: ProcessingItem, ramdisk_frames_dir: Path) -> None: - self.initialize() - - metadata_file = item.input_path - episode_info = item.metadata["episode_info"] - - with open(metadata_file, "r", encoding="utf-8") as f: - metadata = json.load(f) - - frame_requests = metadata.get("frames", []) - if not frame_requests: - console.print(f"[yellow]No frames in metadata for {metadata_file}[/yellow]") - return - - episode_dir = PathManager(episode_info.series_name or "unknown").get_episode_dir(episode_info,settings.output_subdirs.embeddings) - checkpoint_file = episode_dir / "embeddings_video_checkpoint.json" - - series_name = item.metadata.get("series_name", "unknown") - image_hashes = load_image_hashes_for_episode( - {"season": episode_info.season, "episode_number": episode_info.relative_episode}, - series_name, - self.logger, - ) - video_embeddings = compute_embeddings_in_batches( - ramdisk_frames_dir, - frame_requests, - self.gpu_processor, - self.batch_size, - image_hashes, - checkpoint_file=checkpoint_file, - checkpoint_interval=20, - prefetch_count=settings.embedding.prefetch_chunks, - ) - series_name = item.metadata["series_name"] - self.__save_embeddings(episode_info, video_embeddings, series_name) - - def __save_embeddings(self, episode_info, video_embeddings: List[Dict[str, Any]], series_name: str) -> None: - episode_dir = PathManager(episode_info.series_name or "unknown").get_episode_dir(episode_info,settings.output_subdirs.embeddings) - episode_dir.mkdir(parents=True, exist_ok=True) - - video_data = create_processing_metadata( - episode_info=episode_info, - processing_params={ - "model_name": self.model_name, - "model_revision": self.model_revision, - "batch_size": self.batch_size, - "device": self.device, - }, - statistics={ - "total_embeddings": len(video_embeddings), - "embedding_dimension": len(video_embeddings[0]["embedding"]) if video_embeddings else 0, - "frames_with_hash": sum(1 for e in video_embeddings if "perceptual_hash" in e), - }, - results_key="video_embeddings", - results_data=video_embeddings, - ) - file_naming = FileNamingConventions(series_name) - video_filename = file_naming.build_filename( - episode_info, - extension="json", - suffix="embeddings_video", - ) - video_output = episode_dir / video_filename - atomic_write_json(video_output, video_data, indent=2, ensure_ascii=False) - - console.print(f"[green]✓ Saved embeddings to: {video_output}[/green]") - - @staticmethod - def __cleanup_memory() -> None: - gc.collect() - if torch.cuda.is_available(): - torch.cuda.empty_cache() - - -class CharacterDetectionSubProcessor(FrameSubProcessor): - def __init__(self, characters_dir: Path, use_gpu: bool, threshold: float): - super().__init__("Character Detection") - self.characters_dir = characters_dir - self.use_gpu = use_gpu - self.threshold = threshold - self.face_app: Optional[FaceAnalysis] = None - self.character_vectors: Dict[str, np.ndarray] = {} - self.logger = ErrorHandlingLogger("CharacterDetectionSubProcessor", logging.DEBUG, 15) - - def initialize(self) -> None: - if self.face_app is None: - console.print("[cyan]Initializing face detection...[/cyan]") - self.face_app = init_face_detection() - self.character_vectors = load_character_references(self.characters_dir, self.face_app) - console.print("[green]✓ Face detection initialized[/green]") - - def cleanup(self) -> None: - self.face_app = None - self.character_vectors = {} - - def finalize(self) -> None: - if hasattr(self, 'logger'): - self.logger.finalize() - - def get_expected_outputs(self, item: ProcessingItem) -> List[OutputSpec]: - episode_info = item.metadata["episode_info"] - episode_dir = PathManager(episode_info.series_name or "unknown").get_episode_dir(episode_info,settings.output_subdirs.character_detections) - series_name = item.metadata["series_name"] - file_naming = FileNamingConventions(series_name) - detections_filename = file_naming.build_filename( - episode_info, - extension="json", - suffix="character_detections", - ) - detections_output = episode_dir / detections_filename - return [OutputSpec(path=detections_output, required=True)] - - def should_run(self, item: ProcessingItem, missing_outputs: List[OutputSpec]) -> bool: - if not self.characters_dir.exists(): - console.print(f"[yellow]Characters directory not found: {self.characters_dir}, skipping[/yellow]") - return False - - expected = self.get_expected_outputs(item) - return any(str(exp.path) in str(miss.path) for exp in expected for miss in missing_outputs) - - def process(self, item: ProcessingItem, ramdisk_frames_dir: Path) -> None: - self.initialize() - - if not self.character_vectors: - console.print("[yellow]No character references loaded, skipping detection[/yellow]") - return - - episode_info = item.metadata["episode_info"] - - frame_files = sorted([ - f for f in ramdisk_frames_dir.glob("*.jpg") - if f.is_file() and "frame_" in f.name - ]) - - console.print(f"[cyan]Detecting characters in {len(frame_files)} frames[/cyan]") - - fps = 25.0 - - results = process_frames_for_detection( - frame_files, - self.face_app, - self.character_vectors, - self.threshold, - fps=fps, - ) - save_character_detections(episode_info, results, fps=fps) - - -class ObjectDetectionSubProcessor(FrameSubProcessor): - def __init__(self, model_name: str = "ustc-community/dfine-xlarge-obj2coco", conf_threshold: float = 0.25): - super().__init__("Object Detection") - self.model_name = model_name - self.conf_threshold = conf_threshold - self.model: Optional[Any] = None - self.image_processor: Optional[Any] = None - self.logger = ErrorHandlingLogger("ObjectDetectionSubProcessor", logging.DEBUG, 15) - - def initialize(self) -> None: - if self.model is None: - if not torch.cuda.is_available(): - raise RuntimeError("CUDA is not available. Object detection requires GPU.") - - from transformers import ( # pylint: disable=import-outside-toplevel - AutoImageProcessor, - DFineForObjectDetection, - ) - - console.print(f"[cyan]Loading D-FINE model: {self.model_name}[/cyan]") - self.image_processor = AutoImageProcessor.from_pretrained(self.model_name) - self.model = DFineForObjectDetection.from_pretrained(self.model_name) - self.model.to("cuda") - console.print("[green]✓ D-FINE model loaded on GPU[/green]") - - def cleanup(self) -> None: - self.model = None - self.image_processor = None - self.__cleanup_memory() - - def finalize(self) -> None: - if hasattr(self, 'logger'): - self.logger.finalize() - - def get_expected_outputs(self, item: ProcessingItem) -> List[OutputSpec]: - episode_info = item.metadata["episode_info"] - episode_dir = PathManager(episode_info.series_name or "unknown").get_episode_dir(episode_info,settings.output_subdirs.object_detections) - series_name = item.metadata["series_name"] - file_naming = FileNamingConventions(series_name) - detections_filename = file_naming.build_filename( - episode_info, - extension="json", - suffix="_object_detections", - ) - detections_output = episode_dir / detections_filename - return [OutputSpec(path=detections_output, required=True)] - - def should_run(self, item: ProcessingItem, missing_outputs: List[OutputSpec]) -> bool: - expected = self.get_expected_outputs(item) - return any(str(exp.path) in str(miss.path) for exp in expected for miss in missing_outputs) - - def process(self, item: ProcessingItem, ramdisk_frames_dir: Path) -> None: # pylint: disable=too-many-locals - self.initialize() - - from PIL import Image # pylint: disable=import-outside-toplevel - - episode_info = item.metadata["episode_info"] - - frame_files = sorted([ - f for f in ramdisk_frames_dir.glob("*.jpg") - if f.is_file() and "frame_" in f.name - ]) - - if not frame_files: - console.print(f"[yellow]No frames found in {ramdisk_frames_dir}[/yellow]") - return - - console.print(f"[cyan]Detecting objects in {len(frame_files)} frames[/cyan]") - - detections_data = { - "episode_code": episode_info.episode_code(), - "model": self.model_name, - "confidence_threshold": self.conf_threshold, - "frames": [], - } - - batch_size = 8 - for batch_start in range(0, len(frame_files), batch_size): - batch_paths = frame_files[batch_start:batch_start + batch_size] - batch_images = [Image.open(fp) for fp in batch_paths] - target_sizes = [(img.height, img.width) for img in batch_images] - - inputs = self.image_processor(images=batch_images, return_tensors="pt") - inputs = {k: v.to("cuda") for k, v in inputs.items()} - - with torch.no_grad(): - outputs = self.model(**inputs) - - results = self.image_processor.post_process_object_detection( - outputs, - target_sizes=target_sizes, - threshold=self.conf_threshold, - ) - - for frame_path, result in zip(batch_paths, results): - frame_result = { - "frame_name": frame_path.name, - "detections": [], - } - - for score, label_id, box in zip(result["scores"], result["labels"], result["boxes"]): - score_value = score.item() - label = label_id.item() - box_coords = [float(i) for i in box.tolist()] - - detection = { - "class_id": label, - "class_name": self.model.config.id2label[label], - "confidence": score_value, - "bbox": { - "x1": box_coords[0], - "y1": box_coords[1], - "x2": box_coords[2], - "y2": box_coords[3], - }, - } - frame_result["detections"].append(detection) - - frame_result["detection_count"] = len(frame_result["detections"]) - detections_data["frames"].append(frame_result) - - for img in batch_images: - img.close() - - total_detections = sum(f['detection_count'] for f in detections_data['frames']) - frames_with_detections = len([f for f in detections_data['frames'] if f['detection_count'] > 0]) - - console.print(f"[green]✓ Total detections: {total_detections}[/green]") - console.print(f"[green]✓ Frames with detections: {frames_with_detections}/{len(frame_files)}[/green]") - - class_counts = {} - for frame in detections_data["frames"]: - for det in frame["detections"]: - class_name = det["class_name"] - class_counts[class_name] = class_counts.get(class_name, 0) + 1 - - if class_counts: - top_classes = sorted(class_counts.items(), key=lambda x: x[1], reverse=True)[:5] - console.print(f"[cyan]Top 5 classes: {', '.join(f'{cls}:{cnt}' for cls, cnt in top_classes)}[/cyan]") - - series_name = item.metadata["series_name"] - self.__save_detections(episode_info, detections_data, series_name) - - def __save_detections(self, episode_info, detections_data: Dict[str, Any], series_name: str) -> None: - episode_dir = PathManager(episode_info.series_name or "unknown").get_episode_dir(episode_info,settings.output_subdirs.object_detections) - episode_dir.mkdir(parents=True, exist_ok=True) - - output_data = create_processing_metadata( - episode_info=episode_info, - processing_params={ - "model": self.model_name, - "confidence_threshold": self.conf_threshold, - }, - statistics={ - "total_frames": len(detections_data["frames"]), - "total_detections": sum(f['detection_count'] for f in detections_data['frames']), - "frames_with_detections": len([f for f in detections_data['frames'] if f['detection_count'] > 0]), - }, - results_key="detections", - results_data=detections_data["frames"], - ) - file_naming = FileNamingConventions(series_name) - detections_filename = file_naming.build_filename( - episode_info, - extension="json", - suffix="_object_detections", - ) - detections_output = episode_dir / detections_filename - atomic_write_json(detections_output, output_data, indent=2, ensure_ascii=False) - - console.print(f"[green]✓ Saved object detections to: {detections_output}[/green]") - - @staticmethod - def __cleanup_memory() -> None: - gc.collect() - if torch.cuda.is_available(): - torch.cuda.empty_cache() - - -class ObjectDetectionVisualizationSubProcessor(FrameSubProcessor): - def __init__(self): - super().__init__("Object Detection Visualization") - self.logger = ErrorHandlingLogger("ObjectDetectionVisualizationSubProcessor", logging.DEBUG, 15) - - def initialize(self) -> None: - pass - - def cleanup(self) -> None: - pass - - def finalize(self) -> None: - if hasattr(self, 'logger'): - self.logger.finalize() - - def needs_ramdisk(self) -> bool: - return False - - def get_expected_outputs(self, item: ProcessingItem) -> List[OutputSpec]: - episode_info = item.metadata["episode_info"] - episode_dir = PathManager(episode_info.series_name or "unknown").get_episode_dir(episode_info,settings.output_subdirs.object_visualizations) - marker_file = episode_dir / ".visualization_complete" - return [OutputSpec(path=marker_file, required=True)] - - def should_run(self, item: ProcessingItem, missing_outputs: List[OutputSpec]) -> bool: - episode_info = item.metadata["episode_info"] - detection_dir = PathManager(episode_info.series_name or "unknown").get_episode_dir(episode_info,settings.output_subdirs.object_detections) - detection_files = list(detection_dir.glob("*_object_detections.json")) - detection_file = detection_files[0] if detection_files else None - - if not detection_file or not detection_file.exists(): - console.print(f"[yellow]No object detections found for {episode_info.episode_code()}, skipping visualization[/yellow]") - return False - - expected = self.get_expected_outputs(item) - return any(str(exp.path) in str(miss.path) for exp in expected for miss in missing_outputs) - - def process(self, item: ProcessingItem, ramdisk_frames_dir: Path) -> None: - import cv2 # pylint: disable=import-outside-toplevel - - episode_info = item.metadata["episode_info"] - detection_dir = PathManager(episode_info.series_name or "unknown").get_episode_dir(episode_info,settings.output_subdirs.object_detections) - detection_files = list(detection_dir.glob("*_object_detections.json")) - detection_file = detection_files[0] if detection_files else None - - if not detection_file or not detection_file.exists(): - console.print(f"[yellow]No detections JSON found in {detection_dir}[/yellow]") - return - - if not ramdisk_frames_dir.exists(): - console.print(f"[yellow]No frames directory found: {ramdisk_frames_dir}[/yellow]") - return - - with open(detection_file, 'r', encoding='utf-8') as f: - detection_data = json.load(f) - - frames_with_detections = [f for f in detection_data.get("detections", []) if f['detection_count'] > 0] - if not frames_with_detections: - console.print(f"[yellow]No frames with detections for {episode_info.episode_code()}[/yellow]") - return - - output_dir = PathManager(episode_info.series_name or "unknown").get_episode_dir(episode_info,settings.output_subdirs.object_visualizations) - output_dir.mkdir(parents=True, exist_ok=True) - colors = self.__generate_colors() - conf_threshold = detection_data.get("processing_params", {}).get("confidence_threshold", 0.25) - - console.print(f"[cyan]Visualizing {len(frames_with_detections)} frames for {episode_info.episode_code()}[/cyan]") - - for frame_data in frames_with_detections: - output_path = output_dir / frame_data['frame_name'] - if output_path.exists(): - continue - - frame_path = ramdisk_frames_dir / frame_data['frame_name'] - if not frame_path.exists(): - continue - - img = cv2.imread(str(frame_path)) - if img is None: - continue - - self.__draw_detections_on_frame(img, frame_data['detections'], colors, conf_threshold) - cv2.imwrite(str(output_path), img) - - marker_file = output_dir / ".visualization_complete" - marker_file.write_text(f"completed: {len(frames_with_detections)} frames") - console.print(f"[green]✓ Visualized {len(frames_with_detections)} frames saved to: {output_dir}[/green]") - - @staticmethod - def __draw_detections_on_frame(img, detections: List[Dict[str, Any]], colors: Dict[int, Tuple[int, int, int]], conf_threshold: float) -> None: - import cv2 # pylint: disable=import-outside-toplevel - - for detection in detections: - if detection['confidence'] < conf_threshold: - continue - - class_id = detection['class_id'] - bbox = detection['bbox'] - x1, y1 = int(bbox['x1']), int(bbox['y1']) - x2, y2 = int(bbox['x2']), int(bbox['y2']) - color = colors.get(class_id, (0, 255, 0)) - - cv2.rectangle(img, (x1, y1), (x2, y2), color, 2) - - label = f"{detection['class_name']} {detection['confidence']:.2f}" - label_size, _ = cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX, 0.5, 1) - label_y1 = max(y1 - 10, label_size[1]) - - cv2.rectangle(img, (x1, label_y1 - label_size[1] - 5), (x1 + label_size[0], label_y1), color, -1) - cv2.putText(img, label, (x1, label_y1 - 2), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255), 1) - - @staticmethod - def __generate_colors(num_colors: int = 80) -> Dict[int, Tuple[int, int, int]]: - np.random.seed(42) - colors = {} - for i in range(num_colors): - colors[i] = tuple(int(x) for x in np.random.randint(50, 255, 3)) - return colors - - -class CharacterDetectionVisualizationSubProcessor(FrameSubProcessor): - def __init__(self): - super().__init__("Character Detection Visualization") - self.logger = ErrorHandlingLogger("CharacterDetectionVisualizationSubProcessor", logging.DEBUG, 15) - - def initialize(self) -> None: - pass - - def cleanup(self) -> None: - pass - - def finalize(self) -> None: - if hasattr(self, 'logger'): - self.logger.finalize() - - def needs_ramdisk(self) -> bool: - return False - - def get_expected_outputs(self, item: ProcessingItem) -> List[OutputSpec]: - episode_info = item.metadata["episode_info"] - episode_dir = PathManager(episode_info.series_name or "unknown").get_episode_dir(episode_info,settings.output_subdirs.character_visualizations) - marker_file = episode_dir / ".visualization_complete" - return [OutputSpec(path=marker_file, required=True)] - - def should_run(self, item: ProcessingItem, missing_outputs: List[OutputSpec]) -> bool: - episode_info = item.metadata["episode_info"] - detection_dir = PathManager(episode_info.series_name or "unknown").get_episode_dir(episode_info,settings.output_subdirs.character_detections) - detection_files = list(detection_dir.glob("*_character_detections.json")) - detection_file = detection_files[0] if detection_files else None - - if not detection_file or not detection_file.exists(): - console.print(f"[yellow]No character detections found for {episode_info.episode_code()}, skipping visualization[/yellow]") - return False - - expected = self.get_expected_outputs(item) - return any(str(exp.path) in str(miss.path) for exp in expected for miss in missing_outputs) - - def process(self, item: ProcessingItem, ramdisk_frames_dir: Path) -> None: # pylint: disable=too-many-locals - import cv2 # pylint: disable=import-outside-toplevel - - episode_info = item.metadata["episode_info"] - detection_dir = PathManager(episode_info.series_name or "unknown").get_episode_dir(episode_info,settings.output_subdirs.character_detections) - detection_files = list(detection_dir.glob("*_character_detections.json")) - detection_file = detection_files[0] if detection_files else None - - if not detection_file or not detection_file.exists(): - console.print(f"[yellow]No detections JSON found in {detection_dir}[/yellow]") - return - - if not ramdisk_frames_dir.exists(): - console.print(f"[yellow]No frames directory found: {ramdisk_frames_dir}[/yellow]") - return - - with open(detection_file, 'r', encoding='utf-8') as f: - detection_data = json.load(f) - - frames_with_detections = [f for f in detection_data.get("detections", []) if f.get('characters')] - if not frames_with_detections: - console.print(f"[yellow]No frames with character detections for {episode_info.episode_code()}[/yellow]") - return - - output_dir = PathManager(episode_info.series_name or "unknown").get_episode_dir(episode_info,settings.output_subdirs.character_visualizations) - output_dir.mkdir(parents=True, exist_ok=True) - - all_character_names = set() - for frame_data in frames_with_detections: - for char in frame_data.get('characters', []): - all_character_names.add(char['name']) - colors = self.__generate_character_colors(all_character_names) - - console.print(f"[cyan]Visualizing {len(frames_with_detections)} frames with characters for {episode_info.episode_code()}[/cyan]") - - for frame_data in frames_with_detections: - frame_name = frame_data.get('frame_file') or frame_data.get('frame') - if not frame_name: - continue - - output_path = output_dir / frame_name - if output_path.exists(): - continue - - frame_path = ramdisk_frames_dir / frame_name - if not frame_path.exists(): - continue - - img = cv2.imread(str(frame_path)) - if img is None: - continue - - self.__draw_characters_on_frame(img, frame_data['characters'], colors) - cv2.imwrite(str(output_path), img) - - marker_file = output_dir / ".visualization_complete" - marker_file.write_text(f"completed: {len(frames_with_detections)} frames") - console.print(f"[green]✓ Visualized {len(frames_with_detections)} frames saved to: {output_dir}[/green]") - - @staticmethod - def __draw_characters_on_frame(img, characters: List[Dict[str, Any]], colors: Dict[str, Tuple[int, int, int]]) -> None: - import cv2 # pylint: disable=import-outside-toplevel - - for character in characters: - name = character['name'] - confidence = character['confidence'] - bbox = character['bbox'] - - x1, y1 = bbox['x1'], bbox['y1'] - x2, y2 = bbox['x2'], bbox['y2'] - color = colors.get(name, (0, 255, 0)) - - cv2.rectangle(img, (x1, y1), (x2, y2), color, 2) - - label = f"{name} {confidence:.2f}" - if "emotion" in character: - emotion_label = character["emotion"]["label"] - emotion_conf = character["emotion"]["confidence"] - label += f" | {emotion_label} {emotion_conf:.2f}" - - label_size, _ = cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX, 0.5, 1) - label_y1 = max(y1 - 10, label_size[1]) - - cv2.rectangle(img, (x1, label_y1 - label_size[1] - 5), (x1 + label_size[0], label_y1), color, -1) - cv2.putText(img, label, (x1, label_y1 - 2), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255), 1) - - @staticmethod - def __generate_character_colors(character_names: Set[str]) -> Dict[str, Tuple[int, int, int]]: - np.random.seed(42) - colors = {} - sorted_names = sorted(character_names) - for _, name in enumerate(sorted_names): - colors[name] = tuple(int(x) for x in np.random.randint(50, 255, 3)) - return colors diff --git a/preprocessor/video/subprocessors/image_hash_subprocessor.py b/preprocessor/video/subprocessors/image_hash_subprocessor.py new file mode 100644 index 000000000..661dcb00a --- /dev/null +++ b/preprocessor/video/subprocessors/image_hash_subprocessor.py @@ -0,0 +1,118 @@ +import gc +import logging +from pathlib import Path +from typing import ( + Any, + Dict, + List, + Optional, +) + +import torch + +from preprocessor.config.config import settings +from preprocessor.core.base_processor import ( + OutputSpec, + ProcessingItem, +) +from preprocessor.core.path_manager import PathManager +from preprocessor.utils.image_hasher import PerceptualHasher +from preprocessor.utils.batch_processing_utils import compute_hashes_in_batches +from preprocessor.utils.console import console +from preprocessor.utils.error_handling_logger import ErrorHandlingLogger +from preprocessor.utils.file_utils import atomic_write_json +from preprocessor.utils.metadata_utils import create_processing_metadata +from preprocessor.video.frame_processor import FrameSubProcessor +import json + + +class ImageHashSubProcessor(FrameSubProcessor): + def __init__(self, device: str, batch_size: int): + super().__init__("Image Hashing") + self.device = device + self.batch_size = batch_size + self.hasher: Optional[PerceptualHasher] = None + self.logger = ErrorHandlingLogger("ImageHashSubProcessor", logging.DEBUG, 15) + + def initialize(self) -> None: + if self.hasher is None: + self.hasher = PerceptualHasher(device=self.device, hash_size=8) + + def cleanup(self) -> None: + self.hasher = None + self.__cleanup_memory() + + def finalize(self) -> None: + if hasattr(self, 'logger'): + self.logger.finalize() + + def get_expected_outputs(self, item: ProcessingItem) -> List[OutputSpec]: + episode_info = item.metadata["episode_info"] + episode_dir = PathManager(episode_info.series_name or "unknown").get_episode_dir(episode_info,settings.output_subdirs.image_hashes) + series_name = item.metadata["series_name"] + path_manager = PathManager(series_name) + hash_filename = path_manager.build_filename( + episode_info, + extension="json", + suffix="image_hashes", + ) + hash_output = episode_dir / hash_filename + return [OutputSpec(path=hash_output, required=True)] + + def should_run(self, item: ProcessingItem, missing_outputs: List[OutputSpec]) -> bool: + expected = self.get_expected_outputs(item) + return any(str(exp.path) in str(miss.path) for exp in expected for miss in missing_outputs) + + def process(self, item: ProcessingItem, ramdisk_frames_dir: Path) -> None: + self.initialize() + + metadata_file = item.input_path + episode_info = item.metadata["episode_info"] + + with open(metadata_file, "r", encoding="utf-8") as f: + metadata = json.load(f) + + frame_requests = metadata.get("frames", []) + if not frame_requests: + console.print(f"[yellow]No frames in metadata for {metadata_file}[/yellow]") + return + + hash_results = compute_hashes_in_batches(ramdisk_frames_dir, frame_requests, self.hasher, self.batch_size) + series_name = item.metadata["series_name"] + self.__save_hashes(episode_info, hash_results, series_name) + + def __save_hashes(self, episode_info, hash_results: List[Dict[str, Any]], series_name: str) -> None: + episode_dir = PathManager(episode_info.series_name or "unknown").get_episode_dir(episode_info,settings.output_subdirs.image_hashes) + episode_dir.mkdir(parents=True, exist_ok=True) + + hash_data = create_processing_metadata( + episode_info=episode_info, + processing_params={ + "device": self.device, + "batch_size": self.batch_size, + "hash_size": 8, + }, + statistics={ + "total_hashes": len(hash_results), + "unique_hashes": len(set(h.get("perceptual_hash") for h in hash_results if "perceptual_hash" in h)), + }, + results_key="image_hashes", + results_data=hash_results, + ) + + path_manager = PathManager(series_name) + hash_filename = path_manager.build_filename( + episode_info, + extension="json", + suffix="image_hashes", + ) + hash_output = episode_dir / hash_filename + atomic_write_json(hash_output, hash_data, indent=2, ensure_ascii=False) + + console.print(f"[green]✓ Saved hashes to: {hash_output}[/green]") + + @staticmethod + def __cleanup_memory() -> None: + gc.collect() + if torch.cuda.is_available(): + torch.cuda.empty_cache() diff --git a/preprocessor/video/subprocessors/object_detection_subprocessor.py b/preprocessor/video/subprocessors/object_detection_subprocessor.py new file mode 100644 index 000000000..03308cf75 --- /dev/null +++ b/preprocessor/video/subprocessors/object_detection_subprocessor.py @@ -0,0 +1,207 @@ +import gc +import logging +from pathlib import Path +from typing import ( + Any, + Dict, + List, + Optional, +) + +import torch +from PIL import Image + +from preprocessor.config.config import settings +from preprocessor.core.base_processor import ( + OutputSpec, + ProcessingItem, +) +from preprocessor.core.path_manager import PathManager +from preprocessor.utils.batch_processor import BatchProcessor +from preprocessor.utils.console import console +from preprocessor.utils.error_handling_logger import ErrorHandlingLogger +from preprocessor.utils.file_utils import atomic_write_json +from preprocessor.utils.metadata_utils import create_processing_metadata +from preprocessor.video.frame_processor import FrameSubProcessor + + +class ObjectDetectionSubProcessor(FrameSubProcessor): + def __init__(self, model_name: str = "ustc-community/dfine-xlarge-obj2coco", conf_threshold: float = 0.25): + super().__init__("Object Detection") + self.model_name = model_name + self.conf_threshold = conf_threshold + self.model: Optional[Any] = None + self.image_processor: Optional[Any] = None + self.logger = ErrorHandlingLogger("ObjectDetectionSubProcessor", logging.DEBUG, 15) + self.batch_processor = BatchProcessor(8) + + def initialize(self) -> None: + if self.model is None: + if not torch.cuda.is_available(): + raise RuntimeError("CUDA is not available. Object detection requires GPU.") + + from transformers import ( # pylint: disable=import-outside-toplevel + AutoImageProcessor, + DFineForObjectDetection, + ) + + console.print(f"[cyan]Loading D-FINE model: {self.model_name}[/cyan]") + self.image_processor = AutoImageProcessor.from_pretrained(self.model_name) + self.model = DFineForObjectDetection.from_pretrained(self.model_name) + self.model.to("cuda") + console.print("[green]✓ D-FINE model loaded on GPU[/green]") + + def cleanup(self) -> None: + self.model = None + self.image_processor = None + self.__cleanup_memory() + + def finalize(self) -> None: + if hasattr(self, 'logger'): + self.logger.finalize() + + def get_expected_outputs(self, item: ProcessingItem) -> List[OutputSpec]: + episode_info = item.metadata["episode_info"] + episode_dir = PathManager(episode_info.series_name or "unknown").get_episode_dir(episode_info,settings.output_subdirs.object_detections) + series_name = item.metadata["series_name"] + path_manager = PathManager(series_name) + detections_filename = path_manager.build_filename( + episode_info, + extension="json", + suffix="_object_detections", + ) + detections_output = episode_dir / detections_filename + return [OutputSpec(path=detections_output, required=True)] + + def should_run(self, item: ProcessingItem, missing_outputs: List[OutputSpec]) -> bool: + expected = self.get_expected_outputs(item) + return any(str(exp.path) in str(miss.path) for exp in expected for miss in missing_outputs) + + def process(self, item: ProcessingItem, ramdisk_frames_dir: Path) -> None: # pylint: disable=too-many-locals + self.initialize() + + from PIL import Image # pylint: disable=import-outside-toplevel + + episode_info = item.metadata["episode_info"] + + frame_files = sorted([ + f for f in ramdisk_frames_dir.glob("*.jpg") + if f.is_file() and "frame_" in f.name + ]) + + if not frame_files: + console.print(f"[yellow]No frames found in {ramdisk_frames_dir}[/yellow]") + return + + console.print(f"[cyan]Detecting objects in {len(frame_files)} frames[/cyan]") + + def _process_batch(batch_paths: List[Path]) -> List[Dict[str, Any]]: + batch_images = [Image.open(fp) for fp in batch_paths] + target_sizes = [(img.height, img.width) for img in batch_images] + + inputs = self.image_processor(images=batch_images, return_tensors="pt") + inputs = {k: v.to("cuda") for k, v in inputs.items()} + + with torch.no_grad(): + outputs = self.model(**inputs) + + results = self.image_processor.post_process_object_detection( + outputs, + target_sizes=target_sizes, + threshold=self.conf_threshold, + ) + + batch_results = [] + for frame_path, result in zip(batch_paths, results): + frame_result = { + "frame_name": frame_path.name, + "detections": [], + } + + for score, label_id, box in zip(result["scores"], result["labels"], result["boxes"]): + score_value = score.item() + label = label_id.item() + box_coords = [float(i) for i in box.tolist()] + + detection = { + "class_id": label, + "class_name": self.model.config.id2label[label], + "confidence": score_value, + "bbox": { + "x1": box_coords[0], + "y1": box_coords[1], + "x2": box_coords[2], + "y2": box_coords[3], + }, + } + frame_result["detections"].append(detection) + + frame_result["detection_count"] = len(frame_result["detections"]) + batch_results.append(frame_result) + + for img in batch_images: + img.close() + return batch_results + + all_results = self.batch_processor.process(frame_files, _process_batch) + + detections_data = { + "episode_code": episode_info.episode_code(), + "model": self.model_name, + "confidence_threshold": self.conf_threshold, + "frames": all_results, + } + + total_detections = sum(f['detection_count'] for f in detections_data['frames']) + frames_with_detections = len([f for f in detections_data['frames'] if f['detection_count'] > 0]) + + console.print(f"[green]✓ Total detections: {total_detections}[/green]") + console.print(f"[green]✓ Frames with detections: {frames_with_detections}/{len(frame_files)}[/green]") + + class_counts = {} + for frame in detections_data["frames"]: + for det in frame["detections"]: + class_name = det["class_name"] + class_counts[class_name] = class_counts.get(class_name, 0) + 1 + + if class_counts: + top_classes = sorted(class_counts.items(), key=lambda x: x[1], reverse=True)[:5] + console.print(f"[cyan]Top 5 classes: {', '.join(f'{cls}:{cnt}' for cls, cnt in top_classes)}[/cyan]") + + series_name = item.metadata["series_name"] + self.__save_detections(episode_info, detections_data, series_name) + + def __save_detections(self, episode_info, detections_data: Dict[str, Any], series_name: str) -> None: + episode_dir = PathManager(episode_info.series_name or "unknown").get_episode_dir(episode_info,settings.output_subdirs.object_detections) + episode_dir.mkdir(parents=True, exist_ok=True) + + output_data = create_processing_metadata( + episode_info=episode_info, + processing_params={ + "model": self.model_name, + "confidence_threshold": self.conf_threshold, + }, + statistics={ + "total_frames": len(detections_data["frames"]), + "total_detections": sum(f['detection_count'] for f in detections_data['frames']), + "frames_with_detections": len([f for f in detections_data['frames'] if f['detection_count'] > 0]), + }, + results_key="detections", + results_data=detections_data["frames"], + ) + path_manager = PathManager(series_name) + detections_filename = path_manager.build_filename( + episode_info, + extension="json", + suffix="_object_detections", + ) + detections_output = episode_dir / detections_filename + atomic_write_json(detections_output, output_data, indent=2, ensure_ascii=False) + + console.print(f"[green]✓ Saved object detections to: {detections_output}[/green]") + + @staticmethod + def __cleanup_memory() -> None: + gc.collect() + if torch.cuda.is_available(): + torch.cuda.empty_cache() \ No newline at end of file diff --git a/preprocessor/video/subprocessors/object_detection_visualization_subprocessor.py b/preprocessor/video/subprocessors/object_detection_visualization_subprocessor.py new file mode 100644 index 000000000..4e6ed5ed6 --- /dev/null +++ b/preprocessor/video/subprocessors/object_detection_visualization_subprocessor.py @@ -0,0 +1,141 @@ +import json +import logging +from pathlib import Path +from typing import ( + Any, + Dict, + List, + Tuple, +) + +import numpy as np + +from preprocessor.config.config import settings +from preprocessor.core.base_processor import ( + OutputSpec, + ProcessingItem, +) +from preprocessor.core.path_manager import PathManager +from preprocessor.utils.console import console +from preprocessor.utils.error_handling_logger import ErrorHandlingLogger +from preprocessor.video.frame_processor import FrameSubProcessor + + +class ObjectDetectionVisualizationSubProcessor(FrameSubProcessor): + def __init__(self): + super().__init__("Object Detection Visualization") + self.logger = ErrorHandlingLogger("ObjectDetectionVisualizationSubProcessor", logging.DEBUG, 15) + + def initialize(self) -> None: + pass + + def cleanup(self) -> None: + pass + + def finalize(self) -> None: + if hasattr(self, 'logger'): + self.logger.finalize() + + def needs_ramdisk(self) -> bool: + return False + + def get_expected_outputs(self, item: ProcessingItem) -> List[OutputSpec]: + episode_info = item.metadata["episode_info"] + episode_dir = PathManager(episode_info.series_name or "unknown").get_episode_dir(episode_info,settings.output_subdirs.object_visualizations) + marker_file = episode_dir / ".visualization_complete" + return [OutputSpec(path=marker_file, required=True)] + + def should_run(self, item: ProcessingItem, missing_outputs: List[OutputSpec]) -> bool: + episode_info = item.metadata["episode_info"] + detection_dir = PathManager(episode_info.series_name or "unknown").get_episode_dir(episode_info,settings.output_subdirs.object_detections) + detection_files = list(detection_dir.glob("*_object_detections.json")) + detection_file = detection_files[0] if detection_files else None + + if not detection_file or not detection_file.exists(): + console.print(f"[yellow]No object detections found for {episode_info.episode_code()}, skipping visualization[/yellow]") + return False + + expected = self.get_expected_outputs(item) + return any(str(exp.path) in str(miss.path) for exp in expected for miss in missing_outputs) + + def process(self, item: ProcessingItem, ramdisk_frames_dir: Path) -> None: + import cv2 # pylint: disable=import-outside-toplevel + + episode_info = item.metadata["episode_info"] + detection_dir = PathManager(episode_info.series_name or "unknown").get_episode_dir(episode_info,settings.output_subdirs.object_detections) + detection_files = list(detection_dir.glob("*_object_detections.json")) + detection_file = detection_files[0] if detection_files else None + + if not detection_file or not detection_file.exists(): + console.print(f"[yellow]No detections JSON found in {detection_dir}[/yellow]") + return + + if not ramdisk_frames_dir.exists(): + console.print(f"[yellow]No frames directory found: {ramdisk_frames_dir}[/yellow]") + return + + with open(detection_file, 'r', encoding='utf-8') as f: + detection_data = json.load(f) + + frames_with_detections = [f for f in detection_data.get("detections", []) if f['detection_count'] > 0] + if not frames_with_detections: + console.print(f"[yellow]No frames with detections for {episode_info.episode_code()}[/yellow]") + return + + output_dir = PathManager(episode_info.series_name or "unknown").get_episode_dir(episode_info,settings.output_subdirs.object_visualizations) + output_dir.mkdir(parents=True, exist_ok=True) + colors = self.__generate_colors() + conf_threshold = detection_data.get("processing_params", {}).get("confidence_threshold", 0.25) + + console.print(f"[cyan]Visualizing {len(frames_with_detections)} frames for {episode_info.episode_code()}[/cyan]") + + for frame_data in frames_with_detections: + output_path = output_dir / frame_data['frame_name'] + if output_path.exists(): + continue + + frame_path = ramdisk_frames_dir / frame_data['frame_name'] + if not frame_path.exists(): + continue + + img = cv2.imread(str(frame_path)) + if img is None: + continue + + self.__draw_detections_on_frame(img, frame_data['detections'], colors, conf_threshold) + cv2.imwrite(str(output_path), img) + + marker_file = output_dir / ".visualization_complete" + marker_file.write_text(f"completed: {len(frames_with_detections)} frames") + console.print(f"[green]✓ Visualized {len(frames_with_detections)} frames saved to: {output_dir}[/green]") + + @staticmethod + def __draw_detections_on_frame(img, detections: List[Dict[str, Any]], colors: Dict[int, Tuple[int, int, int]], conf_threshold: float) -> None: + import cv2 # pylint: disable=import-outside-toplevel + + for detection in detections: + if detection['confidence'] < conf_threshold: + continue + + class_id = detection['class_id'] + bbox = detection['bbox'] + x1, y1 = int(bbox['x1']), int(bbox['y1']) + x2, y2 = int(bbox['x2']), int(bbox['y2']) + color = colors.get(class_id, (0, 255, 0)) + + cv2.rectangle(img, (x1, y1), (x2, y2), color, 2) + + label = f"{detection['class_name']} {detection['confidence']:.2f}" + label_size, _ = cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX, 0.5, 1) + label_y1 = max(y1 - 10, label_size[1]) + + cv2.rectangle(img, (x1, label_y1 - label_size[1] - 5), (x1 + label_size[0], label_y1), color, -1) + cv2.putText(img, label, (x1, label_y1 - 2), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255), 1) + + @staticmethod + def __generate_colors(num_colors: int = 80) -> Dict[int, Tuple[int, int, int]]: + np.random.seed(42) + colors = {} + for i in range(num_colors): + colors[i] = tuple(int(x) for x in np.random.randint(50, 255, 3)) + return colors diff --git a/preprocessor/video/subprocessors/video_embedding_subprocessor.py b/preprocessor/video/subprocessors/video_embedding_subprocessor.py new file mode 100644 index 000000000..0267cb500 --- /dev/null +++ b/preprocessor/video/subprocessors/video_embedding_subprocessor.py @@ -0,0 +1,155 @@ +import gc +import json +import logging +from pathlib import Path +from typing import ( + Any, + Dict, + List, + Optional, +) + +import torch + +from preprocessor.config.config import settings +from preprocessor.core.base_processor import ( + OutputSpec, + ProcessingItem, +) +from preprocessor.core.path_manager import PathManager +from preprocessor.embeddings.gpu_batch_processor import GPUBatchProcessor +from preprocessor.utils.batch_processing_utils import compute_embeddings_in_batches +from preprocessor.utils.console import console +from preprocessor.utils.error_handling_logger import ErrorHandlingLogger +from preprocessor.utils.file_utils import atomic_write_json +from preprocessor.utils.image_hash_utils import load_image_hashes_for_episode +from preprocessor.utils.metadata_utils import create_processing_metadata +from preprocessor.video.frame_processor import FrameSubProcessor + + +class VideoEmbeddingSubProcessor(FrameSubProcessor): + def __init__(self, device: str, batch_size: int, model_name: str, model_revision: str): + super().__init__("Video Embeddings") + self.device = device + self.batch_size = batch_size + self.model_name = model_name + self.model_revision = model_revision + self.model = None + self.gpu_processor: Optional[GPUBatchProcessor] = None + self.logger = ErrorHandlingLogger("VideoEmbeddingSubProcessor", logging.DEBUG, 15) + + def initialize(self) -> None: + if self.model is None: + from preprocessor.embeddings.qwen3_vl_embedding import Qwen3VLEmbedder # pylint: disable=import-outside-toplevel + console.print(f"[cyan]Loading embedding model: {self.model_name}[/cyan]") + self.model = Qwen3VLEmbedder( + model_name_or_path=self.model_name, + torch_dtype=torch.bfloat16, + ) + self.gpu_processor = GPUBatchProcessor( + self.model, + self.batch_size, + self.logger, + self.device, + progress_sub_batch_size=settings.embedding.progress_sub_batch_size, + ) + console.print("[green]✓ Qwen3-VL-Embedding model loaded[/green]") + + def cleanup(self) -> None: + self.model = None + self.gpu_processor = None + self.__cleanup_memory() + + def finalize(self) -> None: + if hasattr(self, 'logger'): + self.logger.finalize() + + def get_expected_outputs(self, item: ProcessingItem) -> List[OutputSpec]: + episode_info = item.metadata["episode_info"] + episode_dir = PathManager(episode_info.series_name or "unknown").get_episode_dir(episode_info,settings.output_subdirs.embeddings) + series_name = item.metadata["series_name"] + path_manager = PathManager(series_name) + video_filename = path_manager.build_filename( + episode_info, + extension="json", + suffix="embeddings_video", + ) + video_output = episode_dir / video_filename + return [OutputSpec(path=video_output, required=True)] + + def should_run(self, item: ProcessingItem, missing_outputs: List[OutputSpec]) -> bool: + expected = self.get_expected_outputs(item) + return any(str(exp.path) in str(miss.path) for exp in expected for miss in missing_outputs) + + def process(self, item: ProcessingItem, ramdisk_frames_dir: Path) -> None: + self.initialize() + + metadata_file = item.input_path + episode_info = item.metadata["episode_info"] + + with open(metadata_file, "r", encoding="utf-8") as f: + metadata = json.load(f) + + frame_requests = metadata.get("frames", []) + if not frame_requests: + console.print(f"[yellow]No frames in metadata for {metadata_file}[/yellow]") + return + + episode_dir = PathManager(episode_info.series_name or "unknown").get_episode_dir(episode_info,settings.output_subdirs.embeddings) + checkpoint_file = episode_dir / "embeddings_video_checkpoint.json" + + series_name = item.metadata.get("series_name", "unknown") + image_hashes = load_image_hashes_for_episode( + {"season": episode_info.season, "episode_number": episode_info.relative_episode}, + series_name, + self.logger, + ) + video_embeddings = compute_embeddings_in_batches( + ramdisk_frames_dir, + frame_requests, + self.gpu_processor, + self.batch_size, + image_hashes, + checkpoint_file=checkpoint_file, + checkpoint_interval=20, + prefetch_count=settings.embedding.prefetch_chunks, + ) + series_name = item.metadata["series_name"] + self.__save_embeddings(episode_info, video_embeddings, series_name) + + def __save_embeddings(self, episode_info, video_embeddings: List[Dict[str, Any]], series_name: str) -> None: + episode_dir = PathManager(episode_info.series_name or "unknown").get_episode_dir(episode_info,settings.output_subdirs.embeddings) + episode_dir.mkdir(parents=True, exist_ok=True) + + video_data = create_processing_metadata( + episode_info=episode_info, + processing_params={ + "model_name": self.model_name, + "model_revision": self.model_revision, + "batch_size": self.batch_size, + "device": self.device, + }, + statistics={ + "total_embeddings": len(video_embeddings), + "embedding_dimension": len(video_embeddings[0]["embedding"]) if video_embeddings else 0, + "frames_with_hash": sum(1 for e in video_embeddings if "perceptual_hash" in e), + }, + results_key="video_embeddings", + results_data=video_embeddings, + ) + path_manager = PathManager(series_name) + video_filename = path_manager.build_filename( + episode_info, + extension="json", + suffix="embeddings_video", + ) + video_output = episode_dir / video_filename + atomic_write_json(video_output, video_data, indent=2, ensure_ascii=False) + + console.print(f"[green]✓ Saved embeddings to: {video_output}[/green]") + + @staticmethod + def __cleanup_memory() -> None: + gc.collect() + if torch.cuda.is_available(): + torch.cuda.empty_cache() From 9cfce988195a05f2ba9195579e2161d880d4b779 Mon Sep 17 00:00:00 2001 From: dam2452 <81230036+dam2452@users.noreply.github.com> Date: Mon, 9 Feb 2026 22:51:17 +0100 Subject: [PATCH 05/89] . --- bot/types.py | 6 +- preprocessor/characters/face_detection.py | 11 +- .../characters/reference_downloader.py | 6 +- preprocessor/cli/commands/detect_scenes.py | 2 +- preprocessor/cli/commands/fix_unicode.py | 2 +- .../cli/commands/generate_embeddings.py | 2 +- preprocessor/cli/commands/run_all.py | 2 +- preprocessor/cli/commands/search.py | 644 +------------- preprocessor/cli/commands/separate_sounds.py | 2 +- preprocessor/cli/commands/transcode.py | 2 +- preprocessor/cli/commands/transcribe.py | 2 +- preprocessor/cli/pipeline/orchestrator.py | 2 +- preprocessor/cli/pipeline/steps.py | 4 +- preprocessor/core/base_processor.py | 5 +- preprocessor/core/path_manager.py | 2 +- preprocessor/core/video_processor.py | 47 + .../embeddings/embedding_generator.py | 823 ------------------ .../embeddings/episode_name_embedder.py | 4 +- .../embeddings/gpu_batch_processor.py | 55 +- preprocessor/episodes/__init__.py | 8 + .../{core => episodes}/episode_file_finder.py | 0 .../{core => episodes}/episode_manager.py | 4 +- .../{core => episodes}/episode_parser.py | 0 preprocessor/processors/archive_generator.py | 2 +- preprocessor/processors/character_detector.py | 7 +- .../processors/elastic_document_generator.py | 25 +- .../processors/elasticsearch_indexer.py | 2 +- .../processors/embedding_generator.py | 22 +- preprocessor/processors/frame_exporter.py | 45 +- .../processors/image_hash_processor.py | 69 +- preprocessor/processors/scene_detector.py | 11 +- preprocessor/processors/text_analyzer.py | 6 +- .../processors/transcription_generator.py | 12 +- .../processors/transcription_importer.py | 5 +- preprocessor/processors/video_transcoder.py | 22 +- preprocessor/scraping/base_scraper.py | 2 +- .../scripts_temp/import_transcriptions.py | 94 -- preprocessor/search/__init__.py | 11 + preprocessor/search/elasticsearch_queries.py | 467 ++++++++++ preprocessor/search/embedding_service.py | 106 +++ preprocessor/search/hash_service.py | 41 + preprocessor/search/result_formatters.py | 85 ++ preprocessor/transcription/elevenlabs.py | 4 +- .../generators/json_generator.py | 9 +- .../generators/multi_format_generator.py | 14 +- .../processors/episode_info_processor.py | 4 +- .../processors/sound_separator.py | 2 +- .../transcription/processors/unicode_fixer.py | 2 +- preprocessor/types/__init__.py | 36 +- preprocessor/types/clip.py | 1 + preprocessor/types/detection.py | 1 + preprocessor/types/episode.py | 1 + preprocessor/types/frame.py | 1 + preprocessor/types/scene.py | 1 + preprocessor/types/search.py | 2 + preprocessor/types/transcription.py | 2 + preprocessor/types/video.py | 2 +- preprocessor/utils/batch_processing_utils.py | 4 +- preprocessor/utils/hash_save_utils.py | 50 ++ preprocessor/utils/image_hash_utils.py | 2 +- preprocessor/validation/episode_stats.py | 2 +- preprocessor/validation/validator.py | 2 +- preprocessor/video/frame_processor.py | 75 +- preprocessor/video/subprocessors/__init__.py | 10 +- .../character_detection_subprocessor.py | 6 +- ...er_detection_visualization_subprocessor.py | 35 +- .../emotion_detection_subprocessor.py | 5 +- .../face_clustering_subprocessor.py | 14 +- .../subprocessors/image_hash_subprocessor.py | 60 +- .../object_detection_subprocessor.py | 21 +- ...ct_detection_visualization_subprocessor.py | 33 +- .../video_embedding_subprocessor.py | 19 +- 72 files changed, 1185 insertions(+), 1904 deletions(-) create mode 100644 preprocessor/core/video_processor.py delete mode 100644 preprocessor/embeddings/embedding_generator.py create mode 100644 preprocessor/episodes/__init__.py rename preprocessor/{core => episodes}/episode_file_finder.py (100%) rename preprocessor/{core => episodes}/episode_manager.py (97%) rename preprocessor/{core => episodes}/episode_parser.py (100%) delete mode 100644 preprocessor/scripts_temp/import_transcriptions.py create mode 100644 preprocessor/search/__init__.py create mode 100644 preprocessor/search/elasticsearch_queries.py create mode 100644 preprocessor/search/embedding_service.py create mode 100644 preprocessor/search/hash_service.py create mode 100644 preprocessor/search/result_formatters.py create mode 100644 preprocessor/utils/hash_save_utils.py diff --git a/bot/types.py b/bot/types.py index e1e866ca7..56d463b6e 100644 --- a/bot/types.py +++ b/bot/types.py @@ -18,10 +18,11 @@ SceneTimestamp, SceneTimestampPoint, SceneTimestampsData, + SearchSegment, SeasonBucket, SeasonInfo, SeasonInfoDict, - SearchSegment, + SegmentWithScore, TranscriptionContext, VideoMetadata, ) @@ -50,6 +51,7 @@ "SeasonInfo", "SeasonInfoDict", "SearchSegment", + "SegmentWithScore", "TranscriptionContext", "VideoMetadata", -] \ No newline at end of file +] diff --git a/preprocessor/characters/face_detection.py b/preprocessor/characters/face_detection.py index e5b1bde65..fff55a70a 100644 --- a/preprocessor/characters/face_detection.py +++ b/preprocessor/characters/face_detection.py @@ -1,12 +1,17 @@ import os -import warnings from pathlib import Path -from typing import Any, Dict, List, Optional +from typing import ( + Any, + Dict, + List, + Optional, +) +import warnings import cv2 +from insightface.app import FaceAnalysis import numpy as np from numpy.linalg import norm -from insightface.app import FaceAnalysis import onnxruntime as ort from preprocessor.config.config import settings diff --git a/preprocessor/characters/reference_downloader.py b/preprocessor/characters/reference_downloader.py index 7a3b27cff..af392e165 100644 --- a/preprocessor/characters/reference_downloader.py +++ b/preprocessor/characters/reference_downloader.py @@ -21,10 +21,10 @@ sync_playwright, ) -from preprocessor.characters.face_detection import init_face_detection -from preprocessor.characters.image_search import BaseImageSearch from preprocessor.characters.duckduckgo_image_search import DuckDuckGoImageSearch +from preprocessor.characters.face_detection import init_face_detection from preprocessor.characters.google_image_search import GoogleImageSearch +from preprocessor.characters.image_search import BaseImageSearch from preprocessor.config.config import settings from preprocessor.core.base_processor import BaseProcessor from preprocessor.utils.console import ( @@ -71,8 +71,6 @@ def _validate_args(self, args: Dict[str, Any]) -> None: def get_output_subdir(self) -> str: return "character_references" - if "series_name" not in args: - raise ValueError("series_name is required") def __all_references_exist(self, characters: List[Dict[str, Any]]) -> bool: for char in characters: diff --git a/preprocessor/cli/commands/detect_scenes.py b/preprocessor/cli/commands/detect_scenes.py index 924ed53da..632efb044 100644 --- a/preprocessor/cli/commands/detect_scenes.py +++ b/preprocessor/cli/commands/detect_scenes.py @@ -3,9 +3,9 @@ import click -from preprocessor.utils.resource_scope import ResourceScope from preprocessor.config.config import settings from preprocessor.processors.scene_detector import SceneDetector +from preprocessor.utils.resource_scope import ResourceScope @click.command(name="detect-scenes", context_settings={"show_default": True}) diff --git a/preprocessor/cli/commands/fix_unicode.py b/preprocessor/cli/commands/fix_unicode.py index 92a684895..b2d116c17 100644 --- a/preprocessor/cli/commands/fix_unicode.py +++ b/preprocessor/cli/commands/fix_unicode.py @@ -3,9 +3,9 @@ import click -from preprocessor.utils.resource_scope import ResourceScope from preprocessor.config.config import settings from preprocessor.transcription.processors.unicode_fixer import TranscriptionUnicodeFixer +from preprocessor.utils.resource_scope import ResourceScope @click.command(context_settings={"show_default": True}) diff --git a/preprocessor/cli/commands/generate_embeddings.py b/preprocessor/cli/commands/generate_embeddings.py index 23007b8d8..1bbb44ee4 100644 --- a/preprocessor/cli/commands/generate_embeddings.py +++ b/preprocessor/cli/commands/generate_embeddings.py @@ -3,9 +3,9 @@ import click -from preprocessor.utils.resource_scope import ResourceScope from preprocessor.config.config import settings from preprocessor.processors.embedding_generator import EmbeddingGenerator +from preprocessor.utils.resource_scope import ResourceScope @click.command(name="generate-embeddings", context_settings={"show_default": True}) diff --git a/preprocessor/cli/commands/run_all.py b/preprocessor/cli/commands/run_all.py index 3ed1493f4..8459b198d 100644 --- a/preprocessor/cli/commands/run_all.py +++ b/preprocessor/cli/commands/run_all.py @@ -4,6 +4,7 @@ import click +from preprocessor.cli.helpers import create_state_manager from preprocessor.cli.pipeline.orchestrator import PipelineOrchestrator from preprocessor.cli.pipeline.steps import ( run_archive_generation_step, @@ -23,7 +24,6 @@ run_transcribe_step, run_validation_step, ) -from preprocessor.cli.helpers import create_state_manager from preprocessor.config.config import ( get_base_output_dir, settings, diff --git a/preprocessor/cli/commands/search.py b/preprocessor/cli/commands/search.py index 80d3eaec8..383ae5e3c 100644 --- a/preprocessor/cli/commands/search.py +++ b/preprocessor/cli/commands/search.py @@ -1,593 +1,19 @@ -# pylint: disable=duplicate-code,too-many-arguments,too-many-statements +# pylint: disable=too-many-arguments,too-many-locals,too-many-branches,too-many-statements import asyncio import json from pathlib import Path import sys -from PIL import Image import click from elasticsearch import AsyncElasticsearch -from qwen_vl_utils import process_vision_info -import torch -from transformers import ( - AutoModelForVision2Seq, - AutoProcessor, -) -from preprocessor.config.config import settings -from preprocessor.utils.image_hasher import PerceptualHasher -from preprocessor.utils.constants import ( - ElasticsearchAggregationKeys, - ElasticsearchKeys, - EpisodeMetadataKeys, +from preprocessor.search import ( + ElasticsearchQueries, + EmbeddingService, + HashService, + ResultFormatter, ) -_model = None -_processor = None -_device = None -_hasher = None - - -def load_model(): - global _model, _processor, _device # pylint: disable=global-statement - if _model is not None: - return _model, _processor, _device - - click.echo("Loading embedding model...", err=True) - if not torch.cuda.is_available(): - raise RuntimeError("CUDA is required but not available. This pipeline requires GPU.") - - model_name = settings.embedding_model.model_name - _device = "cuda" - - _model = AutoModelForVision2Seq.from_pretrained( - model_name, - dtype=torch.bfloat16, - device_map="auto", - ) - _processor = AutoProcessor.from_pretrained(model_name) - - click.echo(f"Model loaded on {_device}", err=True) - return _model, _processor, _device - - -def get_text_embedding(text): - model, processor, device = load_model() - - messages = [{ - "role": "user", - "content": [{"type": "text", "text": text}], - }] - - text_inputs = processor.apply_chat_template( - messages, - add_generation_prompt=True, - tokenize=True, - return_tensors="pt", - ).to(device) - - with torch.no_grad(): - output = model(input_ids=text_inputs, output_hidden_states=True) - embedding = output.hidden_states[-1][:, -1, :].squeeze(0) - embedding = torch.nn.functional.normalize(embedding, p=2, dim=0) - - return embedding.float().cpu().numpy().tolist() - - -def _get_image_embedding(image_path): - model, processor, device = load_model() - - messages = [{ - "role": "user", - "content": [ - {"type": "image", "image": image_path}, - {"type": "text", "text": "Describe this image."}, - ], - }] - - text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) - image_inputs, video_inputs = process_vision_info(messages) - - inputs = processor( - text=[text], - images=image_inputs, - videos=video_inputs, - padding=True, - return_tensors="pt", - ) - inputs = inputs.to(device) - - with torch.no_grad(): - output = model(**inputs, output_hidden_states=True) - embedding = output.hidden_states[-1][:, -1, :].squeeze(0) - embedding = torch.nn.functional.normalize(embedding, p=2, dim=0) - - return embedding.float().cpu().numpy().tolist() - - -def _load_hasher(): - global _hasher # pylint: disable=global-statement - if _hasher is not None: - return _hasher - - click.echo("Loading perceptual hasher...", err=True) - if not torch.cuda.is_available(): - raise RuntimeError("CUDA is required but not available. This pipeline requires GPU.") - - _hasher = PerceptualHasher(device="cuda", hash_size=8) - click.echo("Hasher loaded on cuda", err=True) - return _hasher - - -def _get_perceptual_hash(image_path): - hasher = _load_hasher() - image = Image.open(image_path).convert("RGB") - hashes = hasher.compute_phash_batch([image]) - return hashes[0] if hashes else None - - -async def search_text_query(es_client, query, season=None, episode=None, limit=20): - must_clauses = [{ - "multi_match": { - "query": query, - "fields": ["text^2", "episode_metadata.title"], - "fuzziness": "AUTO", - }, - }] - - if season is not None: - must_clauses.append({"term": {"episode_metadata.season": season}}) - if episode is not None: - must_clauses.append({"term": {"episode_metadata.episode_number": episode}}) - - query_body = {"bool": {"must": must_clauses}} - - return await es_client.search( - index="ranczo_segments", - query=query_body, - size=limit, - _source=["episode_id", "segment_id", "text", "start_time", "end_time", "speaker", "video_path", "episode_metadata", "scene_info"], - ) - - -async def search_text_semantic(es_client, text, season=None, episode=None, limit=10): - embedding = get_text_embedding(text) - - filter_clauses = [] - if season is not None: - filter_clauses.append({"term": {"episode_metadata.season": season}}) - if episode is not None: - filter_clauses.append({"term": {"episode_metadata.episode_number": episode}}) - - knn_query = { - "field": "text_embedding", - "query_vector": embedding, - "k": limit, - "num_candidates": limit * 10, - } - if filter_clauses: - knn_query["filter"] = filter_clauses - - return await es_client.search( - index="ranczo_text_embeddings", - knn=knn_query, - size=limit, - _source=[ - "episode_id", "embedding_id", "text", "segment_range", - "video_path", "episode_metadata", "scene_info", - ], - ) - - -async def search_video_semantic(es_client, image_path, season=None, episode=None, character=None, limit=10): - embedding = _get_image_embedding(image_path) - - filter_clauses = [] - if season is not None: - filter_clauses.append({"term": {"episode_metadata.season": season}}) - if episode is not None: - filter_clauses.append({"term": {"episode_metadata.episode_number": episode}}) - if character: - filter_clauses.append({ - "nested": { - "path": "character_appearances", - "query": {"term": {"character_appearances.name": character}}, - }, - }) - - knn_query = { - "field": "video_embedding", - "query_vector": embedding, - "k": limit, - "num_candidates": limit * 10, - } - if filter_clauses: - knn_query["filter"] = filter_clauses - - return await es_client.search( - index="ranczo_video_frames", - knn=knn_query, - size=limit, - _source=[ - "episode_id", "frame_number", "timestamp", "frame_type", "scene_number", - "perceptual_hash", "video_path", "episode_metadata", "character_appearances", "scene_info", - ], - ) - - -async def search_text_to_video(es_client, text, season=None, episode=None, character=None, limit=10): - embedding = get_text_embedding(text) - - filter_clauses = [] - if season is not None: - filter_clauses.append({"term": {"episode_metadata.season": season}}) - if episode is not None: - filter_clauses.append({"term": {"episode_metadata.episode_number": episode}}) - if character: - filter_clauses.append({ - "nested": { - "path": "character_appearances", - "query": {"term": {"character_appearances.name": character}}, - }, - }) - - knn_query = { - "field": "video_embedding", - "query_vector": embedding, - "k": limit, - "num_candidates": limit * 10, - } - if filter_clauses: - knn_query["filter"] = filter_clauses - - return await es_client.search( - index="ranczo_video_frames", - knn=knn_query, - size=limit, - _source=[ - "episode_id", "frame_number", "timestamp", "frame_type", "scene_number", - "perceptual_hash", "video_path", "episode_metadata", "character_appearances", "scene_info", - ], - ) - - -async def search_by_character(es_client, character, season=None, episode=None, limit=20): - must_clauses = [{ - "nested": { - "path": "character_appearances", - "query": {"term": {"character_appearances.name": character}}, - }, - }] - - if season is not None: - must_clauses.append({"term": {"episode_metadata.season": season}}) - if episode is not None: - must_clauses.append({"term": {"episode_metadata.episode_number": episode}}) - - return await es_client.search( - index="ranczo_video_frames", - query={"bool": {"must": must_clauses}}, - size=limit, - _source=["episode_id", "frame_number", "timestamp", "video_path", "episode_metadata", "character_appearances", "scene_info"], - ) - - -async def search_by_emotion(es_client, emotion, season=None, episode=None, character=None, limit=20): - nested_must = [{"term": {"character_appearances.emotion.label": emotion}}] - if character: - nested_must.append({"term": {"character_appearances.name": character}}) - - must_clauses = [{ - "nested": { - "path": "character_appearances", - "query": {"bool": {"must": nested_must}}, - }, - }] - - if season is not None: - must_clauses.append({"term": {"episode_metadata.season": season}}) - if episode is not None: - must_clauses.append({"term": {"episode_metadata.episode_number": episode}}) - - nested_filter = {"term": {"character_appearances.emotion.label": emotion}} - if character: - nested_filter = { - "bool": { - "must": [ - {"term": {"character_appearances.emotion.label": emotion}}, - {"term": {"character_appearances.name": character}}, - ], - }, - } - - return await es_client.search( - index="ranczo_video_frames", - query={"bool": {"must": must_clauses}}, - sort=[ - { - "character_appearances.emotion.confidence": { - "order": "desc", - "nested": { - "path": "character_appearances", - "filter": nested_filter, - }, - }, - }, - ], - track_scores=True, - size=limit, - _source=["episode_id", "frame_number", "timestamp", "video_path", "episode_metadata", "character_appearances", "scene_info"], - ) - - -async def search_by_object(es_client, object_query, season=None, episode=None, limit=20): - filter_clauses = [] - if season is not None: - filter_clauses.append({"term": {"episode_metadata.season": season}}) - if episode is not None: - filter_clauses.append({"term": {"episode_metadata.episode_number": episode}}) - - must_clauses = [] - - if ":" in object_query: - object_class, count_filter = object_query.split(":", 1) - object_class = object_class.strip() - - if count_filter.endswith("+"): - min_count = int(count_filter[:-1]) - must_clauses.append({ - "nested": { - "path": "detected_objects", - "query": { - "bool": { - "must": [ - {"term": {"detected_objects.class": object_class}}, - {"range": {"detected_objects.count": {"gte": min_count}}}, - ], - }, - }, - }, - }) - elif "-" in count_filter: - min_c, max_c = count_filter.split("-") - must_clauses.append({ - "nested": { - "path": "detected_objects", - "query": { - "bool": { - "must": [ - {"term": {"detected_objects.class": object_class}}, - {"range": {"detected_objects.count": {"gte": int(min_c), "lte": int(max_c)}}}, - ], - }, - }, - }, - }) - else: - exact_count = int(count_filter) - must_clauses.append({ - "nested": { - "path": "detected_objects", - "query": { - "bool": { - "must": [ - {"term": {"detected_objects.class": object_class}}, - {"term": {"detected_objects.count": exact_count}}, - ], - }, - }, - }, - }) - else: - must_clauses.append({ - "nested": { - "path": "detected_objects", - "query": { - "term": {"detected_objects.class": object_query.strip()}, - }, - }, - }) - - query_body = { - "bool": { - "must": must_clauses, - "filter": filter_clauses, - }, - } - - object_class = object_query.split(":")[0].strip() if ":" in object_query else object_query.strip() - - return await es_client.search( - index="ranczo_video_frames", - query=query_body, - sort=[ - { - "detected_objects.count": { - "order": "desc", - "nested": { - "path": "detected_objects", - "filter": {"term": {"detected_objects.class": object_class}}, - }, - }, - }, - ], - track_scores=True, - size=limit, - _source=["episode_id", "frame_number", "timestamp", "detected_objects", "character_appearances", "video_path", "episode_metadata", "scene_info"], - ) - - -async def search_perceptual_hash(es_client, phash, limit=10): - return await es_client.search( - index="ranczo_video_frames", - query={"term": {"perceptual_hash": phash}}, - size=limit, - _source=["episode_id", "frame_number", "timestamp", "video_path", "episode_metadata", "perceptual_hash", "scene_info"], - ) - - -async def list_characters(es_client): - result = await es_client.search( - index="ranczo_video_frames", - size=0, - aggs={ - "characters_nested": { - "nested": {"path": "character_appearances"}, - "aggs": { - "character_names": { - "terms": {"field": "character_appearances.name", "size": 1000}, - }, - }, - }, - }, - ) - buckets = result["aggregations"]["characters_nested"]["character_names"]["buckets"] - return [(b["key"], b["doc_count"]) for b in buckets] - - -async def list_objects(es_client): - result = await es_client.search( - index="ranczo_video_frames", - size=0, - aggs={ - "objects_nested": { - "nested": {"path": "detected_objects"}, - "aggs": { - "object_classes": { - "terms": {"field": "detected_objects.class", "size": 1000}, - }, - }, - }, - }, - ) - buckets = result["aggregations"]["objects_nested"]["object_classes"]["buckets"] - return [(b["key"], b["doc_count"]) for b in buckets] - - -async def search_episode_name(es_client, query, season=None, limit=20): - must_clauses = [{ - "multi_match": { - "query": query, - "fields": ["title^2", "episode_metadata.title"], - "fuzziness": "AUTO", - }, - }] - - if season is not None: - must_clauses.append({"term": {"episode_metadata.season": season}}) - - query_body = {"bool": {"must": must_clauses}} - - return await es_client.search( - index="ranczo_episode_names", - query=query_body, - size=limit, - _source=["episode_id", "title", "video_path", "episode_metadata"], - ) - - -async def search_episode_name_semantic(es_client, text, season=None, limit=10): - embedding = get_text_embedding(text) - - filter_clauses = [] - if season is not None: - filter_clauses.append({"term": {"episode_metadata.season": season}}) - - knn_query = { - "field": "title_embedding", - "query_vector": embedding, - "k": limit, - "num_candidates": limit * 10, - } - if filter_clauses: - knn_query["filter"] = filter_clauses - - return await es_client.search( - index="ranczo_episode_names", - knn=knn_query, - size=limit, - _source=["episode_id", "title", "video_path", "episode_metadata"], - ) - - -async def get_stats(es_client): - return { - "segments": (await es_client.count(index="ranczo_segments"))["count"], - "text_embeddings": (await es_client.count(index="ranczo_text_embeddings"))["count"], - "video_embeddings": (await es_client.count(index="ranczo_video_frames"))["count"], - "episode_names": (await es_client.count(index="ranczo_episode_names"))["count"], - } - - -def format_timestamp(seconds): - minutes = int(seconds // 60) - secs = seconds % 60 - return f"{minutes}m {secs:.1f}s" - - -def _format_scene_context(scene_info): - if not scene_info: - return "" - start = format_timestamp(scene_info.get('scene_start_time', 0)) - end = format_timestamp(scene_info.get('scene_end_time', 0)) - return f" [Scene {scene_info.get('scene_number', '?')}: {start} - {end}]" - - -def _print_results(result, result_type="text"): # pylint: disable=too-many-locals - total = result[ElasticsearchKeys.HITS][ElasticsearchKeys.TOTAL][ElasticsearchAggregationKeys.VALUE] - hits = result[ElasticsearchKeys.HITS][ElasticsearchKeys.HITS] - - click.echo(f"\nZnaleziono: {total} wynikow") - click.echo("=" * 80) - - for i, hit in enumerate(hits, 1): - source = hit[ElasticsearchKeys.SOURCE] - score = hit[ElasticsearchKeys.SCORE] - meta = source[EpisodeMetadataKeys.EPISODE_METADATA] - scene_ctx = _format_scene_context(source.get("scene_info")) - - click.echo(f"\n[{i}] Score: {score:.2f}") - season_code = "S00" if meta['season'] == 0 else f"S{meta['season']:02d}" - click.echo(f"Episode: {season_code}E{meta['episode_number']:02d} - {meta.get('title', 'N/A')}") - - if result_type == "text": - click.echo(f"Segment ID: {source.get('segment_id', 'N/A')}") - start_time = format_timestamp(source['start_time']) - end_time = format_timestamp(source['end_time']) - click.echo(f"Time: {start_time} - {end_time}{scene_ctx}") - click.echo(f"Speaker: {source.get('speaker', 'N/A')}") - click.echo(f"Text: {source['text']}") - elif result_type == "text_semantic": - click.echo(f"Segments: {source['segment_range'][0]}-{source['segment_range'][1]}{scene_ctx}") - click.echo(f"Embedding ID: {source.get('embedding_id', 'N/A')}") - click.echo(f"Text: {source['text']}") - elif result_type == "episode_name": - click.echo(f"Episode Title: {source.get('title', 'N/A')}") - else: - timestamp = format_timestamp(source['timestamp']) - click.echo(f"Frame: {source['frame_number']} @ {timestamp}{scene_ctx}") - if "frame_type" in source: - click.echo(f"Type: {source['frame_type']}") - if "scene_number" in source: - click.echo(f"Scene number: {source['scene_number']}") - if "perceptual_hash" in source: - click.echo(f"Hash: {source['perceptual_hash']}") - if source.get("character_appearances"): - chars_strs = [] - for char in source['character_appearances']: - char_str = char.get('name', 'Unknown') - if char.get('emotion'): - emotion_label = char['emotion'].get('label', '?') - emotion_conf = char['emotion'].get('confidence', 0) - char_str += f" ({emotion_label} {emotion_conf:.2f})" - chars_strs.append(char_str) - click.echo(f"Characters: {', '.join(chars_strs)}") - if source.get("detected_objects"): - objects_str = ", ".join([f"{obj['class']}:{obj['count']}" for obj in source['detected_objects']]) - click.echo(f"Objects: {objects_str}") - - click.echo(f"Path: {source['video_path']}") - @click.command(context_settings={"show_default": True}) @click.option("--text", type=str, help="Full-text search po transkrypcjach") @@ -608,7 +34,7 @@ def _print_results(result, result_type="text"): # pylint: disable=too-many-loca @click.option("--stats", is_flag=True, help="Pokaz statystyki indeksow") @click.option("--json-output", is_flag=True, help="Output w formacie JSON") @click.option("--host", type=str, default="http://localhost:9200", help="Elasticsearch host") -def search( # pylint: disable=too-many-locals +def search( text, text_semantic, text_to_video, image, phash, character, emotion, object_query, episode_name, episode_name_semantic, list_chars_flag, list_objects_flag, season, episode, limit, stats, json_output, host, @@ -622,12 +48,16 @@ def search( # pylint: disable=too-many-locals click.echo("Podaj przynajmniej jedna opcje wyszukiwania. Uzyj --help", err=True) sys.exit(1) + embedding_service = EmbeddingService() + hash_service = HashService() + queries = ElasticsearchQueries(embedding_service) + hash_value = None if phash: phash_path = Path(phash) if phash_path.exists() and phash_path.is_file(): click.echo(f"Computing perceptual hash from image: {phash}", err=True) - hash_value = _get_perceptual_hash(str(phash_path)) + hash_value = hash_service.get_perceptual_hash(str(phash_path)) if hash_value: click.echo(f"Computed hash: {hash_value}", err=True) else: @@ -636,7 +66,7 @@ def search( # pylint: disable=too-many-locals else: hash_value = phash - async def run(): # pylint: disable=too-many-branches + async def __run(): es_client = AsyncElasticsearch(hosts=[host], verify_certs=False) try: @@ -649,7 +79,7 @@ async def run(): # pylint: disable=too-many-branches try: if stats: - result = await get_stats(es_client) + result = await queries.get_stats(es_client) if json_output: click.echo(json.dumps(result, indent=2)) else: @@ -660,7 +90,7 @@ async def run(): # pylint: disable=too-many-branches click.echo(f" Episode Names: {result['episode_names']:,}") elif list_chars_flag: - chars = await list_characters(es_client) + chars = await queries.list_characters(es_client) if json_output: click.echo(json.dumps(chars, indent=2)) else: @@ -669,7 +99,7 @@ async def run(): # pylint: disable=too-many-branches click.echo(f" {char}: {count:,} wystapien") elif list_objects_flag: - objects = await list_objects(es_client) + objects = await queries.list_objects(es_client) if json_output: click.echo(json.dumps(objects, indent=2)) else: @@ -678,76 +108,76 @@ async def run(): # pylint: disable=too-many-branches click.echo(f" {obj}: {count:,} wystapien") elif text: - result = await search_text_query(es_client, text, season, episode, limit) + result = await queries.search_text_query(es_client, text, season, episode, limit) if json_output: click.echo(json.dumps(result["hits"], indent=2)) else: - _print_results(result, "text") + ResultFormatter.print_results(result, "text") elif text_semantic: - result = await search_text_semantic(es_client, text_semantic, season, episode, limit) + result = await queries.search_text_semantic(es_client, text_semantic, season, episode, limit) if json_output: click.echo(json.dumps(result["hits"], indent=2)) else: - _print_results(result, "text_semantic") + ResultFormatter.print_results(result, "text_semantic") elif text_to_video: - result = await search_text_to_video(es_client, text_to_video, season, episode, character, limit) + result = await queries.search_text_to_video(es_client, text_to_video, season, episode, character, limit) if json_output: click.echo(json.dumps(result["hits"], indent=2)) else: - _print_results(result, "video") + ResultFormatter.print_results(result, "video") elif image: - result = await search_video_semantic(es_client, str(image), season, episode, character, limit) + result = await queries.search_video_semantic(es_client, str(image), season, episode, character, limit) if json_output: click.echo(json.dumps(result["hits"], indent=2)) else: - _print_results(result, "video") + ResultFormatter.print_results(result, "video") elif emotion: - result = await search_by_emotion(es_client, emotion, season, episode, character, limit) + result = await queries.search_by_emotion(es_client, emotion, season, episode, character, limit) if json_output: click.echo(json.dumps(result["hits"], indent=2)) else: - _print_results(result, "video") + ResultFormatter.print_results(result, "video") elif character: - result = await search_by_character(es_client, character, season, episode, limit) + result = await queries.search_by_character(es_client, character, season, episode, limit) if json_output: click.echo(json.dumps(result["hits"], indent=2)) else: - _print_results(result, "video") + ResultFormatter.print_results(result, "video") elif object_query: - result = await search_by_object(es_client, object_query, season, episode, limit) + result = await queries.search_by_object(es_client, object_query, season, episode, limit) if json_output: click.echo(json.dumps(result["hits"], indent=2)) else: - _print_results(result, "video") + ResultFormatter.print_results(result, "video") elif hash_value: - result = await search_perceptual_hash(es_client, hash_value, limit) + result = await queries.search_perceptual_hash(es_client, hash_value, limit) if json_output: click.echo(json.dumps(result["hits"], indent=2)) else: - _print_results(result, "video") + ResultFormatter.print_results(result, "video") elif episode_name: - result = await search_episode_name(es_client, episode_name, season, limit) + result = await queries.search_episode_name(es_client, episode_name, season, limit) if json_output: click.echo(json.dumps(result["hits"], indent=2)) else: - _print_results(result, "episode_name") + ResultFormatter.print_results(result, "episode_name") elif episode_name_semantic: - result = await search_episode_name_semantic(es_client, episode_name_semantic, season, limit) + result = await queries.search_episode_name_semantic(es_client, episode_name_semantic, season, limit) if json_output: click.echo(json.dumps(result["hits"], indent=2)) else: - _print_results(result, "episode_name") + ResultFormatter.print_results(result, "episode_name") finally: await es_client.close() - asyncio.run(run()) + asyncio.run(__run()) diff --git a/preprocessor/cli/commands/separate_sounds.py b/preprocessor/cli/commands/separate_sounds.py index 709521e5a..0e91bebb4 100644 --- a/preprocessor/cli/commands/separate_sounds.py +++ b/preprocessor/cli/commands/separate_sounds.py @@ -3,9 +3,9 @@ import click -from preprocessor.utils.resource_scope import ResourceScope from preprocessor.config.config import settings from preprocessor.transcription.processors.sound_separator import SoundEventSeparator +from preprocessor.utils.resource_scope import ResourceScope @click.command(context_settings={"show_default": True}) diff --git a/preprocessor/cli/commands/transcode.py b/preprocessor/cli/commands/transcode.py index a5bcae9de..ceec83c8d 100644 --- a/preprocessor/cli/commands/transcode.py +++ b/preprocessor/cli/commands/transcode.py @@ -4,13 +4,13 @@ import click from preprocessor.cli.helpers import create_state_manager -from preprocessor.utils.resource_scope import ResourceScope from preprocessor.config.config import ( TranscodeConfig, settings, ) from preprocessor.processors.video_transcoder import VideoTranscoder from preprocessor.utils.resolution import Resolution +from preprocessor.utils.resource_scope import ResourceScope @click.command(context_settings={"show_default": True}) diff --git a/preprocessor/cli/commands/transcribe.py b/preprocessor/cli/commands/transcribe.py index 37f51d36b..fd6badc4a 100644 --- a/preprocessor/cli/commands/transcribe.py +++ b/preprocessor/cli/commands/transcribe.py @@ -4,12 +4,12 @@ import click -from preprocessor.utils.resource_scope import ResourceScope from preprocessor.config.config import ( TranscriptionConfig, settings, ) from preprocessor.processors.transcription_generator import TranscriptionGenerator +from preprocessor.utils.resource_scope import ResourceScope # pylint: disable=duplicate-code diff --git a/preprocessor/cli/pipeline/orchestrator.py b/preprocessor/cli/pipeline/orchestrator.py index 5751ff702..829252d94 100644 --- a/preprocessor/cli/pipeline/orchestrator.py +++ b/preprocessor/cli/pipeline/orchestrator.py @@ -9,7 +9,6 @@ Optional, ) -from preprocessor.utils.resource_scope import ResourceScope from preprocessor.config.config import ( get_output_path, settings, @@ -17,6 +16,7 @@ from preprocessor.core.processing_metadata import ProcessingMetadata from preprocessor.core.state_manager import StateManager from preprocessor.utils.console import console +from preprocessor.utils.resource_scope import ResourceScope ELASTIC_SUBDIRS = settings.output_subdirs.elastic_document_subdirs diff --git a/preprocessor/cli/pipeline/steps.py b/preprocessor/cli/pipeline/steps.py index 0d5cb81ea..9c5757dd9 100644 --- a/preprocessor/cli/pipeline/steps.py +++ b/preprocessor/cli/pipeline/steps.py @@ -9,8 +9,6 @@ from preprocessor.scraping.episode_scraper import EpisodeScraper from preprocessor.utils.console import console from preprocessor.video.frame_processor import FrameProcessor -from preprocessor.video.subprocessors.emotion_detection_subprocessor import EmotionDetectionSubProcessor -from preprocessor.video.subprocessors.face_clustering_subprocessor import FaceClusteringSubProcessor from preprocessor.video.subprocessors import ( CharacterDetectionSubProcessor, CharacterDetectionVisualizationSubProcessor, @@ -19,6 +17,8 @@ ObjectDetectionVisualizationSubProcessor, VideoEmbeddingSubProcessor, ) +from preprocessor.video.subprocessors.emotion_detection_subprocessor import EmotionDetectionSubProcessor +from preprocessor.video.subprocessors.face_clustering_subprocessor import FaceClusteringSubProcessor # pylint: disable=duplicate-code diff --git a/preprocessor/core/base_processor.py b/preprocessor/core/base_processor.py index 1787998c5..fc700d385 100644 --- a/preprocessor/core/base_processor.py +++ b/preprocessor/core/base_processor.py @@ -67,7 +67,6 @@ def __init__( self.state_manager: Optional[StateManager] = args.get("state_manager") self.series_name: str = args.get("series_name", "unknown") - from preprocessor.core.path_manager import PathManager # pylint: disable=import-outside-toplevel self.path_manager: PathManager = args.get( "path_manager", PathManager(self.series_name), @@ -298,7 +297,7 @@ def _create_video_processing_items( skip_unparseable: bool = True, subdirectory_filter: Optional[str] = None, ) -> List[ProcessingItem]: - from preprocessor.core.episode_manager import EpisodeManager # pylint: disable=import-outside-toplevel + from preprocessor.episodes import EpisodeManager # pylint: disable=import-outside-toplevel series_name = self.series_name @@ -352,7 +351,7 @@ def _create_video_processing_items( return items def _create_transcription_processing_item(self, transcription_file: Path) -> ProcessingItem: - from preprocessor.core.episode_manager import EpisodeManager # pylint: disable=import-outside-toplevel + from preprocessor.episodes import EpisodeManager # pylint: disable=import-outside-toplevel base_name = transcription_file.stem.replace(FILE_SUFFIXES["segmented"], "").replace(FILE_SUFFIXES["simple"], "") diff --git a/preprocessor/core/path_manager.py b/preprocessor/core/path_manager.py index 141b5cd0d..510a44189 100644 --- a/preprocessor/core/path_manager.py +++ b/preprocessor/core/path_manager.py @@ -85,4 +85,4 @@ def add_suffix_to_filename(filename: str, suffix: str) -> str: @staticmethod def get_suffix(suffix_key: str) -> str: - return FILE_SUFFIXES.get(suffix_key, "") \ No newline at end of file + return FILE_SUFFIXES.get(suffix_key, "") diff --git a/preprocessor/core/video_processor.py b/preprocessor/core/video_processor.py new file mode 100644 index 000000000..5d66334e3 --- /dev/null +++ b/preprocessor/core/video_processor.py @@ -0,0 +1,47 @@ +from pathlib import Path +from typing import ( + Any, + Dict, + List, + Optional, +) + +from preprocessor.core.base_processor import ( + BaseProcessor, + ProcessingItem, +) +from preprocessor.episodes import EpisodeManager + + +class VideoProcessor(BaseProcessor): + def __init__( + self, + args: Dict[str, Any], + class_name: str, + error_exit_code: int, + loglevel: int, + ): + super().__init__( + args=args, + class_name=class_name, + error_exit_code=error_exit_code, + loglevel=loglevel, + ) + + self.input_videos: Path = Path(self._args["videos"]) + self.subdirectory_filter: Optional[str] = None + episodes_json_path = self._args.get("episodes_info_json") + self.episode_manager = EpisodeManager(episodes_json_path, self.series_name) + + def _get_processing_items(self) -> List[ProcessingItem]: + return self._create_video_processing_items( + source_path=self.input_videos, + extensions=self.get_video_glob_patterns(), + episode_manager=self.episode_manager, + skip_unparseable=True, + subdirectory_filter=self.subdirectory_filter, + ) + + def _validate_videos_required(self, args: Dict[str, Any]) -> None: + if "videos" not in args: + raise ValueError("videos path is required") diff --git a/preprocessor/embeddings/embedding_generator.py b/preprocessor/embeddings/embedding_generator.py deleted file mode 100644 index 0c985e281..000000000 --- a/preprocessor/embeddings/embedding_generator.py +++ /dev/null @@ -1,823 +0,0 @@ -import gc -import json -import logging -from pathlib import Path -import re -from typing import ( - Any, - Dict, - List, - Optional, -) - -import numpy as np -import torch - -from preprocessor.config.config import settings -from preprocessor.core.base_processor import ( - BaseProcessor, - OutputSpec, - ProcessingItem, -) -from preprocessor.core.constants import FILE_SUFFIXES -from preprocessor.core.episode_manager import EpisodeManager -from preprocessor.core.processor_registry import register_processor -from preprocessor.embeddings.episode_name_embedder import EpisodeNameEmbedder -from preprocessor.embeddings.gpu_batch_processor import GPUBatchProcessor -from preprocessor.embeddings.qwen3_vl_embedding import Qwen3VLEmbedder -from preprocessor.utils.batch_processing_utils import compute_embeddings_in_batches -from preprocessor.utils.console import console -from preprocessor.utils.constants import EpisodeMetadataKeys -from preprocessor.utils.file_utils import atomic_write_json -from preprocessor.utils.image_hash_utils import load_image_hashes_for_episode -from preprocessor.utils.metadata_utils import create_processing_metadata - -# pylint: disable=duplicate-code - - -@register_processor("generate_embeddings") -class EmbeddingGenerator(BaseProcessor): # pylint: disable=too-many-instance-attributes - REQUIRES = ["transcriptions", "frames"] - PRODUCES = ["embeddings"] - PRIORITY = 50 - DESCRIPTION = "Generate multimodal embeddings" - - def __init__(self, args: Dict[str, Any]): - super().__init__( - args=args, - class_name=self.__class__.__name__, - error_exit_code=9, - loglevel=logging.DEBUG, - ) - - self.transcription_jsons: Path = self._args["transcription_jsons"] - self.frames_dir: Path = self._args.get("frames_dir", settings.frame_export.get_output_dir(self.series_name)) - self.output_dir: Path = self._args.get("output_dir", settings.embedding.get_output_dir(self.series_name)) - - self.model_name: str = self._args.get("model", settings.embedding_model.model_name) - self.model_revision: str = self._args.get("model_revision", settings.embedding_model.model_revision) - self.batch_size: int = self._args.get("batch_size", settings.embedding.batch_size) - self.device: str = "cuda" - - self.segments_per_embedding: int = self._args.get("segments_per_embedding", settings.text_chunking.segments_per_embedding) - self.text_sentences_per_chunk: int = self._args.get("text_sentences_per_chunk", settings.text_chunking.text_sentences_per_chunk) - self.text_chunk_overlap: int = self._args.get("text_chunk_overlap", settings.text_chunking.text_chunk_overlap) - self.generate_text: bool = self._args.get("generate_text", True) - self.generate_video: bool = self._args.get("generate_video", True) - self.generate_episode_names: bool = self._args.get("generate_episode_names", True) - self.generate_full_episode: bool = self._args.get("generate_full_episode", settings.embedding.generate_full_episode_embedding) - self.generate_sound_events: bool = self._args.get("generate_sound_events", True) - - self.image_hashes_dir: Path = Path( - self._args.get("image_hashes_dir", settings.image_hash.get_output_dir(self.series_name)), - ) - - episodes_info_json = self._args.get("episodes_info_json") - self.episode_manager = EpisodeManager(episodes_info_json, self.series_name) - - self.model = None - self.processor = None - self.gpu_processor: Optional[GPUBatchProcessor] = None - self.episode_name_embedder: Optional[EpisodeNameEmbedder] = None - - def _validate_args(self, args: Dict[str, Any]) -> None: - if "transcription_jsons" not in args: - raise ValueError("transcription_jsons is required") - if not torch.cuda.is_available(): - raise RuntimeError("CUDA is not available. This application requires GPU.") - - def get_output_subdir(self) -> str: - return settings.output_subdirs.embeddings - - def cleanup(self) -> None: - console.print("[cyan]Unloading embedding model...[/cyan]") - self.model = None - self.processor = None - self._cleanup_memory() - console.print("[green]✓ Model unloaded[/green]") - - def _get_processing_items(self) -> List[ProcessingItem]: - all_transcription_files = list(self.transcription_jsons.glob("**/*.json")) - items = [] - seen_episodes = set() - - for trans_file in all_transcription_files: - if "_simple.json" in trans_file.name or "_text_stats.json" in trans_file.name: - continue - - if trans_file.parent.name in {"clean", "sound_events"}: - continue - - if not trans_file.name.endswith("_segmented.json"): - segmented_version = trans_file.parent / f"{trans_file.stem}_segmented.json" - if segmented_version.exists(): - continue - - episode_info = self.episode_manager.parse_filename(trans_file) - if episode_info: - episode_key = (episode_info.season, episode_info.relative_episode) - if episode_key in seen_episodes: - continue - seen_episodes.add(episode_key) - - items.append(self._create_transcription_processing_item(trans_file)) - - return items - - def _should_skip_item(self, item: ProcessingItem): - trans_file = item.input_path - parent_name = trans_file.parent.name - if parent_name in {"raw", "clean", "sound_events"}: - episode_dir = trans_file.parent.parent - else: - episode_dir = trans_file.parent - - clean_dir = episode_dir / settings.output_subdirs.transcription_subdirs.clean - base_name = self.__remove_all_suffixes(trans_file.stem) - clean_transcription_file = clean_dir / f"{base_name}_clean_transcription.json" - - if clean_transcription_file.exists(): - try: - with open(clean_transcription_file, "r", encoding="utf-8") as f: - data = json.load(f) - segments = data.get("segments", []) - if not segments: - episode_id = item.episode_id - self.logger.warning( - f"Empty clean transcription (no text segments) for {episode_id}, " - f"will skip text embeddings but generate other types (sound events, episode names, etc.)", - ) - except Exception as e: - self.logger.error(f"Failed to read {clean_transcription_file}: {e}") - - return super()._should_skip_item(item) - - def _get_expected_outputs(self, item: ProcessingItem) -> List[OutputSpec]: - outputs = [] - episode_info = self.episode_manager.parse_filename(item.input_path) - if not episode_info: - return outputs - - if self.generate_text: - text_filename = self.episode_manager.file_naming.build_filename( - episode_info, - extension="json", - suffix="_embeddings_text", - ) - text_output = self._build_output_path(episode_info, text_filename) - outputs.append(OutputSpec(path=text_output, required=True)) - - if self.generate_episode_names: - episode_name_filename = f"{FILE_SUFFIXES['episode_name']}.json" - episode_name_output = self._build_output_path(episode_info, episode_name_filename) - outputs.append(OutputSpec(path=episode_name_output, required=True)) - - if self.generate_video: - video_filename = self.episode_manager.file_naming.build_filename( - episode_info, - extension="json", - suffix="_embeddings_video", - ) - video_output = self._build_output_path(episode_info, video_filename) - outputs.append(OutputSpec(path=video_output, required=True)) - - if self.generate_full_episode: - full_episode_filename = self.episode_manager.file_naming.build_filename( - episode_info, - extension="json", - suffix="_embeddings_full_episode", - ) - full_episode_output = self._build_output_path(episode_info, full_episode_filename) - outputs.append(OutputSpec(path=full_episode_output, required=True)) - - if self.generate_sound_events: - sound_events_filename = self.episode_manager.file_naming.build_filename( - episode_info, - extension="json", - suffix="_embeddings_sound_events", - ) - sound_events_output = self._build_output_path(episode_info, sound_events_filename) - outputs.append(OutputSpec(path=sound_events_output, required=True)) - - return outputs - - def _get_temp_files(self, item: ProcessingItem) -> List[str]: - temp_files = [] - expected_outputs = self._get_expected_outputs(item) - for output in expected_outputs: - temp_path = output.path.with_suffix('.json.tmp') - temp_files.append(str(temp_path)) - return temp_files - - def _get_processing_info(self) -> List[str]: - return [ - f"[cyan]Loading model: {self.model_name}[/cyan]", - f"[cyan]Device: {self.device}[/cyan]", - f"[cyan]Batch size: {self.batch_size}[/cyan]", - ] - - def _load_resources(self) -> bool: - self.__load_model() - self.gpu_processor = GPUBatchProcessor( - self.model, - self.batch_size, - self.logger, - self.device, - progress_sub_batch_size=settings.embedding.progress_sub_batch_size, - ) - self.episode_name_embedder = EpisodeNameEmbedder( - model=self.model, - episode_manager=self.episode_manager, - series_name=self.series_name, - logger=self.logger, - ) - return True - - def __load_model(self) -> None: - try: - self.model = Qwen3VLEmbedder( - model_name_or_path=self.model_name, - torch_dtype=torch.bfloat16, - ) - console.print("[green]Qwen3-VL-Embedding model loaded successfully (vLLM)[/green]") - except Exception as e: - self.logger.error(f"Failed to load model: {e}") - raise - - def _process_item(self, item: ProcessingItem, missing_outputs: List[OutputSpec]) -> None: # pylint: disable=too-many-locals,too-many-statements - trans_file = item.input_path - - parent_name = trans_file.parent.name - if parent_name in {"raw", "clean", "sound_events"}: - episode_dir = trans_file.parent.parent - else: - episode_dir = trans_file.parent - - clean_dir = episode_dir / settings.output_subdirs.transcription_subdirs.clean - - base_name = self.__remove_all_suffixes(trans_file.stem) - clean_transcription_file = clean_dir / f"{base_name}_clean_transcription.json" - - if not clean_transcription_file.exists(): - self.logger.warning(f"Clean transcription not found: {clean_transcription_file}, skipping text embeddings generation") - with open(trans_file, "r", encoding="utf-8") as f: - data = json.load(f) - data["segments"] = [] - else: - with open(clean_transcription_file, "r", encoding="utf-8") as f: - data = json.load(f) - - has_segments = bool(data.get("segments")) - segmented_file = trans_file.parent / f"{trans_file.stem}_segmented.json" - - if not has_segments and segmented_file.exists(): - return - - need_text = any("embeddings_text.json" in str(o.path) for o in missing_outputs) - need_video = any("embeddings_video.json" in str(o.path) for o in missing_outputs) - need_episode_name = any("episode_name_embedding.json" in str(o.path) for o in missing_outputs) - need_full_episode = any("embeddings_full_episode.json" in str(o.path) for o in missing_outputs) - need_sound_events = any("embeddings_sound_events.json" in str(o.path) for o in missing_outputs) - - text_embeddings = [] - if need_text: - text_embeddings = self.__generate_text_embeddings(data) - - sound_event_embeddings = [] - if need_sound_events: - sound_event_embeddings = self.__generate_sound_event_embeddings(trans_file) - - video_embeddings = [] - if need_video: - episode_info = data.get("episode_info", {}) - frame_metadata = self.__load_frame_metadata(episode_info) - if frame_metadata: - video_embeddings = self.__generate_video_embeddings(episode_info, frame_metadata) - - if need_episode_name and self.episode_name_embedder: - self.episode_name_embedder.generate_and_save_for_transcription(data) - - full_episode_embedding = None - if need_full_episode: - full_episode_embedding = self.__generate_full_episode_embedding(trans_file) - - episode_dir = self.__get_episode_output_dir(trans_file) - episode_info_dict = data.get("episode_info", {}) - season = episode_info_dict.get("season", 0) - episode_num = episode_info_dict.get("episode_number", 0) - - episode_info_temp = self.episode_manager.get_episode_by_season_and_relative(season, episode_num) - if episode_info_temp: - episode_code = episode_info_temp.episode_code() - else: - episode_code = f"S{season:02d}E{episode_num:02d}" - - text_output = episode_dir / f"{self.episode_manager.series_name}_{episode_code}_embeddings_text.json" - video_output = episode_dir / f"{self.episode_manager.series_name}_{episode_code}_embeddings_video.json" - full_episode_output = episode_dir / f"{self.episode_manager.series_name}_{episode_code}_embeddings_full_episode.json" - sound_events_output = episode_dir / f"{self.episode_manager.series_name}_{episode_code}_embeddings_sound_events.json" - self.__save_embeddings( - data, - text_embeddings, - video_embeddings, - full_episode_embedding, - sound_event_embeddings, - text_output, - video_output, - full_episode_output, - sound_events_output, - ) - self._cleanup_memory() - - def __generate_text_embeddings(self, data: Dict[str, Any]) -> List[Dict[str, Any]]: # pylint: disable=too-many-locals - segments = data.get("segments", []) - if not segments: - return [] - - text_chunks = [] - chunk_metadata = [] - - if True: # Always use sentence-based chunking for text # pylint: disable=using-constant-test - full_text = " ".join([seg.get("text", "") for seg in segments]) - sentences = self.__split_into_sentences(full_text) - - sentences_per_chunk = self.text_sentences_per_chunk - overlap = self.text_chunk_overlap - step = sentences_per_chunk - overlap - - for i in range(0, len(sentences), step): - chunk_sentences = sentences[i:i + sentences_per_chunk] - if not chunk_sentences: - continue - - chunk_text = " ".join(chunk_sentences).strip() - if not chunk_text: - continue - - char_start = sum(len(s) + 1 for s in sentences[:i]) - char_end = char_start + len(chunk_text) - - start_seg_id = self.__find_segment_at_position(segments, char_start) - end_seg_id = self.__find_segment_at_position(segments, char_end) - - text_chunks.append(chunk_text) - chunk_metadata.append({ - "segment_range": [start_seg_id, end_seg_id], - "text": chunk_text, - }) - else: - for i in range(0, len(segments), self.segments_per_embedding): - chunk = segments[i: i + self.segments_per_embedding] - combined_text = " ".join([seg.get("text", "") for seg in chunk]) - - if combined_text.strip(): - text_chunks.append(combined_text) - chunk_metadata.append({ - "segment_range": [i, i + len(chunk) - 1], - "text": combined_text, - }) - - if not text_chunks: - return [] - - embeddings = [] - text_batch_size = settings.embedding.text_batch_size - - with self.progress.track_operation( - f"Text embeddings ({len(text_chunks)} chunks)", - (len(text_chunks) + text_batch_size - 1) // text_batch_size, - ) as tracker: - for batch_idx in range(0, len(text_chunks), text_batch_size): - batch_texts = text_chunks[batch_idx: batch_idx + text_batch_size] - batch_meta = chunk_metadata[batch_idx: batch_idx + text_batch_size] - - try: - batch_embeddings = self.__encode_text_batch(batch_texts) - for meta, embedding in zip(batch_meta, batch_embeddings): - embeddings.append({ - **meta, - "embedding": embedding.tolist(), - }) - except (RuntimeError, ValueError, OSError) as e: - self.logger.error(f"Failed text embedding batch {batch_idx}: {e}") - - tracker.update((batch_idx // text_batch_size) + 1, interval=5) - - return embeddings - - def __generate_sound_event_embeddings(self, trans_file: Path) -> List[Dict[str, Any]]: # pylint: disable=too-many-locals - parent_name = trans_file.parent.name - if parent_name in {"raw", "clean", "sound_events"}: - episode_dir = trans_file.parent.parent - else: - episode_dir = trans_file.parent - - sound_events_dir = episode_dir / settings.output_subdirs.transcription_subdirs.sound_events - - base_name = self.__remove_all_suffixes(trans_file.stem) - sound_events_file = sound_events_dir / f"{base_name}_sound_events.json" - - if not sound_events_file.exists(): - self.logger.warning(f"Sound events file not found: {sound_events_file}, skipping sound event embeddings generation") - return [] - - try: - with open(sound_events_file, "r", encoding="utf-8") as f: - sound_events_data = json.load(f) - except Exception as e: - self.logger.error(f"Failed to load sound events file {sound_events_file}: {e}") - return [] - - segments = sound_events_data.get("segments", []) - if not segments: - return [] - - text_chunks = [] - chunk_metadata = [] - - for i in range(0, len(segments), self.segments_per_embedding): - chunk = segments[i: i + self.segments_per_embedding] - combined_text = " ".join([seg.get("text", "") for seg in chunk]) - - if combined_text.strip(): - sound_types = set() - for seg in chunk: - sound_type = seg.get("sound_type", "sound") - sound_types.add(sound_type) - - start_time = chunk[0].get("start", 0.0) if chunk else 0.0 - end_time = chunk[-1].get("end", 0.0) if chunk else 0.0 - - text_chunks.append(combined_text) - chunk_metadata.append({ - "segment_range": [i, i + len(chunk) - 1], - "text": combined_text, - "sound_types": list(sound_types), - "start_time": start_time, - "end_time": end_time, - }) - - if not text_chunks: - return [] - - embeddings = [] - text_batch_size = settings.embedding.text_batch_size - - with self.progress.track_operation( - f"Sound event embeddings ({len(text_chunks)} chunks)", - (len(text_chunks) + text_batch_size - 1) // text_batch_size, - ) as tracker: - for batch_idx in range(0, len(text_chunks), text_batch_size): - batch_texts = text_chunks[batch_idx: batch_idx + text_batch_size] - batch_meta = chunk_metadata[batch_idx: batch_idx + text_batch_size] - - try: - batch_embeddings = self.__encode_text_batch(batch_texts) - for meta, embedding in zip(batch_meta, batch_embeddings): - embeddings.append({ - **meta, - "embedding": embedding.tolist(), - }) - except (RuntimeError, ValueError, OSError) as e: - self.logger.error(f"Failed sound event embedding batch {batch_idx}: {e}") - - tracker.update((batch_idx // text_batch_size) + 1, interval=5) - - return embeddings - - @staticmethod - def __remove_all_suffixes(base_name: str) -> str: - suffixes = (FILE_SUFFIXES["segmented"], FILE_SUFFIXES["sound_events"], FILE_SUFFIXES["clean"], FILE_SUFFIXES["clean_alt"]) - while True: - removed = False - for suffix in suffixes: - if base_name.endswith(suffix): - base_name = base_name[:-len(suffix)] - removed = True - break - if not removed: - break - return base_name - - @staticmethod - def __split_into_sentences(text: str) -> List[str]: - normalized_text = re.sub(r'\.{2,}', '.', text) - normalized_text = re.sub(r'!{2,}', '!', normalized_text) - normalized_text = re.sub(r'\?{2,}', '?', normalized_text) - - sentences = re.split(r'([.!?]+(?:\s+|$))', normalized_text) - raw_sentences = [] - for i in range(0, len(sentences) - 1, 2): - sentence = sentences[i] + (sentences[i + 1] if i + 1 < len(sentences) else "") - sentence = sentence.strip() - if sentence: - raw_sentences.append(sentence) - if len(sentences) % 2 == 1 and sentences[-1].strip(): - raw_sentences.append(sentences[-1].strip()) - - result = [] - buffer = "" - min_sentence_length = 30 - - for sentence in raw_sentences: - buffer = (buffer + " " + sentence).strip() if buffer else sentence - - if len(buffer) >= min_sentence_length: - result.append(buffer) - buffer = "" - - if buffer: - if result: - result[-1] = result[-1] + " " + buffer - else: - result.append(buffer) - - return result - - @staticmethod - def __find_segment_at_position(segments: List[Dict[str, Any]], char_pos: int) -> int: - cumulative_length = 0 - for idx, seg in enumerate(segments): - seg_text = seg.get("text", "") - seg_length = len(seg_text) + 1 - if cumulative_length <= char_pos < cumulative_length + seg_length: - return idx - cumulative_length += seg_length - return len(segments) - 1 if segments else 0 - - def __encode_text_batch(self, texts: List[str]) -> List[np.ndarray]: - inputs = [{"text": text} for text in texts] - embeddings_tensor = self.model.process(inputs, normalize=True) - embeddings = [emb.cpu().numpy() for emb in embeddings_tensor] - del embeddings_tensor - return embeddings - - def __generate_full_episode_embedding(self, trans_file: Path) -> Optional[Dict[str, Any]]: # pylint: disable=too-many-locals,too-many-statements - parent_name = trans_file.parent.name - if parent_name in {"raw", "clean", "sound_events"}: - episode_dir = trans_file.parent.parent - else: - episode_dir = trans_file.parent - - clean_dir = episode_dir / settings.output_subdirs.transcription_subdirs.clean - - base_name = self.__remove_all_suffixes(trans_file.stem) - clean_txt_file = clean_dir / f"{base_name}_clean_transcription.txt" - - if not clean_txt_file.exists(): - self.logger.warning(f"Clean transcript file not found: {clean_txt_file}") - return None - - try: # pylint: disable=too-many-try-statements - with open(clean_txt_file, "r", encoding="utf-8") as f: - full_text = f.read().strip() - - if not full_text: - self.logger.warning(f"Empty clean transcript file: {clean_txt_file}") - return None - - console.print(f"[cyan]Generating full episode embedding ({len(full_text)} chars)...[/cyan]") - - max_chars_per_chunk = 6000 - overlap_chars = 4500 - - if len(full_text) > max_chars_per_chunk: - console.print( - f"[yellow]Text too long ({len(full_text)} chars), " - f"using sliding window (chunk={max_chars_per_chunk}, overlap={overlap_chars})...[/yellow]", - ) - - chunks = [] - step_size = max_chars_per_chunk - overlap_chars - - for i in range(0, len(full_text), step_size): - chunk_end = min(i + max_chars_per_chunk, len(full_text)) - chunk = full_text[i:chunk_end] - - if len(chunk.strip()) < 100: - continue - - chunks.append(chunk) - - if chunk_end >= len(full_text): - break - - console.print(f"[cyan]Processing {len(chunks)} overlapping chunks...[/cyan]") - chunk_embeddings = [] - chunk_weights = [] - - for idx, chunk in enumerate(chunks): - inputs = [{"text": chunk}] - embeddings_tensor = self.model.process(inputs, normalize=True) - chunk_embedding = embeddings_tensor[0].cpu().numpy() - chunk_embeddings.append(chunk_embedding) - del embeddings_tensor - - weight = len(chunk) / max_chars_per_chunk - chunk_weights.append(weight) - - if (idx + 1) % 5 == 0 or idx == len(chunks) - 1: - console.print(f"[cyan]Processed chunk {idx + 1}/{len(chunks)}[/cyan]") - - chunk_weights_array = np.array(chunk_weights) - chunk_weights_normalized = chunk_weights_array / chunk_weights_array.sum() - - embedding = np.average(chunk_embeddings, axis=0, weights=chunk_weights_normalized) - embedding = embedding / np.linalg.norm(embedding) - - console.print(f"[green]✓ Weighted-averaged {len(chunks)} overlapping chunks[/green]") - else: - inputs = [{"text": full_text}] - embeddings_tensor = self.model.process(inputs, normalize=True) - embedding = embeddings_tensor[0].cpu().numpy() - del embeddings_tensor - - return { - "text": full_text, - "embedding": embedding.tolist(), - "transcript_length": len(full_text), - } - - except Exception as e: - self.logger.error(f"Failed to generate full episode embedding: {e}") - return None - - def __load_frame_metadata(self, episode_info_dict: Dict[str, Any]) -> Optional[Dict[str, Any]]: - season = episode_info_dict.get("season") - episode = episode_info_dict.get("episode_number") - if season is None or episode is None: - return None - - episode_info_obj = self.episode_manager.get_episode_by_season_and_relative(season, episode) - if not episode_info_obj: - return None - - frames_episode_dir = self.path_manager.get_episode_dir(episode_info_obj, settings.output_subdirs.frames) - metadata_file = frames_episode_dir / f"{self.episode_manager.series_name}_{episode_info_obj.episode_code()}_frame_metadata.json" - - if not metadata_file.exists(): - self.logger.warning(f"Frame metadata not found: {metadata_file}") - return None - - with open(metadata_file, "r", encoding="utf-8") as f: - return json.load(f) - - def __load_image_hashes(self, episode_info_dict: Dict[str, Any]) -> Dict[int, str]: - return load_image_hashes_for_episode(episode_info_dict, self.series_name, self.logger) - - def __generate_video_embeddings(self, episode_info_dict: Dict[str, Any], frame_metadata: Dict[str, Any]) -> List[Dict[str, Any]]: - frame_requests = frame_metadata.get("frames", []) - if not frame_requests: - return [] - - season = episode_info_dict.get("season") - episode = episode_info_dict.get("episode_number") - - episode_info_obj = self.episode_manager.get_episode_by_season_and_relative(season, episode) - if not episode_info_obj: - return [] - - frames_episode_dir = self.path_manager.get_episode_dir(episode_info_obj, settings.output_subdirs.frames) - episode_output_dir = self.path_manager.get_episode_dir(episode_info_obj, settings.output_subdirs.embeddings) - checkpoint_file = episode_output_dir / "embeddings_video_checkpoint.json" - - image_hashes = self.__load_image_hashes(episode_info_dict) - embeddings = compute_embeddings_in_batches( - frames_episode_dir, - frame_requests, - self.gpu_processor, - self.batch_size, - image_hashes, - checkpoint_file=checkpoint_file, - checkpoint_interval=20, - prefetch_count=settings.embedding.prefetch_chunks, - ) - self._cleanup_memory() - return embeddings - - def __get_episode_output_dir(self, transcription_file: Path) -> Path: - episode_info_from_file = self.episode_manager.parse_filename(transcription_file) - if episode_info_from_file: - return self.path_manager.get_episode_dir(episode_info_from_file, settings.output_subdirs.embeddings) - return self.path_manager.base_output_dir / settings.output_subdirs.embeddings - - def __save_embeddings( - self, - data, - text_embeddings, - video_embeddings, - full_episode_embedding, - sound_event_embeddings, - text_output, - video_output, - full_episode_output, - sound_events_output, - ): - episode_info = data.get(EpisodeMetadataKeys.EPISODE_INFO, {}) - text_output.parent.mkdir(parents=True, exist_ok=True) - - if text_embeddings: - text_data = create_processing_metadata( - episode_info=type( - 'obj', (object,), { - 'season': episode_info.get(EpisodeMetadataKeys.SEASON), - 'relative_episode': episode_info.get(EpisodeMetadataKeys.EPISODE_NUMBER), - }, - )(), - processing_params={ - "model_name": self.model_name, - "model_revision": self.model_revision, - "segments_per_embedding": self.segments_per_embedding, - "use_sentence_based_chunking": True, - "text_sentences_per_chunk": self.text_sentences_per_chunk, - "text_chunk_overlap": self.text_chunk_overlap, - "device": self.device, - }, - statistics={ - "total_embeddings": len(text_embeddings), - "embedding_dimension": len(text_embeddings[0]["embedding"]) if text_embeddings else 0, - }, - results_key="text_embeddings", - results_data=text_embeddings, - ) - atomic_write_json(text_output, text_data, indent=2, ensure_ascii=False) - - if video_embeddings: - video_data = create_processing_metadata( - episode_info=type( - 'obj', (object,), { - 'season': episode_info.get(EpisodeMetadataKeys.SEASON), - 'relative_episode': episode_info.get(EpisodeMetadataKeys.EPISODE_NUMBER), - }, - )(), - processing_params={ - "model_name": self.model_name, - "model_revision": self.model_revision, - "batch_size": self.batch_size, - "device": self.device, - }, - statistics={ - "total_embeddings": len(video_embeddings), - "embedding_dimension": len(video_embeddings[0]["embedding"]) if video_embeddings else 0, - "frames_with_hash": sum(1 for e in video_embeddings if "perceptual_hash" in e), - }, - results_key="video_embeddings", - results_data=video_embeddings, - ) - atomic_write_json(video_output, video_data, indent=2, ensure_ascii=False) - - if full_episode_embedding: - full_episode_data = create_processing_metadata( - episode_info=type( - 'obj', (object,), { - 'season': episode_info.get(EpisodeMetadataKeys.SEASON), - 'relative_episode': episode_info.get(EpisodeMetadataKeys.EPISODE_NUMBER), - }, - )(), - processing_params={ - "model_name": self.model_name, - "model_revision": self.model_revision, - "device": self.device, - }, - statistics={ - "transcript_length": full_episode_embedding.get("transcript_length", 0), - "embedding_dimension": len(full_episode_embedding["embedding"]) if "embedding" in full_episode_embedding else 0, - }, - results_key="full_episode_embedding", - results_data=full_episode_embedding, - ) - atomic_write_json(full_episode_output, full_episode_data, indent=2, ensure_ascii=False) - console.print(f"[green]✓ Saved full episode embedding to: {full_episode_output}[/green]") - - if sound_event_embeddings: - sound_events_data = create_processing_metadata( - episode_info=type( - 'obj', (object,), { - 'season': episode_info.get(EpisodeMetadataKeys.SEASON), - 'relative_episode': episode_info.get(EpisodeMetadataKeys.EPISODE_NUMBER), - }, - )(), - processing_params={ - "model_name": self.model_name, - "model_revision": self.model_revision, - "segments_per_embedding": self.segments_per_embedding, - "use_sentence_based_chunking": True, - "text_sentences_per_chunk": self.text_sentences_per_chunk, - "text_chunk_overlap": self.text_chunk_overlap, - "device": self.device, - }, - statistics={ - "total_embeddings": len(sound_event_embeddings), - "embedding_dimension": len(sound_event_embeddings[0]["embedding"]) if sound_event_embeddings else 0, - }, - results_key="sound_event_embeddings", - results_data=sound_event_embeddings, - ) - atomic_write_json(sound_events_output, sound_events_data, indent=2, ensure_ascii=False) - console.print(f"[green]✓ Saved sound event embeddings to: {sound_events_output}[/green]") - - @staticmethod - def _cleanup_memory() -> None: - gc.collect() - if torch.cuda.is_available(): - torch.cuda.empty_cache() diff --git a/preprocessor/embeddings/episode_name_embedder.py b/preprocessor/embeddings/episode_name_embedder.py index 43bd1447f..6e76e718f 100644 --- a/preprocessor/embeddings/episode_name_embedder.py +++ b/preprocessor/embeddings/episode_name_embedder.py @@ -10,11 +10,11 @@ import numpy as np from preprocessor.config.config import settings -from preprocessor.core.episode_manager import ( +from preprocessor.core.path_manager import PathManager +from preprocessor.episodes import ( EpisodeInfo, EpisodeManager, ) -from preprocessor.core.path_manager import PathManager from preprocessor.utils.console import console from preprocessor.utils.constants import EmbeddingKeys from preprocessor.utils.file_utils import atomic_write_json diff --git a/preprocessor/embeddings/gpu_batch_processor.py b/preprocessor/embeddings/gpu_batch_processor.py index 15909c423..55715b239 100644 --- a/preprocessor/embeddings/gpu_batch_processor.py +++ b/preprocessor/embeddings/gpu_batch_processor.py @@ -61,6 +61,37 @@ def suggest_optimal_batch_size(self, target_vram_gb: float = 21.0) -> int: return suggested + @staticmethod + def __compute_embeddings(model: Any, batch_pil: List[Image.Image]) -> List[List[float]]: + inputs = [{"image": img} for img in batch_pil] + embeddings_tensor = model.process(inputs, normalize=True) + batch_np = embeddings_tensor.cpu().numpy() + del embeddings_tensor + results = [emb.tolist() for emb in batch_np] + del batch_np + torch.cuda.empty_cache() + return results + + @staticmethod + def __report_batch_progress( + processed_count: int, + total_images: int, + elapsed: float, + current_batch_size: int, + batch_start_time: float, + ) -> None: + rate = current_batch_size / elapsed if elapsed > 0 else 0 + console.print( + f" [dim cyan]→ {processed_count}/{total_images} " + f"({processed_count / total_images * 100:.0f}%) - {elapsed:.1f}s ({rate:.3f} img/s)[/dim cyan]", + ) + + elapsed_total = time.time() - batch_start_time + remaining_images = total_images - processed_count + if processed_count > 0: + eta = remaining_images / (processed_count / elapsed_total) + console.print(f" [dim]Batch ETA: {eta:.0f}s[/dim]") + def process_images_batch( self, pil_images: List[Image.Image], @@ -76,30 +107,20 @@ def _process_sub_batch(batch_pil: List[Image.Image]) -> List[List[float]]: sub_batch_start = time.time() try: - inputs = [{"image": img} for img in batch_pil] - embeddings_tensor = self.model.process(inputs, normalize=True) + results = self.__compute_embeddings(self.model, batch_pil) self.__log_vram_usage() - batch_np = embeddings_tensor.cpu().numpy() - del embeddings_tensor - results = [emb.tolist() for emb in batch_np] - del batch_np - torch.cuda.empty_cache() processed_count += current_batch_size if total_images > self.progress_sub_batch_size: elapsed = time.time() - sub_batch_start - rate = current_batch_size / elapsed if elapsed > 0 else 0 - console.print( - f" [dim cyan]→ {processed_count}/{total_images} " - f"({processed_count / total_images * 100:.0f}%) - {elapsed:.1f}s ({rate:.3f} img/s)[/dim cyan]", + self.__report_batch_progress( + processed_count, + total_images, + elapsed, + current_batch_size, + batch_start_time, ) - elapsed_total = time.time() - batch_start_time - remaining_images = total_images - processed_count - if processed_count > 0: - eta = remaining_images / (processed_count / elapsed_total) - console.print(f" [dim]Batch ETA: {eta:.0f}s[/dim]") - return results except RuntimeError as e: if "out of memory" in str(e).lower(): diff --git a/preprocessor/episodes/__init__.py b/preprocessor/episodes/__init__.py new file mode 100644 index 000000000..1254a6ef9 --- /dev/null +++ b/preprocessor/episodes/__init__.py @@ -0,0 +1,8 @@ +from preprocessor.episodes.episode_file_finder import EpisodeFileFinder +from preprocessor.episodes.episode_manager import ( + EpisodeInfo, + EpisodeManager, +) +from preprocessor.episodes.episode_parser import EpisodeInfoParser + +__all__ = ["EpisodeInfo", "EpisodeManager", "EpisodeInfoParser", "EpisodeFileFinder"] diff --git a/preprocessor/core/episode_file_finder.py b/preprocessor/episodes/episode_file_finder.py similarity index 100% rename from preprocessor/core/episode_file_finder.py rename to preprocessor/episodes/episode_file_finder.py diff --git a/preprocessor/core/episode_manager.py b/preprocessor/episodes/episode_manager.py similarity index 97% rename from preprocessor/core/episode_manager.py rename to preprocessor/episodes/episode_manager.py index 66ad400e0..21f0baf19 100644 --- a/preprocessor/core/episode_manager.py +++ b/preprocessor/episodes/episode_manager.py @@ -9,9 +9,9 @@ Optional, ) -from preprocessor.core.episode_file_finder import EpisodeFileFinder -from preprocessor.core.episode_parser import EpisodeInfoParser from preprocessor.core.path_manager import PathManager +from preprocessor.episodes.episode_file_finder import EpisodeFileFinder +from preprocessor.episodes.episode_parser import EpisodeInfoParser from preprocessor.utils.constants import ( EpisodeMetadataKeys, EpisodesDataKeys, diff --git a/preprocessor/core/episode_parser.py b/preprocessor/episodes/episode_parser.py similarity index 100% rename from preprocessor/core/episode_parser.py rename to preprocessor/episodes/episode_parser.py diff --git a/preprocessor/processors/archive_generator.py b/preprocessor/processors/archive_generator.py index 09e1b9797..90a947bfc 100644 --- a/preprocessor/processors/archive_generator.py +++ b/preprocessor/processors/archive_generator.py @@ -20,8 +20,8 @@ FILE_EXTENSIONS, FILE_SUFFIXES, ) -from preprocessor.core.episode_manager import EpisodeManager from preprocessor.core.processor_registry import register_processor +from preprocessor.episodes import EpisodeManager from preprocessor.utils.console import console ELASTIC_SUBDIRS = settings.output_subdirs.elastic_document_subdirs diff --git a/preprocessor/processors/character_detector.py b/preprocessor/processors/character_detector.py index 95fb682e1..64de214be 100644 --- a/preprocessor/processors/character_detector.py +++ b/preprocessor/processors/character_detector.py @@ -21,9 +21,9 @@ OutputSpec, ProcessingItem, ) -from preprocessor.core.episode_manager import EpisodeManager from preprocessor.core.path_manager import PathManager from preprocessor.core.processor_registry import register_processor +from preprocessor.episodes import EpisodeManager from preprocessor.utils.console import console from preprocessor.utils.detection_io import ( process_frames_for_detection, @@ -109,13 +109,10 @@ def _process_item(self, item: ProcessingItem, missing_outputs: List[OutputSpec]) if f.is_file() and "frame_" in f.name ]) - fps = 25.0 - results = process_frames_for_detection( frame_files, self.face_app, self.character_vectors, self.threshold, - fps=fps, ) - save_character_detections(episode_info, results, self.path_manager, fps=fps) + save_character_detections(episode_info, results, self.path_manager) diff --git a/preprocessor/processors/elastic_document_generator.py b/preprocessor/processors/elastic_document_generator.py index ff6cea1d7..16c4b7abe 100644 --- a/preprocessor/processors/elastic_document_generator.py +++ b/preprocessor/processors/elastic_document_generator.py @@ -8,12 +8,6 @@ Optional, ) -from preprocessor.types import ( - CharacterDetectionInFrame, - EpisodeMetadata, - ObjectDetectionInFrame, - SceneTimestampsData, -) from preprocessor.config.config import ( get_base_output_dir, settings, @@ -27,9 +21,15 @@ FILE_EXTENSIONS, FILE_SUFFIXES, ) -from preprocessor.core.episode_manager import EpisodeManager from preprocessor.core.processor_registry import register_processor from preprocessor.embeddings.episode_name_embedder import EpisodeNameEmbedder +from preprocessor.episodes import EpisodeManager +from preprocessor.types import ( + CharacterDetectionInFrame, + EpisodeMetadata, + ObjectDetectionInFrame, + SceneTimestampsData, +) from preprocessor.utils.console import console from preprocessor.utils.constants import ( CharacterDetectionKeys, @@ -106,7 +106,6 @@ def _get_expected_outputs(self, item: ProcessingItem) -> List[OutputSpec]: # py episode_info, ELASTIC_SUBDIRS.text_segments, segments_filename, - self.series_name, ) outputs.append(OutputSpec(path=segments_file, required=True)) @@ -114,7 +113,7 @@ def _get_expected_outputs(self, item: ProcessingItem) -> List[OutputSpec]: # py episode_code = episode_info.episode_num() trans_dir = self.path_manager.base_output_dir / settings.output_subdirs.transcriptions / season_code / episode_code sound_events_dir = trans_dir / settings.output_subdirs.transcription_subdirs.sound_events - sound_events_filename = self.episode_manager.file_naming.build_filename( + sound_events_filename = self.episode_manager.path_manager.build_filename( episode_info, extension="json", suffix="sound_events", @@ -126,7 +125,6 @@ def _get_expected_outputs(self, item: ProcessingItem) -> List[OutputSpec]: # py episode_info, ELASTIC_SUBDIRS.sound_events, sound_events_elastic, - self.series_name, ) outputs.append(OutputSpec(path=sound_events_file, required=False)) else: @@ -155,7 +153,6 @@ def _get_expected_outputs(self, item: ProcessingItem) -> List[OutputSpec]: # py episode_info, ELASTIC_SUBDIRS.text_embeddings, text_embeddings_filename, - self.series_name, ) outputs.append(OutputSpec(path=text_embeddings_file, required=True)) @@ -165,7 +162,6 @@ def _get_expected_outputs(self, item: ProcessingItem) -> List[OutputSpec]: # py episode_info, ELASTIC_SUBDIRS.video_frames, video_frames_filename, - self.series_name, ) outputs.append(OutputSpec(path=video_frames_file, required=True)) @@ -181,7 +177,6 @@ def _get_expected_outputs(self, item: ProcessingItem) -> List[OutputSpec]: # py episode_info, ELASTIC_SUBDIRS.episode_names, episode_name_filename, - self.series_name, ) outputs.append(OutputSpec(path=episode_name_file, required=True)) @@ -197,7 +192,6 @@ def _get_expected_outputs(self, item: ProcessingItem) -> List[OutputSpec]: # py episode_info, ELASTIC_SUBDIRS.text_statistics, text_stats_elastic_filename, - self.series_name, ) outputs.append(OutputSpec(path=text_stats_elastic_file, required=True)) @@ -208,7 +202,6 @@ def _get_expected_outputs(self, item: ProcessingItem) -> List[OutputSpec]: # py episode_info, ELASTIC_SUBDIRS.full_episode_embeddings, full_episode_elastic_filename, - self.series_name, ) outputs.append(OutputSpec(path=full_episode_elastic_file, required=True)) @@ -219,7 +212,6 @@ def _get_expected_outputs(self, item: ProcessingItem) -> List[OutputSpec]: # py episode_info, ELASTIC_SUBDIRS.sound_event_embeddings, sound_event_elastic_filename, - self.series_name, ) outputs.append(OutputSpec(path=sound_event_elastic_file, required=False)) @@ -272,7 +264,6 @@ def _process_item(self, item: ProcessingItem, missing_outputs: List[OutputSpec]) episode_metadata = self.__build_episode_metadata(episode_info) episode_id = episode_info.episode_code() - from preprocessor.core.constants import FILE_EXTENSIONS filename = f"{self.series_name.lower()}_{episode_info.episode_code()}{FILE_EXTENSIONS['mp4']}" video_path = str(Path("bot") / f"{self.series_name.upper()}-WIDEO" / episode_info.season_code() / filename) diff --git a/preprocessor/processors/elasticsearch_indexer.py b/preprocessor/processors/elasticsearch_indexer.py index b18850650..1369a6425 100644 --- a/preprocessor/processors/elasticsearch_indexer.py +++ b/preprocessor/processors/elasticsearch_indexer.py @@ -21,8 +21,8 @@ settings, ) from preprocessor.core.base_processor import BaseProcessor -from preprocessor.core.episode_manager import EpisodeManager from preprocessor.core.processor_registry import register_processor +from preprocessor.episodes import EpisodeManager from preprocessor.processors.elasticsearch_manager import ElasticSearchManager from preprocessor.utils.console import console diff --git a/preprocessor/processors/embedding_generator.py b/preprocessor/processors/embedding_generator.py index 28b71fb1f..389f5a985 100644 --- a/preprocessor/processors/embedding_generator.py +++ b/preprocessor/processors/embedding_generator.py @@ -20,11 +20,11 @@ ProcessingItem, ) from preprocessor.core.constants import FILE_SUFFIXES -from preprocessor.core.episode_manager import EpisodeManager from preprocessor.core.processor_registry import register_processor from preprocessor.embeddings.episode_name_embedder import EpisodeNameEmbedder from preprocessor.embeddings.gpu_batch_processor import GPUBatchProcessor from preprocessor.embeddings.qwen3_vl_embedding import Qwen3VLEmbedder +from preprocessor.episodes import EpisodeManager from preprocessor.utils.batch_processing_utils import compute_embeddings_in_batches from preprocessor.utils.console import console from preprocessor.utils.constants import EpisodeMetadataKeys @@ -51,8 +51,14 @@ def __init__(self, args: Dict[str, Any]): ) self.transcription_jsons: Path = self._args["transcription_jsons"] - self.frames_dir: Path = self._args.get("frames_dir", settings.frame_export.output_dir) - self.output_dir: Path = self._args.get("output_dir", settings.embedding.default_output_dir) + self.frames_dir: Path = self._args.get( + "frames_dir", + settings.frame_export.get_output_dir(self.series_name), + ) + self.output_dir: Path = self._args.get( + "output_dir", + settings.embedding.get_output_dir(self.series_name), + ) self.model_name: str = self._args.get("model", settings.embedding_model.model_name) self.model_revision: str = self._args.get("model_revision", settings.embedding_model.model_revision) @@ -68,7 +74,7 @@ def __init__(self, args: Dict[str, Any]): self.generate_full_episode: bool = self._args.get("generate_full_episode", settings.embedding.generate_full_episode_embedding) self.generate_sound_events: bool = self._args.get("generate_sound_events", True) - self.image_hashes_dir: Path = Path(self._args.get("image_hashes_dir", settings.image_hash.output_dir)) + self.image_hashes_dir: Path = Path(self._args.get("image_hashes_dir", settings.image_hash.get_output_dir(self.series_name))) episodes_info_json = self._args.get("episodes_info_json") self.episode_manager = EpisodeManager(episodes_info_json, self.series_name) @@ -157,7 +163,7 @@ def _get_expected_outputs(self, item: ProcessingItem) -> List[OutputSpec]: return outputs if self.generate_text: - text_filename = self.episode_manager.file_naming.build_filename( + text_filename = self.episode_manager.path_manager.build_filename( episode_info, extension="json", suffix="_embeddings_text", @@ -171,7 +177,7 @@ def _get_expected_outputs(self, item: ProcessingItem) -> List[OutputSpec]: outputs.append(OutputSpec(path=episode_name_output, required=True)) if self.generate_video: - video_filename = self.episode_manager.file_naming.build_filename( + video_filename = self.episode_manager.path_manager.build_filename( episode_info, extension="json", suffix="_embeddings_video", @@ -180,7 +186,7 @@ def _get_expected_outputs(self, item: ProcessingItem) -> List[OutputSpec]: outputs.append(OutputSpec(path=video_output, required=True)) if self.generate_full_episode: - full_episode_filename = self.episode_manager.file_naming.build_filename( + full_episode_filename = self.episode_manager.path_manager.build_filename( episode_info, extension="json", suffix="_embeddings_full_episode", @@ -189,7 +195,7 @@ def _get_expected_outputs(self, item: ProcessingItem) -> List[OutputSpec]: outputs.append(OutputSpec(path=full_episode_output, required=True)) if self.generate_sound_events: - sound_events_filename = self.episode_manager.file_naming.build_filename( + sound_events_filename = self.episode_manager.path_manager.build_filename( episode_info, extension="json", suffix="_embeddings_sound_events", diff --git a/preprocessor/processors/frame_exporter.py b/preprocessor/processors/frame_exporter.py index 4bf0c14bf..790eecf4b 100644 --- a/preprocessor/processors/frame_exporter.py +++ b/preprocessor/processors/frame_exporter.py @@ -1,6 +1,6 @@ -import logging from datetime import datetime import json +import logging from pathlib import Path import shutil import subprocess @@ -14,23 +14,23 @@ from PIL import Image import decord -from preprocessor.types import FrameRequest from preprocessor.config.config import settings from preprocessor.core.base_processor import ( - BaseProcessor, OutputSpec, ProcessingItem, ) from preprocessor.core.enums import KeyframeStrategy -from preprocessor.core.episode_manager import EpisodeManager from preprocessor.core.processor_registry import register_processor +from preprocessor.core.video_processor import VideoProcessor from preprocessor.embeddings.strategies.strategy_factory import KeyframeStrategyFactory +from preprocessor.episodes import EpisodeManager +from preprocessor.types import FrameRequest from preprocessor.utils.console import console from preprocessor.utils.file_utils import atomic_write_json @register_processor("export_frames") -class FrameExporter(BaseProcessor): +class FrameExporter(VideoProcessor): REQUIRES = ["videos", "scene_timestamps"] PRODUCES = ["frames"] PRIORITY = 30 @@ -45,15 +45,14 @@ def __init__(self, args: Dict[str, Any]) -> None: ) decord.bridge.set_bridge('native') - self.input_videos: Path = Path(self._args["videos"]) - self.subdirectory_filter: Optional[str] = None - episodes_json_path = self._args.get("episodes_info_json") - self.episode_manager = EpisodeManager(episodes_json_path, self.series_name) - - self.output_frames: Path = Path(self._args.get("output_frames", settings.frame_export.output_dir)) + self.output_frames: Path = Path( + self._args.get("output_frames", settings.frame_export.get_output_dir(self.series_name)), + ) self.output_frames.mkdir(parents=True, exist_ok=True) - self.scene_timestamps_dir: Path = Path(self._args.get("scene_timestamps_dir", settings.scene_detection.output_dir)) + self.scene_timestamps_dir: Path = Path( + self._args.get("scene_timestamps_dir", settings.scene_detection.get_output_dir(self.series_name)), + ) resolution = self._args.get("resolution", settings.frame_export.resolution) self.resize_width: int = resolution.width @@ -68,18 +67,8 @@ def __init__(self, args: Dict[str, Any]) -> None: self.frames_per_scene, ) - def _get_processing_items(self) -> List[ProcessingItem]: - return self._create_video_processing_items( - source_path=self.input_videos, - extensions=self.get_video_glob_patterns(), - episode_manager=self.episode_manager, - skip_unparseable=True, - subdirectory_filter=self.subdirectory_filter, - ) - def _validate_args(self, args: Dict[str, Any]) -> None: - if "videos" not in args: - raise ValueError("videos path is required") + self._validate_videos_required(args) if "scene_timestamps_dir" in args: scene_path = Path(args["scene_timestamps_dir"]) @@ -92,7 +81,7 @@ def get_output_subdir(self) -> str: def _get_expected_outputs(self, item: ProcessingItem) -> List[OutputSpec]: episode_info = item.metadata["episode_info"] - metadata_filename = self.episode_manager.file_naming.build_filename( + metadata_filename = self.episode_manager.path_manager.build_filename( episode_info, extension="json", suffix="_frame_metadata", @@ -112,7 +101,7 @@ def _process_item(self, item: ProcessingItem, missing_outputs: List[OutputSpec]) episode_dir = self.__get_episode_dir(episode_info) if episode_dir.exists(): - metadata_filename = self.episode_manager.file_naming.build_filename( + metadata_filename = self.episode_manager.path_manager.build_filename( episode_info, extension="json", suffix="_frame_metadata", @@ -174,7 +163,7 @@ def __extract_and_save_frame(self, vr, frame_num: int, episode_dir: Path, episod frame_pil = Image.fromarray(frame_np) resized = self.__resize_frame(frame_pil, self.current_video_dar) - base_filename = self.episode_manager.file_naming.build_base_filename(episode_info) + base_filename = self.episode_manager.path_manager.build_base_filename(episode_info) filename = f"{base_filename}_frame_{frame_num:06d}.jpg" resized.save(episode_dir / filename, quality=90) @@ -253,7 +242,7 @@ def __write_metadata(self, episode_dir: Path, frame_requests: List[FrameRequest] frame_with_path = frame.copy() frame_num = frame["frame_number"] - base_filename = self.episode_manager.file_naming.build_base_filename(episode_info) + base_filename = self.episode_manager.path_manager.build_base_filename(episode_info) frame_with_path["frame_path"] = f"{base_filename}_frame_{frame_num:06d}.jpg" frames_with_paths.append(frame_with_path) @@ -282,7 +271,7 @@ def __write_metadata(self, episode_dir: Path, frame_requests: List[FrameRequest] }, "frames": frames_with_paths, } - metadata_filename = self.episode_manager.file_naming.build_filename( + metadata_filename = self.episode_manager.path_manager.build_filename( episode_info, extension="json", suffix="_frame_metadata", diff --git a/preprocessor/processors/image_hash_processor.py b/preprocessor/processors/image_hash_processor.py index bc891ea98..bb1da9909 100644 --- a/preprocessor/processors/image_hash_processor.py +++ b/preprocessor/processors/image_hash_processor.py @@ -17,12 +17,12 @@ OutputSpec, ProcessingItem, ) -from preprocessor.core.episode_manager import EpisodeManager from preprocessor.core.processor_registry import register_processor -from preprocessor.utils.image_hasher import PerceptualHasher +from preprocessor.episodes import EpisodeManager from preprocessor.utils.batch_processing_utils import compute_hashes_in_batches from preprocessor.utils.console import console -from preprocessor.utils.metadata_utils import create_processing_metadata +from preprocessor.utils.hash_save_utils import save_image_hashes_to_json +from preprocessor.utils.image_hasher import PerceptualHasher # pylint: disable=duplicate-code @@ -42,8 +42,12 @@ def __init__(self, args: Dict[str, Any]) -> None: loglevel=logging.DEBUG, ) - self.frames_dir: Path = Path(self._args.get("frames_dir", settings.frame_export.output_dir)) - self.output_dir: Path = Path(self._args.get("output_dir", settings.image_hash.output_dir)) + self.frames_dir: Path = Path( + self._args.get("frames_dir", settings.frame_export.get_output_dir(self.series_name)), + ) + self.output_dir: Path = Path( + self._args.get("output_dir", settings.image_hash.get_output_dir(self.series_name)), + ) self.batch_size: int = self._args.get("batch_size", settings.embedding.batch_size) self.device: str = "cuda" @@ -75,7 +79,7 @@ def _get_processing_items(self) -> List[ProcessingItem]: def _get_expected_outputs(self, item: ProcessingItem) -> List[OutputSpec]: episode_info = item.metadata["episode_info"] - hash_filename = self.episode_manager.file_naming.build_filename( + hash_filename = self.episode_manager.path_manager.build_filename( episode_info, extension="json", suffix="image_hashes", @@ -98,50 +102,33 @@ def _process_item(self, item: ProcessingItem, missing_outputs: List[OutputSpec]) metadata_file = item.input_path episode_info = item.metadata["episode_info"] - with open(metadata_file, "r", encoding="utf-8") as f: - metadata = json.load(f) - - frame_requests = metadata.get("frames", []) - if not frame_requests: - console.print(f"[yellow]No frames in metadata for {metadata_file}[/yellow]") + frame_requests = self.__load_frame_requests(metadata_file) + if frame_requests is None: return frames_dir = metadata_file.parent hash_results = compute_hashes_in_batches(frames_dir, frame_requests, self.hasher, self.batch_size) - episode_dir = self.__get_episode_output_dir(episode_info) - self.__save_hashes(episode_dir, episode_info, hash_results) + save_image_hashes_to_json( + episode_info=episode_info, + hash_results=hash_results, + series_name=self.series_name, + device=self.device, + batch_size=self.batch_size, + ) self.__cleanup_memory() - def __get_episode_output_dir(self, episode_info) -> Path: - return self.path_manager.get_episode_dir(episode_info, settings.output_subdirs.image_hashes) - - def __save_hashes(self, episode_dir: Path, episode_info, hash_results: List[Dict[str, Any]]) -> None: - episode_dir.mkdir(parents=True, exist_ok=True) + @staticmethod + def __load_frame_requests(metadata_file: Path) -> Optional[List[Dict[str, Any]]]: + with open(metadata_file, "r", encoding="utf-8") as f: + metadata = json.load(f) - hash_data = create_processing_metadata( - episode_info=episode_info, - processing_params={ - "device": self.device, - "batch_size": self.batch_size, - "hash_size": 8, - }, - statistics={ - "total_hashes": len(hash_results), - "unique_hashes": len(set(h.get("perceptual_hash") for h in hash_results if "perceptual_hash" in h)), - }, - results_key="image_hashes", - results_data=hash_results, - ) + frame_requests = metadata.get("frames", []) + if not frame_requests: + console.print(f"[yellow]No frames in metadata for {metadata_file}[/yellow]") + return None - hash_filename = self.episode_manager.file_naming.build_filename( - episode_info, - extension="json", - suffix="image_hashes", - ) - hash_output = episode_dir / hash_filename - with open(hash_output, "w", encoding="utf-8") as f: - json.dump(hash_data, f, indent=2, ensure_ascii=False) + return frame_requests @staticmethod def __cleanup_memory() -> None: diff --git a/preprocessor/processors/scene_detector.py b/preprocessor/processors/scene_detector.py index 7c58a7a09..c2d0ab9eb 100644 --- a/preprocessor/processors/scene_detector.py +++ b/preprocessor/processors/scene_detector.py @@ -13,15 +13,15 @@ import torch from transnetv2_pytorch import TransNetV2 -from preprocessor.types import SceneDict from preprocessor.config.config import settings from preprocessor.core.base_processor import ( BaseProcessor, OutputSpec, ProcessingItem, ) -from preprocessor.core.episode_manager import EpisodeManager from preprocessor.core.processor_registry import register_processor +from preprocessor.episodes import EpisodeManager +from preprocessor.types import SceneDict from preprocessor.utils.console import console from preprocessor.utils.file_utils import atomic_write_json @@ -42,7 +42,10 @@ def __init__(self, args: Dict[str, Any]): ) self.videos: Path = self._args["videos"] - self.output_dir: Path = self._args.get("output_dir", settings.scene_detection.output_dir) + self.output_dir: Path = self._args.get( + "output_dir", + settings.scene_detection.get_output_dir(self.series_name), + ) self.threshold: float = self._args.get("threshold", settings.scene_detection.threshold) self.min_scene_len: int = self._args.get("min_scene_len", settings.scene_detection.min_scene_len) @@ -82,7 +85,7 @@ def _get_expected_outputs(self, item: ProcessingItem) -> List[OutputSpec]: episode_info = item.metadata.get("episode_info") if episode_info: - output_filename = self.episode_manager.file_naming.build_filename( + output_filename = self.episode_manager.path_manager.build_filename( episode_info, extension="json", suffix="scenes", diff --git a/preprocessor/processors/text_analyzer.py b/preprocessor/processors/text_analyzer.py index 7333a8c8d..e950db5b1 100644 --- a/preprocessor/processors/text_analyzer.py +++ b/preprocessor/processors/text_analyzer.py @@ -15,8 +15,8 @@ OutputSpec, ProcessingItem, ) -from preprocessor.core.episode_manager import EpisodeManager from preprocessor.core.processor_registry import register_processor +from preprocessor.episodes import EpisodeManager from preprocessor.text_analysis.text_statistics import TextStatistics from preprocessor.utils.file_utils import atomic_write_json @@ -95,7 +95,7 @@ def _get_expected_outputs(self, item: ProcessingItem) -> List[OutputSpec]: episode_info = item.metadata["episode_info"] clean_dir = episode_dir / settings.output_subdirs.transcription_subdirs.clean - output_filename = self.episode_manager.file_naming.build_filename( + output_filename = self.episode_manager.path_manager.build_filename( episode_info, extension="json", suffix="text_stats", @@ -110,7 +110,7 @@ def _process_item(self, item: ProcessingItem, missing_outputs: List[OutputSpec]) episode_info = item.metadata["episode_info"] clean_dir = episode_dir / settings.output_subdirs.transcription_subdirs.clean - output_filename = self.episode_manager.file_naming.build_filename( + output_filename = self.episode_manager.path_manager.build_filename( episode_info, extension="json", suffix="text_stats", diff --git a/preprocessor/processors/transcription_generator.py b/preprocessor/processors/transcription_generator.py index 2f7bed833..13d914d84 100644 --- a/preprocessor/processors/transcription_generator.py +++ b/preprocessor/processors/transcription_generator.py @@ -13,8 +13,8 @@ OutputSpec, ProcessingItem, ) -from preprocessor.core.episode_manager import EpisodeManager from preprocessor.core.processor_registry import register_processor +from preprocessor.episodes import EpisodeManager from preprocessor.transcription.generators.multi_format_generator import MultiFormatGenerator from preprocessor.transcription.processors.audio_normalizer import AudioNormalizer from preprocessor.transcription.processors.normalized_audio_processor import NormalizedAudioProcessor @@ -84,13 +84,13 @@ def _get_expected_outputs(self, item: ProcessingItem) -> List[OutputSpec]: if not episode_info: continue - filename = self.episode_manager.file_naming.build_filename(episode_info, extension="json") + filename = self.episode_manager.path_manager.build_filename(episode_info, extension="json") season_code = episode_info.season_code() episode_code = episode_info.episode_num() expected_file = self.path_manager.base_output_dir / self.get_output_subdir() / season_code / episode_code / "raw" / filename expected_file.parent.mkdir(parents=True, exist_ok=True) - segmented_filename = self.episode_manager.file_naming.build_filename( + segmented_filename = self.episode_manager.path_manager.build_filename( episode_info, extension="json", suffix="_segmented", @@ -153,13 +153,13 @@ def __check_all_transcriptions_exist(self) -> bool: if not episode_info: continue - filename = self.episode_manager.file_naming.build_filename(episode_info, extension="json") + filename = self.episode_manager.path_manager.build_filename(episode_info, extension="json") season_code = episode_info.season_code() episode_code = episode_info.episode_num() expected_file = self.path_manager.base_output_dir / self.get_output_subdir() / season_code / episode_code / "raw" / filename expected_file.parent.mkdir(parents=True, exist_ok=True) - segmented_filename = self.episode_manager.file_naming.build_filename( + segmented_filename = self.episode_manager.path_manager.build_filename( episode_info, extension="json", suffix="_segmented", @@ -189,7 +189,7 @@ def __get_missing_video_files(self, missing_outputs: List[OutputSpec]) -> List[P if not episode_info: continue - filename = self.episode_manager.file_naming.build_filename(episode_info, extension="json") + filename = self.episode_manager.path_manager.build_filename(episode_info, extension="json") season_code = episode_info.season_code() episode_code = episode_info.episode_num() expected_file = self.path_manager.base_output_dir / self.get_output_subdir() / season_code / episode_code / "raw" / filename diff --git a/preprocessor/processors/transcription_importer.py b/preprocessor/processors/transcription_importer.py index 8646d8dcb..4eda52515 100644 --- a/preprocessor/processors/transcription_importer.py +++ b/preprocessor/processors/transcription_importer.py @@ -10,9 +10,10 @@ Tuple, ) +from preprocessor.config.config import settings from preprocessor.core.base_processor import BaseProcessor -from preprocessor.core.episode_manager import EpisodeManager from preprocessor.core.processor_registry import register_processor +from preprocessor.episodes import EpisodeManager from preprocessor.utils.console import ( console, create_progress, @@ -139,7 +140,7 @@ def __import_single_file(self, json_file: Path) -> None: if episode_info: converted_data["episode_info"] = EpisodeManager.get_metadata(episode_info) - filename = self.episode_manager.file_naming.build_filename(episode_info, extension="json") + filename = self.episode_manager.path_manager.build_filename(episode_info, extension="json") season_dir = self.output_dir / episode_info.season_code() output_file = season_dir / filename output_file.parent.mkdir(parents=True, exist_ok=True) diff --git a/preprocessor/processors/video_transcoder.py b/preprocessor/processors/video_transcoder.py index ab3470f41..21070a2e4 100644 --- a/preprocessor/processors/video_transcoder.py +++ b/preprocessor/processors/video_transcoder.py @@ -12,13 +12,12 @@ from preprocessor.config.config import settings from preprocessor.core.base_processor import ( - BaseProcessor, OutputSpec, ProcessingItem, ) from preprocessor.core.constants import DEFAULT_VIDEO_EXTENSION -from preprocessor.core.episode_manager import EpisodeManager from preprocessor.core.processor_registry import register_processor +from preprocessor.core.video_processor import VideoProcessor from preprocessor.utils.constants import ( FfprobeKeys, FfprobeStreamKeys, @@ -27,7 +26,7 @@ @register_processor("transcode") -class VideoTranscoder(BaseProcessor): +class VideoTranscoder(VideoProcessor): REQUIRES = ["videos"] PRODUCES = ["transcoded_videos"] PRIORITY = 10 @@ -41,11 +40,6 @@ def __init__(self, args: Dict[str, Any]) -> None: loglevel=logging.DEBUG, ) - self.input_videos: Path = Path(self._args["videos"]) - self.subdirectory_filter: Optional[str] = None - episodes_json_path = self._args.get("episodes_info_json") - self.episode_manager = EpisodeManager(episodes_json_path, self.series_name) - self.resolution: Resolution = self._args["resolution"] self.codec: str = str(self._args["codec"]) self.preset: str = "p7" @@ -56,18 +50,8 @@ def __init__(self, args: Dict[str, Any]) -> None: self.audio_bitrate_kbps: int = int(self._args.get("audio_bitrate_kbps", 128)) self.gop_size: float = float(self._args["gop_size"]) - def _get_processing_items(self) -> List[ProcessingItem]: - return self._create_video_processing_items( - source_path=self.input_videos, - extensions=self.get_video_glob_patterns(), - episode_manager=self.episode_manager, - skip_unparseable=True, - subdirectory_filter=self.subdirectory_filter, - ) - def _validate_args(self, args: Dict[str, Any]) -> None: - if "videos" not in args: - raise ValueError("videos path is required") + self._validate_videos_required(args) if "resolution" not in args: raise ValueError("resolution is required") if "codec" not in args: diff --git a/preprocessor/scraping/base_scraper.py b/preprocessor/scraping/base_scraper.py index 5f911b9c8..6949dd3e2 100644 --- a/preprocessor/scraping/base_scraper.py +++ b/preprocessor/scraping/base_scraper.py @@ -11,12 +11,12 @@ from rich.progress import Progress from preprocessor.config.config import settings +from preprocessor.config.llm_provider import LLMProvider from preprocessor.core.base_processor import BaseProcessor from preprocessor.core.enums import ( ParserMode, ScraperMethod, ) -from preprocessor.config.llm_provider import LLMProvider from preprocessor.scraping.clipboard import ScraperClipboard from preprocessor.scraping.crawl4ai import ScraperCrawl4AI from preprocessor.utils.console import ( diff --git a/preprocessor/scripts_temp/import_transcriptions.py b/preprocessor/scripts_temp/import_transcriptions.py deleted file mode 100644 index dbe01f8a5..000000000 --- a/preprocessor/scripts_temp/import_transcriptions.py +++ /dev/null @@ -1,94 +0,0 @@ -import json -from pathlib import Path -import re -import shutil -from typing import ( - Optional, - Tuple, -) - -SOURCE_DIR = Path("/mnt/c/GIT_REPO/RANCZO_KLIPY/sceny-trans") -OUTPUT_DIR = Path("/mnt/c/GIT_REPO/RANCZO_KLIPY/preprocessor/output_data/transcriptions") -SERIES_NAME = "ranczo" - - -def parse_filename(filename: str) -> Optional[Tuple[int, int]]: - match = re.search(r"S(\d{2})E(\d{2})", filename, re.IGNORECASE) - if match: - return int(match.group(1)), int(match.group(2)) - return None - - -def _copy_and_fix_file(source_dir: Path, filename_base: str, season: int, episode: int) -> bool: - raw_dir = OUTPUT_DIR / f"S{season:02d}" / f"E{episode:02d}" / "raw" - raw_dir.mkdir(parents=True, exist_ok=True) - - episode_info = { - "season": season, - "episode_number": episode, - } - - segmented_src = source_dir / "segmented_json" / f"{filename_base}_segmented.json" - simple_src = source_dir / "simple_json" / f"{filename_base}_simple.json" - srt_src = source_dir / "srt" / f"{filename_base}.srt" - txt_src = source_dir / "txt" / f"{filename_base}.txt" - - if not segmented_src.exists(): - print(f" ERROR: Missing {segmented_src.name}") - return False - - try: # pylint: disable=too-many-try-statements - with open(segmented_src, "r", encoding="utf-8") as f: - segmented_data = json.load(f) - segmented_data["episode_info"] = episode_info - segmented_dst = raw_dir / f"{SERIES_NAME}_S{season:02d}E{episode:02d}_segmented.json" - with open(segmented_dst, "w", encoding="utf-8") as f: - json.dump(segmented_data, f, indent=2, ensure_ascii=False) - print(f" Created: {segmented_dst}") - - with open(simple_src, "r", encoding="utf-8") as f: - simple_data = json.load(f) - simple_data["episode_info"] = episode_info - simple_dst = raw_dir / f"{SERIES_NAME}_S{season:02d}E{episode:02d}_simple.json" - with open(simple_dst, "w", encoding="utf-8") as f: - json.dump(simple_data, f, indent=2, ensure_ascii=False) - print(f" Created: {simple_dst}") - - srt_dst = raw_dir / f"{SERIES_NAME}_S{season:02d}E{episode:02d}.srt" - shutil.copy2(srt_src, srt_dst) - print(f" Created: {srt_dst}") - - txt_dst = raw_dir / f"{SERIES_NAME}_S{season:02d}E{episode:02d}.txt" - shutil.copy2(txt_src, txt_dst) - print(f" Created: {txt_dst}") - - return True - except Exception as e: - print(f" ERROR: {e}") - return False - - -def main() -> None: - print(f"Source: {SOURCE_DIR}") - print(f"Output: {OUTPUT_DIR}") - - segmented_dir = SOURCE_DIR / "segmented_json" - if not segmented_dir.exists(): - print(f"ERROR: {segmented_dir} does not exist") - return - - for segmented_file in sorted(segmented_dir.glob("*_segmented.json")): - filename_base = segmented_file.stem.replace("_segmented", "") - - parsed = parse_filename(filename_base) - if not parsed: - print(f"Skipping (cannot parse): {filename_base}") - continue - - season, episode = parsed - print(f"{filename_base} -> S{season:02d}E{episode:02d}") - _copy_and_fix_file(SOURCE_DIR, filename_base, season, episode) - - -if __name__ == "__main__": - main() diff --git a/preprocessor/search/__init__.py b/preprocessor/search/__init__.py new file mode 100644 index 000000000..f3bba3a4d --- /dev/null +++ b/preprocessor/search/__init__.py @@ -0,0 +1,11 @@ +from preprocessor.search.elasticsearch_queries import ElasticsearchQueries +from preprocessor.search.embedding_service import EmbeddingService +from preprocessor.search.hash_service import HashService +from preprocessor.search.result_formatters import ResultFormatter + +__all__ = [ + "ElasticsearchQueries", + "EmbeddingService", + "HashService", + "ResultFormatter", +] diff --git a/preprocessor/search/elasticsearch_queries.py b/preprocessor/search/elasticsearch_queries.py new file mode 100644 index 000000000..2acf7e122 --- /dev/null +++ b/preprocessor/search/elasticsearch_queries.py @@ -0,0 +1,467 @@ +from typing import ( + Any, + Dict, + List, + Optional, + Tuple, +) + +from elasticsearch import AsyncElasticsearch + +from preprocessor.search.embedding_service import EmbeddingService + + +class ElasticsearchQueries: + def __init__(self, embedding_service: EmbeddingService) -> None: + self._embedding_service = embedding_service + + @staticmethod + def _build_episode_filters(season: Optional[int], episode: Optional[int]) -> List[Dict[str, Any]]: + filters = [] + if season is not None: + filters.append({"term": {"episode_metadata.season": season}}) + if episode is not None: + filters.append({"term": {"episode_metadata.episode_number": episode}}) + return filters + + async def search_text_query( + self, + es_client: AsyncElasticsearch, + query: str, + season: Optional[int] = None, + episode: Optional[int] = None, + limit: int = 20, + ) -> Dict[str, Any]: + must_clauses = [{ + "multi_match": { + "query": query, + "fields": ["text^2", "episode_metadata.title"], + "fuzziness": "AUTO", + }, + }] + + must_clauses.extend(self._build_episode_filters(season, episode)) + + query_body = {"bool": {"must": must_clauses}} + + return await es_client.search( + index="ranczo_segments", + query=query_body, + size=limit, + _source=[ + "episode_id", "segment_id", "text", "start_time", "end_time", + "speaker", "video_path", "episode_metadata", "scene_info", + ], + ) + + async def search_text_semantic( + self, + es_client: AsyncElasticsearch, + text: str, + season: Optional[int] = None, + episode: Optional[int] = None, + limit: int = 10, + ) -> Dict[str, Any]: + embedding = self._embedding_service.get_text_embedding(text) + + filter_clauses = self._build_episode_filters(season, episode) + + knn_query: Dict[str, Any] = { + "field": "text_embedding", + "query_vector": embedding, + "k": limit, + "num_candidates": limit * 10, + } + if filter_clauses: + knn_query["filter"] = filter_clauses + + return await es_client.search( + index="ranczo_text_embeddings", + knn=knn_query, + size=limit, + _source=[ + "episode_id", "embedding_id", "text", "segment_range", + "video_path", "episode_metadata", "scene_info", + ], + ) + + async def search_video_semantic( + self, + es_client: AsyncElasticsearch, + image_path: str, + season: Optional[int] = None, + episode: Optional[int] = None, + character: Optional[str] = None, + limit: int = 10, + ) -> Dict[str, Any]: + embedding = self._embedding_service.get_image_embedding(image_path) + + filter_clauses = self._build_episode_filters(season, episode) + if character: + filter_clauses.append({ + "nested": { + "path": "character_appearances", + "query": {"term": {"character_appearances.name": character}}, + }, + }) + + knn_query: Dict[str, Any] = { + "field": "video_embedding", + "query_vector": embedding, + "k": limit, + "num_candidates": limit * 10, + } + if filter_clauses: + knn_query["filter"] = filter_clauses + + return await es_client.search( + index="ranczo_video_frames", + knn=knn_query, + size=limit, + _source=[ + "episode_id", "frame_number", "timestamp", "frame_type", "scene_number", + "perceptual_hash", "video_path", "episode_metadata", "character_appearances", "scene_info", + ], + ) + + async def search_text_to_video( + self, + es_client: AsyncElasticsearch, + text: str, + season: Optional[int] = None, + episode: Optional[int] = None, + character: Optional[str] = None, + limit: int = 10, + ) -> Dict[str, Any]: + embedding = self._embedding_service.get_text_embedding(text) + + filter_clauses = self._build_episode_filters(season, episode) + if character: + filter_clauses.append({ + "nested": { + "path": "character_appearances", + "query": {"term": {"character_appearances.name": character}}, + }, + }) + + knn_query: Dict[str, Any] = { + "field": "video_embedding", + "query_vector": embedding, + "k": limit, + "num_candidates": limit * 10, + } + if filter_clauses: + knn_query["filter"] = filter_clauses + + return await es_client.search( + index="ranczo_video_frames", + knn=knn_query, + size=limit, + _source=[ + "episode_id", "frame_number", "timestamp", "frame_type", "scene_number", + "perceptual_hash", "video_path", "episode_metadata", "character_appearances", "scene_info", + ], + ) + + @staticmethod + async def search_by_character( + es_client: AsyncElasticsearch, + character: str, + season: Optional[int] = None, + episode: Optional[int] = None, + limit: int = 20, + ) -> Dict[str, Any]: + must_clauses = [{ + "nested": { + "path": "character_appearances", + "query": {"term": {"character_appearances.name": character}}, + }, + }] + + must_clauses.extend(ElasticsearchQueries._build_episode_filters(season, episode)) + + return await es_client.search( + index="ranczo_video_frames", + query={"bool": {"must": must_clauses}}, + size=limit, + _source=[ + "episode_id", "frame_number", "timestamp", "video_path", + "episode_metadata", "character_appearances", "scene_info", + ], + ) + + @staticmethod + async def search_by_emotion( + es_client: AsyncElasticsearch, + emotion: str, + season: Optional[int] = None, + episode: Optional[int] = None, + character: Optional[str] = None, + limit: int = 20, + ) -> Dict[str, Any]: + nested_must = [{"term": {"character_appearances.emotion.label": emotion}}] + if character: + nested_must.append({"term": {"character_appearances.name": character}}) + + must_clauses = [{ + "nested": { + "path": "character_appearances", + "query": {"bool": {"must": nested_must}}, + }, + }] + + must_clauses.extend(ElasticsearchQueries._build_episode_filters(season, episode)) + + nested_filter: Dict[str, Any] = {"term": {"character_appearances.emotion.label": emotion}} + if character: + nested_filter = { + "bool": { + "must": [ + {"term": {"character_appearances.emotion.label": emotion}}, + {"term": {"character_appearances.name": character}}, + ], + }, + } + + return await es_client.search( + index="ranczo_video_frames", + query={"bool": {"must": must_clauses}}, + sort=[ + { + "character_appearances.emotion.confidence": { + "order": "desc", + "nested": { + "path": "character_appearances", + "filter": nested_filter, + }, + }, + }, + ], + track_scores=True, + size=limit, + _source=[ + "episode_id", "frame_number", "timestamp", "video_path", + "episode_metadata", "character_appearances", "scene_info", + ], + ) + + @staticmethod + async def search_by_object( + es_client: AsyncElasticsearch, + object_query: str, + season: Optional[int] = None, + episode: Optional[int] = None, + limit: int = 20, + ) -> Dict[str, Any]: + filter_clauses = ElasticsearchQueries._build_episode_filters(season, episode) + + must_clauses: List[Dict[str, Any]] = [] + + if ":" in object_query: + object_class, count_filter = object_query.split(":", 1) + object_class = object_class.strip() + + if count_filter.endswith("+"): + min_count = int(count_filter[:-1]) + must_clauses.append({ + "nested": { + "path": "detected_objects", + "query": { + "bool": { + "must": [ + {"term": {"detected_objects.class": object_class}}, + {"range": {"detected_objects.count": {"gte": min_count}}}, + ], + }, + }, + }, + }) + elif "-" in count_filter: + min_c, max_c = count_filter.split("-") + must_clauses.append({ + "nested": { + "path": "detected_objects", + "query": { + "bool": { + "must": [ + {"term": {"detected_objects.class": object_class}}, + {"range": {"detected_objects.count": {"gte": int(min_c), "lte": int(max_c)}}}, + ], + }, + }, + }, + }) + else: + exact_count = int(count_filter) + must_clauses.append({ + "nested": { + "path": "detected_objects", + "query": { + "bool": { + "must": [ + {"term": {"detected_objects.class": object_class}}, + {"term": {"detected_objects.count": exact_count}}, + ], + }, + }, + }, + }) + else: + must_clauses.append({ + "nested": { + "path": "detected_objects", + "query": { + "term": {"detected_objects.class": object_query.strip()}, + }, + }, + }) + + query_body = { + "bool": { + "must": must_clauses, + "filter": filter_clauses, + }, + } + + object_class = object_query.split(":")[0].strip() if ":" in object_query else object_query.strip() + + return await es_client.search( + index="ranczo_video_frames", + query=query_body, + sort=[ + { + "detected_objects.count": { + "order": "desc", + "nested": { + "path": "detected_objects", + "filter": {"term": {"detected_objects.class": object_class}}, + }, + }, + }, + ], + track_scores=True, + size=limit, + _source=[ + "episode_id", "frame_number", "timestamp", "detected_objects", + "character_appearances", "video_path", "episode_metadata", "scene_info", + ], + ) + + @staticmethod + async def search_perceptual_hash( + es_client: AsyncElasticsearch, + phash: str, + limit: int = 10, + ) -> Dict[str, Any]: + return await es_client.search( + index="ranczo_video_frames", + query={"term": {"perceptual_hash": phash}}, + size=limit, + _source=[ + "episode_id", "frame_number", "timestamp", "video_path", + "episode_metadata", "perceptual_hash", "scene_info", + ], + ) + + @staticmethod + async def list_characters(es_client: AsyncElasticsearch) -> List[Tuple[str, int]]: + result = await es_client.search( + index="ranczo_video_frames", + size=0, + aggs={ + "characters_nested": { + "nested": {"path": "character_appearances"}, + "aggs": { + "character_names": { + "terms": {"field": "character_appearances.name", "size": 1000}, + }, + }, + }, + }, + ) + buckets = result["aggregations"]["characters_nested"]["character_names"]["buckets"] + return [(b["key"], b["doc_count"]) for b in buckets] + + @staticmethod + async def list_objects(es_client: AsyncElasticsearch) -> List[Tuple[str, int]]: + result = await es_client.search( + index="ranczo_video_frames", + size=0, + aggs={ + "objects_nested": { + "nested": {"path": "detected_objects"}, + "aggs": { + "object_classes": { + "terms": {"field": "detected_objects.class", "size": 1000}, + }, + }, + }, + }, + ) + buckets = result["aggregations"]["objects_nested"]["object_classes"]["buckets"] + return [(b["key"], b["doc_count"]) for b in buckets] + + @staticmethod + async def search_episode_name( + es_client: AsyncElasticsearch, + query: str, + season: Optional[int] = None, + limit: int = 20, + ) -> Dict[str, Any]: + must_clauses = [{ + "multi_match": { + "query": query, + "fields": ["title^2", "episode_metadata.title"], + "fuzziness": "AUTO", + }, + }] + + if season is not None: + must_clauses.append({"term": {"episode_metadata.season": season}}) + + query_body = {"bool": {"must": must_clauses}} + + return await es_client.search( + index="ranczo_episode_names", + query=query_body, + size=limit, + _source=["episode_id", "title", "video_path", "episode_metadata"], + ) + + async def search_episode_name_semantic( + self, + es_client: AsyncElasticsearch, + text: str, + season: Optional[int] = None, + limit: int = 10, + ) -> Dict[str, Any]: + embedding = self._embedding_service.get_text_embedding(text) + + filter_clauses = [] + if season is not None: + filter_clauses.append({"term": {"episode_metadata.season": season}}) + + knn_query: Dict[str, Any] = { + "field": "title_embedding", + "query_vector": embedding, + "k": limit, + "num_candidates": limit * 10, + } + if filter_clauses: + knn_query["filter"] = filter_clauses + + return await es_client.search( + index="ranczo_episode_names", + knn=knn_query, + size=limit, + _source=["episode_id", "title", "video_path", "episode_metadata"], + ) + + @staticmethod + async def get_stats(es_client: AsyncElasticsearch) -> Dict[str, int]: + return { + "segments": (await es_client.count(index="ranczo_segments"))["count"], + "text_embeddings": (await es_client.count(index="ranczo_text_embeddings"))["count"], + "video_embeddings": (await es_client.count(index="ranczo_video_frames"))["count"], + "episode_names": (await es_client.count(index="ranczo_episode_names"))["count"], + } diff --git a/preprocessor/search/embedding_service.py b/preprocessor/search/embedding_service.py new file mode 100644 index 000000000..0b757e179 --- /dev/null +++ b/preprocessor/search/embedding_service.py @@ -0,0 +1,106 @@ +from pathlib import Path +from typing import ( + List, + Optional, + Tuple, + Union, +) + +import click +from qwen_vl_utils import process_vision_info +import torch +from transformers import ( + AutoModelForVision2Seq, + AutoProcessor, +) + +from preprocessor.config.config import settings + + +class EmbeddingService: + def __init__(self) -> None: + self._model: Optional[AutoModelForVision2Seq] = None + self._processor: Optional[AutoProcessor] = None + self._device: Optional[str] = None + + def _load_model(self) -> Tuple[AutoModelForVision2Seq, AutoProcessor, str]: + if self._model is not None: + return self._model, self._processor, self._device + + click.echo("Loading embedding model...", err=True) + if not torch.cuda.is_available(): + raise RuntimeError("CUDA is required but not available. This pipeline requires GPU.") + + model_name = settings.embedding_model.model_name + self._device = "cuda" + + self._model = AutoModelForVision2Seq.from_pretrained( + model_name, + dtype=torch.bfloat16, + device_map="auto", + ) + self._processor = AutoProcessor.from_pretrained(model_name) + + click.echo(f"Model loaded on {self._device}", err=True) + return self._model, self._processor, self._device + + def get_text_embedding(self, text: str) -> List[float]: + model, processor, device = self._load_model() + + messages = [{ + "role": "user", + "content": [{"type": "text", "text": text}], + }] + + text_inputs = processor.apply_chat_template( + messages, + add_generation_prompt=True, + tokenize=True, + return_tensors="pt", + ).to(device) + + with torch.no_grad(): + output = model(input_ids=text_inputs, output_hidden_states=True) + embedding = output.hidden_states[-1][:, -1, :].squeeze(0) + embedding = torch.nn.functional.normalize(embedding, p=2, dim=0) + + return embedding.float().cpu().numpy().tolist() + + def get_image_embedding(self, image_path: Union[str, Path]) -> List[float]: + model, processor, device = self._load_model() + + messages = [{ + "role": "user", + "content": [ + {"type": "image", "image": str(image_path)}, + {"type": "text", "text": "Describe this image."}, + ], + }] + + text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) + image_inputs, video_inputs = process_vision_info(messages) + + inputs = processor( + text=[text], + images=image_inputs, + videos=video_inputs, + padding=True, + return_tensors="pt", + ) + inputs = inputs.to(device) + + with torch.no_grad(): + output = model(**inputs, output_hidden_states=True) + embedding = output.hidden_states[-1][:, -1, :].squeeze(0) + embedding = torch.nn.functional.normalize(embedding, p=2, dim=0) + + return embedding.float().cpu().numpy().tolist() + + def cleanup(self) -> None: + if self._model is not None: + del self._model + del self._processor + self._model = None + self._processor = None + if torch.cuda.is_available(): + torch.cuda.empty_cache() diff --git a/preprocessor/search/hash_service.py b/preprocessor/search/hash_service.py new file mode 100644 index 000000000..8be829556 --- /dev/null +++ b/preprocessor/search/hash_service.py @@ -0,0 +1,41 @@ +from pathlib import Path +from typing import ( + Optional, + Union, +) + +from PIL import Image +import click +import torch + +from preprocessor.utils.image_hasher import PerceptualHasher + + +class HashService: + def __init__(self) -> None: + self._hasher: Optional[PerceptualHasher] = None + + def _load_hasher(self) -> PerceptualHasher: + if self._hasher is not None: + return self._hasher + + click.echo("Loading perceptual hasher...", err=True) + if not torch.cuda.is_available(): + raise RuntimeError("CUDA is required but not available. This pipeline requires GPU.") + + self._hasher = PerceptualHasher(device="cuda", hash_size=8) + click.echo("Hasher loaded on cuda", err=True) + return self._hasher + + def get_perceptual_hash(self, image_path: Union[str, Path]) -> Optional[str]: + hasher = self._load_hasher() + image = Image.open(image_path).convert("RGB") + hashes = hasher.compute_phash_batch([image]) + return hashes[0] if hashes else None + + def cleanup(self) -> None: + if self._hasher is not None: + del self._hasher + self._hasher = None + if torch.cuda.is_available(): + torch.cuda.empty_cache() diff --git a/preprocessor/search/result_formatters.py b/preprocessor/search/result_formatters.py new file mode 100644 index 000000000..2ba2d7780 --- /dev/null +++ b/preprocessor/search/result_formatters.py @@ -0,0 +1,85 @@ +from typing import ( + Any, + Dict, + Optional, +) + +import click + +from preprocessor.utils.constants import ( + ElasticsearchAggregationKeys, + ElasticsearchKeys, + EpisodeMetadataKeys, +) + + +class ResultFormatter: + @staticmethod + def format_timestamp(seconds: float) -> str: + minutes = int(seconds // 60) + secs = seconds % 60 + return f"{minutes}m {secs:.1f}s" + + @staticmethod + def _format_scene_context(scene_info: Optional[Dict[str, Any]]) -> str: + if not scene_info: + return "" + start = ResultFormatter.format_timestamp(scene_info.get('scene_start_time', 0)) + end = ResultFormatter.format_timestamp(scene_info.get('scene_end_time', 0)) + return f" [Scene {scene_info.get('scene_number', '?')}: {start} - {end}]" + + @staticmethod + def print_results(result: Dict[str, Any], result_type: str = "text") -> None: # pylint: disable=too-many-locals + total = result[ElasticsearchKeys.HITS][ElasticsearchKeys.TOTAL][ElasticsearchAggregationKeys.VALUE] + hits = result[ElasticsearchKeys.HITS][ElasticsearchKeys.HITS] + + click.echo(f"\nZnaleziono: {total} wynikow") + click.echo("=" * 80) + + for i, hit in enumerate(hits, 1): + source = hit[ElasticsearchKeys.SOURCE] + score = hit[ElasticsearchKeys.SCORE] + meta = source[EpisodeMetadataKeys.EPISODE_METADATA] + scene_ctx = ResultFormatter._format_scene_context(source.get("scene_info")) + + click.echo(f"\n[{i}] Score: {score:.2f}") + season_code = "S00" if meta['season'] == 0 else f"S{meta['season']:02d}" + click.echo(f"Episode: {season_code}E{meta['episode_number']:02d} - {meta.get('title', 'N/A')}") + + if result_type == "text": + click.echo(f"Segment ID: {source.get('segment_id', 'N/A')}") + start_time = ResultFormatter.format_timestamp(source['start_time']) + end_time = ResultFormatter.format_timestamp(source['end_time']) + click.echo(f"Time: {start_time} - {end_time}{scene_ctx}") + click.echo(f"Speaker: {source.get('speaker', 'N/A')}") + click.echo(f"Text: {source['text']}") + elif result_type == "text_semantic": + click.echo(f"Segments: {source['segment_range'][0]}-{source['segment_range'][1]}{scene_ctx}") + click.echo(f"Embedding ID: {source.get('embedding_id', 'N/A')}") + click.echo(f"Text: {source['text']}") + elif result_type == "episode_name": + click.echo(f"Episode Title: {source.get('title', 'N/A')}") + else: + timestamp = ResultFormatter.format_timestamp(source['timestamp']) + click.echo(f"Frame: {source['frame_number']} @ {timestamp}{scene_ctx}") + if "frame_type" in source: + click.echo(f"Type: {source['frame_type']}") + if "scene_number" in source: + click.echo(f"Scene number: {source['scene_number']}") + if "perceptual_hash" in source: + click.echo(f"Hash: {source['perceptual_hash']}") + if source.get("character_appearances"): + chars_strs = [] + for char in source['character_appearances']: + char_str = char.get('name', 'Unknown') + if char.get('emotion'): + emotion_label = char['emotion'].get('label', '?') + emotion_conf = char['emotion'].get('confidence', 0) + char_str += f" ({emotion_label} {emotion_conf:.2f})" + chars_strs.append(char_str) + click.echo(f"Characters: {', '.join(chars_strs)}") + if source.get("detected_objects"): + objects_str = ", ".join([f"{obj['class']}:{obj['count']}" for obj in source['detected_objects']]) + click.echo(f"Objects: {objects_str}") + + click.echo(f"Path: {source['video_path']}") diff --git a/preprocessor/transcription/elevenlabs.py b/preprocessor/transcription/elevenlabs.py index 2ef9ec40f..2dee53b12 100644 --- a/preprocessor/transcription/elevenlabs.py +++ b/preprocessor/transcription/elevenlabs.py @@ -12,7 +12,7 @@ from preprocessor.config.config import settings from preprocessor.core.base_processor import BaseProcessor -from preprocessor.core.episode_manager import EpisodeManager +from preprocessor.episodes import EpisodeManager from preprocessor.transcription.engines.elevenlabs_engine import ElevenLabsEngine from preprocessor.transcription.generators.multi_format_generator import MultiFormatGenerator from preprocessor.utils.console import ( @@ -213,7 +213,7 @@ def __save_transcription(self, data: Dict[str, Any], video_file: Path) -> None: } json_dir = self.output_dir / "json" - filename = self.episode_manager.file_naming.build_filename(episode_info, extension="json") + filename = self.episode_manager.path_manager.build_filename(episode_info, extension="json") season_dir = json_dir / episode_info.season_code() output_file = season_dir / filename output_file.parent.mkdir(parents=True, exist_ok=True) diff --git a/preprocessor/transcription/generators/json_generator.py b/preprocessor/transcription/generators/json_generator.py index b6f5c6c08..c785f5237 100644 --- a/preprocessor/transcription/generators/json_generator.py +++ b/preprocessor/transcription/generators/json_generator.py @@ -30,12 +30,11 @@ def _get_output_filename(self, json_file: Path) -> str: def convert(self, data: Dict[str, Any]) -> Dict[str, Any]: if self.format_type == "full": return self.convert_to_full_format(data) - elif self.format_type == "simple": + if self.format_type == "simple": return self.convert_to_simple_format(data) - elif self.format_type == "segmented": + if self.format_type == "segmented": return self.convert_to_segmented_format(data) - else: - raise ValueError(f"Unknown format type: {self.format_type}") + raise ValueError(f"Unknown format type: {self.format_type}") @staticmethod def convert_to_full_format(data: Dict[str, Any]) -> Dict[str, Any]: @@ -92,4 +91,4 @@ def convert_to_segmented_format(data: Dict[str, Any]) -> Dict[str, Any]: "words": convert_words_list(seg_words), }) - return {"segments": result_segments} \ No newline at end of file + return {"segments": result_segments} diff --git a/preprocessor/transcription/generators/multi_format_generator.py b/preprocessor/transcription/generators/multi_format_generator.py index c1552ca00..6e2283079 100644 --- a/preprocessor/transcription/generators/multi_format_generator.py +++ b/preprocessor/transcription/generators/multi_format_generator.py @@ -9,7 +9,7 @@ get_base_output_dir, settings, ) -from preprocessor.core.episode_manager import EpisodeManager +from preprocessor.episodes import EpisodeManager from preprocessor.transcription.generators.json_generator import JsonGenerator from preprocessor.transcription.generators.srt_generator import SrtGenerator from preprocessor.transcription.generators.txt_generator import TxtGenerator @@ -49,7 +49,7 @@ def __process_file(self, transcription_file: Path) -> None: self.logger.error(f"Cannot extract episode info from {transcription_file.name}") return - filename = self.episode_manager.file_naming.build_filename(episode_info, extension="json") + filename = self.episode_manager.path_manager.build_filename(episode_info, extension="json") season_code = episode_info.season_code() episode_code = episode_info.episode_num() main_output_file = get_base_output_dir(self.series_name) / settings.output_subdirs.transcriptions / season_code / episode_code / "raw" / filename @@ -74,7 +74,7 @@ def __process_file(self, transcription_file: Path) -> None: self.logger.error(f"Error processing file {transcription_file}: {e}") def __generate_full_json(self, data: Dict[str, Any], episode_info) -> None: - filename = self.episode_manager.file_naming.build_filename(episode_info, extension="json") + filename = self.episode_manager.path_manager.build_filename(episode_info, extension="json") season_code = episode_info.season_code() episode_code = episode_info.episode_num() output_file = get_base_output_dir(self.series_name) / settings.output_subdirs.transcriptions / season_code / episode_code / "raw" / filename @@ -90,7 +90,7 @@ def __generate_full_json(self, data: Dict[str, Any], episode_info) -> None: self.logger.info(f"Generated full JSON: {output_file}") def __generate_segmented_json(self, data: Dict[str, Any], episode_info) -> None: - filename = self.episode_manager.file_naming.build_filename( + filename = self.episode_manager.path_manager.build_filename( episode_info, extension="json", suffix="segmented", @@ -114,7 +114,7 @@ def __generate_segmented_json(self, data: Dict[str, Any], episode_info) -> None: self.logger.info(f"Generated segmented JSON: {output_file}") def __generate_simple_json(self, data: Dict[str, Any], episode_info) -> None: - filename = self.episode_manager.file_naming.build_filename( + filename = self.episode_manager.path_manager.build_filename( episode_info, extension="json", suffix="simple", @@ -138,7 +138,7 @@ def __generate_simple_json(self, data: Dict[str, Any], episode_info) -> None: self.logger.info(f"Generated simple JSON: {output_file}") def __generate_srt(self, data: Dict[str, Any], episode_info) -> None: - filename = self.episode_manager.file_naming.build_filename(episode_info, extension="srt") + filename = self.episode_manager.path_manager.build_filename(episode_info, extension="srt") season_code = episode_info.season_code() episode_code = episode_info.episode_num() output_file = get_base_output_dir(self.series_name) / settings.output_subdirs.transcriptions / season_code / episode_code / "raw" / filename @@ -154,7 +154,7 @@ def __generate_srt(self, data: Dict[str, Any], episode_info) -> None: self.logger.info(f"Generated SRT: {output_file}") def __generate_txt(self, data: Dict[str, Any], episode_info) -> None: - filename = self.episode_manager.file_naming.build_filename(episode_info, extension="txt") + filename = self.episode_manager.path_manager.build_filename(episode_info, extension="txt") season_code = episode_info.season_code() episode_code = episode_info.episode_num() output_file = get_base_output_dir(self.series_name) / settings.output_subdirs.transcriptions / season_code / episode_code / "raw" / filename diff --git a/preprocessor/transcription/processors/episode_info_processor.py b/preprocessor/transcription/processors/episode_info_processor.py index 5beec534f..866044576 100644 --- a/preprocessor/transcription/processors/episode_info_processor.py +++ b/preprocessor/transcription/processors/episode_info_processor.py @@ -6,7 +6,7 @@ Tuple, ) -from preprocessor.core.episode_manager import EpisodeManager +from preprocessor.episodes import EpisodeManager from preprocessor.utils.error_handling_logger import ErrorHandlingLogger @@ -58,7 +58,7 @@ def __load_transcription(path: Path) -> Dict[str, Any]: return json.load(f) def __write_episode_json(self, transcription: Dict[str, Any], episode_info) -> Tuple[Path, str]: - new_json_name = self.__episode_manager.file_naming.build_filename(episode_info, extension="json") + new_json_name = self.__episode_manager.path_manager.build_filename(episode_info, extension="json") season_dir = self.__output_path / episode_info.season_code() output_path = season_dir / new_json_name output_path.parent.mkdir(parents=True, exist_ok=True) diff --git a/preprocessor/transcription/processors/sound_separator.py b/preprocessor/transcription/processors/sound_separator.py index 5a31653c4..e958b5dca 100644 --- a/preprocessor/transcription/processors/sound_separator.py +++ b/preprocessor/transcription/processors/sound_separator.py @@ -18,7 +18,7 @@ FILE_EXTENSIONS, FILE_SUFFIXES, ) -from preprocessor.core.episode_manager import EpisodeManager +from preprocessor.episodes import EpisodeManager from preprocessor.utils.constants import ( WordKeys, WordTypeValues, diff --git a/preprocessor/transcription/processors/unicode_fixer.py b/preprocessor/transcription/processors/unicode_fixer.py index c616ee96b..3dfea589b 100644 --- a/preprocessor/transcription/processors/unicode_fixer.py +++ b/preprocessor/transcription/processors/unicode_fixer.py @@ -11,7 +11,7 @@ OutputSpec, ProcessingItem, ) -from preprocessor.core.episode_manager import EpisodeManager +from preprocessor.episodes import EpisodeManager from preprocessor.utils.transcription_utils import fix_transcription_file_unicode diff --git a/preprocessor/types/__init__.py b/preprocessor/types/__init__.py index 6d44a654a..b8eb211e8 100644 --- a/preprocessor/types/__init__.py +++ b/preprocessor/types/__init__.py @@ -1,3 +1,9 @@ +from .clip import ClipSegment +from .detection import ( + CharacterDetectionInFrame, + Detection, + ObjectDetectionInFrame, +) from .episode import ( EpisodeInfo, EpisodeMetadata, @@ -11,23 +17,6 @@ SceneTimestampPoint, SceneTimestampsData, ) -from .clip import ClipSegment -from .detection import ( - CharacterDetectionInFrame, - Detection, - ObjectDetectionInFrame, -) -from .video import ( - HashResult, - VideoMetadata, -) -from .transcription import ( - BaseSegment, - ElasticsearchSegment, - SegmentWithScore, - SegmentWithTimes, - TranscriptionContext, -) from .search import ( ElasticsearchAggregations, ElasticsearchHit, @@ -37,6 +26,17 @@ SearchSegment, SeasonBucket, ) +from .transcription import ( + BaseSegment, + ElasticsearchSegment, + SegmentWithScore, + SegmentWithTimes, + TranscriptionContext, +) +from .video import ( + HashResult, + VideoMetadata, +) __all__ = [ "EpisodeInfo", @@ -66,4 +66,4 @@ "EpisodeBucket", "SearchSegment", "SeasonBucket", -] \ No newline at end of file +] diff --git a/preprocessor/types/clip.py b/preprocessor/types/clip.py index 4385f79e3..a1dac0191 100644 --- a/preprocessor/types/clip.py +++ b/preprocessor/types/clip.py @@ -4,6 +4,7 @@ Union, ) + class ClipSegment(TypedDict): video_path: Union[str, Any] start_time: float diff --git a/preprocessor/types/detection.py b/preprocessor/types/detection.py index 2250d4b78..2360eafaf 100644 --- a/preprocessor/types/detection.py +++ b/preprocessor/types/detection.py @@ -4,6 +4,7 @@ TypedDict, ) + class CharacterDetectionInFrame(TypedDict): name: str confidence: float diff --git a/preprocessor/types/episode.py b/preprocessor/types/episode.py index 9ef0f3174..8dc88f45c 100644 --- a/preprocessor/types/episode.py +++ b/preprocessor/types/episode.py @@ -4,6 +4,7 @@ Union, ) + class EpisodeInfo(TypedDict): episode_number: int title: str diff --git a/preprocessor/types/frame.py b/preprocessor/types/frame.py index 4009acd38..7d9c59ebe 100644 --- a/preprocessor/types/frame.py +++ b/preprocessor/types/frame.py @@ -3,6 +3,7 @@ TypedDict, ) + class FrameRequest(TypedDict): frame: int time: float diff --git a/preprocessor/types/scene.py b/preprocessor/types/scene.py index 2f92d5dcf..9fd92f181 100644 --- a/preprocessor/types/scene.py +++ b/preprocessor/types/scene.py @@ -4,6 +4,7 @@ TypedDict, ) + class SceneDict(TypedDict): scene_number: int start_frame: int diff --git a/preprocessor/types/search.py b/preprocessor/types/search.py index 18c67fe33..2d0f5719c 100644 --- a/preprocessor/types/search.py +++ b/preprocessor/types/search.py @@ -6,8 +6,10 @@ TypedDict, Union, ) + from .transcription import ElasticsearchSegment + class SearchSegment(TypedDict): season: int episode_number: int diff --git a/preprocessor/types/transcription.py b/preprocessor/types/transcription.py index 24a448b5d..222bfb9a0 100644 --- a/preprocessor/types/transcription.py +++ b/preprocessor/types/transcription.py @@ -3,8 +3,10 @@ NotRequired, TypedDict, ) + from .episode import EpisodeMetadata + class BaseSegment(TypedDict): id: int text: str diff --git a/preprocessor/types/video.py b/preprocessor/types/video.py index 000aface2..7d4d620ab 100644 --- a/preprocessor/types/video.py +++ b/preprocessor/types/video.py @@ -1,9 +1,9 @@ from typing import ( - List, NotRequired, TypedDict, ) + class HashResult(TypedDict): frame_number: int timestamp: float diff --git a/preprocessor/utils/batch_processing_utils.py b/preprocessor/utils/batch_processing_utils.py index f4d04d52c..fbc753796 100644 --- a/preprocessor/utils/batch_processing_utils.py +++ b/preprocessor/utils/batch_processing_utils.py @@ -14,11 +14,11 @@ from PIL import Image from preprocessor.embeddings.gpu_batch_processor import GPUBatchProcessor -from preprocessor.utils.image_hasher import PerceptualHasher from preprocessor.utils.batch_processor import BatchProcessor from preprocessor.utils.console import console -from preprocessor.video.frame_utils import load_frames_from_requests +from preprocessor.utils.image_hasher import PerceptualHasher from preprocessor.utils.time_utils import format_time_hms +from preprocessor.video.frame_utils import load_frames_from_requests def _prefetch_batches( diff --git a/preprocessor/utils/hash_save_utils.py b/preprocessor/utils/hash_save_utils.py new file mode 100644 index 000000000..78d757084 --- /dev/null +++ b/preprocessor/utils/hash_save_utils.py @@ -0,0 +1,50 @@ +from pathlib import Path +from typing import ( + Any, + Dict, + List, +) + +from preprocessor.config.config import settings +from preprocessor.core.path_manager import PathManager +from preprocessor.episodes import EpisodeInfo +from preprocessor.utils.file_utils import atomic_write_json +from preprocessor.utils.metadata_utils import create_processing_metadata + + +def save_image_hashes_to_json( + episode_info: EpisodeInfo, + hash_results: List[Dict[str, Any]], + series_name: str, + device: str, + batch_size: int, +) -> Path: + path_manager = PathManager(series_name) + episode_dir = path_manager.get_episode_dir(episode_info, settings.output_subdirs.image_hashes) + episode_dir.mkdir(parents=True, exist_ok=True) + + hash_data = create_processing_metadata( + episode_info=episode_info, + processing_params={ + "device": device, + "batch_size": batch_size, + "hash_size": 8, + }, + statistics={ + "total_hashes": len(hash_results), + "unique_hashes": len(set(h.get("perceptual_hash") for h in hash_results if "perceptual_hash" in h)), + }, + results_key="image_hashes", + results_data=hash_results, + ) + + hash_filename = path_manager.build_filename( + episode_info, + extension="json", + suffix="image_hashes", + ) + + output_path = episode_dir / hash_filename + atomic_write_json(output_path, hash_data) + + return output_path diff --git a/preprocessor/utils/image_hash_utils.py b/preprocessor/utils/image_hash_utils.py index a12c0e181..493e3bc8c 100644 --- a/preprocessor/utils/image_hash_utils.py +++ b/preprocessor/utils/image_hash_utils.py @@ -2,8 +2,8 @@ from typing import Dict from preprocessor.config.config import settings -from preprocessor.core.episode_manager import EpisodeInfo from preprocessor.core.path_manager import PathManager +from preprocessor.episodes import EpisodeInfo def load_image_hashes_for_episode( diff --git a/preprocessor/validation/episode_stats.py b/preprocessor/validation/episode_stats.py index 62ff86764..a683c4b48 100644 --- a/preprocessor/validation/episode_stats.py +++ b/preprocessor/validation/episode_stats.py @@ -20,8 +20,8 @@ OUTPUT_FILE_NAMES, OUTPUT_FILE_PATTERNS, ) -from preprocessor.core.episode_manager import EpisodeInfo from preprocessor.core.path_manager import PathManager +from preprocessor.episodes import EpisodeInfo from preprocessor.validation.base_result import ValidationStatusMixin from preprocessor.validation.file_validators import ( validate_image_file, diff --git a/preprocessor/validation/validator.py b/preprocessor/validation/validator.py index 4f1984cd0..6f5c04cef 100644 --- a/preprocessor/validation/validator.py +++ b/preprocessor/validation/validator.py @@ -9,8 +9,8 @@ from rich.progress import track from preprocessor.config.config import settings -from preprocessor.core.episode_manager import EpisodeManager from preprocessor.core.path_manager import PathManager +from preprocessor.episodes import EpisodeManager from preprocessor.utils.file_utils import atomic_write_json from preprocessor.validation.episode_stats import EpisodeStats from preprocessor.validation.report_generator import ReportGenerator diff --git a/preprocessor/video/frame_processor.py b/preprocessor/video/frame_processor.py index b6ac68ee6..24f6b561d 100644 --- a/preprocessor/video/frame_processor.py +++ b/preprocessor/video/frame_processor.py @@ -1,3 +1,4 @@ +import json import logging from pathlib import Path import shutil @@ -5,17 +6,23 @@ Any, Dict, List, + Optional, + Tuple, ) +import cv2 + from preprocessor.config.config import settings from preprocessor.core.base_processor import ( BaseProcessor, OutputSpec, ProcessingItem, ) -from preprocessor.core.episode_manager import EpisodeManager +from preprocessor.episodes import EpisodeManager from preprocessor.utils.console import console +# pylint: disable=duplicate-code + class FrameProcessor(BaseProcessor): def __init__(self, args: Dict[str, Any]): @@ -135,7 +142,8 @@ def get_expected_outputs(self, item: ProcessingItem) -> List[OutputSpec]: raise NotImplementedError def should_run(self, item: ProcessingItem, missing_outputs: List[OutputSpec]) -> bool: - raise NotImplementedError + expected = self.get_expected_outputs(item) + return any(str(exp.path) in str(miss.path) for exp in expected for miss in missing_outputs) def needs_ramdisk(self) -> bool: return True @@ -143,5 +151,66 @@ def needs_ramdisk(self) -> bool: def process(self, item: ProcessingItem, ramdisk_frames_dir: Path) -> None: raise NotImplementedError + @staticmethod + def _load_frame_files_from_ramdisk(ramdisk_frames_dir: Path) -> List[Path]: + return sorted([ + f for f in ramdisk_frames_dir.glob("*.jpg") + if f.is_file() and "frame_" in f.name + ]) + + def _load_frames_with_warning(self, ramdisk_frames_dir: Path) -> Optional[List[Path]]: + frame_files = self._load_frame_files_from_ramdisk(ramdisk_frames_dir) + if not frame_files: + console.print(f"[yellow]No frames found in {ramdisk_frames_dir}[/yellow]") + return None + return frame_files + + @staticmethod + def _load_detection_file( + detection_dir: Path, + ramdisk_frames_dir: Path, + glob_pattern: str, + ) -> Optional[Dict[str, Any]]: + detection_files = list(detection_dir.glob(glob_pattern)) + detection_file = detection_files[0] if detection_files else None + + if not detection_file or not detection_file.exists(): + console.print(f"[yellow]No detections JSON found in {detection_dir}[/yellow]") + return None + + if not ramdisk_frames_dir.exists(): + console.print(f"[yellow]No frames directory found: {ramdisk_frames_dir}[/yellow]") + return None + + with open(detection_file, 'r', encoding='utf-8') as f: + return json.load(f) + + @staticmethod + def _load_frame_requests_from_metadata(metadata_file: Path) -> Optional[List[Dict[str, Any]]]: + with open(metadata_file, "r", encoding="utf-8") as f: + metadata = json.load(f) + + frame_requests = metadata.get("frames", []) + if not frame_requests: + console.print(f"[yellow]No frames in metadata for {metadata_file}[/yellow]") + return None + + return frame_requests + + @staticmethod + def _draw_label_on_bbox( + img, + label: str, + x1: int, + y1: int, + color: Tuple[int, int, int], + ) -> None: + label_size, _ = cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX, 0.5, 1) + label_y1 = max(y1 - 10, label_size[1]) + + cv2.rectangle(img, (x1, label_y1 - label_size[1] - 5), (x1 + label_size[0], label_y1), color, -1) + cv2.putText(img, label, (x1, label_y1 - 2), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255), 1) + def finalize(self) -> None: - pass + if hasattr(self, 'logger'): + self.logger.finalize() diff --git a/preprocessor/video/subprocessors/__init__.py b/preprocessor/video/subprocessors/__init__.py index 9fc496d07..baa834a1c 100644 --- a/preprocessor/video/subprocessors/__init__.py +++ b/preprocessor/video/subprocessors/__init__.py @@ -1,11 +1,11 @@ -from .image_hash_subprocessor import ImageHashSubProcessor -from .video_embedding_subprocessor import VideoEmbeddingSubProcessor from .character_detection_subprocessor import CharacterDetectionSubProcessor -from .object_detection_subprocessor import ObjectDetectionSubProcessor -from .object_detection_visualization_subprocessor import ObjectDetectionVisualizationSubProcessor from .character_detection_visualization_subprocessor import CharacterDetectionVisualizationSubProcessor from .emotion_detection_subprocessor import EmotionDetectionSubProcessor from .face_clustering_subprocessor import FaceClusteringSubProcessor +from .image_hash_subprocessor import ImageHashSubProcessor +from .object_detection_subprocessor import ObjectDetectionSubProcessor +from .object_detection_visualization_subprocessor import ObjectDetectionVisualizationSubProcessor +from .video_embedding_subprocessor import VideoEmbeddingSubProcessor __all__ = [ "ImageHashSubProcessor", @@ -16,4 +16,4 @@ "CharacterDetectionVisualizationSubProcessor", "EmotionDetectionSubProcessor", "FaceClusteringSubProcessor", -] \ No newline at end of file +] diff --git a/preprocessor/video/subprocessors/character_detection_subprocessor.py b/preprocessor/video/subprocessors/character_detection_subprocessor.py index 8412f9e27..fdd508f47 100644 --- a/preprocessor/video/subprocessors/character_detection_subprocessor.py +++ b/preprocessor/video/subprocessors/character_detection_subprocessor.py @@ -1,7 +1,6 @@ import logging from pathlib import Path from typing import ( - Any, Dict, List, Optional, @@ -91,13 +90,10 @@ def process(self, item: ProcessingItem, ramdisk_frames_dir: Path) -> None: console.print(f"[cyan]Detecting characters in {len(frame_files)} frames[/cyan]") - fps = 25.0 - results = process_frames_for_detection( frame_files, self.face_app, self.character_vectors, self.threshold, - fps=fps, ) - save_character_detections(episode_info, results, fps=fps) + save_character_detections(episode_info, results) diff --git a/preprocessor/video/subprocessors/character_detection_visualization_subprocessor.py b/preprocessor/video/subprocessors/character_detection_visualization_subprocessor.py index 37fc17770..3f0d0046a 100644 --- a/preprocessor/video/subprocessors/character_detection_visualization_subprocessor.py +++ b/preprocessor/video/subprocessors/character_detection_visualization_subprocessor.py @@ -1,4 +1,3 @@ -import json import logging from pathlib import Path from typing import ( @@ -9,6 +8,7 @@ Tuple, ) +import cv2 import numpy as np from preprocessor.config.config import settings @@ -33,10 +33,6 @@ def initialize(self) -> None: def cleanup(self) -> None: pass - def finalize(self) -> None: - if hasattr(self, 'logger'): - self.logger.finalize() - def needs_ramdisk(self) -> bool: return False @@ -59,25 +55,18 @@ def should_run(self, item: ProcessingItem, missing_outputs: List[OutputSpec]) -> expected = self.get_expected_outputs(item) return any(str(exp.path) in str(miss.path) for exp in expected for miss in missing_outputs) - def process(self, item: ProcessingItem, ramdisk_frames_dir: Path) -> None: # pylint: disable=too-many-locals - import cv2 # pylint: disable=import-outside-toplevel - + def process(self, item: ProcessingItem, ramdisk_frames_dir: Path) -> None: episode_info = item.metadata["episode_info"] detection_dir = PathManager(episode_info.series_name or "unknown").get_episode_dir(episode_info,settings.output_subdirs.character_detections) - detection_files = list(detection_dir.glob("*_character_detections.json")) - detection_file = detection_files[0] if detection_files else None - - if not detection_file or not detection_file.exists(): - console.print(f"[yellow]No detections JSON found in {detection_dir}[/yellow]") - return - if not ramdisk_frames_dir.exists(): - console.print(f"[yellow]No frames directory found: {ramdisk_frames_dir}[/yellow]") + detection_data = self._load_detection_file( + detection_dir, + ramdisk_frames_dir, + "*_character_detections.json", + ) + if detection_data is None: return - with open(detection_file, 'r', encoding='utf-8') as f: - detection_data = json.load(f) - frames_with_detections = [f for f in detection_data.get("detections", []) if f.get('characters')] if not frames_with_detections: console.print(f"[yellow]No frames with character detections for {episode_info.episode_code()}[/yellow]") @@ -120,8 +109,6 @@ def process(self, item: ProcessingItem, ramdisk_frames_dir: Path) -> None: # py @staticmethod def __draw_characters_on_frame(img, characters: List[Dict[str, Any]], colors: Dict[str, Tuple[int, int, int]]) -> None: - import cv2 # pylint: disable=import-outside-toplevel - for character in characters: name = character['name'] confidence = character['confidence'] @@ -139,11 +126,7 @@ def __draw_characters_on_frame(img, characters: List[Dict[str, Any]], colors: Di emotion_conf = character["emotion"]["confidence"] label += f" | {emotion_label} {emotion_conf:.2f}" - label_size, _ = cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX, 0.5, 1) - label_y1 = max(y1 - 10, label_size[1]) - - cv2.rectangle(img, (x1, label_y1 - label_size[1] - 5), (x1 + label_size[0], label_y1), color, -1) - cv2.putText(img, label, (x1, label_y1 - 2), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255), 1) + FrameSubProcessor._draw_label_on_bbox(img, label, x1, y1, color) @staticmethod def __generate_character_colors(character_names: Set[str]) -> Dict[str, Tuple[int, int, int]]: diff --git a/preprocessor/video/subprocessors/emotion_detection_subprocessor.py b/preprocessor/video/subprocessors/emotion_detection_subprocessor.py index bd5490a40..3076a7042 100644 --- a/preprocessor/video/subprocessors/emotion_detection_subprocessor.py +++ b/preprocessor/video/subprocessors/emotion_detection_subprocessor.py @@ -14,16 +14,15 @@ OutputSpec, ProcessingItem, ) -from preprocessor.core.episode_manager import EpisodeManager from preprocessor.core.path_manager import PathManager from preprocessor.utils.console import console +from preprocessor.utils.error_handling_logger import ErrorHandlingLogger +from preprocessor.utils.file_utils import atomic_write_json from preprocessor.video.emotion_utils import ( crop_face_from_frame, detect_emotions_batch, init_emotion_model, ) -from preprocessor.utils.error_handling_logger import ErrorHandlingLogger -from preprocessor.utils.file_utils import atomic_write_json from preprocessor.video.frame_processor import FrameSubProcessor diff --git a/preprocessor/video/subprocessors/face_clustering_subprocessor.py b/preprocessor/video/subprocessors/face_clustering_subprocessor.py index d67c17753..8b3b61299 100644 --- a/preprocessor/video/subprocessors/face_clustering_subprocessor.py +++ b/preprocessor/video/subprocessors/face_clustering_subprocessor.py @@ -22,7 +22,6 @@ OutputSpec, ProcessingItem, ) -from preprocessor.core.episode_manager import EpisodeManager from preprocessor.core.path_manager import PathManager from preprocessor.utils.console import console from preprocessor.utils.error_handling_logger import ErrorHandlingLogger @@ -77,22 +76,13 @@ def get_expected_outputs(self, item: ProcessingItem) -> List[OutputSpec]: metadata_output = episode_dir / metadata_filename return [OutputSpec(path=metadata_output, required=True)] - def should_run(self, item: ProcessingItem, missing_outputs: List[OutputSpec]) -> bool: - expected = self.get_expected_outputs(item) - return any(str(exp.path) in str(miss.path) for exp in expected for miss in missing_outputs) - def process(self, item: ProcessingItem, ramdisk_frames_dir: Path) -> None: self.initialize() episode_info = item.metadata["episode_info"] - frame_files = sorted([ - f for f in ramdisk_frames_dir.glob("*.jpg") - if f.is_file() and "frame_" in f.name - ]) - - if not frame_files: - console.print(f"[yellow]No frames found in {ramdisk_frames_dir}[/yellow]") + frame_files = self._load_frames_with_warning(ramdisk_frames_dir) + if frame_files is None: return console.print(f"[cyan]Extracting faces and vectors from {len(frame_files)} frames[/cyan]") diff --git a/preprocessor/video/subprocessors/image_hash_subprocessor.py b/preprocessor/video/subprocessors/image_hash_subprocessor.py index 661dcb00a..a0f4ae577 100644 --- a/preprocessor/video/subprocessors/image_hash_subprocessor.py +++ b/preprocessor/video/subprocessors/image_hash_subprocessor.py @@ -2,8 +2,6 @@ import logging from pathlib import Path from typing import ( - Any, - Dict, List, Optional, ) @@ -16,14 +14,14 @@ ProcessingItem, ) from preprocessor.core.path_manager import PathManager -from preprocessor.utils.image_hasher import PerceptualHasher from preprocessor.utils.batch_processing_utils import compute_hashes_in_batches from preprocessor.utils.console import console from preprocessor.utils.error_handling_logger import ErrorHandlingLogger -from preprocessor.utils.file_utils import atomic_write_json -from preprocessor.utils.metadata_utils import create_processing_metadata +from preprocessor.utils.hash_save_utils import save_image_hashes_to_json +from preprocessor.utils.image_hasher import PerceptualHasher from preprocessor.video.frame_processor import FrameSubProcessor -import json + +# pylint: disable=duplicate-code class ImageHashSubProcessor(FrameSubProcessor): @@ -42,10 +40,6 @@ def cleanup(self) -> None: self.hasher = None self.__cleanup_memory() - def finalize(self) -> None: - if hasattr(self, 'logger'): - self.logger.finalize() - def get_expected_outputs(self, item: ProcessingItem) -> List[OutputSpec]: episode_info = item.metadata["episode_info"] episode_dir = PathManager(episode_info.series_name or "unknown").get_episode_dir(episode_info,settings.output_subdirs.image_hashes) @@ -59,57 +53,27 @@ def get_expected_outputs(self, item: ProcessingItem) -> List[OutputSpec]: hash_output = episode_dir / hash_filename return [OutputSpec(path=hash_output, required=True)] - def should_run(self, item: ProcessingItem, missing_outputs: List[OutputSpec]) -> bool: - expected = self.get_expected_outputs(item) - return any(str(exp.path) in str(miss.path) for exp in expected for miss in missing_outputs) - def process(self, item: ProcessingItem, ramdisk_frames_dir: Path) -> None: self.initialize() metadata_file = item.input_path episode_info = item.metadata["episode_info"] - with open(metadata_file, "r", encoding="utf-8") as f: - metadata = json.load(f) - - frame_requests = metadata.get("frames", []) - if not frame_requests: - console.print(f"[yellow]No frames in metadata for {metadata_file}[/yellow]") + frame_requests = self._load_frame_requests_from_metadata(metadata_file) + if frame_requests is None: return hash_results = compute_hashes_in_batches(ramdisk_frames_dir, frame_requests, self.hasher, self.batch_size) series_name = item.metadata["series_name"] - self.__save_hashes(episode_info, hash_results, series_name) - - def __save_hashes(self, episode_info, hash_results: List[Dict[str, Any]], series_name: str) -> None: - episode_dir = PathManager(episode_info.series_name or "unknown").get_episode_dir(episode_info,settings.output_subdirs.image_hashes) - episode_dir.mkdir(parents=True, exist_ok=True) - hash_data = create_processing_metadata( + output_path = save_image_hashes_to_json( episode_info=episode_info, - processing_params={ - "device": self.device, - "batch_size": self.batch_size, - "hash_size": 8, - }, - statistics={ - "total_hashes": len(hash_results), - "unique_hashes": len(set(h.get("perceptual_hash") for h in hash_results if "perceptual_hash" in h)), - }, - results_key="image_hashes", - results_data=hash_results, + hash_results=hash_results, + series_name=series_name, + device=self.device, + batch_size=self.batch_size, ) - - path_manager = PathManager(series_name) - hash_filename = path_manager.build_filename( - episode_info, - extension="json", - suffix="image_hashes", - ) - hash_output = episode_dir / hash_filename - atomic_write_json(hash_output, hash_data, indent=2, ensure_ascii=False) - - console.print(f"[green]✓ Saved hashes to: {hash_output}[/green]") + console.print(f"[green]✓ Saved hashes to: {output_path}[/green]") @staticmethod def __cleanup_memory() -> None: diff --git a/preprocessor/video/subprocessors/object_detection_subprocessor.py b/preprocessor/video/subprocessors/object_detection_subprocessor.py index 03308cf75..d31a87793 100644 --- a/preprocessor/video/subprocessors/object_detection_subprocessor.py +++ b/preprocessor/video/subprocessors/object_detection_subprocessor.py @@ -8,8 +8,8 @@ Optional, ) -import torch from PIL import Image +import torch from preprocessor.config.config import settings from preprocessor.core.base_processor import ( @@ -73,24 +73,13 @@ def get_expected_outputs(self, item: ProcessingItem) -> List[OutputSpec]: detections_output = episode_dir / detections_filename return [OutputSpec(path=detections_output, required=True)] - def should_run(self, item: ProcessingItem, missing_outputs: List[OutputSpec]) -> bool: - expected = self.get_expected_outputs(item) - return any(str(exp.path) in str(miss.path) for exp in expected for miss in missing_outputs) - - def process(self, item: ProcessingItem, ramdisk_frames_dir: Path) -> None: # pylint: disable=too-many-locals + def process(self, item: ProcessingItem, ramdisk_frames_dir: Path) -> None: self.initialize() - from PIL import Image # pylint: disable=import-outside-toplevel - episode_info = item.metadata["episode_info"] - frame_files = sorted([ - f for f in ramdisk_frames_dir.glob("*.jpg") - if f.is_file() and "frame_" in f.name - ]) - - if not frame_files: - console.print(f"[yellow]No frames found in {ramdisk_frames_dir}[/yellow]") + frame_files = self._load_frames_with_warning(ramdisk_frames_dir) + if frame_files is None: return console.print(f"[cyan]Detecting objects in {len(frame_files)} frames[/cyan]") @@ -204,4 +193,4 @@ def __save_detections(self, episode_info, detections_data: Dict[str, Any], serie def __cleanup_memory() -> None: gc.collect() if torch.cuda.is_available(): - torch.cuda.empty_cache() \ No newline at end of file + torch.cuda.empty_cache() diff --git a/preprocessor/video/subprocessors/object_detection_visualization_subprocessor.py b/preprocessor/video/subprocessors/object_detection_visualization_subprocessor.py index 4e6ed5ed6..60c1b0ebd 100644 --- a/preprocessor/video/subprocessors/object_detection_visualization_subprocessor.py +++ b/preprocessor/video/subprocessors/object_detection_visualization_subprocessor.py @@ -1,4 +1,3 @@ -import json import logging from pathlib import Path from typing import ( @@ -8,6 +7,7 @@ Tuple, ) +import cv2 import numpy as np from preprocessor.config.config import settings @@ -32,10 +32,6 @@ def initialize(self) -> None: def cleanup(self) -> None: pass - def finalize(self) -> None: - if hasattr(self, 'logger'): - self.logger.finalize() - def needs_ramdisk(self) -> bool: return False @@ -59,24 +55,17 @@ def should_run(self, item: ProcessingItem, missing_outputs: List[OutputSpec]) -> return any(str(exp.path) in str(miss.path) for exp in expected for miss in missing_outputs) def process(self, item: ProcessingItem, ramdisk_frames_dir: Path) -> None: - import cv2 # pylint: disable=import-outside-toplevel - episode_info = item.metadata["episode_info"] detection_dir = PathManager(episode_info.series_name or "unknown").get_episode_dir(episode_info,settings.output_subdirs.object_detections) - detection_files = list(detection_dir.glob("*_object_detections.json")) - detection_file = detection_files[0] if detection_files else None - - if not detection_file or not detection_file.exists(): - console.print(f"[yellow]No detections JSON found in {detection_dir}[/yellow]") - return - if not ramdisk_frames_dir.exists(): - console.print(f"[yellow]No frames directory found: {ramdisk_frames_dir}[/yellow]") + detection_data = self._load_detection_file( + detection_dir, + ramdisk_frames_dir, + "*_object_detections.json", + ) + if detection_data is None: return - with open(detection_file, 'r', encoding='utf-8') as f: - detection_data = json.load(f) - frames_with_detections = [f for f in detection_data.get("detections", []) if f['detection_count'] > 0] if not frames_with_detections: console.print(f"[yellow]No frames with detections for {episode_info.episode_code()}[/yellow]") @@ -111,8 +100,6 @@ def process(self, item: ProcessingItem, ramdisk_frames_dir: Path) -> None: @staticmethod def __draw_detections_on_frame(img, detections: List[Dict[str, Any]], colors: Dict[int, Tuple[int, int, int]], conf_threshold: float) -> None: - import cv2 # pylint: disable=import-outside-toplevel - for detection in detections: if detection['confidence'] < conf_threshold: continue @@ -126,11 +113,7 @@ def __draw_detections_on_frame(img, detections: List[Dict[str, Any]], colors: Di cv2.rectangle(img, (x1, y1), (x2, y2), color, 2) label = f"{detection['class_name']} {detection['confidence']:.2f}" - label_size, _ = cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX, 0.5, 1) - label_y1 = max(y1 - 10, label_size[1]) - - cv2.rectangle(img, (x1, label_y1 - label_size[1] - 5), (x1 + label_size[0], label_y1), color, -1) - cv2.putText(img, label, (x1, label_y1 - 2), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255), 1) + FrameSubProcessor._draw_label_on_bbox(img, label, x1, y1, color) @staticmethod def __generate_colors(num_colors: int = 80) -> Dict[int, Tuple[int, int, int]]: diff --git a/preprocessor/video/subprocessors/video_embedding_subprocessor.py b/preprocessor/video/subprocessors/video_embedding_subprocessor.py index 0267cb500..b6463a732 100644 --- a/preprocessor/video/subprocessors/video_embedding_subprocessor.py +++ b/preprocessor/video/subprocessors/video_embedding_subprocessor.py @@ -1,5 +1,4 @@ import gc -import json import logging from pathlib import Path from typing import ( @@ -26,6 +25,8 @@ from preprocessor.utils.metadata_utils import create_processing_metadata from preprocessor.video.frame_processor import FrameSubProcessor +# pylint: disable=duplicate-code + class VideoEmbeddingSubProcessor(FrameSubProcessor): def __init__(self, device: str, batch_size: int, model_name: str, model_revision: str): @@ -60,10 +61,6 @@ def cleanup(self) -> None: self.gpu_processor = None self.__cleanup_memory() - def finalize(self) -> None: - if hasattr(self, 'logger'): - self.logger.finalize() - def get_expected_outputs(self, item: ProcessingItem) -> List[OutputSpec]: episode_info = item.metadata["episode_info"] episode_dir = PathManager(episode_info.series_name or "unknown").get_episode_dir(episode_info,settings.output_subdirs.embeddings) @@ -77,22 +74,14 @@ def get_expected_outputs(self, item: ProcessingItem) -> List[OutputSpec]: video_output = episode_dir / video_filename return [OutputSpec(path=video_output, required=True)] - def should_run(self, item: ProcessingItem, missing_outputs: List[OutputSpec]) -> bool: - expected = self.get_expected_outputs(item) - return any(str(exp.path) in str(miss.path) for exp in expected for miss in missing_outputs) - def process(self, item: ProcessingItem, ramdisk_frames_dir: Path) -> None: self.initialize() metadata_file = item.input_path episode_info = item.metadata["episode_info"] - with open(metadata_file, "r", encoding="utf-8") as f: - metadata = json.load(f) - - frame_requests = metadata.get("frames", []) - if not frame_requests: - console.print(f"[yellow]No frames in metadata for {metadata_file}[/yellow]") + frame_requests = self._load_frame_requests_from_metadata(metadata_file) + if frame_requests is None: return episode_dir = PathManager(episode_info.series_name or "unknown").get_episode_dir(episode_info,settings.output_subdirs.embeddings) From e622921c9920c8573eff01f87de357bf18438c4b Mon Sep 17 00:00:00 2001 From: dam2452 <81230036+dam2452@users.noreply.github.com> Date: Tue, 10 Feb 2026 21:38:10 +0100 Subject: [PATCH 06/89] Restructure preprocessor and add ES mappings Large refactor: reorganize the preprocessor package into new app, lib and modules subpackages (many files moved/renamed and legacy CLI/processor files removed). Add a pipeline system (Pipeline, StepBuilder, pipeline_factory, pipeline_builder, config_defaults) under preprocessor.app to build/visualize processing pipelines. Introduce many Elasticsearch index mapping constants in ElasticSearchManager (segments, text/video embeddings, episode names, full-episode embeddings, sound events and embeddings). Update usages/imports: reindex_service now imports bot.search.elastic_search_manager.ElasticSearchManager and adjusts the connect call, bot/types imports from preprocessor.config.types, and preprocessor.__main__ uses the relocated console. Overall this commit prepares the codebase for modular pipeline execution and richer ES indexing schemas. --- bot/search/elastic_search_manager.py | 122 +++ bot/services/reindex/reindex_service.py | 6 +- bot/types.py | 2 +- preprocessor/__main__.py | 7 +- preprocessor/app/__init__.py | 6 + preprocessor/app/config_defaults.py | 68 ++ preprocessor/app/pipeline.py | 153 +++ preprocessor/app/pipeline_builder.py | 76 ++ preprocessor/app/pipeline_factory.py | 317 ++++++ preprocessor/app/step_builder.py | 73 ++ preprocessor/characters/face_detection.py | 183 ---- .../characters/google_image_search.py | 41 - .../characters/reference_downloader.py | 301 ------ preprocessor/cli/__init__.py | 73 +- preprocessor/cli/__main__.py | 2 +- preprocessor/cli/cli_main.py | 108 ++ preprocessor/cli/commands/__init__.py | 41 - preprocessor/cli/commands/analyze_text.py | 59 -- preprocessor/cli/commands/detect_scenes.py | 49 - preprocessor/cli/commands/export_frames.py | 72 -- preprocessor/cli/commands/fix_unicode.py | 48 - .../cli/commands/generate_archives.py | 86 -- .../commands/generate_elastic_documents.py | 78 -- .../cli/commands/generate_embeddings.py | 146 --- preprocessor/cli/commands/image_hashing.py | 68 -- .../cli/commands/import_transcriptions.py | 69 -- preprocessor/cli/commands/index.py | 35 - .../commands/process_character_references.py | 65 -- preprocessor/cli/commands/run_all.py | 304 ------ preprocessor/cli/commands/scrape_episodes.py | 65 -- preprocessor/cli/commands/search.py | 183 ---- preprocessor/cli/commands/separate_sounds.py | 48 - preprocessor/cli/commands/transcode.py | 99 -- preprocessor/cli/commands/transcribe.py | 82 -- .../cli/commands/transcribe_elevenlabs.py | 84 -- preprocessor/cli/commands/validate.py | 48 - preprocessor/cli/helpers.py | 70 +- preprocessor/cli/options/common.py | 35 - preprocessor/cli/pipeline/orchestrator.py | 160 --- preprocessor/cli/pipeline/steps.py | 608 ------------ preprocessor/config/config.py | 303 ++---- preprocessor/config/constants.py | 66 ++ preprocessor/config/enums.py | 30 + preprocessor/{ => config}/prompts/__init__.py | 20 +- preprocessor/config/prompts/common_schemas.py | 14 + .../prompts/extract_all_seasons_system.py | 64 ++ .../prompts/extract_all_seasons_user.py | 7 + .../prompts/extract_characters_system.py | 120 +++ .../config/prompts/extract_characters_user.py | 15 + .../extract_episode_metadata_system.py | 17 + .../prompts/extract_episode_metadata_user.py | 2 + .../config/prompts/extract_season_system.py | 31 + .../config/prompts/extract_season_user.py | 2 + .../prompts/merge_episode_data_system.py | 14 + .../config/prompts/merge_episode_data_user.py | 2 + preprocessor/config/step_configs.py | 135 +++ preprocessor/config/types/__init__.py | 81 ++ preprocessor/{ => config}/types/clip.py | 0 preprocessor/{ => config}/types/detection.py | 2 - preprocessor/{ => config}/types/episode.py | 4 - preprocessor/{ => config}/types/frame.py | 0 preprocessor/config/types/keys.py | 178 ++++ preprocessor/{ => config}/types/scene.py | 3 - preprocessor/{ => config}/types/search.py | 6 - .../{ => config}/types/transcription.py | 4 - preprocessor/{ => config}/types/video.py | 1 - preprocessor/core/artifacts.py | 110 +++ preprocessor/core/base_processor.py | 255 ++--- preprocessor/core/base_step.py | 39 + preprocessor/core/constants.py | 50 - preprocessor/core/context.py | 78 ++ preprocessor/core/enums.py | 35 - preprocessor/core/path_manager.py | 91 +- preprocessor/core/processing_metadata.py | 88 +- preprocessor/core/processor_factory.py | 64 -- preprocessor/core/processor_registry.py | 48 - preprocessor/core/state_manager.py | 100 +- preprocessor/core/video_processor.py | 47 - .../embeddings/episode_name_embedder.py | 166 ---- .../embeddings/gpu_batch_processor.py | 137 --- preprocessor/embeddings/qwen3_vl_embedding.py | 112 --- .../embeddings/strategies/__init__.py | 7 - .../embeddings/strategies/strategy_factory.py | 14 - preprocessor/episodes/__init__.py | 8 - preprocessor/episodes/episode_file_finder.py | 107 -- preprocessor/episodes/episode_manager.py | 170 ---- preprocessor/episodes/episode_parser.py | 29 - preprocessor/{characters => lib}/__init__.py | 0 preprocessor/lib/ai/__init__.py | 8 + .../{config => lib/ai}/llm_provider.py | 147 ++- preprocessor/lib/characters/__init__.py | 9 + preprocessor/lib/characters/face_detection.py | 143 +++ .../lib/characters/image_search/__init__.py | 5 + .../image_search}/duckduckgo_image_search.py | 5 +- .../image_search/google_image_search.py | 30 + .../characters/image_search}/image_search.py | 3 +- preprocessor/lib/characters/models.py | 18 + .../lib/characters/reference_downloader.py | 283 ++++++ preprocessor/lib/core/__init__.py | 7 + .../core/logging.py} | 35 +- preprocessor/lib/core/time.py | 20 + preprocessor/lib/episodes/__init__.py | 6 + preprocessor/lib/episodes/episode_manager.py | 204 ++++ preprocessor/lib/io/__init__.py | 9 + preprocessor/lib/io/detection_io.py | 30 + preprocessor/lib/io/files.py | 48 + preprocessor/lib/io/hashing.py | 48 + preprocessor/lib/io/metadata.py | 51 + preprocessor/lib/media/__init__.py | 9 + preprocessor/lib/media/ffmpeg.py | 152 +++ .../{utils => lib/media}/resolution.py | 19 +- preprocessor/lib/media/scene_detection.py | 119 +++ .../{cli/options => lib/scraping}/__init__.py | 0 preprocessor/lib/scraping/clipboard.py | 30 + preprocessor/lib/scraping/crawl4ai.py | 54 + preprocessor/lib/search/__init__.py | 8 + preprocessor/lib/search/elasticsearch.py | 61 ++ preprocessor/lib/search/embedding_model.py | 23 + preprocessor/lib/text/__init__.py | 8 + preprocessor/lib/text/language_config.py | 28 + preprocessor/lib/text/text_statistics.py | 159 +++ preprocessor/lib/transcription/__init__.py | 25 + .../{ => lib}/transcription/elevenlabs.py | 172 ++-- .../transcription/engines}/__init__.py | 0 .../transcription/engines/base_engine.py | 5 +- .../engines/elevenlabs_engine.py | 117 +-- .../transcription/engines/whisper_engine.py | 49 + .../transcription/generators}/__init__.py | 0 .../generators/base_generator.py | 23 +- .../generators/json_generator.py | 74 ++ .../generators/multi_format_generator.py | 156 +++ .../transcription/generators/srt_generator.py | 44 + .../transcription/generators/txt_generator.py | 17 +- .../lib/transcription/processors/__init__.py | 7 + .../processors/audio_normalizer.py | 67 +- .../processors/episode_info_processor.py | 55 +- .../processors/normalized_audio_processor.py | 61 +- .../processors/sound_separator.py | 267 +++++ .../transcription/processors/unicode_fixer.py | 35 +- .../lib/transcription/sound_classification.py | 37 + preprocessor/lib/transcription/utils.py | 95 ++ preprocessor/lib/transcription/whisper.py | 61 ++ preprocessor/lib/ui/__init__.py | 10 + preprocessor/{utils => lib/ui}/console.py | 41 +- .../ui/progress.py} | 50 +- preprocessor/lib/validation/__init__.py | 3 + .../{ => lib}/validation/base_result.py | 14 +- .../lib/validation/file_validators.py | 173 ++++ preprocessor/lib/video/__init__.py | 9 + preprocessor/lib/video/emotion_utils.py | 112 +++ preprocessor/lib/video/frame_utils.py | 32 + preprocessor/lib/video/image_hasher.py | 36 + .../{processors => modules}/__init__.py | 0 .../{scraping => modules/audio}/__init__.py | 0 preprocessor/modules/audio/extraction.py | 62 ++ preprocessor/modules/audio/separation.py | 244 +++++ preprocessor/modules/packaging/__init__.py | 3 + preprocessor/modules/packaging/archives.py | 28 + preprocessor/modules/scraping/__init__.py | 6 + preprocessor/modules/scraping/base_scraper.py | 91 ++ .../modules/scraping/character_scraper.py | 25 + .../scraping/character_scraper_step.py | 55 ++ .../modules/scraping/episode_scraper.py | 86 ++ .../modules/scraping/episode_scraper_step.py | 56 ++ .../scraping}/reference_processor.py | 479 +++------ .../scraping/reference_processor_step.py | 63 ++ .../engines => modules/search}/__init__.py | 0 .../modules/search/clients/__init__.py | 6 + .../search/clients/elasticsearch_queries.py | 411 ++++++++ .../search/clients}/embedding_service.py | 60 +- .../search/clients}/hash_service.py | 17 +- .../search/clients/result_formatters.py | 102 ++ .../modules/search/document_generation.py | 88 ++ preprocessor/modules/search/indexing.py | 116 +++ preprocessor/modules/text/__init__.py | 6 + preprocessor/modules/text/analysis.py | 50 + preprocessor/modules/text/embeddings.py | 165 ++++ preprocessor/modules/text/import_step.py | 154 +++ preprocessor/modules/text/transcription.py | 90 ++ preprocessor/modules/validation/__init__.py | 7 + .../modules/validation/episode_stats.py | 471 +++++++++ .../modules/validation/global_validator.py | 90 ++ .../validation/report_generator.py | 22 +- .../validation/season_comparator.py | 57 +- preprocessor/modules/validation/validator.py | 123 +++ .../generators => modules/video}/__init__.py | 0 preprocessor/modules/video/frame_export.py | 221 +++++ preprocessor/modules/video/scene_detection.py | 79 ++ .../modules/video/strategies/__init__.py | 4 + .../video}/strategies/base_strategy.py | 7 +- .../strategies/scene_changes_strategy.py | 47 +- .../video/strategies/strategy_factory.py | 12 + preprocessor/modules/video/transcoding.py | 104 ++ preprocessor/modules/vision/__init__.py | 8 + .../modules/vision/character_detection.py | 118 +++ preprocessor/modules/vision/embeddings.py | 129 +++ .../modules/vision/emotion_detection.py | 28 + .../modules/vision/face_clustering.py | 28 + preprocessor/modules/vision/image_hashing.py | 113 +++ .../modules/vision/object_detection.py | 28 + preprocessor/processors/archive_generator.py | 182 ---- preprocessor/processors/character_detector.py | 118 --- .../processors/elastic_document_generator.py | 929 ------------------ .../processors/elasticsearch_indexer.py | 291 ------ .../processors/elasticsearch_manager.py | 379 ------- .../processors/embedding_generator.py | 827 ---------------- preprocessor/processors/frame_exporter.py | 285 ------ .../processors/image_hash_processor.py | 137 --- preprocessor/processors/scene_detector.py | 217 ---- preprocessor/processors/text_analyzer.py | 145 --- .../processors/transcription_generator.py | 241 ----- .../processors/transcription_importer.py | 238 ----- preprocessor/processors/video_transcoder.py | 268 ----- .../prompts/extract_all_seasons_system.py | 54 - .../prompts/extract_all_seasons_user.py | 7 - .../prompts/extract_characters_system.py | 120 --- .../prompts/extract_characters_user.py | 14 - .../extract_episode_metadata_system.py | 21 - .../prompts/extract_episode_metadata_user.py | 7 - preprocessor/prompts/extract_season_system.py | 26 - preprocessor/prompts/extract_season_user.py | 7 - .../prompts/merge_episode_data_system.py | 19 - .../prompts/merge_episode_data_user.py | 6 - preprocessor/scraping/base_scraper.py | 112 --- preprocessor/scraping/character_scraper.py | 36 - preprocessor/scraping/clipboard.py | 37 - preprocessor/scraping/crawl4ai.py | 64 -- preprocessor/scraping/episode_scraper.py | 87 -- preprocessor/search/__init__.py | 11 - preprocessor/search/elasticsearch_queries.py | 467 --------- preprocessor/search/result_formatters.py | 85 -- preprocessor/text_analysis/__init__.py | 3 - preprocessor/text_analysis/text_statistics.py | 207 ---- preprocessor/transcription/__init__.py | 4 - .../transcription/engines/whisper_engine.py | 73 -- .../generators/json_generator.py | 94 -- .../generators/multi_format_generator.py | 170 ---- .../transcription/generators/srt_generator.py | 50 - .../transcription/processors/__init__.py | 0 .../processors/sound_separator.py | 391 -------- preprocessor/transcription/whisper_utils.py | 58 -- preprocessor/types/__init__.py | 69 -- preprocessor/utils/__init__.py | 0 preprocessor/utils/batch_processing_utils.py | 221 ----- preprocessor/utils/batch_processor.py | 24 - preprocessor/utils/constants.py | 200 ---- preprocessor/utils/detection_io.py | 92 -- preprocessor/utils/file_utils.py | 18 - preprocessor/utils/hash_save_utils.py | 50 - preprocessor/utils/image_hash_utils.py | 55 -- preprocessor/utils/image_hasher.py | 76 -- preprocessor/utils/metadata_utils.py | 29 - preprocessor/utils/resource_scope.py | 19 - preprocessor/utils/time_utils.py | 17 - preprocessor/utils/transcription_utils.py | 57 -- preprocessor/utils/video_utils.py | 29 - preprocessor/validation/__init__.py | 0 preprocessor/validation/episode_stats.py | 511 ---------- preprocessor/validation/file_validators.py | 178 ---- preprocessor/validation/global_validator.py | 117 --- preprocessor/validation/validator.py | 152 --- preprocessor/video/__init__.py | 0 preprocessor/video/emotion_utils.py | 127 --- preprocessor/video/frame_processor.py | 216 ---- preprocessor/video/frame_utils.py | 40 - preprocessor/video/subprocessors/__init__.py | 19 - .../character_detection_subprocessor.py | 99 -- ...er_detection_visualization_subprocessor.py | 138 --- .../emotion_detection_subprocessor.py | 167 ---- .../face_clustering_subprocessor.py | 277 ------ .../subprocessors/image_hash_subprocessor.py | 82 -- .../object_detection_subprocessor.py | 196 ---- ...ct_detection_visualization_subprocessor.py | 124 --- .../video_embedding_subprocessor.py | 144 --- 274 files changed, 9115 insertions(+), 15716 deletions(-) create mode 100644 preprocessor/app/__init__.py create mode 100644 preprocessor/app/config_defaults.py create mode 100644 preprocessor/app/pipeline.py create mode 100644 preprocessor/app/pipeline_builder.py create mode 100644 preprocessor/app/pipeline_factory.py create mode 100644 preprocessor/app/step_builder.py delete mode 100644 preprocessor/characters/face_detection.py delete mode 100644 preprocessor/characters/google_image_search.py delete mode 100644 preprocessor/characters/reference_downloader.py create mode 100644 preprocessor/cli/cli_main.py delete mode 100644 preprocessor/cli/commands/__init__.py delete mode 100644 preprocessor/cli/commands/analyze_text.py delete mode 100644 preprocessor/cli/commands/detect_scenes.py delete mode 100644 preprocessor/cli/commands/export_frames.py delete mode 100644 preprocessor/cli/commands/fix_unicode.py delete mode 100644 preprocessor/cli/commands/generate_archives.py delete mode 100644 preprocessor/cli/commands/generate_elastic_documents.py delete mode 100644 preprocessor/cli/commands/generate_embeddings.py delete mode 100644 preprocessor/cli/commands/image_hashing.py delete mode 100644 preprocessor/cli/commands/import_transcriptions.py delete mode 100644 preprocessor/cli/commands/index.py delete mode 100644 preprocessor/cli/commands/process_character_references.py delete mode 100644 preprocessor/cli/commands/run_all.py delete mode 100644 preprocessor/cli/commands/scrape_episodes.py delete mode 100644 preprocessor/cli/commands/search.py delete mode 100644 preprocessor/cli/commands/separate_sounds.py delete mode 100644 preprocessor/cli/commands/transcode.py delete mode 100644 preprocessor/cli/commands/transcribe.py delete mode 100644 preprocessor/cli/commands/transcribe_elevenlabs.py delete mode 100644 preprocessor/cli/commands/validate.py delete mode 100644 preprocessor/cli/options/common.py delete mode 100644 preprocessor/cli/pipeline/orchestrator.py delete mode 100644 preprocessor/cli/pipeline/steps.py create mode 100644 preprocessor/config/constants.py create mode 100644 preprocessor/config/enums.py rename preprocessor/{ => config}/prompts/__init__.py (50%) create mode 100644 preprocessor/config/prompts/common_schemas.py create mode 100644 preprocessor/config/prompts/extract_all_seasons_system.py create mode 100644 preprocessor/config/prompts/extract_all_seasons_user.py create mode 100644 preprocessor/config/prompts/extract_characters_system.py create mode 100644 preprocessor/config/prompts/extract_characters_user.py create mode 100644 preprocessor/config/prompts/extract_episode_metadata_system.py create mode 100644 preprocessor/config/prompts/extract_episode_metadata_user.py create mode 100644 preprocessor/config/prompts/extract_season_system.py create mode 100644 preprocessor/config/prompts/extract_season_user.py create mode 100644 preprocessor/config/prompts/merge_episode_data_system.py create mode 100644 preprocessor/config/prompts/merge_episode_data_user.py create mode 100644 preprocessor/config/step_configs.py create mode 100644 preprocessor/config/types/__init__.py rename preprocessor/{ => config}/types/clip.py (100%) rename preprocessor/{ => config}/types/detection.py (99%) rename preprocessor/{ => config}/types/episode.py (99%) rename preprocessor/{ => config}/types/frame.py (100%) create mode 100644 preprocessor/config/types/keys.py rename preprocessor/{ => config}/types/scene.py (99%) rename preprocessor/{ => config}/types/search.py (99%) rename preprocessor/{ => config}/types/transcription.py (99%) rename preprocessor/{ => config}/types/video.py (99%) create mode 100644 preprocessor/core/artifacts.py create mode 100644 preprocessor/core/base_step.py delete mode 100644 preprocessor/core/constants.py create mode 100644 preprocessor/core/context.py delete mode 100644 preprocessor/core/enums.py delete mode 100644 preprocessor/core/processor_factory.py delete mode 100644 preprocessor/core/processor_registry.py delete mode 100644 preprocessor/core/video_processor.py delete mode 100644 preprocessor/embeddings/episode_name_embedder.py delete mode 100644 preprocessor/embeddings/gpu_batch_processor.py delete mode 100644 preprocessor/embeddings/qwen3_vl_embedding.py delete mode 100644 preprocessor/embeddings/strategies/__init__.py delete mode 100644 preprocessor/embeddings/strategies/strategy_factory.py delete mode 100644 preprocessor/episodes/__init__.py delete mode 100644 preprocessor/episodes/episode_file_finder.py delete mode 100644 preprocessor/episodes/episode_manager.py delete mode 100644 preprocessor/episodes/episode_parser.py rename preprocessor/{characters => lib}/__init__.py (100%) create mode 100644 preprocessor/lib/ai/__init__.py rename preprocessor/{config => lib/ai}/llm_provider.py (67%) create mode 100644 preprocessor/lib/characters/__init__.py create mode 100644 preprocessor/lib/characters/face_detection.py create mode 100644 preprocessor/lib/characters/image_search/__init__.py rename preprocessor/{characters => lib/characters/image_search}/duckduckgo_image_search.py (76%) create mode 100644 preprocessor/lib/characters/image_search/google_image_search.py rename preprocessor/{characters => lib/characters/image_search}/image_search.py (87%) create mode 100644 preprocessor/lib/characters/models.py create mode 100644 preprocessor/lib/characters/reference_downloader.py create mode 100644 preprocessor/lib/core/__init__.py rename preprocessor/{utils/error_handling_logger.py => lib/core/logging.py} (75%) create mode 100644 preprocessor/lib/core/time.py create mode 100644 preprocessor/lib/episodes/__init__.py create mode 100644 preprocessor/lib/episodes/episode_manager.py create mode 100644 preprocessor/lib/io/__init__.py create mode 100644 preprocessor/lib/io/detection_io.py create mode 100644 preprocessor/lib/io/files.py create mode 100644 preprocessor/lib/io/hashing.py create mode 100644 preprocessor/lib/io/metadata.py create mode 100644 preprocessor/lib/media/__init__.py create mode 100644 preprocessor/lib/media/ffmpeg.py rename preprocessor/{utils => lib/media}/resolution.py (70%) create mode 100644 preprocessor/lib/media/scene_detection.py rename preprocessor/{cli/options => lib/scraping}/__init__.py (100%) create mode 100644 preprocessor/lib/scraping/clipboard.py create mode 100644 preprocessor/lib/scraping/crawl4ai.py create mode 100644 preprocessor/lib/search/__init__.py create mode 100644 preprocessor/lib/search/elasticsearch.py create mode 100644 preprocessor/lib/search/embedding_model.py create mode 100644 preprocessor/lib/text/__init__.py create mode 100644 preprocessor/lib/text/language_config.py create mode 100644 preprocessor/lib/text/text_statistics.py create mode 100644 preprocessor/lib/transcription/__init__.py rename preprocessor/{ => lib}/transcription/elevenlabs.py (52%) rename preprocessor/{cli/pipeline => lib/transcription/engines}/__init__.py (100%) rename preprocessor/{ => lib}/transcription/engines/base_engine.py (91%) rename preprocessor/{ => lib}/transcription/engines/elevenlabs_engine.py (51%) create mode 100644 preprocessor/lib/transcription/engines/whisper_engine.py rename preprocessor/{embeddings => lib/transcription/generators}/__init__.py (100%) rename preprocessor/{ => lib}/transcription/generators/base_generator.py (61%) create mode 100644 preprocessor/lib/transcription/generators/json_generator.py create mode 100644 preprocessor/lib/transcription/generators/multi_format_generator.py create mode 100644 preprocessor/lib/transcription/generators/srt_generator.py rename preprocessor/{ => lib}/transcription/generators/txt_generator.py (54%) create mode 100644 preprocessor/lib/transcription/processors/__init__.py rename preprocessor/{ => lib}/transcription/processors/audio_normalizer.py (54%) rename preprocessor/{ => lib}/transcription/processors/episode_info_processor.py (56%) rename preprocessor/{ => lib}/transcription/processors/normalized_audio_processor.py (59%) create mode 100644 preprocessor/lib/transcription/processors/sound_separator.py rename preprocessor/{ => lib}/transcription/processors/unicode_fixer.py (60%) create mode 100644 preprocessor/lib/transcription/sound_classification.py create mode 100644 preprocessor/lib/transcription/utils.py create mode 100644 preprocessor/lib/transcription/whisper.py create mode 100644 preprocessor/lib/ui/__init__.py rename preprocessor/{utils => lib/ui}/console.py (72%) rename preprocessor/{utils/progress_tracker.py => lib/ui/progress.py} (53%) create mode 100644 preprocessor/lib/validation/__init__.py rename preprocessor/{ => lib}/validation/base_result.py (68%) create mode 100644 preprocessor/lib/validation/file_validators.py create mode 100644 preprocessor/lib/video/__init__.py create mode 100644 preprocessor/lib/video/emotion_utils.py create mode 100644 preprocessor/lib/video/frame_utils.py create mode 100644 preprocessor/lib/video/image_hasher.py rename preprocessor/{processors => modules}/__init__.py (100%) rename preprocessor/{scraping => modules/audio}/__init__.py (100%) create mode 100644 preprocessor/modules/audio/extraction.py create mode 100644 preprocessor/modules/audio/separation.py create mode 100644 preprocessor/modules/packaging/__init__.py create mode 100644 preprocessor/modules/packaging/archives.py create mode 100644 preprocessor/modules/scraping/__init__.py create mode 100644 preprocessor/modules/scraping/base_scraper.py create mode 100644 preprocessor/modules/scraping/character_scraper.py create mode 100644 preprocessor/modules/scraping/character_scraper_step.py create mode 100644 preprocessor/modules/scraping/episode_scraper.py create mode 100644 preprocessor/modules/scraping/episode_scraper_step.py rename preprocessor/{characters => modules/scraping}/reference_processor.py (61%) create mode 100644 preprocessor/modules/scraping/reference_processor_step.py rename preprocessor/{transcription/engines => modules/search}/__init__.py (100%) create mode 100644 preprocessor/modules/search/clients/__init__.py create mode 100644 preprocessor/modules/search/clients/elasticsearch_queries.py rename preprocessor/{search => modules/search/clients}/embedding_service.py (62%) rename preprocessor/{search => modules/search/clients}/hash_service.py (60%) create mode 100644 preprocessor/modules/search/clients/result_formatters.py create mode 100644 preprocessor/modules/search/document_generation.py create mode 100644 preprocessor/modules/search/indexing.py create mode 100644 preprocessor/modules/text/__init__.py create mode 100644 preprocessor/modules/text/analysis.py create mode 100644 preprocessor/modules/text/embeddings.py create mode 100644 preprocessor/modules/text/import_step.py create mode 100644 preprocessor/modules/text/transcription.py create mode 100644 preprocessor/modules/validation/__init__.py create mode 100644 preprocessor/modules/validation/episode_stats.py create mode 100644 preprocessor/modules/validation/global_validator.py rename preprocessor/{ => modules}/validation/report_generator.py (58%) rename preprocessor/{ => modules}/validation/season_comparator.py (71%) create mode 100644 preprocessor/modules/validation/validator.py rename preprocessor/{transcription/generators => modules/video}/__init__.py (100%) create mode 100644 preprocessor/modules/video/frame_export.py create mode 100644 preprocessor/modules/video/scene_detection.py create mode 100644 preprocessor/modules/video/strategies/__init__.py rename preprocessor/{embeddings => modules/video}/strategies/base_strategy.py (59%) rename preprocessor/{embeddings => modules/video}/strategies/scene_changes_strategy.py (56%) create mode 100644 preprocessor/modules/video/strategies/strategy_factory.py create mode 100644 preprocessor/modules/video/transcoding.py create mode 100644 preprocessor/modules/vision/__init__.py create mode 100644 preprocessor/modules/vision/character_detection.py create mode 100644 preprocessor/modules/vision/embeddings.py create mode 100644 preprocessor/modules/vision/emotion_detection.py create mode 100644 preprocessor/modules/vision/face_clustering.py create mode 100644 preprocessor/modules/vision/image_hashing.py create mode 100644 preprocessor/modules/vision/object_detection.py delete mode 100644 preprocessor/processors/archive_generator.py delete mode 100644 preprocessor/processors/character_detector.py delete mode 100644 preprocessor/processors/elastic_document_generator.py delete mode 100644 preprocessor/processors/elasticsearch_indexer.py delete mode 100644 preprocessor/processors/elasticsearch_manager.py delete mode 100644 preprocessor/processors/embedding_generator.py delete mode 100644 preprocessor/processors/frame_exporter.py delete mode 100644 preprocessor/processors/image_hash_processor.py delete mode 100644 preprocessor/processors/scene_detector.py delete mode 100644 preprocessor/processors/text_analyzer.py delete mode 100644 preprocessor/processors/transcription_generator.py delete mode 100644 preprocessor/processors/transcription_importer.py delete mode 100644 preprocessor/processors/video_transcoder.py delete mode 100644 preprocessor/prompts/extract_all_seasons_system.py delete mode 100644 preprocessor/prompts/extract_all_seasons_user.py delete mode 100644 preprocessor/prompts/extract_characters_system.py delete mode 100644 preprocessor/prompts/extract_characters_user.py delete mode 100644 preprocessor/prompts/extract_episode_metadata_system.py delete mode 100644 preprocessor/prompts/extract_episode_metadata_user.py delete mode 100644 preprocessor/prompts/extract_season_system.py delete mode 100644 preprocessor/prompts/extract_season_user.py delete mode 100644 preprocessor/prompts/merge_episode_data_system.py delete mode 100644 preprocessor/prompts/merge_episode_data_user.py delete mode 100644 preprocessor/scraping/base_scraper.py delete mode 100644 preprocessor/scraping/character_scraper.py delete mode 100644 preprocessor/scraping/clipboard.py delete mode 100644 preprocessor/scraping/crawl4ai.py delete mode 100644 preprocessor/scraping/episode_scraper.py delete mode 100644 preprocessor/search/__init__.py delete mode 100644 preprocessor/search/elasticsearch_queries.py delete mode 100644 preprocessor/search/result_formatters.py delete mode 100644 preprocessor/text_analysis/__init__.py delete mode 100644 preprocessor/text_analysis/text_statistics.py delete mode 100644 preprocessor/transcription/__init__.py delete mode 100644 preprocessor/transcription/engines/whisper_engine.py delete mode 100644 preprocessor/transcription/generators/json_generator.py delete mode 100644 preprocessor/transcription/generators/multi_format_generator.py delete mode 100644 preprocessor/transcription/generators/srt_generator.py delete mode 100644 preprocessor/transcription/processors/__init__.py delete mode 100644 preprocessor/transcription/processors/sound_separator.py delete mode 100644 preprocessor/transcription/whisper_utils.py delete mode 100644 preprocessor/types/__init__.py delete mode 100644 preprocessor/utils/__init__.py delete mode 100644 preprocessor/utils/batch_processing_utils.py delete mode 100644 preprocessor/utils/batch_processor.py delete mode 100644 preprocessor/utils/constants.py delete mode 100644 preprocessor/utils/detection_io.py delete mode 100644 preprocessor/utils/file_utils.py delete mode 100644 preprocessor/utils/hash_save_utils.py delete mode 100644 preprocessor/utils/image_hash_utils.py delete mode 100644 preprocessor/utils/image_hasher.py delete mode 100644 preprocessor/utils/metadata_utils.py delete mode 100644 preprocessor/utils/resource_scope.py delete mode 100644 preprocessor/utils/time_utils.py delete mode 100644 preprocessor/utils/transcription_utils.py delete mode 100644 preprocessor/utils/video_utils.py delete mode 100644 preprocessor/validation/__init__.py delete mode 100644 preprocessor/validation/episode_stats.py delete mode 100644 preprocessor/validation/file_validators.py delete mode 100644 preprocessor/validation/global_validator.py delete mode 100644 preprocessor/validation/validator.py delete mode 100644 preprocessor/video/__init__.py delete mode 100644 preprocessor/video/emotion_utils.py delete mode 100644 preprocessor/video/frame_processor.py delete mode 100644 preprocessor/video/frame_utils.py delete mode 100644 preprocessor/video/subprocessors/__init__.py delete mode 100644 preprocessor/video/subprocessors/character_detection_subprocessor.py delete mode 100644 preprocessor/video/subprocessors/character_detection_visualization_subprocessor.py delete mode 100644 preprocessor/video/subprocessors/emotion_detection_subprocessor.py delete mode 100644 preprocessor/video/subprocessors/face_clustering_subprocessor.py delete mode 100644 preprocessor/video/subprocessors/image_hash_subprocessor.py delete mode 100644 preprocessor/video/subprocessors/object_detection_subprocessor.py delete mode 100644 preprocessor/video/subprocessors/object_detection_visualization_subprocessor.py delete mode 100644 preprocessor/video/subprocessors/video_embedding_subprocessor.py diff --git a/bot/search/elastic_search_manager.py b/bot/search/elastic_search_manager.py index 10d3aac14..6b8d205e1 100644 --- a/bot/search/elastic_search_manager.py +++ b/bot/search/elastic_search_manager.py @@ -27,6 +27,128 @@ class ElasticSearchManager: + SEGMENTS_INDEX_MAPPING = { + "mappings": { + "properties": { + "episode_id": {"type": "keyword"}, + "episode_metadata": {"type": "object"}, + "segment_id": {"type": "integer"}, + "text": {"type": "text"}, + "start_time": {"type": "float"}, + "end_time": {"type": "float"}, + "speaker": {"type": "keyword"}, + "video_path": {"type": "keyword"}, + "scene_info": {"type": "object"}, + }, + }, + } + + TEXT_EMBEDDINGS_INDEX_MAPPING = { + "mappings": { + "properties": { + "episode_id": {"type": "keyword"}, + "episode_metadata": {"type": "object"}, + "embedding_id": {"type": "integer"}, + "segment_range": {"type": "integer"}, + "text": {"type": "text"}, + "text_embedding": { + "type": "dense_vector", + "dims": 4096, + "index": True, + "similarity": "cosine", + }, + }, + }, + } + + VIDEO_EMBEDDINGS_INDEX_MAPPING = { + "mappings": { + "properties": { + "episode_id": {"type": "keyword"}, + "episode_metadata": {"type": "object"}, + "frame_number": {"type": "integer"}, + "timestamp": {"type": "float"}, + "frame_type": {"type": "keyword"}, + "video_path": {"type": "keyword"}, + "video_embedding": { + "type": "dense_vector", + "dims": 4096, + "index": True, + "similarity": "cosine", + }, + }, + }, + } + + EPISODE_NAMES_INDEX_MAPPING = { + "mappings": { + "properties": { + "episode_id": {"type": "keyword"}, + "episode_metadata": {"type": "object"}, + "title": {"type": "text"}, + "title_embedding": { + "type": "dense_vector", + "dims": 4096, + "index": True, + "similarity": "cosine", + }, + }, + }, + } + + FULL_EPISODE_EMBEDDINGS_INDEX_MAPPING = { + "mappings": { + "properties": { + "episode_id": {"type": "keyword"}, + "episode_metadata": {"type": "object"}, + "full_transcript": {"type": "text"}, + "full_episode_embedding": { + "type": "dense_vector", + "dims": 4096, + "index": True, + "similarity": "cosine", + }, + }, + }, + } + + SOUND_EVENTS_INDEX_MAPPING = { + "mappings": { + "properties": { + "episode_id": {"type": "keyword"}, + "episode_metadata": {"type": "object"}, + "segment_id": {"type": "integer"}, + "text": {"type": "text"}, + "sound_type": {"type": "keyword"}, + "start_time": {"type": "float"}, + "end_time": {"type": "float"}, + "video_path": {"type": "keyword"}, + "scene_info": {"type": "object"}, + }, + }, + } + + SOUND_EVENT_EMBEDDINGS_INDEX_MAPPING = { + "mappings": { + "properties": { + "episode_id": {"type": "keyword"}, + "episode_metadata": {"type": "object"}, + "embedding_id": {"type": "integer"}, + "segment_range": {"type": "object"}, + "text": {"type": "text"}, + "sound_types": {"type": "keyword"}, + "start_time": {"type": "float"}, + "end_time": {"type": "float"}, + "sound_event_embedding": { + "type": "dense_vector", + "dims": 4096, + "index": True, + "similarity": "cosine", + }, + }, + }, + } + @staticmethod async def connect_to_elasticsearch(logger: logging.Logger) -> AsyncElasticsearch: es = AsyncElasticsearch( diff --git a/bot/services/reindex/reindex_service.py b/bot/services/reindex/reindex_service.py index 017c5ac8a..c06b688b9 100644 --- a/bot/services/reindex/reindex_service.py +++ b/bot/services/reindex/reindex_service.py @@ -18,11 +18,10 @@ async_bulk, ) +from bot.search.elastic_search_manager import ElasticSearchManager from bot.services.reindex.series_scanner import SeriesScanner from bot.services.reindex.video_path_transformer import VideoPathTransformer from bot.services.reindex.zip_extractor import ZipExtractor -from bot.settings import settings -from preprocessor.processors.elasticsearch_manager import ElasticSearchManager @dataclass @@ -188,9 +187,6 @@ async def reindex_series( async def __init_elasticsearch(self) -> None: if self.__es_manager is None: self.__es_manager = await ElasticSearchManager.connect_to_elasticsearch( - settings.ES_HOST, - settings.ES_USER, - settings.ES_PASS.get_secret_value(), self.__logger, ) diff --git a/bot/types.py b/bot/types.py index 56d463b6e..a1520ef5a 100644 --- a/bot/types.py +++ b/bot/types.py @@ -1,4 +1,4 @@ -from preprocessor.types import ( +from preprocessor.config.types import ( BaseSegment, CharacterDetectionInFrame, ClipSegment, diff --git a/preprocessor/__main__.py b/preprocessor/__main__.py index 5b961d0ed..99b90fa10 100644 --- a/preprocessor/__main__.py +++ b/preprocessor/__main__.py @@ -2,14 +2,13 @@ import sys from preprocessor.cli import cli -from preprocessor.utils.console import console +from preprocessor.lib.ui.console import console logging.getLogger('matplotlib').setLevel(logging.ERROR) logging.getLogger('matplotlib.font_manager').setLevel(logging.ERROR) - -if __name__ == "__main__": +if __name__ == '__main__': try: cli() except KeyboardInterrupt: - console.print("\n[yellow]Operation cancelled by user[/yellow]") + console.print('\n[yellow]Operation cancelled by user[/yellow]') sys.exit(130) diff --git a/preprocessor/app/__init__.py b/preprocessor/app/__init__.py new file mode 100644 index 000000000..6d5ad1b34 --- /dev/null +++ b/preprocessor/app/__init__.py @@ -0,0 +1,6 @@ +from preprocessor.app.pipeline_factory import ( + build_pipeline, + visualize, +) + +__all__ = ["build_pipeline", "visualize"] diff --git a/preprocessor/app/config_defaults.py b/preprocessor/app/config_defaults.py new file mode 100644 index 000000000..288e97028 --- /dev/null +++ b/preprocessor/app/config_defaults.py @@ -0,0 +1,68 @@ +from typing import Dict + +from preprocessor.config.step_configs import ( + ArchiveConfig, + CharacterDetectionConfig, + DocumentGenerationConfig, + ElasticsearchConfig, + EmotionDetectionConfig, + FaceClusteringConfig, + FrameExportConfig, + ImageHashConfig, + ObjectDetectionConfig, + SceneDetectionConfig, + SoundSeparationConfig, + TextAnalysisConfig, + TextEmbeddingConfig, + TranscodeConfig, + VideoEmbeddingConfig, + WhisperTranscriptionConfig, +) + + +def get_default_step_configs(series_name: str) -> Dict[str, object]: + return { + 'transcode': TranscodeConfig( + video_bitrate_mbps=2.5, + minrate_mbps=1.5, + maxrate_mbps=3.5, + bufsize_mbps=5.0, + gop_size=2.0, + ), + 'transcribe': WhisperTranscriptionConfig( + model='large-v3-turbo', + language='pl', + device='cuda', + beam_size=5, + temperature=0.0, + ), + 'separate_sounds': SoundSeparationConfig(), + 'analyze_text': TextAnalysisConfig(language='pl'), + 'detect_scenes': SceneDetectionConfig(threshold=0.5, min_scene_len=15), + 'export_frames': FrameExportConfig(frames_per_scene=3), + 'text_embeddings': TextEmbeddingConfig( + model_name='Qwen/Qwen2-VL-8B-Instruct', + batch_size=8, + device='cuda', + text_sentences_per_chunk=5, + text_chunk_overlap=1, + ), + 'image_hashing': ImageHashConfig(batch_size=32), + 'video_embeddings': VideoEmbeddingConfig( + model_name='Qwen/Qwen2-VL-8B-Instruct', + batch_size=8, + device='cuda', + ), + 'character_detection': CharacterDetectionConfig(threshold=0.7), + 'emotion_detection': EmotionDetectionConfig(), + 'face_clustering': FaceClusteringConfig(), + 'object_detection': ObjectDetectionConfig(), + 'generate_elastic_documents': DocumentGenerationConfig(generate_segments=True), + 'generate_archives': ArchiveConfig(), + 'index': ElasticsearchConfig( + index_name=f'{series_name}_clips', + host='localhost:9200', + dry_run=False, + append=False, + ), + } diff --git a/preprocessor/app/pipeline.py b/preprocessor/app/pipeline.py new file mode 100644 index 000000000..88735ad24 --- /dev/null +++ b/preprocessor/app/pipeline.py @@ -0,0 +1,153 @@ +from typing import ( + Dict, + List, + Optional, + Set, +) + +import networkx as nx + +from preprocessor.app.step_builder import StepBuilder + + +class Pipeline: + def __init__(self, name: str) -> None: + self.name: str = name + self._steps: Dict[str, StepBuilder] = {} + self._graph: Optional[nx.DiGraph] = None + + def register(self, step: StepBuilder) -> None: + if step.id in self._steps: + raise ValueError( + f"❌ DUPLIKAT KROKU:\n" + f" Krok '{step.id}' jest już zarejestrowany w pipeline!\n" + f" Sprawdź build_pipeline() w pipeline_factory.py", + ) + self._steps[step.id] = step + + def validate(self) -> None: + self._graph = nx.DiGraph() + + for step_id, step in self._steps.items(): + self._graph.add_node(step_id, step=step) + + for step_id, step in self._steps.items(): + for dep_id in step.dependency_ids: + if dep_id not in self._steps: + self._raise_missing_dependency_error(step_id, dep_id) + self._graph.add_edge(dep_id, step_id) + + if not nx.is_directed_acyclic_graph(self._graph): + self._raise_cycle_error() + + print( + f"✅ Pipeline '{self.name}' validated successfully:\n" + f" - {len(self._steps)} steps registered\n" + f" - DAG structure confirmed\n" + f" - No cyclic dependencies", + ) + + def _raise_missing_dependency_error( + self, step_id: str, missing_dep_id: str, + ) -> None: + raise ValueError( + f"\n{'=' * 80}\n" + f"❌ BŁĄD ZALEŻNOŚCI W PIPELINE\n" + f"{'=' * 80}\n\n" + f"Krok: '{step_id}'\n" + f"Potrzebuje: '{missing_dep_id}'\n" + f"Problem: Krok '{missing_dep_id}' nie jest zarejestrowany!\n\n" + f"Rozwiązanie:\n" + f" 1. Sprawdź build_pipeline() w preprocessor/app/pipeline_factory.py\n" + f" 2. Upewnij się że '{missing_dep_id}' jest dodany przez pipeline.register()\n" + f" 3. Lub usuń '{missing_dep_id}' z needs=[...] w definicji '{step_id}'\n" + f"\n{'=' * 80}\n", + ) + + def _raise_cycle_error(self) -> None: + cycles: List[List[str]] = list(nx.simple_cycles(self._graph)) + cycle_path: str = " → ".join(cycles[0]) + f" → {cycles[0][0]}" + + raise ValueError( + f"\n{'=' * 80}\n" + f"❌ CYKL W ZALEŻNOŚCIACH PIPELINE\n" + f"{'=' * 80}\n\n" + f"Wykryto cykliczną zależność:\n" + f" {cycle_path}\n\n" + f"Kroki w cyklu: {', '.join(cycles[0])}\n\n" + f"Pipeline musi być DAG (Directed Acyclic Graph).\n" + f"Usuń jedną z zależności aby przerwać cykl.\n" + f"\n{'=' * 80}\n", + ) + + def get_execution_order( + self, targets: Optional[List[str]] = None, skip: Optional[List[str]] = None, + ) -> List[str]: + if not self._graph: + raise RuntimeError( + "Pipeline not validated! Call pipeline.validate() first.", + ) + + full_order: List[str] = list(nx.topological_sort(self._graph)) + + if targets: + required: Set[str] = set() + for target in targets: + if target not in self._steps: + raise ValueError( + f"Target step '{target}' does not exist in pipeline", + ) + required.add(target) + required.update(nx.ancestors(self._graph, target)) + full_order = [s for s in full_order if s in required] + + skip_set: Set[str] = set(skip or []) + return [s for s in full_order if s not in skip_set] + + def get_step(self, step_id: str) -> StepBuilder: + if step_id not in self._steps: + raise KeyError( + f"Step '{step_id}' not found. Available: {list(self._steps.keys())}", + ) + return self._steps[step_id] + + def to_ascii_art(self) -> str: + if not self._graph: + self.validate() + + lines: List[str] = [ + "=" * 80, + f"PIPELINE: {self.name}", + "=" * 80, + "", + ] + + phases: Dict[str, List[StepBuilder]] = {} + for _, step in self._steps.items(): + phase_name: str = step.phase.name + if phase_name not in phases: + phases[phase_name] = [] + phases[phase_name].append(step) + + for phase_name in ("SCRAPING", "PROCESSING", "INDEXING"): + if phase_name not in phases: + continue + + lines.append(f"[{phase_name}]") + lines.append("-" * 80) + + for step in phases[phase_name]: + deps_str: str = "" + if step.dependency_ids: + deps_str = f" ← needs: {', '.join(step.dependency_ids)}" + + lines.append(f" {step.id}{deps_str}") + lines.append(f" → produces: {', '.join(step.produces)}") + lines.append(f" → {step.description}") + lines.append("") + + lines.append("=" * 80) + return "\n".join(lines) + + def __repr__(self) -> str: + return f"Pipeline(name='{self.name}', steps={len(self._steps)})" diff --git a/preprocessor/app/pipeline_builder.py b/preprocessor/app/pipeline_builder.py new file mode 100644 index 000000000..237ccd391 --- /dev/null +++ b/preprocessor/app/pipeline_builder.py @@ -0,0 +1,76 @@ +from pathlib import Path +from typing import ( + Any, + List, +) + +from preprocessor.core.artifacts import SourceVideo +from preprocessor.core.base_step import PipelineStep +from preprocessor.core.context import ExecutionContext +from preprocessor.lib.episodes.episode_manager import EpisodeManager + + +class Pipeline: + def __init__(self, context: ExecutionContext): + self.context = context + self.steps: List[PipelineStep] = [] + + def add_step(self, step: PipelineStep) -> "Pipeline": + self.steps.append(step) + return self + + def run_for_episodes( + self, source_path: Path, episode_manager: EpisodeManager, + ) -> None: + video_files = self.__discover_videos(source_path) + self.context.logger.info( + f"Discovered {len(video_files)} video files in {source_path}", + ) + + current_artifacts: List[Any] = [] + for video_file in video_files: + episode_info = episode_manager.parse_filename(video_file) + if not episode_info: + self.context.logger.warning(f"Cannot parse: {video_file}") + continue + + episode_id = episode_manager.get_episode_id_for_state(episode_info) + current_artifacts.append( + SourceVideo( + path=video_file, + episode_id=episode_id, + episode_info=episode_info, + ), + ) + + for step in self.steps: + self.context.logger.info(f"=== Running Step: {step.name} ===") + next_artifacts = [] + + for artifact in current_artifacts: + try: + result = step.execute(artifact, self.context) + if result: + next_artifacts.append(result) + except Exception as e: + self.context.logger.error( + f"Step {step.name} failed for {artifact.episode_id}: {e}", + ) + + current_artifacts = next_artifacts + + @staticmethod + def __discover_videos(source_path: Path) -> List[Path]: + extensions = ["*.mp4", "*.mkv", "*.avi"] + videos = [] + for ext in extensions: + videos.extend(source_path.glob(f"**/{ext}")) + return sorted(videos) + + def cleanup(self) -> None: + for step in self.steps: + if hasattr(step, "cleanup"): + try: + step.cleanup() + except Exception as e: + self.context.logger.error(f"Cleanup failed for step {step.name}: {e}") diff --git a/preprocessor/app/pipeline_factory.py b/preprocessor/app/pipeline_factory.py new file mode 100644 index 000000000..87352468c --- /dev/null +++ b/preprocessor/app/pipeline_factory.py @@ -0,0 +1,317 @@ +from typing import Dict + +from preprocessor.app.pipeline import Pipeline +from preprocessor.app.step_builder import ( + Phase, + StepBuilder, +) +from preprocessor.config.step_configs import ( + ArchiveConfig, + CharacterDetectionConfig, + CharacterReferenceConfig, + CharacterScraperConfig, + DocumentGenerationConfig, + ElasticsearchConfig, + EmotionDetectionConfig, + EpisodeScraperConfig, + FaceClusteringConfig, + FrameExportConfig, + ImageHashConfig, + ObjectDetectionConfig, + SceneDetectionConfig, + SoundSeparationConfig, + TextAnalysisConfig, + TextEmbeddingConfig, + TranscodeConfig, + VideoEmbeddingConfig, + WhisperTranscriptionConfig, +) + +SCRAPING = Phase("SCRAPING", color="blue") +PROCESSING = Phase("PROCESSING", color="green") +INDEXING = Phase("INDEXING", color="yellow") + +episodes_metadata = StepBuilder( + id="scrape_episodes", + phase=SCRAPING, + module="preprocessor.modules.scraping.episode_scraper_step:EpisodeScraperStep", + description="Scrapes episode metadata from wiki", + produces=["episodes.json"], + needs=[], + config=EpisodeScraperConfig( + urls=["https://ranczo.fandom.com/pl/wiki/Lista_odcinków"], + output_file="preprocessor/scraped_data/episodes.json", + headless=True, + merge_sources=True, + scraper_method="crawl4ai", + parser_mode="normal", + ), +) + +characters_metadata = StepBuilder( + id="scrape_characters", + phase=SCRAPING, + module="preprocessor.modules.scraping.character_scraper_step:CharacterScraperStep", + description="Scrapes character data from wiki", + produces=["characters.json"], + needs=[], + config=CharacterScraperConfig( + urls=["https://ranczo.fandom.com/pl/wiki/Postacie"], + output_file="preprocessor/scraped_data/characters.json", + headless=True, + scraper_method="crawl4ai", + parser_mode="normal", + ), +) + +character_references = StepBuilder( + id="process_references", + phase=SCRAPING, + module="preprocessor.modules.scraping.reference_processor_step:CharacterReferenceStep", + description="Downloads and processes character reference images", + produces=["character_faces/{character}/*.jpg"], + needs=[characters_metadata], + config=CharacterReferenceConfig( + characters_file="preprocessor/scraped_data/characters.json", + output_dir="preprocessor/character_faces", + search_engine="duckduckgo", + images_per_character=5, + ), +) + +transcoded_videos = StepBuilder( + id="transcode", + phase=PROCESSING, + module="preprocessor.modules.video.transcoding:VideoTranscoderStep", + description="Konwersja do h264_nvenc 720p 30fps z adaptacyjnym bitrate", + produces=["transcoded_videos/{season}/{episode}.mp4"], + needs=[], + config=TranscodeConfig( + video_bitrate_mbps=2.5, + minrate_mbps=1.5, + maxrate_mbps=3.5, + bufsize_mbps=5.0, + gop_size=2.0, + ), +) + +scene_data = StepBuilder( + id="detect_scenes", + phase=PROCESSING, + module="preprocessor.modules.video.scene_detection:SceneDetectorStep", + description="Wykrywa zmiany scen używając TransNetV2", + produces=["scene_detections/{season}/{episode}.json"], + needs=[transcoded_videos], + config=SceneDetectionConfig(threshold=0.5, min_scene_len=15), +) + +exported_frames = StepBuilder( + id="export_frames", + phase=PROCESSING, + module="preprocessor.modules.video.frame_export:FrameExporterStep", + description="Eksportuje klatki (PNG) na granicach scen", + produces=["frames/{season}/{episode}/*.png"], + needs=[scene_data], + config=FrameExportConfig(frames_per_scene=3), +) + +transcription_data = StepBuilder( + id="transcribe", + phase=PROCESSING, + module="preprocessor.modules.text.transcription:TranscriptionStep", + description="Transkrypcja audio używając Whisper large-v3-turbo", + produces=["transcriptions/{season}/{episode}.json"], + needs=[transcoded_videos], + config=WhisperTranscriptionConfig( + model="large-v3-turbo", + language="pl", + device="cuda", + beam_size=5, + temperature=0.0, + ), +) + +separated_audio = StepBuilder( + id="separate_sounds", + phase=PROCESSING, + module="preprocessor.modules.audio.separation:AudioSeparationStep", + description="Rozdziela dialogi od efektów dźwiękowych", + produces=["separated_audio/{season}/{episode}/"], + needs=[transcription_data], + config=SoundSeparationConfig(), +) + +text_stats = StepBuilder( + id="analyze_text", + phase=PROCESSING, + module="preprocessor.modules.text.analysis:TextAnalysisStep", + description="Analiza statystyk tekstu (częstotliwość słów, sentiment)", + produces=["text_analysis/{season}/{episode}.json"], + needs=[transcription_data], + config=TextAnalysisConfig(language="pl"), +) + +text_embeddings = StepBuilder( + id="text_embeddings", + phase=PROCESSING, + module="preprocessor.modules.text.embeddings:TextEmbeddingStep", + description="Generuje embeddingi tekstowe używając Qwen2-VL", + produces=["embeddings/text/{season}/{episode}.npy"], + needs=[text_stats], + config=TextEmbeddingConfig( + model_name="Qwen/Qwen2-VL-8B-Instruct", + batch_size=8, + device="cuda", + text_sentences_per_chunk=5, + text_chunk_overlap=1, + ), +) + +image_hashes = StepBuilder( + id="image_hashing", + phase=PROCESSING, + module="preprocessor.modules.vision.image_hashing:ImageHashStep", + description="Perceptual hashing klatek (phash, dhash, wavelet)", + produces=["hashes/{season}/{episode}.json"], + needs=[exported_frames], + config=ImageHashConfig(batch_size=32), +) + +video_embeddings = StepBuilder( + id="video_embeddings", + phase=PROCESSING, + module="preprocessor.modules.vision.embeddings:VideoEmbeddingStep", + description="Embeddingi wizualne używając Qwen2-VL", + produces=["embeddings/vision/{season}/{episode}.npy"], + needs=[exported_frames, image_hashes], + config=VideoEmbeddingConfig( + model_name="Qwen/Qwen2-VL-8B-Instruct", + batch_size=8, + device="cuda", + ), +) + +character_detections = StepBuilder( + id="detect_characters", + phase=PROCESSING, + module="preprocessor.modules.vision.character_detection:CharacterDetectorStep", + description="Rozpoznaje postacie na klatkach używając InsightFace", + produces=["detections/characters/{season}/{episode}.json"], + needs=[exported_frames], + config=CharacterDetectionConfig(threshold=0.7), +) + +emotion_data = StepBuilder( + id="detect_emotions", + phase=PROCESSING, + module="preprocessor.modules.vision.emotion_detection:EmotionDetectionStep", + description="Detekcja emocji na twarzach używając EmoNet", + produces=["detections/emotions/{season}/{episode}.json"], + needs=[exported_frames], + config=EmotionDetectionConfig(), +) + +face_clusters = StepBuilder( + id="cluster_faces", + phase=PROCESSING, + module="preprocessor.modules.vision.face_clustering:FaceClusteringStep", + description="Klasteryzacja twarzy używając HDBSCAN", + produces=["clusters/faces/{season}/{episode}.json"], + needs=[exported_frames], + config=FaceClusteringConfig(), +) + +object_detections = StepBuilder( + id="detect_objects", + phase=PROCESSING, + module="preprocessor.modules.vision.object_detection:ObjectDetectionStep", + description="Detekcja obiektów ogólnych używając D-FINE", + produces=["detections/objects/{season}/{episode}.json"], + needs=[exported_frames], + config=ObjectDetectionConfig(), +) + +elastic_documents = StepBuilder( + id="generate_elastic_docs", + phase=INDEXING, + module="preprocessor.modules.search.document_generation:DocumentGeneratorStep", + description="Łączy wszystkie dane w dokumenty Elasticsearch", + produces=["elastic_documents/{season}/{episode}.ndjson"], + needs=[ + text_embeddings, + video_embeddings, + character_detections, + emotion_data, + face_clusters, + object_detections, + ], + config=DocumentGenerationConfig(generate_segments=True), +) + +episode_archives = StepBuilder( + id="generate_archives", + phase=INDEXING, + module="preprocessor.modules.packaging.archives:ArchiveGenerationStep", + description="Tworzy archiwa ZIP per odcinek (wszystkie artefakty)", + produces=["archives/{season}/{episode}.zip"], + needs=[elastic_documents], + config=ArchiveConfig(), +) + +indexed_data = StepBuilder( + id="index_to_elasticsearch", + phase=INDEXING, + module="preprocessor.modules.search.indexing:ElasticsearchIndexerStep", + description="Wrzuca dokumenty do Elasticsearch", + produces=[""], + needs=[elastic_documents], + config=ElasticsearchConfig( + index_name="ranczo_clips", + host="localhost:9200", + dry_run=False, + append=False, + ), +) + + +def build_pipeline() -> Pipeline: + pipeline = Pipeline(name="ranczo_processing") + + pipeline.register(episodes_metadata) + pipeline.register(characters_metadata) + pipeline.register(character_references) + + pipeline.register(transcoded_videos) + pipeline.register(scene_data) + pipeline.register(exported_frames) + + pipeline.register(transcription_data) + pipeline.register(separated_audio) + pipeline.register(text_stats) + + pipeline.register(text_embeddings) + pipeline.register(image_hashes) + pipeline.register(video_embeddings) + + pipeline.register(character_detections) + pipeline.register(emotion_data) + pipeline.register(face_clusters) + pipeline.register(object_detections) + + pipeline.register(elastic_documents) + pipeline.register(episode_archives) + pipeline.register(indexed_data) + + pipeline.validate() + + return pipeline + + +def visualize() -> None: + pipeline = build_pipeline() + print(pipeline.to_ascii_art()) + + +def get_step_configs() -> Dict[str, object]: + pipeline = build_pipeline() + return {step_id: step.config for step_id, step in pipeline._steps.items()} diff --git a/preprocessor/app/step_builder.py b/preprocessor/app/step_builder.py new file mode 100644 index 000000000..e1456435a --- /dev/null +++ b/preprocessor/app/step_builder.py @@ -0,0 +1,73 @@ +from dataclasses import ( + dataclass, + field, +) +import importlib +from typing import ( + TYPE_CHECKING, + Any, + List, +) + +if TYPE_CHECKING: + from preprocessor.core.base_step import PipelineStep + + +@dataclass +class Phase: + name: str + color: str + + +@dataclass +class StepBuilder: + id: str + phase: Phase + module: str + description: str + produces: List[str] + needs: List["StepBuilder"] = field(default_factory=list) + config: Any = None + + def __post_init__(self) -> None: + if not self.id.replace("_", "").replace("-", "").isalnum(): + raise ValueError( + f"Invalid step_id: '{self.id}'. Use only alphanumeric and underscores.", + ) + if not self.module or ":" not in self.module: + raise ValueError( + f"Invalid module format for '{self.id}'. Expected 'package.module:ClassName'", + ) + + @property + def dependency_ids(self) -> List[str]: + return [step.id for step in self.needs] + + def load_class(self) -> type: + module_path, class_name = self.module.split(":") + + try: + mod = importlib.import_module(module_path) + except ImportError as e: + raise ImportError( + f"Cannot load module '{module_path}' for step '{self.id}': {e}", + ) from e + + try: + return getattr(mod, class_name) + except AttributeError as e: + raise AttributeError( + f"Class '{class_name}' not found in module '{module_path}' for step '{self.id}': {e}", + ) from e + + def __repr__(self) -> str: + deps = f", needs={self.dependency_ids}" if self.needs else "" + return f"StepBuilder(id='{self.id}'{deps})" + + def __hash__(self) -> int: + return hash(self.id) + + def __eq__(self, other: object) -> bool: + if not isinstance(other, StepBuilder): + return False + return self.id == other.id diff --git a/preprocessor/characters/face_detection.py b/preprocessor/characters/face_detection.py deleted file mode 100644 index fff55a70a..000000000 --- a/preprocessor/characters/face_detection.py +++ /dev/null @@ -1,183 +0,0 @@ -import os -from pathlib import Path -from typing import ( - Any, - Dict, - List, - Optional, -) -import warnings - -import cv2 -from insightface.app import FaceAnalysis -import numpy as np -from numpy.linalg import norm -import onnxruntime as ort - -from preprocessor.config.config import settings -from preprocessor.utils.console import console - -# Suppress insightface warnings -warnings.filterwarnings( - "ignore", - message=".*estimate.*is deprecated.*", - category=FutureWarning, - module="insightface", -) - - -def init_face_detection() -> FaceAnalysis: - model_root = os.getenv("INSIGHTFACE_HOME", os.path.expanduser("~/.insightface")) - - available_providers = ort.get_available_providers() - console.print(f"[dim]Available ONNX providers: {', '.join(available_providers)}[/dim]") - - if 'CUDAExecutionProvider' not in available_providers: - console.print("[red]✗ CUDAExecutionProvider not available in onnxruntime[/red]") - console.print("[red] Check if onnxruntime-gpu is installed and CUDA libraries are accessible[/red]") - raise RuntimeError("CUDA provider not available in onnxruntime") - - providers = [ - ( - 'CUDAExecutionProvider', { - 'device_id': 0, - 'arena_extend_strategy': 'kNextPowerOfTwo', - 'gpu_mem_limit': 8 * 1024 * 1024 * 1024, - 'cudnn_conv_algo_search': 'EXHAUSTIVE', - 'do_copy_in_default_stream': True, - }, - ), - ] - - with warnings.catch_warnings(): - warnings.filterwarnings("ignore", category=UserWarning, module="onnxruntime") - warnings.filterwarnings("ignore", category=FutureWarning, module="insightface") - - console.print(f"[cyan]Loading {settings.face_recognition.model_name} face detection model (GPU-only)...[/cyan]") - - try: - face_app = FaceAnalysis(name=settings.face_recognition.model_name, root=model_root, providers=providers) - face_app.prepare( - ctx_id=0, - det_size=settings.face_recognition.detection_size, - det_thresh=settings.character.face_detection_threshold, - ) - except Exception as e: - console.print("[red]✗ Failed to initialize face detection on GPU[/red]") - console.print(f"[red] Error: {e}[/red]") - console.print("[red] Ensure CUDA and onnxruntime-gpu are properly configured[/red]") - raise RuntimeError("GPU required but face detection initialization failed") from e - - actual_providers = face_app.models['detection'].session.get_providers() - - if 'CUDAExecutionProvider' not in actual_providers: - console.print("[red]✗ CUDA provider not active after initialization[/red]") - console.print(f"[red] Active providers: {', '.join(actual_providers)}[/red]") - raise RuntimeError("CUDA required but not available for face detection") - - console.print(f"[green]✓ Face detection initialized ({settings.face_recognition.model_name})[/green]") - console.print("[dim] Device: GPU (CUDA)[/dim]") - console.print(f"[dim] Detection size: {settings.face_recognition.detection_size}[/dim]") - console.print(f"[dim] Face detection threshold: {settings.character.face_detection_threshold}[/dim]") - console.print(f"[dim] Model cache: {model_root}[/dim]") - - return face_app - - -def load_character_references( - characters_dir: Path, - face_app: FaceAnalysis, -) -> Dict[str, np.ndarray]: - console.print("[blue]Loading character references...[/blue]") - character_vectors = {} - - for char_dir in characters_dir.iterdir(): - if not char_dir.is_dir(): - continue - - char_name = char_dir.name.replace("_", " ").title() - vector_file = char_dir / "face_vector.npy" - - if vector_file.exists(): - character_vectors[char_name] = np.load(vector_file) - console.print(f"[dim] ✓ {char_name}: loaded from face_vector.npy[/dim]") - continue - - images = list(char_dir.glob("*.jpg")) - - if not images: - continue - - embeddings = [] - for img_path in images: - emb = _get_face_embedding(str(img_path), face_app) - if emb is not None: - embeddings.append(emb) - - if embeddings: - mean_emb = np.mean(embeddings, axis=0) - centroid = mean_emb / norm(mean_emb) - character_vectors[char_name] = centroid - console.print(f"[green] ✓ {char_name}: {len(embeddings)} reference images[/green]") - - console.print(f"[green]✓ Loaded {len(character_vectors)} characters[/green]") - return character_vectors - - -def _get_face_embedding(img_path: str, face_app: FaceAnalysis) -> Optional[np.ndarray]: - img = cv2.imread(img_path) - if img is None: - return None - - faces = face_app.get(img) - if not faces: - return None - - faces.sort(key=lambda x: (x.bbox[2]-x.bbox[0]) * (x.bbox[3]-x.bbox[1]), reverse=True) - return faces[0].normed_embedding - - -def detect_characters_in_frame( - frame_path: Path, - face_app: FaceAnalysis, - character_vectors: Dict[str, np.ndarray], - threshold: float, -) -> List[Dict[str, Any]]: - img = cv2.imread(str(frame_path)) - if img is None: - return [] - - faces = face_app.get(img) - if not faces: - return [] - - detected = [] - - for face in faces: - face_embedding = face.normed_embedding - bbox = face.bbox.astype(int) - - best_match = None - best_similarity = threshold - - for char_name, char_vector in character_vectors.items(): - similarity = np.dot(face_embedding, char_vector) - - if similarity > best_similarity: - best_similarity = similarity - best_match = char_name - - if best_match is not None: - detected.append({ - "name": best_match, - "confidence": float(best_similarity), - "bbox": { - "x1": int(bbox[0]), - "y1": int(bbox[1]), - "x2": int(bbox[2]), - "y2": int(bbox[3]), - }, - }) - - detected.sort(key=lambda x: x["confidence"], reverse=True) - return detected diff --git a/preprocessor/characters/google_image_search.py b/preprocessor/characters/google_image_search.py deleted file mode 100644 index e2b74e6c9..000000000 --- a/preprocessor/characters/google_image_search.py +++ /dev/null @@ -1,41 +0,0 @@ -from typing import ( - Dict, - List, -) - -from serpapi import GoogleSearch - -from preprocessor.characters.image_search import BaseImageSearch - - -class GoogleImageSearch(BaseImageSearch): - def __init__(self, api_key: str, max_results: int = 50): - super().__init__(max_results) - if not api_key: - raise ValueError("SerpAPI key is required for Google Image Search") - self.api_key = api_key - - @property - def name(self) -> str: - return "Google Images API" - - def search(self, query: str) -> List[Dict[str, str]]: - params = { - "engine": "google_images", - "q": query, - "hl": "pl", - "gl": "pl", - "api_key": self.api_key, - } - - search = GoogleSearch(params) - results = search.get_dict() - - images = [] - for img_result in results.get("images_results", [])[:self.max_results]: - images.append({ - "image": img_result.get("original"), - "thumbnail": img_result.get("thumbnail"), - }) - - return images diff --git a/preprocessor/characters/reference_downloader.py b/preprocessor/characters/reference_downloader.py deleted file mode 100644 index af392e165..000000000 --- a/preprocessor/characters/reference_downloader.py +++ /dev/null @@ -1,301 +0,0 @@ -from __future__ import annotations - -import json -import logging -from pathlib import Path -import random -import time -from typing import ( - Any, - Dict, - List, - Optional, -) - -import cv2 -from insightface.app import FaceAnalysis -import numpy as np -from patchright.sync_api import ( - BrowserContext, - Page, - sync_playwright, -) - -from preprocessor.characters.duckduckgo_image_search import DuckDuckGoImageSearch -from preprocessor.characters.face_detection import init_face_detection -from preprocessor.characters.google_image_search import GoogleImageSearch -from preprocessor.characters.image_search import BaseImageSearch -from preprocessor.config.config import settings -from preprocessor.core.base_processor import BaseProcessor -from preprocessor.utils.console import ( - console, - create_progress, -) - - -class CharacterReferenceDownloader(BaseProcessor): - def __init__(self, args: Dict[str, Any]): - super().__init__( - args=args, - class_name=self.__class__.__name__, - error_exit_code=8, - loglevel=logging.DEBUG, - ) - - self.characters_json: Path = self._args["characters_json"] - self.series_name: str = self._args["series_name"] - self.output_dir: Path = self._args.get("output_dir", settings.character.get_output_dir(self.series_name)) - self.images_per_character: int = self._args.get( - "images_per_character", - settings.character.reference_images_per_character, - ) - self.max_results: int = settings.image_scraper.max_results_to_scrape - self.min_width: int = settings.image_scraper.min_image_width - self.min_height: int = settings.image_scraper.min_image_height - self.use_gpu: bool = True - self.search_mode: str = self._args.get("search_mode", "normal") - - self.search_engine: BaseImageSearch = self.__create_search_engine() - self.face_app: FaceAnalysis = None - self.browser_context: Optional[BrowserContext] = None - - def __create_search_engine(self) -> BaseImageSearch: - if self.search_mode == "premium": - serpapi_key = settings.image_scraper.serpapi_key - return GoogleImageSearch(api_key=serpapi_key, max_results=self.max_results) - return DuckDuckGoImageSearch(max_results=self.max_results) - - def _validate_args(self, args: Dict[str, Any]) -> None: - if "characters_json" not in args: - raise ValueError("characters_json is required") - - def get_output_subdir(self) -> str: - return "character_references" - - def __all_references_exist(self, characters: List[Dict[str, Any]]) -> bool: - for char in characters: - char_name = char["name"] - output_folder = self.output_dir / char_name.replace(" ", "_").lower() - existing_images = list(output_folder.glob("*.jpg")) - if len(existing_images) < self.images_per_character: - return False - return True - - def _execute(self) -> None: - if not self.characters_json.exists(): - console.print(f"[red]Characters JSON not found: {self.characters_json}[/red]") - return - - with open(self.characters_json, encoding="utf-8") as f: - data = json.load(f) - - characters = data.get("characters", []) - if not characters: - console.print("[yellow]No characters found in JSON[/yellow]") - return - - if self.__all_references_exist(characters): - console.print(f"[green]✓ All reference images already exist for {len(characters)} characters (skipping)[/green]") - return - - self.face_app = init_face_detection() - - console.print(f"[blue]Downloading reference images for {len(characters)} characters...[/blue]") - - with sync_playwright() as p: - self.browser_context = p.chromium.launch_persistent_context( - user_data_dir="/tmp/patchright_profile", - headless=True, - args=[ - '--no-sandbox', - '--disable-dev-shm-usage', - '--disable-gpu', - ], - ignore_default_args=['--enable-automation'], - ) - - with create_progress() as progress: - task = progress.add_task("Downloading references", total=len(characters)) - - for i, char in enumerate(characters): - char_name = char["name"] - downloaded = False - try: - downloaded = self.__download_character_references(char_name, progress) - except Exception as e: - self.logger.error(f"Failed to download references for {char_name}: {e}") - finally: - progress.advance(task) - - if downloaded and i < len(characters) - 1: - delay = random.uniform( - settings.image_scraper.request_delay_min, - settings.image_scraper.request_delay_max, - ) - time.sleep(delay) - - self.browser_context.close() - - console.print("[green]✓ Reference download completed[/green]") - - def __count_faces(self, img) -> int: - faces = self.face_app.get(img) - return len(faces) - - def _validate_and_decode_image(self, img_bytes: bytes, img_url: str) -> np.ndarray | None: - if not img_bytes: - return None - - img_array = np.asarray(bytearray(img_bytes), dtype=np.uint8) - img = cv2.imdecode(img_array, cv2.IMREAD_COLOR) - - if img is None or img.size == 0: - self.logger.debug(f"Failed to decode image from {img_url}") - return None - - if len(img.shape) != 3 or img.shape[2] != 3: - self.logger.debug(f"Image has unexpected shape {img.shape} from {img_url}") - return None - - return img - - def __download_image_with_browser(self, img_url: str, page: Page) -> np.ndarray | None: - try: # pylint: disable=too-many-try-statements - response = page.goto( - img_url, - timeout=settings.image_scraper.page_navigation_timeout, - wait_until="domcontentloaded", - ) - if not response or response.status != 200: - return None - - content_type = response.headers.get("content-type", "") - if "image" not in content_type: - return None - - img_bytes = response.body() - if not img_bytes: - return None - - img_array = np.asarray(bytearray(img_bytes), dtype=np.uint8) - img = cv2.imdecode(img_array, cv2.IMREAD_COLOR) - - if img is None or img.size == 0: - self.logger.debug(f"Failed to decode image from {img_url}") - return None - - if len(img.shape) != 3 or img.shape[2] != 3: - self.logger.debug(f"Image has unexpected shape {img.shape} from {img_url}") - return None - - return img - - except TimeoutError: - self.logger.debug(f"Timeout downloading image {img_url}") - return None - except Exception as e: - if "net::ERR_CONNECTION_CLOSED" in str(e) or "Navigation" in str(e): - self.logger.debug(f"Connection/navigation error for {img_url}: {e}") - else: - self.logger.debug(f"Failed to download image {img_url}: {e}") - return None - - def __download_character_references(self, char_name: str, progress) -> bool: # pylint: disable=too-many-locals,too-many-statements - search_query = f"Serial {self.series_name} {char_name} postać" - output_folder = self.output_dir / char_name.replace(" ", "_").lower() - output_folder.mkdir(parents=True, exist_ok=True) - - existing_images = list(output_folder.glob("*.jpg")) - if len(existing_images) >= self.images_per_character: - progress.console.print( - f"[green]✓ {char_name}: {len(existing_images)} images already exist (skipping)[/green]", - ) - return False - - progress.console.print(f"[cyan]Searching [{self.search_engine.name}]: {search_query}[/cyan]") - - saved_count = len(existing_images) - processed = 0 - - for attempt in range(settings.image_scraper.retry_attempts): # pylint: disable=too-many-nested-blocks - try: - results = self.search_engine.search(search_query) - - sorted_results = sorted( - results, - key=lambda x: ( - 0 if x.get('image', '').lower().endswith(('.jpg', '.jpeg')) else 1, - 1 if x.get('image', '').lower().endswith('.png') else 2, - ), - ) - - page = self.browser_context.new_page() - - try: - for res in sorted_results: - if saved_count >= self.images_per_character: - break - - img_url = res['image'] - processed += 1 - - try: - img = self.__download_image_with_browser(img_url, page) - - if img is None: - continue - - if not isinstance(img, np.ndarray) or img.size == 0: - self.logger.debug(f"Invalid image array from {img_url}") - continue - - h, w = img.shape[:2] - if w < self.min_width or h < self.min_height: - continue - - try: - face_count = self.__count_faces(img) - except Exception as face_err: - self.logger.debug(f"Face detection failed for {img_url}: {face_err}") - continue - - if face_count == 1: - filename = f"{saved_count:02d}.jpg" - path = output_folder / filename - cv2.imwrite(str(path), img) - saved_count += 1 - - except Exception as e: - self.logger.debug(f"Error processing image: {e}") - continue - - finally: - page.close() - - break - - except KeyboardInterrupt: - progress.console.print("\n[yellow]Download interrupted[/yellow]") - raise - except Exception as e: - if attempt < settings.image_scraper.retry_attempts - 1: - delay = settings.image_scraper.retry_delay * (2 ** attempt) - self.logger.warning( - f"Attempt {attempt + 1} failed for {char_name}, retrying in {delay}s: {e}", - ) - time.sleep(delay) - else: - self.logger.error(f"All retry attempts failed for {char_name}: {e}") - - if saved_count >= self.images_per_character: - progress.console.print( - f"[green]✓[/green] {char_name}: {saved_count}/{self.images_per_character} images", - ) - elif saved_count > 0: - progress.console.print( - f"[yellow]⚠[/yellow] {char_name}: {saved_count}/{self.images_per_character} images (incomplete)", - ) - else: - progress.console.print(f"[red]✗[/red] {char_name}: No suitable images found") - - return True diff --git a/preprocessor/cli/__init__.py b/preprocessor/cli/__init__.py index ee99d0a5d..f17535929 100644 --- a/preprocessor/cli/__init__.py +++ b/preprocessor/cli/__init__.py @@ -1,72 +1,3 @@ -import click +from preprocessor.cli.cli_main import cli -from preprocessor.cli.commands import ( - analyze_text, - detect_scenes, - export_frames, - fix_unicode, - generate_archives, - generate_elastic_documents, - generate_embeddings, - image_hashing, - import_transcriptions, - index, - process_character_references, - run_all, - scrape_episodes, - search, - separate_sounds, - transcode, - transcribe, - transcribe_elevenlabs, - validate, -) - - -@click.group() -@click.help_option("-h", "--help") -def cli(): - """Preprocessor CLI for video processing pipeline.""" - - -# noinspection PyTypeChecker -cli.add_command(transcode) -# noinspection PyTypeChecker -cli.add_command(transcribe) -# noinspection PyTypeChecker -cli.add_command(index) -# noinspection PyTypeChecker -cli.add_command(import_transcriptions) -# noinspection PyTypeChecker -cli.add_command(transcribe_elevenlabs) -# noinspection PyTypeChecker -cli.add_command(scrape_episodes) -# noinspection PyTypeChecker -cli.add_command(detect_scenes) -# noinspection PyTypeChecker -cli.add_command(export_frames) -# noinspection PyTypeChecker -cli.add_command(image_hashing) -# noinspection PyTypeChecker -cli.add_command(generate_embeddings) -# noinspection PyTypeChecker -cli.add_command(generate_elastic_documents) -# noinspection PyTypeChecker -cli.add_command(generate_archives) -# noinspection PyTypeChecker -cli.add_command(search) -# noinspection PyTypeChecker -cli.add_command(run_all) -# noinspection PyTypeChecker -cli.add_command(validate) -# noinspection PyTypeChecker -cli.add_command(analyze_text) -# noinspection PyTypeChecker -cli.add_command(fix_unicode) -# noinspection PyTypeChecker -cli.add_command(separate_sounds) -# noinspection PyTypeChecker -cli.add_command(process_character_references) - - -__all__ = ["cli"] +__all__ = ['cli'] diff --git a/preprocessor/cli/__main__.py b/preprocessor/cli/__main__.py index 3386182ec..8a28a7810 100644 --- a/preprocessor/cli/__main__.py +++ b/preprocessor/cli/__main__.py @@ -1,4 +1,4 @@ from preprocessor.cli import cli -if __name__ == "__main__": +if __name__ == '__main__': cli() diff --git a/preprocessor/cli/cli_main.py b/preprocessor/cli/cli_main.py new file mode 100644 index 000000000..431498812 --- /dev/null +++ b/preprocessor/cli/cli_main.py @@ -0,0 +1,108 @@ +from pathlib import Path +from typing import Callable + +import click + +from preprocessor.app.pipeline_builder import Pipeline as PipelineRunner +from preprocessor.app.pipeline_factory import ( + build_pipeline, + visualize, +) +from preprocessor.cli.helpers import setup_pipeline_context + + +@click.group() +@click.help_option("-h", "--help") +def cli() -> None: + pass + + +@cli.command(name="visualize") +def visualize_command() -> None: + visualize() + + +@cli.command(name="run-all") +@click.option("--series", required=True, help="Series name (e.g., ranczo)") +@click.option("--force-rerun", is_flag=True, help="Force rerun even if cached") +@click.option( + "--skip", + multiple=True, + help="Step IDs to skip (e.g., --skip transcode --skip detect_scenes)", +) +def run_all(series: str, force_rerun: bool, skip: tuple) -> None: + pipeline = build_pipeline() + setup = setup_pipeline_context(series, "run_all", force_rerun, with_episode_manager=True) + + plan = pipeline.get_execution_order(skip=list(skip)) + + setup.logger.info(f"📋 Execution plan: {' → '.join(plan)}") + setup.logger.info(f"📂 Source: preprocessor/input_data/{series}") + + source_path = Path("preprocessor/input_data") / series + + for step_id in plan: + step = pipeline.get_step(step_id) + setup.logger.info(f"{'=' * 80}") + setup.logger.info(f"🔧 Step: {step_id}") + setup.logger.info(f"📝 {step.description}") + + StepClass = step.load_class() + instance = StepClass(step.config) + + runner = PipelineRunner(setup.context) + runner.add_step(instance) + runner.run_for_episodes(source_path, setup.episode_manager) + + setup.logger.info(f"✅ Step '{step_id}' completed") + + setup.logger.info("=" * 80) + setup.logger.info("🎉 Pipeline completed successfully!") + + +def _create_step_command(step_id: str, step_description: str) -> Callable: + @click.command(name=step_id.replace("_", "-"), help=f"{step_description}") + @click.option("--series", required=True, help="Series name (e.g., ranczo)") + @click.option("--force-rerun", is_flag=True, help="Force rerun even if cached") + def step_command(series: str, force_rerun: bool, _step_id: str = step_id) -> None: + pipeline = build_pipeline() + setup = setup_pipeline_context(series, _step_id, force_rerun, with_episode_manager=True) + + step = pipeline.get_step(_step_id) + + deps = step.dependency_ids + if deps: + setup.logger.info(f"📦 Dependencies: {', '.join(deps)}") + for dep_id in deps: + if not setup.context.state_manager.is_step_completed(dep_id, "*"): + setup.logger.warning( + f"⚠️ Dependency '{dep_id}' may not be completed. " + f"Run it first or use --force-rerun.", + ) + + setup.logger.info(f"🔧 Running: {_step_id}") + setup.logger.info(f"📝 {step.description}") + + StepClass = step.load_class() + instance = StepClass(step.config) + + source_path = Path("preprocessor/input_data") / series + + runner = PipelineRunner(setup.context) + runner.add_step(instance) + runner.run_for_episodes(source_path, setup.episode_manager) + + setup.logger.info(f"✅ Step '{_step_id}' completed successfully") + + return step_command + + +_cli_pipeline = build_pipeline() + +for _step_id, _step in _cli_pipeline._steps.items(): + command_func = _create_step_command(_step_id, _step.description) + cli.add_command(command_func) + + +if __name__ == "__main__": + cli() diff --git a/preprocessor/cli/commands/__init__.py b/preprocessor/cli/commands/__init__.py deleted file mode 100644 index 842404862..000000000 --- a/preprocessor/cli/commands/__init__.py +++ /dev/null @@ -1,41 +0,0 @@ -from preprocessor.cli.commands.analyze_text import analyze_text -from preprocessor.cli.commands.detect_scenes import detect_scenes -from preprocessor.cli.commands.export_frames import export_frames -from preprocessor.cli.commands.fix_unicode import fix_unicode -from preprocessor.cli.commands.generate_archives import generate_archives -from preprocessor.cli.commands.generate_elastic_documents import generate_elastic_documents -from preprocessor.cli.commands.generate_embeddings import generate_embeddings -from preprocessor.cli.commands.image_hashing import image_hashing -from preprocessor.cli.commands.import_transcriptions import import_transcriptions -from preprocessor.cli.commands.index import index -from preprocessor.cli.commands.process_character_references import process_character_references -from preprocessor.cli.commands.run_all import run_all -from preprocessor.cli.commands.scrape_episodes import scrape_episodes -from preprocessor.cli.commands.search import search -from preprocessor.cli.commands.separate_sounds import separate_sounds -from preprocessor.cli.commands.transcode import transcode -from preprocessor.cli.commands.transcribe import transcribe -from preprocessor.cli.commands.transcribe_elevenlabs import transcribe_elevenlabs -from preprocessor.cli.commands.validate import validate - -__all__ = [ - "analyze_text", - "detect_scenes", - "export_frames", - "fix_unicode", - "generate_archives", - "generate_elastic_documents", - "generate_embeddings", - "image_hashing", - "import_transcriptions", - "index", - "process_character_references", - "run_all", - "scrape_episodes", - "search", - "separate_sounds", - "transcode", - "transcribe", - "transcribe_elevenlabs", - "validate", -] diff --git a/preprocessor/cli/commands/analyze_text.py b/preprocessor/cli/commands/analyze_text.py deleted file mode 100644 index cda8240be..000000000 --- a/preprocessor/cli/commands/analyze_text.py +++ /dev/null @@ -1,59 +0,0 @@ -from pathlib import Path -import sys - -import click - -from preprocessor.processors.text_analyzer import TextAnalyzer - - -@click.command(context_settings={"show_default": True}) -@click.option( - "--season", - type=str, - help="Season to analyze (e.g., S10). If not provided, analyzes all seasons", -) -@click.option( - "--episode", - type=str, - help="Episode to analyze (e.g., E01). Requires --season. If not provided, analyzes all episodes in season", -) -@click.option( - "--language", - type=str, - default="pl", - help="Language code for analysis (pl or en)", -) -@click.option( - "--series-name", - type=str, - default="ranczo", - help="Series name for file naming", -) -@click.option( - "--episodes-info-json", - type=click.Path(exists=True, path_type=Path), - help="JSON file with episode metadata (optional)", -) -def analyze_text( - season: str, - episode: str, - language: str, - series_name: str, - episodes_info_json: Path, -): - """Analyze transcription texts and generate linguistic statistics.""" - if episode and not season: - click.echo("Error: --episode requires --season to be specified") - sys.exit(1) - - analyzer = TextAnalyzer( - { - "series_name": series_name, - "episodes_info_json": episodes_info_json, - "language": language, - "state_manager": None, - }, - ) - - exit_code = analyzer.work() - sys.exit(exit_code) diff --git a/preprocessor/cli/commands/detect_scenes.py b/preprocessor/cli/commands/detect_scenes.py deleted file mode 100644 index 632efb044..000000000 --- a/preprocessor/cli/commands/detect_scenes.py +++ /dev/null @@ -1,49 +0,0 @@ -from pathlib import Path -import sys - -import click - -from preprocessor.config.config import settings -from preprocessor.processors.scene_detector import SceneDetector -from preprocessor.utils.resource_scope import ResourceScope - - -@click.command(name="detect-scenes", context_settings={"show_default": True}) -@click.argument("videos", type=click.Path(exists=True, path_type=Path)) -@click.option( - "--output-dir", - type=click.Path(path_type=Path), - default=None, - help="Output directory for scene JSON files", -) -@click.option( - "--threshold", - type=float, - default=settings.scene_detection.threshold, - help="Scene detection threshold 0.0-1.0", -) -@click.option( - "--min-scene-len", - type=int, - default=settings.scene_detection.min_scene_len, - help="Minimum scene length in frames", -) -@click.option("--name", required=True, help="Series name") -def detect_scenes(videos: Path, output_dir: Path, threshold: float, min_scene_len: int, name: str): - """Detect scene changes in videos using TransNetV2.""" - if output_dir is None: - output_dir = settings.scene_detection.get_output_dir(name) - - with ResourceScope(): - detector = SceneDetector( - { - "videos": videos, - "output_dir": output_dir, - "threshold": threshold, - "min_scene_len": min_scene_len, - }, - ) - exit_code = detector.work() - detector.cleanup() - - sys.exit(exit_code) diff --git a/preprocessor/cli/commands/export_frames.py b/preprocessor/cli/commands/export_frames.py deleted file mode 100644 index ae15e3b91..000000000 --- a/preprocessor/cli/commands/export_frames.py +++ /dev/null @@ -1,72 +0,0 @@ -from pathlib import Path -import sys - -import click - -from preprocessor.cli.helpers import create_state_manager -from preprocessor.config.config import settings -from preprocessor.processors.frame_exporter import FrameExporter -from preprocessor.utils.resolution import Resolution - - -@click.command(context_settings={"show_default": True}) -@click.argument("transcoded_videos", type=click.Path(exists=True, file_okay=False, path_type=Path)) -@click.option( - "--episodes-info-json", - type=click.Path(exists=True, path_type=Path), - required=True, - help="JSON file with episode metadata", -) -@click.option( - "--scene-timestamps-dir", - type=click.Path(exists=True, path_type=Path), - default=None, - help="Directory with scene timestamps", -) -@click.option( - "--output-frames", - type=click.Path(path_type=Path), - default=None, - help="Output directory for exported frames", -) -@click.option( - "--resolution", - type=click.Choice(Resolution.get_all_choices()), - default="1080p", - help="Target resolution for exported frames", -) -@click.option("--name", required=True, help="Series name") -@click.option("--no-state", is_flag=True, help="Disable state management (no resume on interrupt)") -def export_frames( - transcoded_videos: Path, - episodes_info_json: Path, - scene_timestamps_dir: Path, - output_frames: Path, - resolution: str, - name: str, - no_state: bool, -): - """Export keyframes at target resolution based on configured keyframe strategy.""" - if scene_timestamps_dir is None: - scene_timestamps_dir = settings.scene_detection.get_output_dir(name) - if output_frames is None: - output_frames = settings.frame_export.get_output_dir(name) - - state_manager = create_state_manager(name, no_state) - - res = Resolution.from_str(resolution) - - exporter = FrameExporter( - { - "transcoded_videos": transcoded_videos, - "scene_timestamps_dir": scene_timestamps_dir, - "output_frames": output_frames, - "resolution": res, - "series_name": name, - "episodes_info_json": episodes_info_json, - "state_manager": state_manager, - }, - ) - - exit_code = exporter.work() - sys.exit(exit_code) diff --git a/preprocessor/cli/commands/fix_unicode.py b/preprocessor/cli/commands/fix_unicode.py deleted file mode 100644 index b2d116c17..000000000 --- a/preprocessor/cli/commands/fix_unicode.py +++ /dev/null @@ -1,48 +0,0 @@ -from pathlib import Path -import sys - -import click - -from preprocessor.config.config import settings -from preprocessor.transcription.processors.unicode_fixer import TranscriptionUnicodeFixer -from preprocessor.utils.resource_scope import ResourceScope - - -@click.command(context_settings={"show_default": True}) -@click.option( - "--transcription-jsons", - type=click.Path(exists=True, path_type=Path), - default=None, - help="Directory with transcription JSON files", -) -@click.option( - "--episodes-info-json", - type=click.Path(exists=True, path_type=Path), - required=True, - help="JSON file with episode metadata", -) -@click.option( - "--name", - required=True, - help="Series name", -) -def fix_unicode( - transcription_jsons: Path, - episodes_info_json: Path, - name: str, -): - """Fix unicode escape sequences in transcription files.""" - if transcription_jsons is None: - transcription_jsons = settings.transcription.get_output_dir(name) - - args = { - "transcription_jsons": transcription_jsons, - "episodes_info_json": episodes_info_json, - "name": name, - } - - with ResourceScope(): - fixer = TranscriptionUnicodeFixer(args) - exit_code = fixer.work() - - sys.exit(exit_code) diff --git a/preprocessor/cli/commands/generate_archives.py b/preprocessor/cli/commands/generate_archives.py deleted file mode 100644 index c5b5ac008..000000000 --- a/preprocessor/cli/commands/generate_archives.py +++ /dev/null @@ -1,86 +0,0 @@ -from pathlib import Path -import sys - -import click - -from preprocessor.cli.options.common import ( - episodes_info_option, - name_option, -) -from preprocessor.config.config import ( - get_base_output_dir, - settings, -) -from preprocessor.processors.archive_generator import ArchiveGenerator - - -@click.command(name="generate-archives", context_settings={"show_default": True}) -@click.option( - "--elastic-documents-dir", - type=click.Path(exists=True, file_okay=False, path_type=Path), - default=None, - help="Directory with Elasticsearch documents (defaults to {series_name}/elastic_documents)", -) -@click.option( - "--output-dir", - type=click.Path(path_type=Path), - default=None, - help="Output directory for ZIP archives (defaults to {series_name}/archives)", -) -@click.option( - "--season", - type=int, - help="Process only specific season", -) -@click.option( - "--episode", - type=int, - help="Process only specific episode (requires --season)", -) -@click.option( - "--force-regenerate", - is_flag=True, - help="Force regenerate existing archives", -) -@click.option( - "--allow-partial", - is_flag=True, - help="Create archives even if not all 5 files are present (default: skip incomplete episodes)", -) -@name_option() -@episodes_info_option(required=False) -def generate_archives( - elastic_documents_dir: Path, - output_dir: Path, - season: int, - episode: int, - force_regenerate: bool, - allow_partial: bool, - name: str, - episodes_info_json: Path, -) -> None: - base_output = get_base_output_dir(name) - - if elastic_documents_dir is None: - elastic_documents_dir = base_output / settings.output_subdirs.elastic_documents - if output_dir is None: - output_dir = base_output / settings.output_subdirs.archives - - args = { - "elastic_documents_dir": elastic_documents_dir, - "output_dir": output_dir, - "series_name": name, - "episodes_info_json": episodes_info_json, - "force_regenerate": force_regenerate, - "allow_partial": allow_partial, - } - - if season: - args["season_filter"] = season - if episode: - args["episode_filter"] = episode - - generator = ArchiveGenerator(args) - exit_code = generator.work() - if exit_code != 0: - sys.exit(exit_code) diff --git a/preprocessor/cli/commands/generate_elastic_documents.py b/preprocessor/cli/commands/generate_elastic_documents.py deleted file mode 100644 index 4d8deb648..000000000 --- a/preprocessor/cli/commands/generate_elastic_documents.py +++ /dev/null @@ -1,78 +0,0 @@ -from pathlib import Path -import sys - -import click - -from preprocessor.cli.options.common import ( - episodes_info_option, - name_option, -) -from preprocessor.config.config import ( - get_output_path, - settings, -) -from preprocessor.processors.elastic_document_generator import ElasticDocumentGenerator - - -@click.command(name="generate-elastic-documents", context_settings={"show_default": True}) -@click.option( - "--transcription-jsons", - type=click.Path(exists=True, file_okay=False, path_type=Path), - required=True, - help="Directory with transcription JSON files", -) -@click.option( - "--embeddings-dir", - type=click.Path(exists=True, file_okay=False, path_type=Path), - help="Directory with embedding files", -) -@click.option( - "--scene-timestamps-dir", - type=click.Path(exists=True, file_okay=False, path_type=Path), - help="Directory with scene timestamp files", -) -@click.option( - "--character-detections-dir", - type=click.Path(exists=True, file_okay=False, path_type=Path), - help="Directory with character detection files", -) -@click.option( - "--object-detections-dir", - type=click.Path(exists=True, file_okay=False, path_type=Path), - help="Directory with object detection files", -) -@click.option( - "--output-dir", - type=click.Path(path_type=Path), - default=None, - help="Output directory", -) -@name_option() -@episodes_info_option(required=False) -def generate_elastic_documents( - transcription_jsons: Path, - embeddings_dir: Path, - scene_timestamps_dir: Path, - character_detections_dir: Path, - object_detections_dir: Path, - output_dir: Path, - name: str, - episodes_info_json: Path, -) -> None: - if output_dir is None: - output_dir = get_output_path(settings.output_subdirs.elastic_documents, name) - args = { - "transcription_jsons": transcription_jsons, - "embeddings_dir": embeddings_dir, - "scene_timestamps_dir": scene_timestamps_dir, - "character_detections_dir": character_detections_dir, - "object_detections_dir": object_detections_dir, - "output_dir": output_dir, - "series_name": name, - "episodes_info_json": episodes_info_json, - } - - generator = ElasticDocumentGenerator(args) - exit_code = generator.work() - if exit_code != 0: - sys.exit(exit_code) diff --git a/preprocessor/cli/commands/generate_embeddings.py b/preprocessor/cli/commands/generate_embeddings.py deleted file mode 100644 index 1bbb44ee4..000000000 --- a/preprocessor/cli/commands/generate_embeddings.py +++ /dev/null @@ -1,146 +0,0 @@ -from pathlib import Path -import sys - -import click - -from preprocessor.config.config import settings -from preprocessor.processors.embedding_generator import EmbeddingGenerator -from preprocessor.utils.resource_scope import ResourceScope - - -@click.command(name="generate-embeddings", context_settings={"show_default": True}) -@click.option( - "--transcription-jsons", - type=click.Path(exists=True, file_okay=False, path_type=Path), - required=True, - help="Directory with transcription JSON files", -) -@click.option( - "--frames-dir", - type=click.Path(exists=True, file_okay=False, path_type=Path), - default=None, - help="Directory with exported frames", -) -@click.option( - "--output-dir", - type=click.Path(path_type=Path), - default=None, - help="Output directory", -) -@click.option( - "--image-hashes-dir", - type=click.Path(path_type=Path), - default=None, - help="Directory with image hashes", -) -@click.option( - "--model", - default=settings.embedding_model.model_name, - help="Model name", -) -@click.option( - "--segments-per-embedding", - type=int, - default=settings.text_chunking.segments_per_embedding, - help="Segments to group for text embeddings", -) -@click.option( - "--generate-text/--no-text", - default=True, - help="Generate text embeddings", -) -@click.option( - "--generate-video/--no-video", - default=True, - help="Generate video embeddings", -) -@click.option( - "--generate-episode-names/--no-episode-names", - default=True, - help="Generate episode name embeddings", -) -@click.option( - "--generate-full-episode/--no-full-episode", - default=True, - help="Generate full episode embeddings", -) -@click.option( - "--generate-sound-events/--no-sound-events", - default=True, - help="Generate sound event embeddings", -) -@click.option( - "--device", - type=click.Choice(["cuda"]), - default="cuda", - help="Device: cuda (GPU only)", -) -@click.option( - "--batch-size", - type=int, - default=settings.embedding.batch_size, - help="Batch size for GPU inference. Reduce if OOM errors occur", -) -@click.option( - "--sentences-per-chunk", - type=int, - default=settings.text_chunking.text_sentences_per_chunk, - help="Number of sentences per chunk (only for --sentence-chunking)", -) -@click.option( - "--chunk-overlap", - type=int, - default=settings.text_chunking.text_chunk_overlap, - help="Number of overlapping sentences between chunks (only for --sentence-chunking)", -) -@click.option("--name", required=True, help="Series name") -def generate_embeddings( # pylint: disable=too-many-arguments - transcription_jsons: Path, - frames_dir: Path, - output_dir: Path, - image_hashes_dir: Path, - model: str, - segments_per_embedding: int, - generate_text: bool, - generate_video: bool, - generate_episode_names: bool, - generate_full_episode: bool, - generate_sound_events: bool, - device: str, - batch_size: int, - sentences_per_chunk: int, - chunk_overlap: int, - name: str, -): - """Generate text and video embeddings from transcriptions and exported frames.""" - if frames_dir is None: - frames_dir = settings.frame_export.get_output_dir(name) - if output_dir is None: - output_dir = settings.embedding.get_output_dir(name) - if image_hashes_dir is None: - image_hashes_dir = settings.image_hash.get_output_dir(name) - - with ResourceScope(): - generator = EmbeddingGenerator( - { - "transcription_jsons": transcription_jsons, - "frames_dir": frames_dir, - "output_dir": output_dir, - "image_hashes_dir": image_hashes_dir, - "model": model, - "segments_per_embedding": segments_per_embedding, - "generate_text": generate_text, - "generate_video": generate_video, - "generate_episode_names": generate_episode_names, - "generate_full_episode": generate_full_episode, - "generate_sound_events": generate_sound_events, - "device": device, - "batch_size": batch_size, - "text_sentences_per_chunk": sentences_per_chunk, - "text_chunk_overlap": chunk_overlap, - }, - ) - exit_code = generator.work() - generator.cleanup() - - sys.exit(exit_code) diff --git a/preprocessor/cli/commands/image_hashing.py b/preprocessor/cli/commands/image_hashing.py deleted file mode 100644 index f73c3a919..000000000 --- a/preprocessor/cli/commands/image_hashing.py +++ /dev/null @@ -1,68 +0,0 @@ -from pathlib import Path -import sys - -import click - -from preprocessor.cli.helpers import create_state_manager -from preprocessor.config.config import settings -from preprocessor.processors.image_hash_processor import ImageHashProcessor - - -@click.command(context_settings={"show_default": True}) -@click.option( - "--frames-dir", - type=click.Path(exists=True, file_okay=False, path_type=Path), - default=None, - help="Directory with exported frames", -) -@click.option( - "--episodes-info-json", - type=click.Path(exists=True, path_type=Path), - required=True, - help="JSON file with episode metadata", -) -@click.option( - "--output-dir", - type=click.Path(path_type=Path), - default=None, - help="Output directory for image hashes", -) -@click.option( - "--batch-size", - type=int, - default=settings.embedding.batch_size, - help="Batch size for processing", -) -@click.option("--name", required=True, help="Series name") -@click.option("--no-state", is_flag=True, help="Disable state management (no resume on interrupt)") -def image_hashing( - frames_dir: Path, - episodes_info_json: Path, - output_dir: Path, - batch_size: int, - name: str, - no_state: bool, -): - """Generate perceptual hashes for exported frames.""" - if frames_dir is None: - frames_dir = settings.frame_export.get_output_dir(name) - if output_dir is None: - output_dir = settings.image_hash.get_output_dir(name) - - state_manager = create_state_manager(name, no_state) - - hasher = ImageHashProcessor( - { - "frames_dir": frames_dir, - "output_dir": output_dir, - "batch_size": batch_size, - "device": "cuda", - "series_name": name, - "episodes_info_json": episodes_info_json, - "state_manager": state_manager, - }, - ) - - exit_code = hasher.work() - hasher.cleanup() - sys.exit(exit_code) diff --git a/preprocessor/cli/commands/import_transcriptions.py b/preprocessor/cli/commands/import_transcriptions.py deleted file mode 100644 index 062714b87..000000000 --- a/preprocessor/cli/commands/import_transcriptions.py +++ /dev/null @@ -1,69 +0,0 @@ -from pathlib import Path -import sys - -import click - -from preprocessor.cli.helpers import create_state_manager -from preprocessor.config.config import settings -from preprocessor.processors.transcription_importer import TranscriptionImporter -from preprocessor.utils.console import console - - -@click.command(name="import-transcriptions", context_settings={"show_default": True}) -@click.option( - "--source-dir", - type=click.Path(exists=True, file_okay=False, path_type=Path), - required=True, - help="Directory with source transcriptions (11labs format)", -) -@click.option( - "--output-dir", - type=click.Path(path_type=Path), - default=None, - help="Output directory for converted transcriptions", -) -@click.option( - "--episodes-info-json", - type=click.Path(exists=True, path_type=Path), - help="JSON file with episode metadata", -) -@click.option("--name", required=True, help="Series name") -@click.option( - "--format-type", - type=click.Choice(["11labs_segmented", "11labs"]), - default="11labs_segmented", - help="Source format: 11labs_segmented or 11labs", -) -@click.option("--no-state", is_flag=True, help="Disable state management (no resume on interrupt)") -def import_transcriptions( - source_dir: Path, - output_dir: Path, - episodes_info_json: Path, - name: str, - format_type: str, - no_state: bool, -): - """Import and convert transcriptions from external sources.""" - if output_dir is None: - output_dir = settings.transcription.get_output_dir(name) - - state_manager = create_state_manager(name, no_state) - - importer = TranscriptionImporter( - { - "source_dir": source_dir, - "output_dir": output_dir, - "episodes_info_json": episodes_info_json, - "series_name": name, - "format_type": format_type, - "state_manager": state_manager, - }, - ) - - exit_code = importer.work() - - if state_manager and exit_code == 0: - console.print("[green]Import completed successfully![/green]") - state_manager.cleanup() - - sys.exit(exit_code) diff --git a/preprocessor/cli/commands/index.py b/preprocessor/cli/commands/index.py deleted file mode 100644 index c4aeb258b..000000000 --- a/preprocessor/cli/commands/index.py +++ /dev/null @@ -1,35 +0,0 @@ -from pathlib import Path -import sys - -import click - -from preprocessor.config.config import ( - get_output_path, - settings, -) -from preprocessor.processors.elasticsearch_indexer import ElasticSearchIndexer - - -@click.command() -@click.option("--name", required=True, help="Elasticsearch index name (also used as series name for path resolution)") -@click.option( - "--elastic-documents-dir", - type=click.Path(exists=True, path_type=Path), - default=None, - help="Directory with generated elastic documents", -) -@click.option("--dry-run", is_flag=True, help="Validate without sending to Elasticsearch") -@click.option("--append", is_flag=True, help="Append to existing indices instead of recreating") -def index(name: str, elastic_documents_dir: Path, dry_run: bool, append: bool): - """Index documents into Elasticsearch (creates 3 indices: segments, text_embeddings, video_frames).""" - if elastic_documents_dir is None: - elastic_documents_dir = get_output_path(settings.output_subdirs.elastic_documents, name) - - indexer = ElasticSearchIndexer({ - "name": name, - "elastic_documents_dir": elastic_documents_dir, - "dry_run": dry_run, - "append": append, - }) - exit_code = indexer.work() - sys.exit(exit_code) diff --git a/preprocessor/cli/commands/process_character_references.py b/preprocessor/cli/commands/process_character_references.py deleted file mode 100644 index 588fc4b37..000000000 --- a/preprocessor/cli/commands/process_character_references.py +++ /dev/null @@ -1,65 +0,0 @@ -from pathlib import Path -import sys - -import click - -from preprocessor.characters.reference_processor import CharacterReferenceProcessor -from preprocessor.cli.helpers import create_state_manager -from preprocessor.config.config import settings - - -@click.command(context_settings={"show_default": True}) -@click.option( - "--characters-dir", - type=click.Path(exists=True, path_type=Path), - default=None, - help="Directory with character reference images", -) -@click.option( - "--output-dir", - type=click.Path(path_type=Path), - default=None, - help="Output directory for processed references", -) -@click.option( - "--similarity-threshold", - type=float, - default=settings.character.reference_matching_threshold, - help="Threshold for face similarity when matching between reference images", -) -@click.option( - "--interactive/--no-interactive", - default=True, - help="Enable interactive mode for ambiguous cases", -) -@click.option("--name", required=True, help="Series name") -@click.option("--no-state", is_flag=True, help="Disable state management (no resume on interrupt)") -def process_character_references( - characters_dir: Path, - output_dir: Path, - similarity_threshold: float, - interactive: bool, - name: str, - no_state: bool, -): - """Process character reference images to identify and extract common faces.""" - if characters_dir is None: - characters_dir = settings.character.get_output_dir(name) - if output_dir is None: - output_dir = settings.character.get_processed_references_dir(name) - - state_manager = create_state_manager(name, no_state) - - processor = CharacterReferenceProcessor( - { - "characters_dir": characters_dir, - "output_dir": output_dir, - "similarity_threshold": similarity_threshold, - "interactive": interactive, - "series_name": name, - "state_manager": state_manager, - }, - ) - - exit_code = processor.work() - sys.exit(exit_code) diff --git a/preprocessor/cli/commands/run_all.py b/preprocessor/cli/commands/run_all.py deleted file mode 100644 index 8459b198d..000000000 --- a/preprocessor/cli/commands/run_all.py +++ /dev/null @@ -1,304 +0,0 @@ -from pathlib import Path -import sys -from typing import Tuple - -import click - -from preprocessor.cli.helpers import create_state_manager -from preprocessor.cli.pipeline.orchestrator import PipelineOrchestrator -from preprocessor.cli.pipeline.steps import ( - run_archive_generation_step, - run_character_reference_download_step, - run_character_reference_processing_step, - run_character_scrape_step, - run_elastic_documents_step, - run_embedding_step, - run_frame_export_step, - run_frame_processing_step, - run_index_step, - run_scene_step, - run_scrape_step, - run_sound_separation_step, - run_text_analysis_step, - run_transcode_step, - run_transcribe_step, - run_validation_step, -) -from preprocessor.config.config import ( - get_base_output_dir, - settings, -) -from preprocessor.utils.console import console -from preprocessor.utils.resolution import Resolution - - -@click.command(context_settings={"show_default": True}) -@click.argument("videos", type=click.Path(exists=True, file_okay=False, path_type=Path)) -@click.option( - "--episodes-info-json", - type=click.Path(path_type=Path), - help="JSON file with episode metadata (required if not using --scrape-urls)", -) -@click.option( - "--transcoded-videos", - type=click.Path(path_type=Path), - help="Output directory for transcoded videos", -) -@click.option( - "--transcription-jsons", - type=click.Path(path_type=Path), - default=None, - help="Output directory for transcription JSONs (defaults to {series_name}/transcriptions)", -) -@click.option( - "--scene-timestamps-dir", - type=click.Path(path_type=Path), - default=None, - help="Output directory for scene timestamps (defaults to {series_name}/scene_timestamps)", -) -@click.option("--series-name", required=True, help="Series name") -@click.option( - "--resolution", - type=click.Choice(Resolution.get_all_choices()), - default="720p", - help="Target resolution for transcoding", -) -@click.option( - "--codec", - help="Video codec", -) -@click.option( - "--model", - default=settings.transcription.model, - help="Whisper model", -) -@click.option( - "--language", - default=settings.transcription.language, - help="Language for transcription", -) -@click.option("--dry-run", is_flag=True, help="Dry run for Elasticsearch indexing") -@click.option("--no-state", is_flag=True, help="Disable state management (no resume on interrupt)") -@click.option( - "--ramdisk-path", - type=click.Path(path_type=Path), - help="Path to ramdisk for temporary files (e.g., /mnt/ramdisk)", -) -@click.option( - "--scrape-urls", - multiple=True, - help="URLs to scrape episode metadata from (Step 0a: optional)", -) -@click.option( - "--character-urls", - multiple=True, - help="URLs to scrape character metadata from (Step 0b: optional)", -) -@click.option( - "--search-mode", - type=click.Choice(["normal", "premium"]), - default="normal", - help="Image search mode: normal (DuckDuckGo) or premium (Google Images API)", -) -@click.option( - "--transcription-mode", - type=click.Choice(["normal", "premium"]), - default="normal", - help="Transcription mode: normal (Whisper) or premium (ElevenLabs API)", -) -@click.option( - "--parser-mode", - type=click.Choice(["normal", "premium"]), - default="normal", - help="Parser mode: normal (Qwen local model) or premium (Gemini 2.5 Flash)", -) -@click.option( - "--skip-character-reference-processing", - is_flag=True, - help="Skip Step 0d: Character reference processing (use existing processed references)", -) -@click.option( - "--interactive-character-processing", - is_flag=True, - help="Enable interactive mode for character reference processing (allows manual face selection)", -) -@click.option("--skip-transcode", is_flag=True, help="Skip Step 1: Transcoding (use existing transcoded videos)") -@click.option("--skip-transcribe", is_flag=True, help="Skip Step 2: Transcription (use existing transcriptions)") -@click.option("--skip-text-analysis", is_flag=True, help="Skip Step 3: Text analysis (use existing text statistics)") -@click.option("--skip-scenes", is_flag=True, help="Skip Step 4: Scene detection (use existing scene timestamps)") -@click.option("--skip-frame-export", is_flag=True, help="Skip Step 5: Frame export (use existing frames)") -@click.option("--skip-embeddings", is_flag=True, help="Skip Step 6: Text embedding generation (use existing text embeddings)") -@click.option("--skip-full-episode", is_flag=True, help="Skip full episode embedding generation (only text, video, sound events)") -@click.option("--skip-image-hashing", is_flag=True, help="Skip Step 7a: Image hashing sub-step (use existing hashes)") -@click.option("--skip-video-embeddings", is_flag=True, help="Skip Step 7b: Video embeddings sub-step (use existing)") -@click.option("--skip-character-detection", is_flag=True, help="Skip Step 7c: Character detection sub-step (use existing)") -@click.option("--skip-emotion-detection", is_flag=True, help="Skip Step 7d: Emotion detection sub-step (use existing)") -@click.option("--skip-face-clustering", is_flag=True, help="Skip Step 7e: Face clustering sub-step (use existing)") -@click.option("--skip-object-detection", is_flag=True, help="Skip Step 7f: Object detection sub-step (use existing)") -@click.option("--debug-visualizations", is_flag=True, help="Enable debug visualizations for character and object detections (disabled by default)") -@click.option("--skip-elastic-documents", is_flag=True, help="Skip Step 8: Generate Elasticsearch documents (use existing documents)") -@click.option("--skip-archives", is_flag=True, help="Skip Step 9: Archive generation (use existing archives)") -@click.option("--skip-index", is_flag=True, help="Skip Step 10: Elasticsearch indexing") -@click.option("--skip-validation", is_flag=True, help="Skip Step 11: Output validation") -def run_all( # pylint: disable=too-many-arguments,too-many-locals,too-many-statements - videos: Path, - episodes_info_json: Path, - transcoded_videos: Path, - transcription_jsons: Path, - scene_timestamps_dir: Path, - series_name: str, - resolution: str, - codec: str, - model: str, - language: str, - dry_run: bool, - no_state: bool, - ramdisk_path: Path, - scrape_urls: Tuple[str, ...], - character_urls: Tuple[str, ...], - search_mode: str, - transcription_mode: str, - parser_mode: str, - skip_character_reference_processing: bool, - interactive_character_processing: bool, - skip_transcode: bool, - skip_transcribe: bool, - skip_text_analysis: bool, - skip_scenes: bool, - skip_frame_export: bool, - skip_embeddings: bool, - skip_full_episode: bool, - skip_image_hashing: bool, - skip_video_embeddings: bool, - skip_character_detection: bool, - skip_emotion_detection: bool, - skip_face_clustering: bool, - skip_object_detection: bool, - debug_visualizations: bool, - skip_elastic_documents: bool, - skip_archives: bool, - skip_index: bool, - skip_validation: bool, -): - """Run complete video processing pipeline.""" - if transcoded_videos is None: # pylint: disable=duplicate-code - transcoded_videos = settings.transcode.get_output_dir(series_name) - if codec is None: - codec = settings.transcode.codec - if transcription_jsons is None: - transcription_jsons = settings.transcription.get_output_dir(series_name) - if scene_timestamps_dir is None: - scene_timestamps_dir = settings.scene_detection.get_output_dir(series_name) - - if not episodes_info_json: - default_episodes_json = get_base_output_dir(series_name) / f"{series_name}_episodes.json" - if default_episodes_json.exists(): - episodes_info_json = default_episodes_json - console.print(f"[cyan]Using existing episodes JSON: {episodes_info_json}[/cyan]") - elif scrape_urls: - episodes_info_json = default_episodes_json - console.print(f"[cyan]Will scrape episodes to: {episodes_info_json}[/cyan]") - else: - console.print("[red]Error: Either --episodes-info-json, --scrape-urls must be provided, or existing episodes JSON must exist[/red]") - console.print(f"[yellow]Expected location: {default_episodes_json}[/yellow]") - sys.exit(1) - - characters_json = None - default_characters_json = get_base_output_dir(series_name) / f"{series_name}_characters.json" - - if default_characters_json.exists(): - characters_json = default_characters_json - console.print(f"[cyan]Using existing characters JSON: {characters_json}[/cyan]") - elif character_urls: - characters_json = default_characters_json - console.print(f"[cyan]Will scrape characters to: {characters_json}[/cyan]") - else: - characters_json = settings.character.get_characters_list_file(series_name) - if characters_json and Path(characters_json).exists(): - console.print(f"[cyan]Using default characters JSON: {characters_json}[/cyan]") - else: - console.print("[yellow]No characters JSON found. Character processing may be skipped.[/yellow]") - - state_manager = create_state_manager(series_name, no_state) - - if ramdisk_path: - console.print(f"[cyan]Using ramdisk: {ramdisk_path}[/cyan]") - - params = { - "videos": videos, - "episodes_info_json": episodes_info_json, - "transcoded_videos": transcoded_videos, - "transcription_jsons": transcription_jsons, - "scene_timestamps_dir": scene_timestamps_dir, - "output_frames": settings.frame_export.get_output_dir(series_name), - "name": series_name, - "resolution": resolution, - "codec": codec, - "model": model, - "language": language, - "device": "cuda", - "dry_run": dry_run, - "ramdisk_path": ramdisk_path, - "scrape_urls": scrape_urls, - "character_urls": character_urls, - "characters_json": characters_json, - "search_mode": search_mode, - "transcription_mode": transcription_mode, - "parser_mode": parser_mode, - "state_manager": state_manager, - "interactive_character_processing": interactive_character_processing, - "debug_visualizations": debug_visualizations, - "skip_image_hashing": skip_image_hashing, - "skip_video_embeddings": skip_video_embeddings, - "skip_character_detection": skip_character_detection, - "skip_character_visualization": not debug_visualizations, - "skip_emotion_detection": skip_emotion_detection, - "skip_face_clustering": skip_face_clustering, - "skip_object_detection": skip_object_detection, - "skip_object_visualization": not debug_visualizations, - "skip_full_episode": skip_full_episode, - } - - metadata_output_dir = get_base_output_dir(series_name) / "processing_metadata" - - orchestrator = PipelineOrchestrator( - state_manager=state_manager, - series_name=series_name, - metadata_output_dir=metadata_output_dir, - ) - skip_character_visualization = not debug_visualizations - skip_object_visualization = not debug_visualizations - skip_frame_processing = ( - skip_image_hashing and skip_video_embeddings and skip_character_detection - and skip_character_visualization and skip_emotion_detection and skip_face_clustering - and skip_object_detection and skip_object_visualization - ) - - orchestrator.add_step("Scraping episode metadata", "0a/14", run_scrape_step, skip=False) - orchestrator.add_step("Scraping character metadata", "0b/14", run_character_scrape_step, skip=False) - orchestrator.add_step("Downloading character references", "0c/14", run_character_reference_download_step, skip=False) - orchestrator.add_step("Processing character references", "0d/14", run_character_reference_processing_step, skip=skip_character_reference_processing) - orchestrator.add_step("Transcoding videos", "1/14", run_transcode_step, skip=skip_transcode) - orchestrator.add_step("Generating transcriptions", "2/14", run_transcribe_step, skip=skip_transcribe) - orchestrator.add_step("Separating sounds and dialogues", "3/14", run_sound_separation_step, skip=skip_transcribe) - orchestrator.add_step("Analyzing transcription texts", "4/14", run_text_analysis_step, skip=skip_text_analysis) - orchestrator.add_step("Detecting scenes", "5/14", run_scene_step, skip=skip_scenes) - orchestrator.add_step("Exporting frames", "6/14", run_frame_export_step, skip=skip_frame_export) - orchestrator.add_step("Generating text embeddings", "7/14", run_embedding_step, skip=skip_embeddings) - orchestrator.add_step( - "Processing frames (hashing + embeddings + characters + emotions + clustering + objects)", - "8/14", - run_frame_processing_step, - skip=skip_frame_processing, - ) - orchestrator.add_step("Generating Elasticsearch documents", "9/14", run_elastic_documents_step, skip=skip_elastic_documents) - orchestrator.add_step("Archiving Elasticsearch documents", "10/14", run_archive_generation_step, skip=skip_archives) - orchestrator.add_step("Indexing in Elasticsearch", "11/14", run_index_step, skip=skip_index) - orchestrator.add_step("Validating output data", "12/14", run_validation_step, skip=skip_validation) - - exit_code = orchestrator.execute(**params) - - if exit_code == 0: - console.print("\n[green]All steps completed successfully![/green]") - - sys.exit(exit_code) diff --git a/preprocessor/cli/commands/scrape_episodes.py b/preprocessor/cli/commands/scrape_episodes.py deleted file mode 100644 index 9813d9115..000000000 --- a/preprocessor/cli/commands/scrape_episodes.py +++ /dev/null @@ -1,65 +0,0 @@ -from pathlib import Path -import sys -from typing import Tuple - -import click - -from preprocessor.scraping.episode_scraper import EpisodeScraper - - -@click.command(name="scrape-episodes", context_settings={"show_default": True}) -@click.option( - "--urls", - multiple=True, - required=True, - help="URL to scrape (specify multiple times for multiple sources)", -) -@click.option( - "--output-file", - type=click.Path(path_type=Path), - required=True, - help="Output JSON file path", -) -@click.option( - "--headless/--no-headless", - default=True, - help="Run browser in headless mode", -) -@click.option( - "--merge-sources/--no-merge", - default=True, - help="Merge data from multiple sources", -) -@click.option( - "--videos-dir", - type=click.Path(exists=True, file_okay=False, path_type=Path), - help="Directory containing video files for coverage validation", -) -@click.option( - "--parser-mode", - type=click.Choice(["normal", "premium"], case_sensitive=False), - default="normal", - help="Parser mode: normal (Qwen local model) or premium (Gemini 2.5 Flash)", -) -def scrape_episodes( - urls: Tuple[str, ...], - output_file: Path, - headless: bool, - merge_sources: bool, - videos_dir: Path, - parser_mode: str, -): - """Scrape episode metadata from websites.""" - scraper = EpisodeScraper( - { - "urls": list(urls), - "output_file": output_file, - "headless": headless, - "merge_sources": merge_sources, - "videos_dir": videos_dir, - "parser_mode": parser_mode, - }, - ) - - exit_code = scraper.work() - sys.exit(exit_code) diff --git a/preprocessor/cli/commands/search.py b/preprocessor/cli/commands/search.py deleted file mode 100644 index 383ae5e3c..000000000 --- a/preprocessor/cli/commands/search.py +++ /dev/null @@ -1,183 +0,0 @@ -# pylint: disable=too-many-arguments,too-many-locals,too-many-branches,too-many-statements -import asyncio -import json -from pathlib import Path -import sys - -import click -from elasticsearch import AsyncElasticsearch - -from preprocessor.search import ( - ElasticsearchQueries, - EmbeddingService, - HashService, - ResultFormatter, -) - - -@click.command(context_settings={"show_default": True}) -@click.option("--text", type=str, help="Full-text search po transkrypcjach") -@click.option("--text-semantic", type=str, help="Semantic search po text embeddings") -@click.option("--text-to-video", type=str, help="Cross-modal search: text query w video embeddings") -@click.option("--image", type=click.Path(exists=True, path_type=Path), help="Semantic search po video embeddings") -@click.option("--hash", "phash", type=str, help="Szukaj po perceptual hash (podaj hash string lub sciezke do obrazka)") -@click.option("--character", type=str, help="Szukaj po postaci") -@click.option("--emotion", type=str, help="Szukaj po emocji (neutral, happiness, surprise, sadness, anger, disgust, fear, contempt)") -@click.option("--object", "object_query", type=str, help="Szukaj po wykrytych obiektach (np. 'dog', 'person:5+', 'chair:2-4')") -@click.option("--episode-name", type=str, help="Fuzzy search po nazwach odcinkow") -@click.option("--episode-name-semantic", type=str, help="Semantic search po nazwach odcinkow") -@click.option("--list-characters", "list_chars_flag", is_flag=True, help="Lista wszystkich postaci") -@click.option("--list-objects", "list_objects_flag", is_flag=True, help="Lista wszystkich klas obiektow") -@click.option("--season", type=int, help="Filtruj po sezonie") -@click.option("--episode", type=int, help="Filtruj po odcinku") -@click.option("--limit", type=int, default=20, help="Limit wynikow") -@click.option("--stats", is_flag=True, help="Pokaz statystyki indeksow") -@click.option("--json-output", is_flag=True, help="Output w formacie JSON") -@click.option("--host", type=str, default="http://localhost:9200", help="Elasticsearch host") -def search( - text, text_semantic, text_to_video, image, phash, character, emotion, object_query, episode_name, - episode_name_semantic, list_chars_flag, list_objects_flag, season, episode, limit, - stats, json_output, host, -): - """Search tool - comprehensive Elasticsearch search""" - - if not any([ - text, text_semantic, text_to_video, image, phash, character, emotion, - object_query, episode_name, episode_name_semantic, list_chars_flag, list_objects_flag, stats, - ]): - click.echo("Podaj przynajmniej jedna opcje wyszukiwania. Uzyj --help", err=True) - sys.exit(1) - - embedding_service = EmbeddingService() - hash_service = HashService() - queries = ElasticsearchQueries(embedding_service) - - hash_value = None - if phash: - phash_path = Path(phash) - if phash_path.exists() and phash_path.is_file(): - click.echo(f"Computing perceptual hash from image: {phash}", err=True) - hash_value = hash_service.get_perceptual_hash(str(phash_path)) - if hash_value: - click.echo(f"Computed hash: {hash_value}", err=True) - else: - click.echo("Failed to compute hash from image", err=True) - sys.exit(1) - else: - hash_value = phash - - async def __run(): - es_client = AsyncElasticsearch(hosts=[host], verify_certs=False) - - try: - await es_client.ping() - except Exception: - click.echo(f"✗ Cannot connect to Elasticsearch at {host}", err=True) - click.echo("Make sure Elasticsearch is running:", err=True) - click.echo(" docker-compose -f docker-compose.test.yml up -d", err=True) - sys.exit(1) - - try: - if stats: - result = await queries.get_stats(es_client) - if json_output: - click.echo(json.dumps(result, indent=2)) - else: - click.echo("\nStatystyki:") - click.echo(f" Segments: {result['segments']:,}") - click.echo(f" Text Embeddings: {result['text_embeddings']:,}") - click.echo(f" Video Embeddings: {result['video_embeddings']:,}") - click.echo(f" Episode Names: {result['episode_names']:,}") - - elif list_chars_flag: - chars = await queries.list_characters(es_client) - if json_output: - click.echo(json.dumps(chars, indent=2)) - else: - click.echo(f"\nZnaleziono {len(chars)} postaci:") - for char, count in sorted(chars, key=lambda x: -x[1]): - click.echo(f" {char}: {count:,} wystapien") - - elif list_objects_flag: - objects = await queries.list_objects(es_client) - if json_output: - click.echo(json.dumps(objects, indent=2)) - else: - click.echo(f"\nZnaleziono {len(objects)} klas obiektow:") - for obj, count in sorted(objects, key=lambda x: -x[1]): - click.echo(f" {obj}: {count:,} wystapien") - - elif text: - result = await queries.search_text_query(es_client, text, season, episode, limit) - if json_output: - click.echo(json.dumps(result["hits"], indent=2)) - else: - ResultFormatter.print_results(result, "text") - - elif text_semantic: - result = await queries.search_text_semantic(es_client, text_semantic, season, episode, limit) - if json_output: - click.echo(json.dumps(result["hits"], indent=2)) - else: - ResultFormatter.print_results(result, "text_semantic") - - elif text_to_video: - result = await queries.search_text_to_video(es_client, text_to_video, season, episode, character, limit) - if json_output: - click.echo(json.dumps(result["hits"], indent=2)) - else: - ResultFormatter.print_results(result, "video") - - elif image: - result = await queries.search_video_semantic(es_client, str(image), season, episode, character, limit) - if json_output: - click.echo(json.dumps(result["hits"], indent=2)) - else: - ResultFormatter.print_results(result, "video") - - elif emotion: - result = await queries.search_by_emotion(es_client, emotion, season, episode, character, limit) - if json_output: - click.echo(json.dumps(result["hits"], indent=2)) - else: - ResultFormatter.print_results(result, "video") - - elif character: - result = await queries.search_by_character(es_client, character, season, episode, limit) - if json_output: - click.echo(json.dumps(result["hits"], indent=2)) - else: - ResultFormatter.print_results(result, "video") - - elif object_query: - result = await queries.search_by_object(es_client, object_query, season, episode, limit) - if json_output: - click.echo(json.dumps(result["hits"], indent=2)) - else: - ResultFormatter.print_results(result, "video") - - elif hash_value: - result = await queries.search_perceptual_hash(es_client, hash_value, limit) - if json_output: - click.echo(json.dumps(result["hits"], indent=2)) - else: - ResultFormatter.print_results(result, "video") - - elif episode_name: - result = await queries.search_episode_name(es_client, episode_name, season, limit) - if json_output: - click.echo(json.dumps(result["hits"], indent=2)) - else: - ResultFormatter.print_results(result, "episode_name") - - elif episode_name_semantic: - result = await queries.search_episode_name_semantic(es_client, episode_name_semantic, season, limit) - if json_output: - click.echo(json.dumps(result["hits"], indent=2)) - else: - ResultFormatter.print_results(result, "episode_name") - - finally: - await es_client.close() - - asyncio.run(__run()) diff --git a/preprocessor/cli/commands/separate_sounds.py b/preprocessor/cli/commands/separate_sounds.py deleted file mode 100644 index 0e91bebb4..000000000 --- a/preprocessor/cli/commands/separate_sounds.py +++ /dev/null @@ -1,48 +0,0 @@ -from pathlib import Path -import sys - -import click - -from preprocessor.config.config import settings -from preprocessor.transcription.processors.sound_separator import SoundEventSeparator -from preprocessor.utils.resource_scope import ResourceScope - - -@click.command(context_settings={"show_default": True}) -@click.option( - "--transcription-dir", - type=click.Path(exists=True, path_type=Path), - default=None, - help="Directory with transcription JSON files", -) -@click.option( - "--episodes-info-json", - type=click.Path(exists=True, path_type=Path), - required=True, - help="JSON file with episode metadata", -) -@click.option( - "--series-name", - required=True, - help="Series name", -) -def separate_sounds( - transcription_dir: Path, - episodes_info_json: Path, - series_name: str, -): - """Separate sound events from dialogues in transcription files.""" - if transcription_dir is None: - transcription_dir = settings.transcription.get_output_dir(series_name) - - args = { - "transcription_dir": transcription_dir, - "episodes_info_json": episodes_info_json, - "series_name": series_name, - } - - with ResourceScope(): - separator = SoundEventSeparator(args) - exit_code = separator.work() - - sys.exit(exit_code) diff --git a/preprocessor/cli/commands/transcode.py b/preprocessor/cli/commands/transcode.py deleted file mode 100644 index ceec83c8d..000000000 --- a/preprocessor/cli/commands/transcode.py +++ /dev/null @@ -1,99 +0,0 @@ -from pathlib import Path -import sys - -import click - -from preprocessor.cli.helpers import create_state_manager -from preprocessor.config.config import ( - TranscodeConfig, - settings, -) -from preprocessor.processors.video_transcoder import VideoTranscoder -from preprocessor.utils.resolution import Resolution -from preprocessor.utils.resource_scope import ResourceScope - - -@click.command(context_settings={"show_default": True}) -@click.argument("videos", type=click.Path(exists=True, file_okay=False, path_type=Path)) -@click.option( - "--transcoded-videos", - type=click.Path(path_type=Path), - default=None, - help="Output directory for transcoded videos", -) -@click.option( - "--resolution", - type=click.Choice(Resolution.get_all_choices()), - default="720p", - help="Target resolution for videos", -) -@click.option( - "--codec", - help="Video codec: h264_nvenc (GPU), libx264 (CPU)", -) -@click.option( - "--gop-size", - type=float, - help="Keyframe interval in seconds", -) -@click.option( - "--episodes-info-json", - type=click.Path(exists=True, path_type=Path), - help="JSON file with episode metadata", -) -@click.option("--name", help="Series name for state management and resume support") -@click.option("--no-state", is_flag=True, help="Disable state management (no resume on interrupt)") -def transcode( - videos: Path, - transcoded_videos: Path, - resolution: str, - codec: str, - gop_size: float, - episodes_info_json: Path, - name: str, - no_state: bool, -): - """Transcode videos to target resolution with FFmpeg.""" - if transcoded_videos is None: # pylint: disable=duplicate-code - if name: - transcoded_videos = settings.transcode.get_output_dir(name) - else: - from preprocessor.config.config import BASE_OUTPUT_DIR # pylint: disable=import-outside-toplevel - transcoded_videos = BASE_OUTPUT_DIR / "transcoded_videos" - if codec is None: - codec = settings.transcode.codec - if gop_size is None: - gop_size = settings.transcode.gop_size - - state_manager = create_state_manager(name, no_state) - - video_bitrate_mbps = settings.transcode.calculate_video_bitrate_mbps() - minrate_mbps = settings.transcode.calculate_minrate_mbps() - maxrate_mbps = settings.transcode.calculate_maxrate_mbps() - bufsize_mbps = settings.transcode.calculate_bufsize_mbps() - - config = TranscodeConfig( - videos=videos, - transcoded_videos=transcoded_videos, - resolution=Resolution.from_str(resolution), - codec=codec, - gop_size=gop_size, - episodes_info_json=episodes_info_json, - video_bitrate_mbps=video_bitrate_mbps, - minrate_mbps=minrate_mbps, - maxrate_mbps=maxrate_mbps, - bufsize_mbps=bufsize_mbps, - audio_bitrate_kbps=settings.transcode.audio_bitrate_kbps, - ) - config_dict = config.to_dict() - config_dict["state_manager"] = state_manager - config_dict["series_name"] = name or "unknown" - - with ResourceScope(): - transcoder = VideoTranscoder(config_dict) - exit_code = transcoder.work() - - if state_manager and exit_code == 0: - state_manager.cleanup() - - sys.exit(exit_code) diff --git a/preprocessor/cli/commands/transcribe.py b/preprocessor/cli/commands/transcribe.py deleted file mode 100644 index fd6badc4a..000000000 --- a/preprocessor/cli/commands/transcribe.py +++ /dev/null @@ -1,82 +0,0 @@ -from pathlib import Path -import sys -from typing import Tuple - -import click - -from preprocessor.config.config import ( - TranscriptionConfig, - settings, -) -from preprocessor.processors.transcription_generator import TranscriptionGenerator -from preprocessor.utils.resource_scope import ResourceScope - -# pylint: disable=duplicate-code - - - -@click.command(context_settings={"show_default": True}) -@click.argument("videos", type=click.Path(exists=True, file_okay=False, path_type=Path)) -@click.option( - "--episodes-info-json", - type=click.Path(exists=True, path_type=Path), - required=True, - help="JSON file with episode metadata", -) -@click.option( - "--transcription-jsons", - type=click.Path(path_type=Path), - default=None, - help="Output directory for transcription JSONs", -) -@click.option( - "--model", - default=settings.transcription.model, - help="Whisper model: tiny, base, small, medium, large, large-v3-turbo", -) -@click.option( - "--language", - default=settings.transcription.language, - help="Language for transcription", -) -@click.option( - "--extra-json-keys", - multiple=True, - help="Additional JSON keys to remove from output (can specify multiple times)", -) -@click.option( - "--name", - required=True, - help="Series name for output files", -) -def transcribe( - videos: Path, - episodes_info_json: Path, - transcription_jsons: Path, - model: str, - language: str, - extra_json_keys: Tuple[str, ...], - name: str, -): - """Generate transcriptions using Whisper.""" - if transcription_jsons is None: - transcription_jsons = settings.transcription.get_output_dir(name) - - config = TranscriptionConfig( - videos=videos, - episodes_info_json=episodes_info_json, - transcription_jsons=transcription_jsons, - model=model, - language=language, - device="cuda", - extra_json_keys_to_remove=list(extra_json_keys), - name=name, - ) - - config_dict = config.to_dict() - - with ResourceScope(): - generator = TranscriptionGenerator(config_dict) - exit_code = generator.work() - - sys.exit(exit_code) diff --git a/preprocessor/cli/commands/transcribe_elevenlabs.py b/preprocessor/cli/commands/transcribe_elevenlabs.py deleted file mode 100644 index 85149594c..000000000 --- a/preprocessor/cli/commands/transcribe_elevenlabs.py +++ /dev/null @@ -1,84 +0,0 @@ -from pathlib import Path -import sys - -import click - -from preprocessor.cli.helpers import create_state_manager -from preprocessor.config.config import settings -from preprocessor.transcription.elevenlabs import ElevenLabsTranscriber -from preprocessor.utils.console import console - - -@click.command(name="transcribe-elevenlabs", context_settings={"show_default": True}) -@click.argument("videos", type=click.Path(exists=True, file_okay=False, path_type=Path)) -@click.option( - "--output-dir", - type=click.Path(path_type=Path), - default=None, - help="Output directory for transcriptions", -) -@click.option( - "--episodes-info-json", - type=click.Path(exists=True, path_type=Path), - help="JSON file with episode metadata", -) -@click.option("--name", required=True, help="Series name") -@click.option( - "--api-key", - envvar="ELEVEN_API_KEY", - help="ElevenLabs API key (or set ELEVEN_API_KEY env var)", -) -@click.option( - "--model-id", - default="scribe_v1", - help="ElevenLabs model ID", -) -@click.option( - "--language-code", - default="pol", - help="Language code: pol, eng, etc", -) -@click.option( - "--diarize/--no-diarize", - default=True, - help="Enable speaker diarization", -) -@click.option("--no-state", is_flag=True, help="Disable state management (no resume on interrupt)") -def transcribe_elevenlabs( - videos: Path, - output_dir: Path, - episodes_info_json: Path, - name: str, - api_key: str, - model_id: str, - language_code: str, - diarize: bool, - no_state: bool, -): - """Transcribe videos using ElevenLabs API.""" - if output_dir is None: - output_dir = settings.transcription.get_output_dir(name) - - state_manager = create_state_manager(name, no_state) - - transcriber = ElevenLabsTranscriber( - { - "videos": videos, - "output_dir": output_dir, - "episodes_info_json": episodes_info_json, - "series_name": name, - "api_key": api_key, - "model_id": model_id, - "language_code": language_code, - "diarize": diarize, - "state_manager": state_manager, - }, - ) - - exit_code = transcriber.work() - - if state_manager and exit_code == 0: - console.print("[green]Transcription completed successfully![/green]") - state_manager.cleanup() - - sys.exit(exit_code) diff --git a/preprocessor/cli/commands/validate.py b/preprocessor/cli/commands/validate.py deleted file mode 100644 index e90007d37..000000000 --- a/preprocessor/cli/commands/validate.py +++ /dev/null @@ -1,48 +0,0 @@ -from pathlib import Path -import sys - -import click - -from preprocessor.validation.validator import Validator - - -@click.command(context_settings={"show_default": True}) -@click.option( - "--season", - type=str, - required=True, - help="Season to validate (e.g., S10)", -) -@click.option( - "--anomaly-threshold", - type=float, - default=20.0, - help="Threshold for anomaly detection (%)", -) -@click.option( - "--series-name", - type=str, - default="ranczo", - help="Series name for file naming", -) -@click.option( - "--episodes-info-json", - type=click.Path(exists=True, path_type=Path), - help="JSON file with episode metadata (optional, for episode titles)", -) -def validate( - season: str, - anomaly_threshold: float, - series_name: str, - episodes_info_json: Path, -): - """Validate preprocessor output for a season.""" - validator = Validator( - season=season, - series_name=series_name, - anomaly_threshold=anomaly_threshold, - episodes_info_json=episodes_info_json, - ) - - exit_code = validator.validate() - sys.exit(exit_code) diff --git a/preprocessor/cli/helpers.py b/preprocessor/cli/helpers.py index 272d5ae52..fd3348264 100644 --- a/preprocessor/cli/helpers.py +++ b/preprocessor/cli/helpers.py @@ -1,20 +1,76 @@ +from dataclasses import dataclass +import logging from pathlib import Path from typing import Optional +from preprocessor.core.context import ExecutionContext from preprocessor.core.state_manager import StateManager -from preprocessor.utils.console import console +from preprocessor.lib.core.logging import ErrorHandlingLogger +from preprocessor.lib.episodes.episode_manager import EpisodeManager +from preprocessor.lib.ui.console import console +def create_cli_logger(command_name: str, loglevel: int=logging.INFO) -> ErrorHandlingLogger: + return ErrorHandlingLogger(class_name=command_name, loglevel=loglevel, error_exit_code=1) + def create_state_manager(name: str, no_state: bool) -> Optional[StateManager]: if no_state or not name: return None - - state_manager = StateManager(series_name=name, working_dir=Path(".")) + state_manager: StateManager = StateManager(series_name=name, working_dir=Path('.')) state_manager.register_interrupt_handler() state_manager.load_or_create_state() - - resume_info = state_manager.get_resume_info() + resume_info: Optional[str] = state_manager.get_resume_info() if resume_info: - console.print(f"[cyan]{resume_info}[/cyan]") - + console.print(f'[cyan]{resume_info}[/cyan]') return state_manager + +def create_execution_context( + name: str, + logger: ErrorHandlingLogger, + no_state: bool = False, + force_rerun: bool = False, +) -> ExecutionContext: + state_manager: Optional[StateManager] = create_state_manager(name, no_state) + return ExecutionContext( + series_name=name, + base_output_dir=Path('preprocessor/output_data'), + state_manager=state_manager, + force_rerun=force_rerun, + logger=logger, + ) + +@dataclass +class PipelineSetup: + logger: ErrorHandlingLogger + state_manager: StateManager + context: ExecutionContext + episode_manager: Optional[EpisodeManager] = None + +def setup_pipeline_context( + series: str, + logger_name: str, + force_rerun: bool = False, + with_episode_manager: bool = True, +) -> PipelineSetup: + logger: ErrorHandlingLogger = create_cli_logger(logger_name) + state_manager: StateManager = StateManager(series) + state_manager.load_or_create_state() + context: ExecutionContext = ExecutionContext( + series_name=series, + base_output_dir=Path('preprocessor/output_data'), + logger=logger, + state_manager=state_manager, + force_rerun=force_rerun, + ) + episode_manager: Optional[EpisodeManager] = None + if with_episode_manager: + episodes_json: Optional[Path] = Path(f'preprocessor/input_data/{series}/episodes.json') + if not episodes_json.exists(): + episodes_json = None + episode_manager = EpisodeManager(episodes_json, series, logger) + return PipelineSetup( + logger=logger, + state_manager=state_manager, + context=context, + episode_manager=episode_manager, + ) diff --git a/preprocessor/cli/options/common.py b/preprocessor/cli/options/common.py deleted file mode 100644 index 504cc07c4..000000000 --- a/preprocessor/cli/options/common.py +++ /dev/null @@ -1,35 +0,0 @@ -from pathlib import Path - -import click - - -def episodes_info_option(required=True): - return click.option( - "--episodes-info-json", - type=click.Path(exists=True, path_type=Path), - required=required, - help="JSON file with episode metadata", - ) - - -def name_option(required=True): - return click.option( - "--name", - required=required, - help="Series name for state management and resume support", - ) - - -def state_option(): - return click.option( - "--no-state", - is_flag=True, - help="Disable state management (no resume on interrupt)", - ) - - -def videos_argument(): - return click.argument( - "videos", - type=click.Path(exists=True, file_okay=False, path_type=Path), - ) diff --git a/preprocessor/cli/pipeline/orchestrator.py b/preprocessor/cli/pipeline/orchestrator.py deleted file mode 100644 index 829252d94..000000000 --- a/preprocessor/cli/pipeline/orchestrator.py +++ /dev/null @@ -1,160 +0,0 @@ -from dataclasses import dataclass -import json -from pathlib import Path -from typing import ( - Any, - Callable, - Dict, - List, - Optional, -) - -from preprocessor.config.config import ( - get_output_path, - settings, -) -from preprocessor.core.processing_metadata import ProcessingMetadata -from preprocessor.core.state_manager import StateManager -from preprocessor.utils.console import console -from preprocessor.utils.resource_scope import ResourceScope - -ELASTIC_SUBDIRS = settings.output_subdirs.elastic_document_subdirs - - -@dataclass -class PipelineStep: - name: str - step_num: str - execute_func: Callable - skip: bool = False - - -class PipelineOrchestrator: - def __init__(self, state_manager: Optional[StateManager] = None, series_name: Optional[str] = None, metadata_output_dir: Optional[Path] = None): - self.state_manager = state_manager - self.steps: List[PipelineStep] = [] - self.series_name = series_name - self.metadata_output_dir = metadata_output_dir - self.metadata: Optional[ProcessingMetadata] = None - - def add_step(self, name: str, step_num: str, func: Callable, skip: bool = False): - self.steps.append(PipelineStep(name, step_num, func, skip)) - - def execute(self, **params) -> int: - if self.series_name: - self.metadata = ProcessingMetadata(series_name=self.series_name, params=params) - - try: - exit_code = self.__run_all_steps(params) - if self.state_manager: - self.state_manager.cleanup() - self.__finalize_metadata(exit_code) - return exit_code - except KeyboardInterrupt: - console.print("\n[yellow]Pipeline interrupted by user[/yellow]") - self.__finalize_metadata(130) - return 130 - - def __run_all_steps(self, params: Dict[str, Any]) -> int: - for step in self.steps: - step_metadata = None - if self.metadata: - step_metadata = self.metadata.add_step(name=step.name, step_num=step.step_num) - - if step.skip: - console.print(f"[yellow]Step {step.step_num}: {step.name} - SKIPPED[/yellow]") - if step_metadata: - step_metadata.skip() - continue - - console.print(f"[bold blue]Step {step.step_num}: {step.name}[/bold blue]") - - if step_metadata: - step_metadata.start() - - try: - with ResourceScope(): - exit_code = step.execute_func(**params) - except KeyboardInterrupt: - console.print(f"\n[yellow]Step {step.step_num} interrupted[/yellow]") - if step_metadata: - step_metadata.finish(130) - return 130 - - if step_metadata: - step_metadata.finish(exit_code) - - if exit_code != 0: - console.print(f"[red]Step {step.step_num} failed with exit code {exit_code}[/red]") - return exit_code - - return 0 - - def __finalize_metadata(self, exit_code: int): - if self.metadata: - additional_stats = self.__collect_additional_statistics() - self.metadata.finish_processing(exit_code, additional_stats) - - if self.metadata_output_dir: - metadata_file = self.metadata_output_dir / f"{self.series_name}_processing_metadata.json" - self.metadata.save_to_file(metadata_file) - console.print(f"[green]Processing metadata saved to: {metadata_file}[/green]") - - def __collect_additional_statistics(self) -> Dict[str, Any]: # pylint: disable=too-many-locals - stats: Dict[str, Any] = {} - # noinspection PyBroadException - try: # pylint: disable=too-many-try-statements - transcription_jsons_dir = Path(self.metadata.params.get("transcription_jsons", "")) - if transcription_jsons_dir.exists(): - transcription_files = list(transcription_jsons_dir.rglob("*_segmented.json")) - stats["transcription_files_count"] = len(transcription_files) - stats["transcription_files"] = [f.name for f in transcription_files[:20]] - - transcoded_videos_dir = Path(self.metadata.params.get("transcoded_videos", "")) - if transcoded_videos_dir.exists(): - video_files = list(transcoded_videos_dir.rglob("*.mp4")) - stats["transcoded_videos_count"] = len(video_files) - total_size = sum(f.stat().st_size for f in video_files if f.is_file()) - stats["transcoded_videos_total_size_mb"] = round(total_size / (1024 * 1024), 2) - - output_frames_dir = Path(settings.frame_export.get_output_dir(self.series_name)) - if output_frames_dir.exists(): - frame_metadata_files = list(output_frames_dir.rglob("*_frame_metadata.json")) - stats["processed_episodes_count"] = len(frame_metadata_files) - total_frames = 0 - for metadata_file in frame_metadata_files: - try: - with open(metadata_file, "r", encoding="utf-8") as f: - data = json.load(f) - total_frames += data.get("statistics", {}).get("total_frames", 0) - except Exception: - pass - stats["total_frames_extracted"] = total_frames - - embeddings_dir = Path(settings.embedding.get_output_dir(self.series_name)) - if embeddings_dir.exists(): - text_embedding_files = list(embeddings_dir.rglob("*_embeddings_text.json")) - video_embedding_files = list(embeddings_dir.rglob("*_embeddings_video.json")) - stats["text_embedding_files_count"] = len(text_embedding_files) - stats["video_embedding_files_count"] = len(video_embedding_files) - - image_hashes_dir = Path(settings.image_hash.get_output_dir(self.series_name)) - if image_hashes_dir.exists(): - hash_files = list(image_hashes_dir.rglob("*_image_hashes.json")) - stats["image_hash_files_count"] = len(hash_files) - - elastic_docs_dir = get_output_path("elastic_documents") - if elastic_docs_dir.exists(): - segment_files = list((elastic_docs_dir / ELASTIC_SUBDIRS.text_segments).rglob("*.jsonl")) - text_emb_files = list((elastic_docs_dir / ELASTIC_SUBDIRS.text_embeddings).rglob("*.jsonl")) - video_frame_files = list((elastic_docs_dir / ELASTIC_SUBDIRS.video_frames).rglob("*.jsonl")) - stats["elastic_documents"] = { - ELASTIC_SUBDIRS.text_segments: len(segment_files), - ELASTIC_SUBDIRS.text_embeddings: len(text_emb_files), - ELASTIC_SUBDIRS.video_frames: len(video_frame_files), - } - - except Exception: - pass - - return stats diff --git a/preprocessor/cli/pipeline/steps.py b/preprocessor/cli/pipeline/steps.py deleted file mode 100644 index 9c5757dd9..000000000 --- a/preprocessor/cli/pipeline/steps.py +++ /dev/null @@ -1,608 +0,0 @@ -from pathlib import Path - -from preprocessor.characters.reference_downloader import CharacterReferenceDownloader -from preprocessor.config.config import ( - get_base_output_dir, - settings, -) -from preprocessor.core.constants import SUPPORTED_VIDEO_EXTENSIONS -from preprocessor.scraping.episode_scraper import EpisodeScraper -from preprocessor.utils.console import console -from preprocessor.video.frame_processor import FrameProcessor -from preprocessor.video.subprocessors import ( - CharacterDetectionSubProcessor, - CharacterDetectionVisualizationSubProcessor, - ImageHashSubProcessor, - ObjectDetectionSubProcessor, - ObjectDetectionVisualizationSubProcessor, - VideoEmbeddingSubProcessor, -) -from preprocessor.video.subprocessors.emotion_detection_subprocessor import EmotionDetectionSubProcessor -from preprocessor.video.subprocessors.face_clustering_subprocessor import FaceClusteringSubProcessor - -# pylint: disable=duplicate-code - - -def run_scrape_step(scrape_urls, episodes_info_json, videos=None, parser_mode="normal", **_kwargs): - if not scrape_urls: - return 0 - - if episodes_info_json.exists(): - console.print( - f"\n[yellow]Scraping episode metadata... SKIPPED (file exists: {episodes_info_json})[/yellow]", - ) - return 0 - - scraper = EpisodeScraper( - { - "urls": list(scrape_urls), - "output_file": episodes_info_json, - "headless": True, - "merge_sources": True, - "videos_dir": videos, - "parser_mode": parser_mode, - }, - ) - scrape_exit_code = scraper.work() - - if scrape_exit_code != 0: - console.print("[red]Scraping failed, aborting pipeline[/red]") - return scrape_exit_code - - console.print(f"[green]Episode metadata saved to: {episodes_info_json}[/green]") - return 0 - - -def run_character_scrape_step(character_urls, characters_json, name, parser_mode="normal", **_kwargs): - from preprocessor.scraping.character_scraper import CharacterScraper # pylint: disable=import-outside-toplevel - - if not character_urls: - return 0 - - if characters_json.exists(): - console.print( - f"\n[yellow]Scraping character metadata... SKIPPED (file exists: {characters_json})[/yellow]", - ) - return 0 - - scraper = CharacterScraper( - { - "urls": list(character_urls), - "output_file": characters_json, - "series_name": name, - "headless": True, - "parser_mode": parser_mode, - }, - ) - scrape_exit_code = scraper.work() - - if scrape_exit_code != 0: - console.print("[red]Character scraping failed[/red]") - return scrape_exit_code - - console.print(f"[green]Character metadata saved to: {characters_json}[/green]") - return 0 - - -def run_character_reference_download_step(name, characters_json, search_mode="normal", **_kwargs): - if not characters_json.exists(): - console.print("[yellow]No characters.json found, skipping reference download[/yellow]") - return 0 - - downloader = CharacterReferenceDownloader( - { - "characters_json": characters_json, - "series_name": name, - "output_dir": settings.character.get_output_dir(name), - "images_per_character": settings.character.reference_images_per_character, - "search_mode": search_mode, - }, - ) - return downloader.work() - - -def run_character_reference_processing_step(name, state_manager, interactive_character_processing=False, debug_visualizations=False, **_kwargs): - from preprocessor.characters.reference_processor import CharacterReferenceProcessor # pylint: disable=import-outside-toplevel - - characters_dir = settings.character.get_output_dir(name) - if not characters_dir.exists() or not list(characters_dir.iterdir()): - console.print("[yellow]No character references found, skipping processing[/yellow]") - return 0 - - processor = CharacterReferenceProcessor( - { - "characters_dir": characters_dir, - "output_dir": settings.character.get_processed_references_dir(name), - "similarity_threshold": settings.character.reference_matching_threshold, - "interactive": interactive_character_processing, - "series_name": name, - "state_manager": state_manager, - }, - ) - exit_code = processor.work() - - if exit_code == 0 and debug_visualizations: - processor.generate_validation_grid() - - return exit_code - - -def run_character_detection_step(**kwargs): - from preprocessor.processors.character_detector import CharacterDetector # pylint: disable=import-outside-toplevel - - name = kwargs.get("name") - frames_dir = kwargs.get("output_frames", settings.frame_export.get_output_dir(name)) - characters_dir = settings.character.get_output_dir(name) - output_dir = settings.character.get_detections_dir(name) - episodes_info_json = kwargs.get("episodes_info_json") - state_manager = kwargs.get("state_manager") - - detector = CharacterDetector( - { - "frames_dir": frames_dir, - "characters_dir": characters_dir, - "output_dir": output_dir, - "episodes_info_json": episodes_info_json, - "series_name": name, - "state_manager": state_manager, - }, - ) - return detector.work() - - -def run_transcode_step(videos, episodes_info_json, name, resolution, codec, state_manager, **kwargs): - from preprocessor.config.config import TranscodeConfig # pylint: disable=import-outside-toplevel - from preprocessor.processors.video_transcoder import VideoTranscoder # pylint: disable=import-outside-toplevel - from preprocessor.utils.resolution import Resolution # pylint: disable=import-outside-toplevel - - transcoded_videos = kwargs.get("transcoded_videos") - - video_bitrate_mbps = settings.transcode.calculate_video_bitrate_mbps() - minrate_mbps = settings.transcode.calculate_minrate_mbps() - maxrate_mbps = settings.transcode.calculate_maxrate_mbps() - bufsize_mbps = settings.transcode.calculate_bufsize_mbps() - - transcode_config = TranscodeConfig( - videos=videos, - transcoded_videos=transcoded_videos, - resolution=Resolution.from_str(resolution), - codec=codec, - gop_size=settings.transcode.gop_size, - episodes_info_json=episodes_info_json, - video_bitrate_mbps=video_bitrate_mbps, - minrate_mbps=minrate_mbps, - maxrate_mbps=maxrate_mbps, - bufsize_mbps=bufsize_mbps, - audio_bitrate_kbps=settings.transcode.audio_bitrate_kbps, - ) - transcode_dict = transcode_config.to_dict() - transcode_dict["state_manager"] = state_manager - transcode_dict["series_name"] = name - - transcoder = VideoTranscoder(transcode_dict) - return transcoder.work() - - -def run_transcribe_step(videos, episodes_info_json, name, model, language, device, ramdisk_path, state_manager, transcription_mode="normal", **kwargs): - transcription_jsons = kwargs.get("transcription_jsons") - - if transcription_mode == "premium": - from preprocessor.transcription.elevenlabs import ElevenLabsTranscriber # pylint: disable=import-outside-toplevel - - console.print("[cyan]Using premium transcription mode (ElevenLabs API)[/cyan]") - - transcriber = ElevenLabsTranscriber( - { - "videos": videos, - "output_dir": transcription_jsons, - "episodes_info_json": episodes_info_json, - "series_name": name, - "api_key": settings.elevenlabs.api_key, - "model_id": settings.elevenlabs.model_id, - "language_code": settings.elevenlabs.language_code, - "diarize": settings.elevenlabs.diarize, - "state_manager": state_manager, - }, - ) - return transcriber.work() - - from preprocessor.config.config import TranscriptionConfig # pylint: disable=import-outside-toplevel - from preprocessor.processors.transcription_generator import TranscriptionGenerator # pylint: disable=import-outside-toplevel - - console.print("[cyan]Using normal transcription mode (Whisper)[/cyan]") - - transcription_config = TranscriptionConfig( - videos=videos, - episodes_info_json=episodes_info_json, - transcription_jsons=transcription_jsons, - model=model, - language=language, - device=device, - extra_json_keys_to_remove=[], - name=name, - ) - transcription_dict = transcription_config.to_dict() - transcription_dict["state_manager"] = state_manager - transcription_dict["series_name"] = name - transcription_dict["ramdisk_path"] = ramdisk_path - - generator = TranscriptionGenerator(transcription_dict) - return generator.work() - - -def run_sound_separation_step(name, episodes_info_json, transcription_jsons, state_manager, **_kwargs): - from preprocessor.transcription.processors.sound_separator import SoundEventSeparator # pylint: disable=import-outside-toplevel - - separator = SoundEventSeparator( - { - "transcription_dir": transcription_jsons, - "episodes_info_json": episodes_info_json, - "series_name": name, - "state_manager": state_manager, - }, - ) - return separator.work() - - -def run_scene_step(device, **kwargs): - from preprocessor.processors.scene_detector import SceneDetector # pylint: disable=import-outside-toplevel - - videos = kwargs.get("videos") - scene_timestamps_dir = kwargs.get("scene_timestamps_dir") - name = kwargs.get("name") - episodes_info_json = kwargs.get("episodes_info_json") - - detector = SceneDetector( - { - "videos": videos, - "output_dir": scene_timestamps_dir, - "threshold": settings.scene_detection.threshold, - "min_scene_len": settings.scene_detection.min_scene_len, - "device": device, - "series_name": name, - "episodes_info_json": episodes_info_json, - }, - ) - exit_code = detector.work() - detector.cleanup() - return exit_code - - -def run_frame_export_step(state_manager, **kwargs): - from preprocessor.processors.frame_exporter import FrameExporter # pylint: disable=import-outside-toplevel - - videos = kwargs.get("videos") - scene_timestamps_dir = kwargs.get("scene_timestamps_dir") - name = kwargs.get("name") - episodes_info_json = kwargs.get("episodes_info_json") - output_frames = kwargs.get("output_frames", settings.frame_export.get_output_dir(name)) - - exporter = FrameExporter( - { - "videos": videos, - "scene_timestamps_dir": scene_timestamps_dir, - "output_frames": output_frames, - "resolution": settings.frame_export.resolution, - "series_name": name, - "episodes_info_json": episodes_info_json, - "state_manager": state_manager, - }, - ) - return exporter.work() - - -def run_image_hashing_step(device, state_manager, **kwargs): - from preprocessor.processors.image_hash_processor import ImageHashProcessor # pylint: disable=import-outside-toplevel - - name = kwargs.get("name") - episodes_info_json = kwargs.get("episodes_info_json") - frames_dir = kwargs.get("output_frames", settings.frame_export.get_output_dir(name)) - - hasher = ImageHashProcessor( - { - "frames_dir": frames_dir, - "output_dir": settings.image_hash.get_output_dir(name), - "batch_size": settings.embedding.batch_size, - "device": device, - "series_name": name, - "episodes_info_json": episodes_info_json, - "state_manager": state_manager, - }, - ) - exit_code = hasher.work() - hasher.cleanup() - return exit_code - - -def run_embedding_step(device, state_manager, **kwargs): - from preprocessor.processors.embedding_generator import EmbeddingGenerator # pylint: disable=import-outside-toplevel - - transcription_jsons = kwargs.get("transcription_jsons") - name = kwargs.get("name") - episodes_info_json = kwargs.get("episodes_info_json") - frames_dir = kwargs.get("output_frames", settings.frame_export.get_output_dir(name)) - skip_full_episode = kwargs.get("skip_full_episode", False) - - embedding_generator = EmbeddingGenerator( - { - "transcription_jsons": transcription_jsons, - "frames_dir": frames_dir, - "output_dir": settings.embedding.get_output_dir(name), - "image_hashes_dir": settings.image_hash.get_output_dir(name), - "model": settings.embedding_model.model_name, - "segments_per_embedding": settings.text_chunking.segments_per_embedding, - "generate_text": True, - "generate_video": False, - "generate_full_episode": not skip_full_episode and settings.embedding.generate_full_episode_embedding, - "device": device, - "batch_size": settings.embedding.batch_size, - "series_name": name, - "episodes_info_json": episodes_info_json, - "state_manager": state_manager, - }, - ) - exit_code = embedding_generator.work() - embedding_generator.cleanup() - return exit_code - - -def run_elastic_documents_step(**kwargs): - from preprocessor.config.config import get_output_path # pylint: disable=import-outside-toplevel - from preprocessor.processors.elastic_document_generator import ElasticDocumentGenerator # pylint: disable=import-outside-toplevel - - name = kwargs.get("name") - base_output = get_base_output_dir(name) - - transcription_jsons = base_output / settings.output_subdirs.transcriptions - embeddings_dir = base_output / settings.output_subdirs.embeddings - scene_timestamps_dir = kwargs.get("scene_timestamps_dir") or (base_output / settings.output_subdirs.scenes) - character_detections_dir = base_output / settings.output_subdirs.character_detections - object_detections_dir = base_output / settings.output_subdirs.object_detections - episodes_info_json = kwargs.get("episodes_info_json") - - generator = ElasticDocumentGenerator( - { - "transcription_jsons": transcription_jsons, - "embeddings_dir": embeddings_dir, - "scene_timestamps_dir": scene_timestamps_dir, - "character_detections_dir": character_detections_dir, - "object_detections_dir": object_detections_dir, - "output_dir": get_output_path("elastic_documents"), - "series_name": name, - "episodes_info_json": episodes_info_json, - }, - ) - return generator.work() - - -def run_index_step(name, dry_run, state_manager, **kwargs): - from preprocessor.config.config import get_output_path # pylint: disable=import-outside-toplevel - from preprocessor.processors.elasticsearch_indexer import ElasticSearchIndexer # pylint: disable=import-outside-toplevel - - episodes_info_json = kwargs.get("episodes_info_json") - elastic_documents_dir = get_output_path(settings.output_subdirs.elastic_documents) - - indexer = ElasticSearchIndexer({ - "name": name, - "elastic_documents_dir": elastic_documents_dir, - "dry_run": dry_run, - "append": False, - "state_manager": state_manager, - "series_name": name, - "episodes_info_json": episodes_info_json, - }) - return indexer.work() - - -def run_frame_processing_step( # pylint: disable=too-many-locals,too-many-arguments - device, - state_manager, - ramdisk_path, - skip_image_hashing, - skip_video_embeddings, - skip_character_detection, - skip_emotion_detection, - skip_character_visualization, - skip_face_clustering, - skip_object_detection, - skip_object_visualization, - debug_visualizations=False, - **kwargs, -): - name = kwargs.get("name") - episodes_info_json = kwargs.get("episodes_info_json") - output_frames = kwargs.get("output_frames", settings.frame_export.get_output_dir(name)) - - processor = FrameProcessor( - { - "frames_dir": output_frames, - "ramdisk_path": ramdisk_path or Path("/dev/shm"), - "series_name": name, - "episodes_info_json": episodes_info_json, - "state_manager": state_manager, - }, - ) - - sub_processors = [] - - if not skip_image_hashing: - hash_sub = ImageHashSubProcessor( - device=device, - batch_size=settings.embedding.batch_size, - ) - processor.add_sub_processor(hash_sub) - sub_processors.append(hash_sub) - - if not skip_video_embeddings: - embedding_sub = VideoEmbeddingSubProcessor( - device=device, - batch_size=settings.embedding.batch_size, - model_name=settings.embedding_model.model_name, - model_revision=settings.embedding_model.model_revision, - ) - processor.add_sub_processor(embedding_sub) - sub_processors.append(embedding_sub) - - if not skip_character_detection: - char_detection_sub = CharacterDetectionSubProcessor( - characters_dir=Path(settings.character.get_output_dir(name)), - use_gpu=True, - threshold=settings.character.frame_detection_threshold, - ) - processor.add_sub_processor(char_detection_sub) - sub_processors.append(char_detection_sub) - - if not skip_emotion_detection: - emotion_detection_sub = EmotionDetectionSubProcessor() - processor.add_sub_processor(emotion_detection_sub) - sub_processors.append(emotion_detection_sub) - - if not skip_character_visualization: - char_viz_sub = CharacterDetectionVisualizationSubProcessor() - processor.add_sub_processor(char_viz_sub) - sub_processors.append(char_viz_sub) - - if not skip_face_clustering: - face_clustering_sub = FaceClusteringSubProcessor( - min_cluster_size=settings.face_clustering.min_cluster_size, - min_samples=settings.face_clustering.min_samples, - save_noise=settings.face_clustering.save_noise, - save_full_frames=debug_visualizations, - ) - processor.add_sub_processor(face_clustering_sub) - sub_processors.append(face_clustering_sub) - - if not skip_object_detection: - object_detection_sub = ObjectDetectionSubProcessor( - model_name=settings.object_detection.model_name, - conf_threshold=settings.object_detection.conf_threshold, - ) - processor.add_sub_processor(object_detection_sub) - sub_processors.append(object_detection_sub) - - if not skip_object_visualization: - object_viz_sub = ObjectDetectionVisualizationSubProcessor() - processor.add_sub_processor(object_viz_sub) - sub_processors.append(object_viz_sub) - - try: - return processor.work() - finally: - for sub in sub_processors: - sub.cleanup() - processor.cleanup() - - -def run_validation_step(name, episodes_info_json, **kwargs): # pylint: disable=too-many-locals - from preprocessor.validation.global_validator import GlobalValidator # pylint: disable=import-outside-toplevel - from preprocessor.validation.validator import Validator # pylint: disable=import-outside-toplevel - - base_output = get_base_output_dir(name) - - console.print("[bold cyan]Running global validation...[/bold cyan]") - global_validator = GlobalValidator(series_name=name, base_output_dir=base_output) - global_result = global_validator.validate() - - validation_reports_dir = base_output / settings.output_subdirs.validation_reports - validation_reports_dir.mkdir(parents=True, exist_ok=True) - - from preprocessor.utils.file_utils import atomic_write_json # pylint: disable=import-outside-toplevel - global_report_path = validation_reports_dir / f"{name}_global.json" - atomic_write_json(global_report_path, global_result.to_dict()) - - if global_result.errors: - console.print(f"[red]Global validation errors: {len(global_result.errors)}[/red]") - for error in global_result.errors[:5]: - console.print(f" - {error}") - if global_result.warnings: - console.print(f"[yellow]Global validation warnings: {len(global_result.warnings)}[/yellow]") - - input_videos_path = kwargs.get("videos") - if not input_videos_path or not input_videos_path.exists(): - console.print("[yellow]No input videos directory found, skipping episode validation[/yellow]") - return 0 - - seasons = sorted([d for d in input_videos_path.iterdir() if d.is_dir() and d.name.startswith("S")]) - if not seasons: - console.print("[yellow]No seasons found in input videos directory, skipping episode validation[/yellow]") - return 0 - - seasons_with_videos = [] - for season_dir in seasons: - video_files = [] - for ext in SUPPORTED_VIDEO_EXTENSIONS: - video_files.extend(list(season_dir.glob(f"**/*{ext}"))) - if video_files: - seasons_with_videos.append(season_dir) - console.print(f"[cyan]Found {len(video_files)} video file(s) in {season_dir.name}[/cyan]") - else: - console.print(f"[yellow]Skipping {season_dir.name}: no video files found[/yellow]") - - if not seasons_with_videos: - console.print("[yellow]No seasons with video files found, skipping episode validation[/yellow]") - return 0 - - for season_dir in seasons_with_videos: - import re # pylint: disable=import-outside-toplevel - - season_name = season_dir.name - match = re.search(r'(\d+)', season_name) - if match: - season_number = int(match.group(1)) - season = f"S{season_number:02d}" - else: - season = season_name - - validator = Validator( - season=season, - series_name=name, - anomaly_threshold=20.0, - base_output_dir=base_output, - episodes_info_json=episodes_info_json, - ) - - console.print(f"[cyan]Validating season {season} (from folder: {season_name})...[/cyan]") - exit_code = validator.validate() - - if exit_code != 0: - console.print(f"[red]Validation failed for season {season}[/red]") - return exit_code - - console.print("[green]All validations completed successfully[/green]") - return 0 - - -def run_text_analysis_step(name, episodes_info_json, language, state_manager, **_kwargs): - from preprocessor.processors.text_analyzer import TextAnalyzer # pylint: disable=import-outside-toplevel - - analyzer = TextAnalyzer( - { - "series_name": name, - "episodes_info_json": episodes_info_json, - "language": language, - "state_manager": state_manager, - }, - ) - return analyzer.work() - - -def run_archive_generation_step(**kwargs): - from preprocessor.config.config import get_output_path # pylint: disable=import-outside-toplevel - from preprocessor.processors.archive_generator import ArchiveGenerator # pylint: disable=import-outside-toplevel - - name = kwargs.get("name") - base_output = get_base_output_dir(name) - - elastic_documents_dir = get_output_path(settings.output_subdirs.elastic_documents, name) - output_dir = base_output / settings.output_subdirs.archives - episodes_info_json = kwargs.get("episodes_info_json") - - generator = ArchiveGenerator( - { - "elastic_documents_dir": elastic_documents_dir, - "output_dir": output_dir, - "series_name": name, - "episodes_info_json": episodes_info_json, - }, - ) - return generator.work() diff --git a/preprocessor/config/config.py b/preprocessor/config/config.py index 4679bd243..b93c7659c 100644 --- a/preprocessor/config/config.py +++ b/preprocessor/config/config.py @@ -14,74 +14,56 @@ from pydantic import SecretStr -from preprocessor.utils.resolution import Resolution +from preprocessor.lib.media.resolution import Resolution -# ============================================================================ -# CONSTANTS & HELPERS -# ============================================================================ +is_docker = os.getenv('DOCKER_CONTAINER', 'false').lower() == 'true' +BASE_OUTPUT_DIR = Path('/app/output_data') if is_docker else Path('preprocessor/output_data') -is_docker = os.getenv("DOCKER_CONTAINER", "false").lower() == "true" -BASE_OUTPUT_DIR = Path("/app/output_data") if is_docker else Path("preprocessor/output_data") - - -def get_base_output_dir(series_name: Optional[str] = None) -> Path: - base = Path("/app/output_data") if is_docker else Path("preprocessor/output_data") +def get_base_output_dir(series_name: Optional[str]=None) -> Path: + base = Path('/app/output_data') if is_docker else Path('preprocessor/output_data') if series_name: return base / series_name.lower() return base - -def get_output_path(relative_path: str, series_name: Optional[str] = None) -> Path: +def get_output_path(relative_path: str, series_name: Optional[str]=None) -> Path: return get_base_output_dir(series_name) / relative_path - -# ============================================================================ -# OUTPUT DIRECTORY STRUCTURE -# ============================================================================ - @dataclass class ElasticDocumentSubdirs: - text_segments: str = "text_segments" - text_embeddings: str = "text_embeddings" - video_frames: str = "video_frames" - episode_names: str = "episode_names" - text_statistics: str = "text_statistics" - full_episode_embeddings: str = "full_episode_embeddings" - sound_events: str = "sound_events" - sound_event_embeddings: str = "sound_event_embeddings" - + text_segments: str = 'text_segments' + text_embeddings: str = 'text_embeddings' + video_frames: str = 'video_frames' + episode_names: str = 'episode_names' + text_statistics: str = 'text_statistics' + full_episode_embeddings: str = 'full_episode_embeddings' + sound_events: str = 'sound_events' + sound_event_embeddings: str = 'sound_event_embeddings' @dataclass class TranscriptionSubdirs: - raw: str = "raw" - clean: str = "clean" - sound_events: str = "sound_events" - + raw: str = 'raw' + clean: str = 'clean' + sound_events: str = 'sound_events' @dataclass class OutputSubdirs: # pylint: disable=too-many-instance-attributes - video: str = "transcoded_videos" - transcriptions: str = "transcriptions" + video: str = 'transcoded_videos' + transcriptions: str = 'transcriptions' transcription_subdirs: TranscriptionSubdirs = field(default_factory=TranscriptionSubdirs) - scenes: str = "scene_timestamps" - frames: str = "exported_frames" - embeddings: str = "embeddings" - image_hashes: str = "image_hashes" - character_detections: str = "character_detections" - character_visualizations: str = "character_detections/visualizations" - face_clusters: str = "face_clusters" - object_detections: str = "object_detections" - object_visualizations: str = "object_detections/visualizations" - elastic_documents: str = "elastic_documents" - archives: str = "archives" - validation_reports: str = "validation_reports" + scenes: str = 'scene_timestamps' + frames: str = 'exported_frames' + embeddings: str = 'embeddings' + image_hashes: str = 'image_hashes' + character_detections: str = 'character_detections' + character_visualizations: str = 'character_detections/visualizations' + face_clusters: str = 'face_clusters' + object_detections: str = 'object_detections' + object_visualizations: str = 'object_detections/visualizations' + elastic_documents: str = 'elastic_documents' + archives: str = 'archives' + validation_reports: str = 'validation_reports' elastic_document_subdirs: ElasticDocumentSubdirs = field(default_factory=ElasticDocumentSubdirs) - -# ============================================================================ -# BASE CLASSES -# ============================================================================ - @dataclass class BaseAPISettings: _api_key: Optional[SecretStr] = None @@ -90,14 +72,9 @@ class BaseAPISettings: def api_key(self) -> Optional[str]: return self._api_key.get_secret_value() if self._api_key else None - -# ============================================================================ -# VIDEO PROCESSING -# ============================================================================ - @dataclass class TranscodeSettings: - codec: str = "h264_nvenc" + codec: str = 'h264_nvenc' target_file_size_mb: float = 50.0 target_duration_seconds: float = 100.0 audio_bitrate_kbps: int = 128 @@ -105,24 +82,23 @@ class TranscodeSettings: @staticmethod def get_output_dir(series_name: str) -> Path: - return get_base_output_dir(series_name) / "transcoded_videos" + return get_base_output_dir(series_name) / 'transcoded_videos' def calculate_video_bitrate_mbps(self) -> float: - total_bitrate_mbps = (self.target_file_size_mb * 8) / self.target_duration_seconds + total_bitrate_mbps = self.target_file_size_mb * 8 / self.target_duration_seconds audio_bitrate_mbps = self.audio_bitrate_kbps / 1000.0 video_bitrate_mbps = total_bitrate_mbps - audio_bitrate_mbps return round(video_bitrate_mbps, 2) - def calculate_minrate_mbps(self, percent: float = 0.5) -> float: + def calculate_minrate_mbps(self, percent: float=0.5) -> float: return round(self.calculate_video_bitrate_mbps() * percent, 2) - def calculate_maxrate_mbps(self, percent: float = 1.75) -> float: + def calculate_maxrate_mbps(self, percent: float=1.75) -> float: return round(self.calculate_video_bitrate_mbps() * percent, 2) - def calculate_bufsize_mbps(self, multiplier: float = 2.0) -> float: + def calculate_bufsize_mbps(self, multiplier: float=2.0) -> float: return round(self.calculate_video_bitrate_mbps() * multiplier, 2) - @dataclass class SceneDetectionSettings: threshold: float = 0.5 @@ -130,54 +106,42 @@ class SceneDetectionSettings: @staticmethod def get_output_dir(series_name: str) -> Path: - return get_base_output_dir(series_name) / "scene_timestamps" - + return get_base_output_dir(series_name) / 'scene_timestamps' @dataclass class SceneChangesSettings: frames_per_scene: int = 1 - @dataclass class KeyframeExtractionSettings: - strategy: str = "scene_changes" + strategy: str = 'scene_changes' scene_changes: SceneChangesSettings = field(default_factory=SceneChangesSettings) - @dataclass class FrameExportSettings: resolution: Resolution = Resolution.R1080P @staticmethod def get_output_dir(series_name: str) -> Path: - return get_base_output_dir(series_name) / "exported_frames" - - -# ============================================================================ -# TRANSCRIPTION & TEXT PROCESSING -# ============================================================================ + return get_base_output_dir(series_name) / 'exported_frames' @dataclass class TranscriptionSettings: - model: str = "large-v3-turbo" - language: str = "Polish" - device: str = "cuda" + model: str = 'large-v3-turbo' + language: str = 'Polish' + device: str = 'cuda' @staticmethod def get_output_dir(series_name: str) -> Path: - return get_base_output_dir(series_name) / "transcriptions" - + return get_base_output_dir(series_name) / 'transcriptions' @dataclass class WhisperSettings: - model: str = "large-v3-turbo" + model: str = 'large-v3-turbo' @classmethod - def _from_env(cls) -> "WhisperSettings": - return cls( - model=os.getenv("WHISPER_MODEL", "large-v3-turbo"), - ) - + def _from_env(cls) -> 'WhisperSettings': + return cls(model=os.getenv('WHISPER_MODEL', 'large-v3-turbo')) @dataclass class TextChunkingSettings: @@ -185,41 +149,34 @@ class TextChunkingSettings: text_sentences_per_chunk: int = 8 text_chunk_overlap: int = 3 - @dataclass class ElevenLabsSettings(BaseAPISettings): - model_id: str = "scribe_v1" - language_code: str = "pol" + model_id: str = 'scribe_v1' + language_code: str = 'pol' diarize: bool = True polling_interval: int = 20 max_attempts: int = 60 @classmethod - def _from_env(cls) -> "ElevenLabsSettings": + def _from_env(cls) -> 'ElevenLabsSettings': api_key = None - if os.getenv("ELEVEN_API_KEY"): - api_key = SecretStr(os.getenv("ELEVEN_API_KEY", "")) + if os.getenv('ELEVEN_API_KEY'): + api_key = SecretStr(os.getenv('ELEVEN_API_KEY', '')) return cls(_api_key=api_key) - -# ============================================================================ -# EMBEDDINGS -# ============================================================================ - @dataclass class EmbeddingModelSettings: - model_name: str = "Qwen/Qwen3-VL-Embedding-8B" - model_revision: str = "main" + model_name: str = 'Qwen/Qwen3-VL-Embedding-8B' + model_revision: str = 'main' embedding_dim: int = 4096 gpu_memory_utilization: float = 0.85 tensor_parallel_size: int = 1 max_model_len: int = 8192 - image_placeholder: str = "<|vision_start|><|image_pad|><|vision_end|>" + image_placeholder: str = '<|vision_start|><|image_pad|><|vision_end|>' enable_chunked_prefill: bool = True max_num_batched_tokens: int = 8192 enforce_eager: bool = False - @dataclass class EmbeddingSettings: batch_size: int = 32 @@ -230,19 +187,13 @@ class EmbeddingSettings: @staticmethod def get_output_dir(series_name: str) -> Path: - return get_base_output_dir(series_name) / "embeddings" - - -# ============================================================================ -# COMPUTER VISION -# ============================================================================ + return get_base_output_dir(series_name) / 'embeddings' @dataclass class FaceRecognitionSettings: - model_name: str = "buffalo_l" + model_name: str = 'buffalo_l' detection_size: Tuple[int, int] = (1280, 1280) - @dataclass class FaceClusteringSettings: min_cluster_size: int = 5 @@ -251,68 +202,60 @@ class FaceClusteringSettings: @staticmethod def get_output_dir(series_name: str) -> Path: - return get_base_output_dir(series_name) / "face_clusters" - + return get_base_output_dir(series_name) / 'face_clusters' @dataclass class EmotionDetectionSettings: - model_name: str = "enet_b2_8" + model_name: str = 'enet_b2_8' @classmethod - def _from_env(cls) -> "EmotionDetectionSettings": - model_name = os.getenv("EMOTION_MODEL_NAME", "enet_b2_8") + def _from_env(cls) -> 'EmotionDetectionSettings': + model_name = os.getenv('EMOTION_MODEL_NAME', 'enet_b2_8') return cls(model_name=model_name) - @dataclass class CharacterSettings: reference_images_per_character: int = 3 normalized_face_size: Tuple[int, int] = (112, 112) face_detection_threshold: float = 0.2 - reference_matching_threshold: float = 0.50 + reference_matching_threshold: float = 0.5 frame_detection_threshold: float = 0.55 @staticmethod def get_output_dir(series_name: str) -> Path: - return get_base_output_dir(series_name) / "characters" + return get_base_output_dir(series_name) / 'characters' @staticmethod def get_characters_list_file(series_name: str) -> Path: - return get_base_output_dir(series_name) / "characters.json" + return get_base_output_dir(series_name) / 'characters.json' @staticmethod def get_detections_dir(series_name: str) -> Path: - return get_base_output_dir(series_name) / "character_detections" + return get_base_output_dir(series_name) / 'character_detections' @staticmethod def get_processed_references_dir(series_name: str) -> Path: - return get_base_output_dir(series_name) / "character_references_processed" - + return get_base_output_dir(series_name) / 'character_references_processed' @dataclass class ObjectDetectionSettings: - model_name: str = "ustc-community/dfine-xlarge-obj2coco" - conf_threshold: float = 0.30 + model_name: str = 'ustc-community/dfine-xlarge-obj2coco' + conf_threshold: float = 0.3 @staticmethod def get_output_dir(series_name: str) -> Path: - return get_base_output_dir(series_name) / "object_detections" + return get_base_output_dir(series_name) / 'object_detections' @staticmethod def get_visualized_output_dir(series_name: str) -> Path: - return get_base_output_dir(series_name) / "object_detections" / "visualizations" - - -# ============================================================================ -# UTILITIES -# ============================================================================ + return get_base_output_dir(series_name) / 'object_detections' / 'visualizations' @dataclass class ImageHashSettings: + @staticmethod def get_output_dir(series_name: str) -> Path: - return get_base_output_dir(series_name) / "image_hashes" - + return get_base_output_dir(series_name) / 'image_hashes' @dataclass class ImageScraperSettings(BaseAPISettings): @@ -326,57 +269,43 @@ class ImageScraperSettings(BaseAPISettings): page_navigation_timeout: int = 30000 @classmethod - def _from_env(cls) -> "ImageScraperSettings": + def _from_env(cls) -> 'ImageScraperSettings': api_key = None - if os.getenv("SERPAPI_API_KEY"): - api_key = SecretStr(os.getenv("SERPAPI_API_KEY", "")) + if os.getenv('SERPAPI_API_KEY'): + api_key = SecretStr(os.getenv('SERPAPI_API_KEY', '')) return cls(_api_key=api_key) @property def serpapi_key(self) -> Optional[str]: return self.api_key - @dataclass class ScraperSettings: + @staticmethod def get_output_dir(series_name: str) -> Path: - return get_base_output_dir(series_name) / "scraped_pages" - - -# ============================================================================ -# EXTERNAL SERVICES -# ============================================================================ + return get_base_output_dir(series_name) / 'scraped_pages' @dataclass class ElasticsearchSettings: - host: str = "" - user: str = "" - password: str = "" + host: str = '' + user: str = '' + password: str = '' @classmethod - def _from_env(cls) -> "ElasticsearchSettings": - return cls( - host=os.getenv("ES_HOST", ""), - user=os.getenv("ES_USER", ""), - password=os.getenv("ES_PASS", ""), - ) - + def _from_env(cls) -> 'ElasticsearchSettings': + return cls(host=os.getenv('ES_HOST', ''), user=os.getenv('ES_USER', ''), password=os.getenv('ES_PASS', '')) @dataclass class GeminiSettings(BaseAPISettings): + @classmethod - def _from_env(cls) -> "GeminiSettings": + def _from_env(cls) -> 'GeminiSettings': api_key = None - if os.getenv("GEMINI_API_KEY"): - api_key = SecretStr(os.getenv("GEMINI_API_KEY", "")) + if os.getenv('GEMINI_API_KEY'): + api_key = SecretStr(os.getenv('GEMINI_API_KEY', '')) return cls(_api_key=api_key) - -# ============================================================================ -# MAIN SETTINGS -# ============================================================================ - @dataclass class Settings: # pylint: disable=too-many-instance-attributes output_subdirs: OutputSubdirs @@ -402,7 +331,7 @@ class Settings: # pylint: disable=too-many-instance-attributes transcription: TranscriptionSettings @classmethod - def _from_env(cls) -> "Settings": + def _from_env(cls) -> 'Settings': return cls( output_subdirs=OutputSubdirs(), whisper=WhisperSettings._from_env(), @@ -427,11 +356,6 @@ def _from_env(cls) -> "Settings": transcription=TranscriptionSettings(), ) - -# ============================================================================ -# PIPELINE CONFIGS -# ============================================================================ - @dataclass class TranscodeConfig: videos: Path @@ -448,20 +372,19 @@ class TranscodeConfig: def to_dict(self) -> Dict[str, Any]: return { - "videos": self.videos, - "transcoded_videos": self.transcoded_videos, - "resolution": self.resolution, - "codec": self.codec, - "video_bitrate_mbps": self.video_bitrate_mbps, - "minrate_mbps": self.minrate_mbps, - "maxrate_mbps": self.maxrate_mbps, - "bufsize_mbps": self.bufsize_mbps, - "audio_bitrate_kbps": self.audio_bitrate_kbps, - "gop_size": self.gop_size, - "episodes_info_json": self.episodes_info_json, + 'videos': self.videos, + 'transcoded_videos': self.transcoded_videos, + 'resolution': self.resolution, + 'codec': self.codec, + 'video_bitrate_mbps': self.video_bitrate_mbps, + 'minrate_mbps': self.minrate_mbps, + 'maxrate_mbps': self.maxrate_mbps, + 'bufsize_mbps': self.bufsize_mbps, + 'audio_bitrate_kbps': self.audio_bitrate_kbps, + 'gop_size': self.gop_size, + 'episodes_info_json': self.episodes_info_json, } - @dataclass class TranscriptionConfig: videos: Path @@ -475,17 +398,16 @@ class TranscriptionConfig: def to_dict(self) -> Dict[str, Any]: return { - "videos": self.videos, - "episodes_info_json": self.episodes_info_json, - "transcription_jsons": self.transcription_jsons, - "model": self.model, - "language": self.language, - "device": self.device, - "extra_json_keys_to_remove": self.extra_json_keys_to_remove, - "name": self.name, + 'videos': self.videos, + 'episodes_info_json': self.episodes_info_json, + 'transcription_jsons': self.transcription_jsons, + 'model': self.model, + 'language': self.language, + 'device': self.device, + 'extra_json_keys_to_remove': self.extra_json_keys_to_remove, + 'name': self.name, } - @dataclass class IndexConfig: name: str @@ -494,16 +416,5 @@ class IndexConfig: append: bool = False def to_dict(self) -> Dict[str, Any]: - return { - "name": self.name, - "transcription_jsons": str(self.transcription_jsons), - "dry_run": self.dry_run, - "append": self.append, - } - - -# ============================================================================ -# GLOBAL INSTANCE -# ============================================================================ - + return {'name': self.name, 'transcription_jsons': str(self.transcription_jsons), 'dry_run': self.dry_run, 'append': self.append} settings = Settings._from_env() diff --git a/preprocessor/config/constants.py b/preprocessor/config/constants.py new file mode 100644 index 000000000..97df95324 --- /dev/null +++ b/preprocessor/config/constants.py @@ -0,0 +1,66 @@ +SUPPORTED_VIDEO_EXTENSIONS = ('.mp4', '.avi', '.mkv', '.mov', '.flv', '.wmv', '.webm') +DEFAULT_VIDEO_EXTENSION = '.mp4' + +FILE_SUFFIXES = { + 'segmented': '_segmented', + 'text_segments': '_text_segments', + 'simple': '_simple', + 'clean': '_clean_transcription', + 'clean_alt': '_clean', + 'scenes': '_scenes', + 'sound_events': '_sound_events', + 'text_stats': '_text_stats', + 'embeddings_text': '_embeddings_text', + 'embeddings_video': '_embeddings_video', + 'embeddings_full': 'embeddings_full_episode', + 'embeddings_sound': 'embeddings_sound_events', + 'episode_name': 'episode_name_embedding', + 'image_hashes': '_image_hashes', + 'detections': 'detections', + 'character_detections': '_character_detections', +} + +FILE_EXTENSIONS = { + 'json': '.json', + 'jsonl': '.jsonl', + 'txt': '.txt', + 'srt': '.srt', + 'mp4': '.mp4', + 'jpg': '.jpg', +} + +OUTPUT_FILE_NAMES = { + 'detections': 'detections.json', + 'episode_embedding': 'episode_name_embedding.json', + 'embeddings_text': 'embeddings_text.json', +} + +OUTPUT_FILE_PATTERNS = { + 'frame': '*_frame_*.jpg', + 'scenes_suffix': '_scenes.json', +} + +class EpisodesDataKeys: + SEASONS = 'seasons' + SEASON_NUMBER = 'season' + EPISODES = 'episodes' + +class EpisodeMetadataKeys: + EPISODE_NUMBER = 'episode_number' + TITLE = 'title' + PREMIERE_DATE = 'premiere_date' + VIEWERSHIP = 'viewership' + +class FfprobeKeys: + STREAMS = 'streams' + FORMAT = 'format' + +class ValidationMetadataKeys: + SIZE_BYTES = 'size_bytes' + SIZE_MB = 'size_mb' + LINE_COUNT = 'line_count' + WIDTH = 'width' + HEIGHT = 'height' + FORMAT = 'format' + CODEC = 'codec' + DURATION = 'duration' diff --git a/preprocessor/config/enums.py b/preprocessor/config/enums.py new file mode 100644 index 000000000..633525a1d --- /dev/null +++ b/preprocessor/config/enums.py @@ -0,0 +1,30 @@ +from enum import Enum + + +class KeyframeStrategy(str, Enum): + SCENE_CHANGES = 'scene_changes' + +class FrameType(str, Enum): + SCENE_SINGLE = 'scene_single' + SCENE_START = 'scene_start' + SCENE_END = 'scene_end' + + @staticmethod + def scene_mid(index: int) -> str: + return f'scene_mid_{index}' + +class ScraperMethod(str, Enum): + CLIPBOARD = 'clipboard' + CRAWL4AI = 'crawl4ai' + +class ParserMode(str, Enum): + NORMAL = 'normal' + PREMIUM = 'premium' + +class TranscriptionFormat(str, Enum): + ELEVENLABS_SEGMENTED = '11labs_segmented' + ELEVENLABS = '11labs' + +class Device(str, Enum): + CUDA = 'cuda' + CPU = 'cpu' diff --git a/preprocessor/prompts/__init__.py b/preprocessor/config/prompts/__init__.py similarity index 50% rename from preprocessor/prompts/__init__.py rename to preprocessor/config/prompts/__init__.py index e39180497..1b4110e68 100644 --- a/preprocessor/prompts/__init__.py +++ b/preprocessor/config/prompts/__init__.py @@ -12,14 +12,14 @@ ) __all__ = [ - "extract_all_seasons_system", - "extract_all_seasons_user", - "extract_characters_system", - "extract_characters_user", - "extract_episode_metadata_system", - "extract_episode_metadata_user", - "extract_season_system", - "extract_season_user", - "merge_episode_data_system", - "merge_episode_data_user", + 'extract_all_seasons_system', + 'extract_all_seasons_user', + 'extract_characters_system', + 'extract_characters_user', + 'extract_episode_metadata_system', + 'extract_episode_metadata_user', + 'extract_season_system', + 'extract_season_user', + 'merge_episode_data_system', + 'merge_episode_data_user', ] diff --git a/preprocessor/config/prompts/common_schemas.py b/preprocessor/config/prompts/common_schemas.py new file mode 100644 index 000000000..6ec402adc --- /dev/null +++ b/preprocessor/config/prompts/common_schemas.py @@ -0,0 +1,14 @@ +"""Common JSON schemas used across prompts.""" + + +def episode_metadata_schema() -> str: + """Returns JSON schema for episode metadata.""" + return ( + '{\n' + ' "title": str,\n' + ' "description": str,\n' + ' "summary": str,\n' + ' "season": int or null,\n' + ' "episode_number": int or null\n' + '}' + ) diff --git a/preprocessor/config/prompts/extract_all_seasons_system.py b/preprocessor/config/prompts/extract_all_seasons_system.py new file mode 100644 index 000000000..14b4dacdd --- /dev/null +++ b/preprocessor/config/prompts/extract_all_seasons_system.py @@ -0,0 +1,64 @@ +def get() -> str: + return ( + 'You are extracting episode data from TV series wiki pages.\n' + 'Your task is to find tables or lists containing episode information ' + 'and extract the EXACT data.\n\n' + 'Look for patterns like:\n' + 'Nr | Tytuł | Premiera | Oglądalność\n' + '1 | _[Episode Title]_ | 05.03.2006 | 4 396 564\n\n' + 'CRITICAL RULES:\n' + '1. Extract EXACT titles from the table - do NOT make up generic titles ' + 'like "Odcinek 1"\n' + '2. Extract EXACT premiere dates as shown - do NOT invent dates\n' + '3. If premiere date contains multiple dates separated by "/" (e.g., ' + '"31.12.2008"), extract ONLY the FIRST date: "31.12.2008"\n' + '4. Extract EXACT viewership numbers - remove spaces: "4 396 564" -> ' + '4396564\n' + '5. If episode number is in format like "E12" or "S01E12", extract just ' + 'the number: 12\n' + '6. Do NOT hallucinate or make up any data - only extract what you see\n\n' + 'IMPORTANT: Each episode must have TWO numbers:\n' + '- episode_in_season: The episode number within its season (resets to 1 ' + 'for each season)\n' + '- overall_episode_number: The absolute episode number across all seasons ' + '(continues counting)\n\n' + 'Example extraction from this markdown:\n' + '```\n' + 'Sezon 1:\n' + 'Nr | Tytuł | Premiera | Oglądalność\n' + '1 | _[Spadek]_ | 05.03.2006 | 4 396 564\n' + '2 | _[Goście z zaświatów]_ | 12.03.2006 | 4 308 423\n\n' + 'Sezon 2:\n' + 'Nr | Tytuł | Premiera | Oglądalność\n' + '14 | _[Sztuka i władza]_ | 18.03.2007 | 6 993 951\n' + '15 | _[Gmina to ja]_ | 25.03.2007 | 6 754 211\n' + '```\n\n' + 'Should produce:\n' + '{\n' + ' "seasons": [\n' + ' {\n' + ' "season_number": 1,\n' + ' "episodes": [\n' + ' {"episode_in_season": 1, "overall_episode_number": 1, ' + '"title": "Spadek", "premiere_date": "05.03.2006", ' + '"viewership": "4396564"},\n' + ' {"episode_in_season": 2, "overall_episode_number": 2, ' + '"title": "Goście z zaświatów", "premiere_date": "12.03.2006", ' + '"viewership": "4308423"}\n' + ' ]\n' + ' },\n' + ' {\n' + ' "season_number": 2,\n' + ' "episodes": [\n' + ' {"episode_in_season": 1, "overall_episode_number": 14, ' + '"title": "Sztuka i władza", "premiere_date": "18.03.2007", ' + '"viewership": "6993951"},\n' + ' {"episode_in_season": 2, "overall_episode_number": 15, ' + '"title": "Gmina to ja", "premiere_date": "25.03.2007", ' + '"viewership": "6754211"}\n' + ' ]\n' + ' }\n' + ' ]\n' + '}\n\n' + 'Return ONLY valid JSON. Extract ONLY what you see, do NOT invent data.' + ) diff --git a/preprocessor/config/prompts/extract_all_seasons_user.py b/preprocessor/config/prompts/extract_all_seasons_user.py new file mode 100644 index 000000000..489577bb4 --- /dev/null +++ b/preprocessor/config/prompts/extract_all_seasons_user.py @@ -0,0 +1,7 @@ +def get() -> str: + return ( + 'Extract ALL episodes from ALL {num_sources} sources below.\n' + 'Return a complete list of ALL seasons found.\n\n' + '{combined_content}\n\n' + 'Extract ALL seasons and episodes from above sources.' + ) diff --git a/preprocessor/config/prompts/extract_characters_system.py b/preprocessor/config/prompts/extract_characters_system.py new file mode 100644 index 000000000..ec49c8dae --- /dev/null +++ b/preprocessor/config/prompts/extract_characters_system.py @@ -0,0 +1,120 @@ +def get() -> str: + return ( + 'You are an expert at extracting character information from TV series ' + 'documentation and wikis.\n\n' + 'Your task is to analyze scraped web pages and extract a COMPLETE list ' + 'of ALL characters from a TV series.\n\n' + 'For each character, extract ONLY the name (full name if available, ' + 'otherwise commonly used name).\n\n' + '### RULES FOR EXTRACTION:\n\n' + '1. **Completeness:** Extract ALL characters: main, supporting, recurring, ' + 'and episodic (even if they appear once).\n' + '2. **Source:** Extract ONLY what you see in the content. Do NOT invent ' + 'characters.\n' + '3. **CRITICAL - Single Series Only:** The scraped content may include ' + 'references to other TV series (e.g., in footers, sidebars, "See also" ' + 'sections, or related links). You MUST extract characters ONLY from the ' + 'specific series mentioned in the user prompt. IGNORE all characters from ' + 'any other series.\n' + '4. **Multi-Source Deduplication:** When processing multiple sources:\n' + ' - Merge character lists from all sources\n' + ' - Remove duplicates (same character mentioned in multiple sources)\n' + ' - If a character has different name variants across sources, use the ' + 'most complete/formal version\n' + ' - Combine information to get the most accurate character list\n' + '5. **Naming:** Use the Polish name if the series is Polish. If a ' + 'character has multiple aliases, use the most formal/common one.\n\n' + '6. **Text Cleaning (CRITICAL):**\n' + ' - Remove ALL special characters that are not letters (e.g., quotes ' + '`"`, brackets `()`, hyphens `-` inside titles, etc.).\n' + ' - Remove actor names typically found in brackets.\n' + ' - The final output string must contain **ONLY letters (including ' + 'Polish diacritics: ą, ć, ę, ł, ń, ó, ś, ź, ż) and spaces**.\n' + ' - Do not leave trailing periods after expanding titles.\n\n' + '7. **ABBREVIATION EXPANSION (Mandatory):**\n' + ' You MUST expand ALL abbreviations to their full Polish forms.\n' + ' **IMPORTANT:** Process compound abbreviations (2+ words) BEFORE ' + 'single word abbreviations.\n\n' + ' **Ecclesiastical (Religious):**\n' + ' - ks. prob. / ks.prob. -> Ksiądz Proboszcz\n' + ' - ks. wik. / ks.wik. -> Ksiądz Wikariusz\n' + ' - ks. kan. -> Ksiądz Kanonik\n' + ' - ks. bp -> Ksiądz Biskup\n' + ' - ks. kard. -> Ksiądz Kardynał\n' + ' - ks. -> Ksiądz\n' + ' - o. -> Ojciec (e.g., Ojciec Mateusz)\n' + ' - s. -> Siostra\n' + ' - br. -> Brat\n' + ' - bp -> Biskup\n' + ' - abp -> Arcybiskup\n' + ' - kard. -> Kardynał\n' + ' - pap. -> Papież\n' + ' - wik. -> Wikariusz\n' + ' - prob. -> Proboszcz\n\n' + ' **Academic & Medical:**\n' + ' - dr hab. -> Doktor habilitowany\n' + ' - prof. nadzw. -> Profesor nadzwyczajny\n' + ' - prof. zw. -> Profesor zwyczajny\n' + ' - prof. -> Profesor\n' + ' - dr -> Doktor\n' + ' - mgr -> Magister\n' + ' - inż. -> Inżynier\n' + ' - lek. med. / lek. -> Lekarz\n' + ' - doc. -> Docent\n' + ' - piel. -> Pielęgniarka / Pielęgniarz\n\n' + ' **Military, Police & Services:**\n' + ' - nadkom. -> Nadkomisarz\n' + ' - podkom. -> Podkomisarz\n' + ' - kom. -> Komisarz\n' + ' - asp. sztab. -> Aspirant sztabowy\n' + ' - asp. -> Aspirant\n' + ' - st. post. -> Starszy posterunkowy\n' + ' - post. -> Posterunkowy\n' + ' - sierż. -> Sierżant\n' + ' - gen. -> Generał\n' + ' - płk -> Pułkownik\n' + ' - ppłk -> Podpułkownik\n' + ' - mjr -> Major\n' + ' - kpt. -> Kapitan\n' + ' - por. -> Porucznik\n' + ' - ppor. -> Podporucznik\n\n' + ' **Legal, Political & Administrative:**\n' + ' - mec. -> Mecenas\n' + ' - prok. -> Prokurator\n' + ' - sędz. -> Sędzia\n' + ' - dyr. -> Dyrektor\n' + ' - prez. -> Prezydent\n' + ' - min. -> Minister\n' + ' - sen. -> Senator\n' + ' - pos. -> Poseł\n' + ' - przew. -> Przewodniczący\n' + ' - z-ca -> Zastępca\n\n' + ' **Other:**\n' + ' - red. -> Redaktor\n\n' + ' *If you encounter an abbreviation not listed here, expand it to its ' + 'correct full Polish form based on context.*\n\n' + '### EXAMPLE EXTRACTION:\n\n' + 'Source 1:\n' + '```\n' + 'Główni bohaterowie:\n' + '- ks. prob. Krzysztof Robert (Artur Żmijewski)\n' + '- Lucy Wilska (Ilona Ostrowska)\n' + '```\n\n' + 'Source 2:\n' + '```\n' + 'Postacie:\n' + '- Ksiądz Proboszcz Krzysztof Robert\n' + '- dr Cezary Pazura\n' + '- kom. Paweł Kozioł\n' + '```\n\n' + 'Should produce (deduplicated and cleaned):\n' + '{\n' + ' "characters": [\n' + ' {"name": "Ksiądz Proboszcz Krzysztof Robert"},\n' + ' {"name": "Lucy Wilska"},\n' + ' {"name": "Doktor Cezary Pazura"},\n' + ' {"name": "Komisarz Paweł Kozioł"}\n' + ' ]\n' + '}\n\n' + 'Return ONLY valid JSON.' + ) diff --git a/preprocessor/config/prompts/extract_characters_user.py b/preprocessor/config/prompts/extract_characters_user.py new file mode 100644 index 000000000..2da692cf7 --- /dev/null +++ b/preprocessor/config/prompts/extract_characters_user.py @@ -0,0 +1,15 @@ +def get() -> str: + return ( + 'Extract ALL characters from the TV series "{series_name}" from ALL ' + '{num_sources} source(s) below.\n\n' + '**CRITICAL:** Multiple sources may have overlapping or complementary ' + 'character lists.\n' + '- Merge and deduplicate characters across all sources\n' + '- Extract ONLY characters from "{series_name}" (ignore other series ' + 'mentioned in footers/sidebars)\n' + '- Return a single unified list\n\n' + 'Here is the content from all sources combined:\n\n' + '{combined_content}\n\n' + '---\n' + 'Extract ALL characters from "{series_name}" found in the content above.' + ) diff --git a/preprocessor/config/prompts/extract_episode_metadata_system.py b/preprocessor/config/prompts/extract_episode_metadata_system.py new file mode 100644 index 000000000..81f54b2c5 --- /dev/null +++ b/preprocessor/config/prompts/extract_episode_metadata_system.py @@ -0,0 +1,17 @@ +from preprocessor.config.prompts.common_schemas import episode_metadata_schema + + +def get() -> str: + return ( + 'Extract episode information from the provided web page content.\n' + 'Focus on finding:\n' + '- Episode title (exact title, not description)\n' + '- Episode description (1-2 sentences summarizing the plot)\n' + '- Episode summary (detailed summary, 3-5 sentences)\n' + '- Season number (if mentioned)\n' + '- Episode number (if mentioned)\n\n' + 'If information is missing, use empty string for text fields and null ' + 'for numbers.\n' + 'Be precise and extract only factual information from the text.\n\n' + f'Return ONLY valid JSON matching this schema:\n{episode_metadata_schema()}' + ) diff --git a/preprocessor/config/prompts/extract_episode_metadata_user.py b/preprocessor/config/prompts/extract_episode_metadata_user.py new file mode 100644 index 000000000..b11fbc1b4 --- /dev/null +++ b/preprocessor/config/prompts/extract_episode_metadata_user.py @@ -0,0 +1,2 @@ +def get() -> str: + return 'URL: {url}\n\nPage content:\n{page_text}\n\nExtract the episode metadata from above.' diff --git a/preprocessor/config/prompts/extract_season_system.py b/preprocessor/config/prompts/extract_season_system.py new file mode 100644 index 000000000..4b8c8a77e --- /dev/null +++ b/preprocessor/config/prompts/extract_season_system.py @@ -0,0 +1,31 @@ +def get() -> str: + return ( + 'You are extracting episode data from a TV series page.\n' + 'Extract ALL episodes you can find on the page. Look for tables, lists, ' + 'or any structured data.\n\n' + 'For each episode extract:\n' + '- episode_in_season: The episode number within its season (1, 2, 3... ' + 'resets each season)\n' + '- overall_episode_number: The absolute episode number across all seasons ' + '(continues counting)\n' + '- title: string (clean title without markdown formatting)\n' + '- premiere_date: string (date format as found on page; if multiple dates ' + 'separated by "/" like "31.12.2008", extract ONLY the FIRST date: ' + '"31.12.2008")\n' + '- viewership: string (remove spaces from numbers like "4 396 564" -> ' + '"4396564", use null if not available)\n\n' + 'The season number should be determined from the page content or URL.\n\n' + 'Return ONLY valid JSON matching this schema:\n' + '{\n' + ' "season_number": int,\n' + ' "episodes": [\n' + ' {\n' + ' "episode_in_season": int,\n' + ' "overall_episode_number": int,\n' + ' "title": str,\n' + ' "premiere_date": str,\n' + ' "viewership": str\n' + ' }\n' + ' ]\n' + '}' + ) diff --git a/preprocessor/config/prompts/extract_season_user.py b/preprocessor/config/prompts/extract_season_user.py new file mode 100644 index 000000000..b0f1c32ba --- /dev/null +++ b/preprocessor/config/prompts/extract_season_user.py @@ -0,0 +1,2 @@ +def get() -> str: + return 'URL: {url}\n\nPage content (markdown):\n{page_text}\n\nExtract ALL episodes from this page and return as JSON.' diff --git a/preprocessor/config/prompts/merge_episode_data_system.py b/preprocessor/config/prompts/merge_episode_data_system.py new file mode 100644 index 000000000..bb48a3fe2 --- /dev/null +++ b/preprocessor/config/prompts/merge_episode_data_system.py @@ -0,0 +1,14 @@ +from preprocessor.config.prompts.common_schemas import episode_metadata_schema + + +def get() -> str: + return ( + 'You are merging episode information from multiple sources.\n' + 'Create a single, accurate metadata entry by:\n' + '- Choosing the most complete and accurate title\n' + '- Combining descriptions into a coherent 1-2 sentence description\n' + '- Merging summaries into a comprehensive 3-5 sentence summary\n' + '- Using the most reliable season/episode numbers\n\n' + 'Prefer longer, more detailed information when merging.\n\n' + f'Return ONLY valid JSON matching this schema:\n{episode_metadata_schema()}' + ) diff --git a/preprocessor/config/prompts/merge_episode_data_user.py b/preprocessor/config/prompts/merge_episode_data_user.py new file mode 100644 index 000000000..eedb27c9f --- /dev/null +++ b/preprocessor/config/prompts/merge_episode_data_user.py @@ -0,0 +1,2 @@ +def get() -> str: + return 'Merge the following episode metadata from {num_sources} sources:\n\n{combined_text}\n\nCreate a single, unified metadata entry.' diff --git a/preprocessor/config/step_configs.py b/preprocessor/config/step_configs.py new file mode 100644 index 000000000..3a496dd08 --- /dev/null +++ b/preprocessor/config/step_configs.py @@ -0,0 +1,135 @@ +from typing import List + +from pydantic import ( + BaseModel, + Field, + model_validator, +) +from typing_extensions import Self + +from preprocessor.config.enums import KeyframeStrategy +from preprocessor.lib.media.resolution import Resolution + + +class TranscodeConfig(BaseModel): + resolution: Resolution = Field(default=Resolution.R720P) + codec: str = Field(default='h264_nvenc') + preset: str = 'p7' + video_bitrate_mbps: float = Field(gt=0) + minrate_mbps: float = Field(gt=0) + maxrate_mbps: float = Field(gt=0) + bufsize_mbps: float = Field(gt=0) + audio_bitrate_kbps: int = 128 + gop_size: float = Field(gt=0) + + class Config: + arbitrary_types_allowed = True + + @model_validator(mode='after') + def maxrate_must_be_greater_than_bitrate(self) -> Self: + if self.maxrate_mbps < self.video_bitrate_mbps: + raise ValueError('maxrate must be >= video_bitrate') + return self + +class SceneDetectionConfig(BaseModel): + threshold: float = Field(default=0.5, ge=0, le=1) + min_scene_len: int = Field(default=15, ge=1) + +class FrameExportConfig(BaseModel): + resolution: Resolution = Field(default=Resolution.R720P) + keyframe_strategy: KeyframeStrategy = KeyframeStrategy.SCENE_CHANGES + frames_per_scene: int = Field(default=3, ge=1) + + class Config: + arbitrary_types_allowed = True + +class TranscriptionConfig(BaseModel): + model: str = 'large-v3' + language: str = 'pl' + output_formats: List[str] = ['json', 'srt', 'txt'] + +class WhisperTranscriptionConfig(BaseModel): + model: str = 'large-v3-turbo' + language: str = 'pl' + device: str = 'cuda' + beam_size: int = Field(default=5, ge=1) + temperature: float = Field(default=0.0, ge=0.0, le=1.0) + +class TextAnalysisConfig(BaseModel): + language: str = 'pl' + +class TextEmbeddingConfig(BaseModel): + model_name: str = 'Qwen/Qwen2-VL-8B-Instruct' + batch_size: int = Field(default=8, ge=1) + device: str = 'cuda' + text_sentences_per_chunk: int = Field(default=5, ge=1) + text_chunk_overlap: int = Field(default=1, ge=0) + +class VideoEmbeddingConfig(BaseModel): + model_name: str = 'Qwen/Qwen2-VL-8B-Instruct' + batch_size: int = Field(default=8, ge=1) + device: str = 'cuda' + +class SoundSeparationConfig(BaseModel): + pass + +class DocumentGenerationConfig(BaseModel): + generate_segments: bool = True + +class ImageHashConfig(BaseModel): + batch_size: int = Field(default=32, ge=1) + +class TranscriptionImportConfig(BaseModel): + source_dir: str + format_type: str = '11labs_segmented' + +class ElasticsearchConfig(BaseModel): + index_name: str + host: str = 'localhost:9200' + dry_run: bool = False + append: bool = False + +class AudioExtractionConfig(BaseModel): + pass + +class CharacterDetectionConfig(BaseModel): + threshold: float = Field(default=0.7, ge=0.0, le=1.0) + +class EmotionDetectionConfig(BaseModel): + pass + +class FaceClusteringConfig(BaseModel): + pass + +class ObjectDetectionConfig(BaseModel): + pass + +class ArchiveConfig(BaseModel): + pass + +class ValidationConfig(BaseModel): + pass + + +class EpisodeScraperConfig(BaseModel): + urls: List[str] + output_file: str + headless: bool = True + merge_sources: bool = True + scraper_method: str = "crawl4ai" + parser_mode: str = "normal" + + +class CharacterScraperConfig(BaseModel): + urls: List[str] + output_file: str + headless: bool = True + scraper_method: str = "crawl4ai" + parser_mode: str = "normal" + + +class CharacterReferenceConfig(BaseModel): + characters_file: str + output_dir: str + search_engine: str = "duckduckgo" + images_per_character: int = Field(default=5, ge=1, le=20) diff --git a/preprocessor/config/types/__init__.py b/preprocessor/config/types/__init__.py new file mode 100644 index 000000000..512f5a419 --- /dev/null +++ b/preprocessor/config/types/__init__.py @@ -0,0 +1,81 @@ +from .clip import ClipSegment +from .detection import ( + CharacterDetectionInFrame, + Detection, + ObjectDetectionInFrame, +) +from .episode import ( + EpisodeInfo, + EpisodeMetadata, + SeasonInfo, + SeasonInfoDict, +) +from .frame import FrameRequest +from .keys import ( + ElasticsearchAggregationKeys, + ElasticsearchKeys, + EpisodeMetadataKeys, + WordKeys, + WordTypeValues, +) +from .scene import ( + SceneDict, + SceneTimestamp, + SceneTimestampPoint, + SceneTimestampsData, +) +from .search import ( + ElasticsearchAggregations, + ElasticsearchHit, + ElasticsearchHits, + ElasticsearchResponse, + EpisodeBucket, + SearchSegment, + SeasonBucket, +) +from .transcription import ( + BaseSegment, + ElasticsearchSegment, + SegmentWithScore, + SegmentWithTimes, + TranscriptionContext, +) +from .video import ( + HashResult, + VideoMetadata, +) + +__all__ = [ + 'EpisodeInfo', + 'EpisodeMetadata', + 'SeasonInfo', + 'SeasonInfoDict', + 'FrameRequest', + 'SceneDict', + 'SceneTimestamp', + 'SceneTimestampPoint', + 'SceneTimestampsData', + 'ClipSegment', + 'CharacterDetectionInFrame', + 'Detection', + 'ObjectDetectionInFrame', + 'HashResult', + 'VideoMetadata', + 'BaseSegment', + 'ElasticsearchSegment', + 'SegmentWithScore', + 'SegmentWithTimes', + 'TranscriptionContext', + 'ElasticsearchAggregations', + 'ElasticsearchHit', + 'ElasticsearchHits', + 'ElasticsearchResponse', + 'EpisodeBucket', + 'SearchSegment', + 'SeasonBucket', + 'WordKeys', + 'WordTypeValues', + 'ElasticsearchKeys', + 'ElasticsearchAggregationKeys', + 'EpisodeMetadataKeys', +] diff --git a/preprocessor/types/clip.py b/preprocessor/config/types/clip.py similarity index 100% rename from preprocessor/types/clip.py rename to preprocessor/config/types/clip.py diff --git a/preprocessor/types/detection.py b/preprocessor/config/types/detection.py similarity index 99% rename from preprocessor/types/detection.py rename to preprocessor/config/types/detection.py index 2360eafaf..f6282c2f6 100644 --- a/preprocessor/types/detection.py +++ b/preprocessor/config/types/detection.py @@ -11,14 +11,12 @@ class CharacterDetectionInFrame(TypedDict): bbox: List[int] embedding: NotRequired[List[float]] - class ObjectDetectionInFrame(TypedDict): class_name: str class_id: int confidence: float bbox: List[int] - class Detection(TypedDict): bbox: List[int] confidence: float diff --git a/preprocessor/types/episode.py b/preprocessor/config/types/episode.py similarity index 99% rename from preprocessor/types/episode.py rename to preprocessor/config/types/episode.py index 8dc88f45c..29b356095 100644 --- a/preprocessor/types/episode.py +++ b/preprocessor/config/types/episode.py @@ -11,7 +11,6 @@ class EpisodeInfo(TypedDict): premiere_date: str viewership: Union[str, int, float] - class EpisodeMetadata(TypedDict): season: int episode_number: int @@ -20,9 +19,6 @@ class EpisodeMetadata(TypedDict): viewership: Union[str, int, float] series_name: str - class SeasonInfo(TypedDict): pass - - SeasonInfoDict = Dict[str, int] diff --git a/preprocessor/types/frame.py b/preprocessor/config/types/frame.py similarity index 100% rename from preprocessor/types/frame.py rename to preprocessor/config/types/frame.py diff --git a/preprocessor/config/types/keys.py b/preprocessor/config/types/keys.py new file mode 100644 index 000000000..64c39a574 --- /dev/null +++ b/preprocessor/config/types/keys.py @@ -0,0 +1,178 @@ +class SegmentKeys: + START_TIME = 'start_time' + END_TIME = 'end_time' + TEXT = 'text' + VIDEO_PATH = 'video_path' + SEGMENT_ID = 'segment_id' + ID = 'id' + START = 'start' + END = 'end' + +class EpisodeMetadataKeys: + EPISODE_METADATA = 'episode_metadata' + EPISODE_INFO = 'episode_info' + SEASON = 'season' + EPISODE_NUMBER = 'episode_number' + SERIES_NAME = 'series_name' + TITLE = 'title' + PREMIERE_DATE = 'premiere_date' + VIEWERSHIP = 'viewership' + +class ElasticsearchKeys: + SOURCE = '_source' + SCORE = '_score' + HITS = 'hits' + TOTAL = 'total' + AGGREGATIONS = 'aggregations' + BUCKETS = 'buckets' + KEY = 'key' + +class ElasticsearchAggregationKeys: + UNIQUE_EPISODES = 'unique_episodes' + SEASONS = 'seasons' + VALUE = 'value' + +class TranscriptionContextKeys: + TARGET = 'target' + CONTEXT = 'context' + OVERALL_START_TIME = 'overall_start_time' + OVERALL_END_TIME = 'overall_end_time' + +class ElasticsearchQueryKeys: + QUERY = 'query' + TERM = 'term' + MATCH = 'match' + BOOL = 'bool' + MUST = 'must' + FILTER = 'filter' + RANGE = 'range' + SIZE = 'size' + SORT = 'sort' + ORDER = 'order' + ASC = 'asc' + DESC = 'desc' + FUZZINESS = 'fuzziness' + AUTO = 'AUTO' + TERMS = 'terms' + FIELD = 'field' + AGGS = 'aggs' + CARDINALITY = 'cardinality' + TOP_HITS = 'top_hits' + INCLUDES = 'includes' + LT = 'lt' + GT = 'gt' + SOURCE = '_source' + KEY = '_key' + +class EpisodesDataKeys: + SEASONS = 'seasons' + SEASON_NUMBER = 'season_number' + EPISODES = 'episodes' + +class FfprobeKeys: + STREAMS = 'streams' + FORMAT = 'format' + +class FfprobeStreamKeys: + R_FRAME_RATE = 'r_frame_rate' + BIT_RATE = 'bit_rate' + CODEC_NAME = 'codec_name' + WIDTH = 'width' + HEIGHT = 'height' + DURATION = 'duration' + +class FfprobeFormatKeys: + DURATION = 'duration' + SIZE = 'size' + +class DetectionKeys: + DETECTIONS = 'detections' + CHARACTERS = 'characters' + FRAME_NUMBER = 'frame_number' + FRAME = 'frame' + FRAME_NAME = 'frame_name' + FRAME_FILE = 'frame_file' + +class CharacterDetectionKeys: + NAME = 'name' + CONFIDENCE = 'confidence' + EMOTION = 'emotion' + BBOX = 'bbox' + +class EmotionKeys: + LABEL = 'label' + CONFIDENCE = 'confidence' + +class ObjectDetectionKeys: + CLASS_NAME = 'class_name' + CLASS_ID = 'class_id' + CONFIDENCE = 'confidence' + BBOX = 'bbox' + +class SceneKeys: + SCENES = 'scenes' + START = 'start' + END = 'end' + SCENE_NUMBER = 'scene_number' + SCENE_START_FRAME = 'scene_start_frame' + SCENE_END_FRAME = 'scene_end_frame' + SCENE_START_TIME = 'scene_start_time' + SCENE_END_TIME = 'scene_end_time' + +class SceneTimeKeys: + SECONDS = 'seconds' + FRAME = 'frame' + +class ElasticDocKeys: + SCENE_INFO = 'scene_info' + CHARACTER_APPEARANCES = 'character_appearances' + DETECTED_OBJECTS = 'detected_objects' + PERCEPTUAL_HASH = 'perceptual_hash' + PERCEPTUAL_HASH_INT = 'perceptual_hash_int' + +class EmbeddingKeys: + EPISODE_ID = 'episode_id' + TITLE = 'title' + TITLE_EMBEDDING = 'title_embedding' + EPISODE_METADATA = 'episode_metadata' + FRAME_NUMBER = 'frame_number' + PERCEPTUAL_HASH = 'perceptual_hash' + FRAME_PATH = 'frame_path' + TIMESTAMP = 'timestamp' + EMBEDDING = 'embedding' + SCENE_NUMBER = 'scene_number' + +class ValidationMetadataKeys: + WIDTH = 'width' + HEIGHT = 'height' + FORMAT = 'format' + SIZE_MB = 'size_mb' + SIZE_BYTES = 'size_bytes' + LINE_COUNT = 'line_count' + CODEC = 'codec' + DURATION = 'duration' + +class WordKeys: + TYPE = 'type' + START = 'start' + END = 'end' + WORD = 'word' + WORDS = 'words' + TEXT = 'text' + +class WordTypeValues: + SPACING = 'spacing' + AUDIO_EVENT = 'audio_event' + +class GoogleSearchKeys: + ENGINE = 'engine' + Q = 'q' + HL = 'hl' + GL = 'gl' + API_KEY = 'api_key' + IMAGES_RESULTS = 'images_results' + +class ImageResultKeys: + ORIGINAL = 'original' + THUMBNAIL = 'thumbnail' + IMAGE = 'image' diff --git a/preprocessor/types/scene.py b/preprocessor/config/types/scene.py similarity index 99% rename from preprocessor/types/scene.py rename to preprocessor/config/types/scene.py index 9fd92f181..7d94c8118 100644 --- a/preprocessor/types/scene.py +++ b/preprocessor/config/types/scene.py @@ -13,18 +13,15 @@ class SceneDict(TypedDict): end_time: float fps: float - class SceneTimestampPoint(TypedDict): frame: int seconds: float - class SceneTimestamp(TypedDict): scene_number: int start: SceneTimestampPoint end: SceneTimestampPoint - class SceneTimestampsData(TypedDict): scenes: List[SceneTimestamp] total_scenes: NotRequired[int] diff --git a/preprocessor/types/search.py b/preprocessor/config/types/search.py similarity index 99% rename from preprocessor/types/search.py rename to preprocessor/config/types/search.py index 2d0f5719c..9d963f67d 100644 --- a/preprocessor/types/search.py +++ b/preprocessor/config/types/search.py @@ -17,37 +17,31 @@ class SearchSegment(TypedDict): start_time: float end_time: float - class ElasticsearchHit(TypedDict): _source: ElasticsearchSegment _score: float - class ElasticsearchHits(TypedDict): hits: List[ElasticsearchHit] total: Dict[str, Any] max_score: float - class ElasticsearchResponse(TypedDict): hits: ElasticsearchHits aggregations: NotRequired[Dict[str, Any]] took: int timed_out: bool - class EpisodeBucket(TypedDict): key: int doc_count: int episode_metadata: Dict[str, Any] - class SeasonBucket(TypedDict): key: int doc_count: int unique_episodes: Dict[str, int] - class ElasticsearchAggregations(TypedDict): seasons: Dict[str, Union[List[SeasonBucket], int]] unique_episodes: Dict[str, Union[List[EpisodeBucket], int]] diff --git a/preprocessor/types/transcription.py b/preprocessor/config/types/transcription.py similarity index 99% rename from preprocessor/types/transcription.py rename to preprocessor/config/types/transcription.py index 222bfb9a0..3f5b07b7a 100644 --- a/preprocessor/types/transcription.py +++ b/preprocessor/config/types/transcription.py @@ -13,7 +13,6 @@ class BaseSegment(TypedDict): start: float end: float - class SegmentWithTimes(TypedDict): segment_id: int text: str @@ -22,11 +21,9 @@ class SegmentWithTimes(TypedDict): episode_metadata: EpisodeMetadata video_path: NotRequired[str] - class SegmentWithScore(SegmentWithTimes): _score: float - class ElasticsearchSegment(TypedDict): segment_id: NotRequired[int] id: NotRequired[int] @@ -40,7 +37,6 @@ class ElasticsearchSegment(TypedDict): video_path: NotRequired[str] _score: NotRequired[float] - class TranscriptionContext(TypedDict): target: ElasticsearchSegment context: List[BaseSegment] diff --git a/preprocessor/types/video.py b/preprocessor/config/types/video.py similarity index 99% rename from preprocessor/types/video.py rename to preprocessor/config/types/video.py index 7d4d620ab..a9120555f 100644 --- a/preprocessor/types/video.py +++ b/preprocessor/config/types/video.py @@ -10,7 +10,6 @@ class HashResult(TypedDict): hash: str file_path: NotRequired[str] - class VideoMetadata(TypedDict): width: int height: int diff --git a/preprocessor/core/artifacts.py b/preprocessor/core/artifacts.py new file mode 100644 index 000000000..889c61300 --- /dev/null +++ b/preprocessor/core/artifacts.py @@ -0,0 +1,110 @@ +from dataclasses import ( + dataclass, + field, +) +from pathlib import Path +from typing import ( + Any, + Dict, + List, + Optional, +) + + +@dataclass(frozen=True) +class Artifact: + pass + +@dataclass(frozen=True) +class EpisodeArtifact(Artifact): + episode_id: str + episode_info: Any + +@dataclass(frozen=True) +class SourceVideo(EpisodeArtifact): + path: Path + +@dataclass(frozen=True) +class TranscodedVideo(EpisodeArtifact): + path: Path + resolution: str + codec: str + +@dataclass(frozen=True) +class SceneCollection(EpisodeArtifact): + path: Path + video_path: Path + scenes: List[Dict[str, Any]] + threshold: float + min_scene_len: int + +@dataclass(frozen=True) +class FrameCollection(EpisodeArtifact): + directory: Path + frame_count: int + metadata_path: Path + +@dataclass(frozen=True) +class TranscriptionData(EpisodeArtifact): + path: Path + language: str + model: str + format: str + +@dataclass(frozen=True) +class EmbeddingCollection(EpisodeArtifact): + path: Path + model_name: str + embedding_count: int + embedding_type: str + +@dataclass(frozen=True) +class DetectionResults(EpisodeArtifact): + path: Path + detection_type: str + detection_count: int + +@dataclass(frozen=True) +class ElasticDocuments(EpisodeArtifact): + path: Path + document_count: int + +@dataclass(frozen=True) +class TextAnalysisResults(EpisodeArtifact): + path: Path + statistics: Dict[str, Any] + metadata: Optional[Dict[str, Any]] = field(default=None) + +@dataclass(frozen=True) +class AudioArtifact(EpisodeArtifact): + path: Path + format: str + +@dataclass(frozen=True) +class IndexingResult(Artifact): + index_name: str + document_count: int + success: bool + +@dataclass(frozen=True) +class ImageHashCollection(EpisodeArtifact): + path: Path + hash_count: int + +@dataclass(frozen=True) +class EmotionData(EpisodeArtifact): + path: Path + +@dataclass(frozen=True) +class ClusterData(EpisodeArtifact): + path: Path + +@dataclass(frozen=True) +class ObjectDetectionData(EpisodeArtifact): + path: Path + +@dataclass(frozen=True) +class ArchiveArtifact(EpisodeArtifact): + path: Path + +ProcessedEpisode = ElasticDocuments diff --git a/preprocessor/core/base_processor.py b/preprocessor/core/base_processor.py index fc700d385..f7d9927fe 100644 --- a/preprocessor/core/base_processor.py +++ b/preprocessor/core/base_processor.py @@ -14,17 +14,18 @@ Tuple, ) -from preprocessor.core.constants import ( +from preprocessor.config.constants import ( FILE_SUFFIXES, SUPPORTED_VIDEO_EXTENSIONS, ) from preprocessor.core.path_manager import PathManager from preprocessor.core.state_manager import StateManager -from preprocessor.utils.console import ( +from preprocessor.lib.core.logging import ErrorHandlingLogger +from preprocessor.lib.ui.console import ( + SimpleProgress, console, - create_progress, ) -from preprocessor.utils.error_handling_logger import ErrorHandlingLogger +from preprocessor.lib.ui.progress import ProgressTracker @dataclass @@ -33,51 +34,30 @@ class ProcessingItem: input_path: Path metadata: Dict[str, Any] - @dataclass class OutputSpec: path: Path required: bool = True - class BaseProcessor(ABC): SUPPORTED_VIDEO_EXTENSIONS = SUPPORTED_VIDEO_EXTENSIONS - REQUIRES: List[str] = [] PRODUCES: List[str] = [] PRIORITY: int = 100 - DESCRIPTION: str = "" + DESCRIPTION: str = '' - def __init__( - self, - args: Dict[str, Any], - class_name: str, - error_exit_code: int, - loglevel: int = logging.DEBUG, - ): + def __init__(self, args: Dict[str, Any], class_name: str, error_exit_code: int, loglevel: int=logging.DEBUG) -> None: self._validate_args(args) self._args = args - - self.logger = ErrorHandlingLogger( - class_name=class_name, - loglevel=loglevel, - error_exit_code=error_exit_code, - ) - - self.state_manager: Optional[StateManager] = args.get("state_manager") - self.series_name: str = args.get("series_name", "unknown") - - self.path_manager: PathManager = args.get( - "path_manager", - PathManager(self.series_name), - ) - - from preprocessor.utils.progress_tracker import ProgressTracker # pylint: disable=import-outside-toplevel - self.progress = args.get("progress_tracker", ProgressTracker()) + self.logger = ErrorHandlingLogger(class_name=class_name, loglevel=loglevel, error_exit_code=error_exit_code) + self.state_manager: Optional[StateManager] = args.get('state_manager') + self.series_name: str = args.get('series_name', 'unknown') + self.path_manager: PathManager = args.get('path_manager', PathManager(self.series_name)) + self.progress = args.get('progress_tracker', ProgressTracker()) @classmethod def get_video_glob_patterns(cls) -> List[str]: - return [f"*{ext}" for ext in cls.SUPPORTED_VIDEO_EXTENSIONS] + return [f'*{ext}' for ext in cls.SUPPORTED_VIDEO_EXTENSIONS] @abstractmethod def _validate_args(self, args: Dict[str, Any]) -> None: @@ -87,13 +67,12 @@ def work(self) -> int: try: self._execute() except KeyboardInterrupt: - console.print("\n[yellow]Process interrupted by user[/yellow]") + console.print('\n[yellow]Process interrupted by user[/yellow]') self.cleanup() self.logger.finalize() return 130 except Exception as e: - self.logger.error(f"{self.__class__.__name__} failed: {e}") - + self.logger.error(f'{self.__class__.__name__} failed: {e}') self.cleanup() return self.logger.finalize() @@ -110,47 +89,45 @@ def _get_processing_info(self) -> List[str]: def _get_episode_processing_items_from_metadata( metadata_pattern: str, base_dir: Path, - episode_manager: "EpisodeManager", + episode_manager: 'EpisodeManager', ) -> List[ProcessingItem]: all_metadata_files = list(base_dir.glob(metadata_pattern)) items = [] - for metadata_file in all_metadata_files: episode_info = episode_manager.parse_filename(metadata_file) if not episode_info: continue - episode_id = episode_manager.get_episode_id_for_state(episode_info) - items.append( ProcessingItem( episode_id=episode_id, input_path=metadata_file, metadata={ - "episode_info": episode_info, - "series_name": episode_manager.series_name, + 'episode_info': episode_info, + 'series_name': episode_manager.series_name, }, ), ) - return items def _get_processing_items(self) -> List[ProcessingItem]: raise NotImplementedError( - f"{self.__class__.__name__} must implement _get_processing_items() " - "or override _execute() directly (legacy mode)", + f'{self.__class__.__name__} must implement _get_processing_items() ' + 'or override _execute() directly (legacy mode)', ) def _get_expected_outputs(self, item: ProcessingItem) -> List[OutputSpec]: raise NotImplementedError( - f"{self.__class__.__name__} must implement _get_expected_outputs() " - "or override _execute() directly (legacy mode)", + f'{self.__class__.__name__} must implement _get_expected_outputs() ' + 'or override _execute() directly (legacy mode)', ) - def _process_item(self, item: ProcessingItem, missing_outputs: List[OutputSpec]) -> None: + def _process_item( + self, item: ProcessingItem, missing_outputs: List[OutputSpec], + ) -> None: raise NotImplementedError( - f"{self.__class__.__name__} must implement _process_item() " - "or override _execute() directly (legacy mode)", + f'{self.__class__.__name__} must implement _process_item() ' + 'or override _execute() directly (legacy mode)', ) @abstractmethod @@ -159,62 +136,58 @@ def get_output_subdir(self) -> str: def __get_step_name(self) -> str: class_name = self.__class__.__name__ - name = class_name.replace("Processor", "").replace("Generator", "").replace("Detector", "") - name = name.replace("Transcoder", "").replace("Importer", "").replace("Indexer", "") + name = class_name.replace('Processor', '').replace('Generator', '').replace('Detector', '') + name = name.replace('Transcoder', '').replace('Importer', '').replace('Indexer', '') return self.__to_snake_case(name) @staticmethod def __to_snake_case(name: str) -> str: - name = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', name) - return re.sub('([a-z0-9])([A-Z])', r'\1_\2', name).lower() + name = re.sub('(.)([A-Z][a-z]+)', '\\1_\\2', name) + return re.sub('([a-z0-9])([A-Z])', '\\1_\\2', name).lower() - def _should_skip_item(self, item: ProcessingItem) -> Tuple[bool, List[OutputSpec], str]: + def _should_skip_item( + self, item: ProcessingItem, + ) -> Tuple[bool, List[OutputSpec], str]: expected_outputs = self._get_expected_outputs(item) - if not expected_outputs: - return False, [], "" - + return (False, [], '') missing_outputs = [ output for output in expected_outputs if not output.path.exists() or output.path.stat().st_size == 0 ] - step_name = self.__get_step_name() state_completed = ( - self.state_manager and - self.state_manager.is_step_completed(step_name, item.episode_id) + self.state_manager + and self.state_manager.is_step_completed(step_name, item.episode_id) ) - if not missing_outputs and state_completed: - return True, [], f"[yellow]Skipping (completed): {item.episode_id}[/yellow]" - - if not missing_outputs and not state_completed: + return (True, [], f'[yellow]Skipping (completed): {item.episode_id}[/yellow]') + if not missing_outputs and (not state_completed): if self.state_manager: self.state_manager.mark_step_completed(step_name, item.episode_id) - return True, [], f"[yellow]Skipping (files exist, state synced): {item.episode_id}[/yellow]" - + return ( + True, + [], + f'[yellow]Skipping (files exist, state synced): {item.episode_id}[/yellow]', + ) if missing_outputs and state_completed: console.print( - f"[yellow]Warning: State marked complete but outputs missing for {item.episode_id}[/yellow]", + f'[yellow]Warning: State marked complete but outputs missing ' + f'for {item.episode_id}[/yellow]', ) - return False, missing_outputs, "" - - return False, missing_outputs, "" + return (False, missing_outputs, '') + return (False, missing_outputs, '') def _execute(self) -> None: all_items = self._get_processing_items() - if not all_items: - console.print("[yellow]No items to process[/yellow]") + console.print('[yellow]No items to process[/yellow]') return - items_to_process = [] skipped_count = 0 skip_messages = [] - for item in all_items: should_skip, missing_outputs, skip_message = self._should_skip_item(item) - if should_skip: if skip_message: skip_messages.append(skip_message) @@ -222,167 +195,140 @@ def _execute(self) -> None: else: item.metadata['missing_outputs'] = missing_outputs items_to_process.append(item) - if not items_to_process: console.print( - f"[yellow]All items already processed ({len(all_items)} total, {skipped_count} skipped)[/yellow]", + f'[yellow]All items already processed ' + f'({len(all_items)} total, {skipped_count} skipped)[/yellow]', ) return - for skip_message in skip_messages: console.print(skip_message) - console.print( - f"[blue]Processing {len(items_to_process)} items " - f"(of {len(all_items)} total, {skipped_count} skipped)[/blue]", + f'[blue]Processing {len(items_to_process)} items ' + f'(of {len(all_items)} total, {skipped_count} skipped)[/blue]', ) - self.__execute_processing(items_to_process) def __execute_processing(self, items: List[ProcessingItem]) -> None: if not items: - console.print("[yellow]No items to process, skipping resource loading[/yellow]") + console.print('[yellow]No items to process, skipping resource loading[/yellow]') return - for info_line in self._get_processing_info(): console.print(info_line) - if not self._load_resources(): return - step_name = self.__get_step_name() - try: - with create_progress() as progress: - task = progress.add_task( - self._get_progress_description(), - total=len(items), - ) - + with SimpleProgress() as progress: + task = progress.add_task(self._get_progress_description(), total=len(items)) for item in items: try: if self.state_manager: temp_files = self._get_temp_files(item) - self.state_manager.mark_step_started( - step_name, - item.episode_id, - temp_files, - ) - + self.state_manager.mark_step_started(step_name, item.episode_id, temp_files) missing_outputs = item.metadata.get('missing_outputs', []) self._process_item(item, missing_outputs) - if self.state_manager: self.state_manager.mark_step_completed(step_name, item.episode_id) - except Exception as e: - self.logger.error(f"Failed to process {item.episode_id}: {e}") + self.logger.error(f'Failed to process {item.episode_id}: {e}') finally: progress.advance(task) except KeyboardInterrupt: - console.print("\n[yellow]Processing interrupted[/yellow]") + console.print('\n[yellow]Processing interrupted[/yellow]') raise def _get_temp_files(self, item: ProcessingItem) -> List[str]: # pylint: disable=unused-argument return [] def _get_progress_description(self) -> str: - return f"Processing {self.__class__.__name__}" + return f'Processing {self.__class__.__name__}' def _create_video_processing_items( self, source_path: Path, extensions: List[str], - episode_manager: "EpisodeManager", + episode_manager: 'EpisodeManager', skip_unparseable: bool = True, subdirectory_filter: Optional[str] = None, ) -> List[ProcessingItem]: - from preprocessor.episodes import EpisodeManager # pylint: disable=import-outside-toplevel - series_name = self.series_name - if not source_path.is_file(): if source_path.name != series_name: source_path = source_path / series_name - if not source_path.exists(): raise FileNotFoundError( - f"Input directory does not exist: {source_path}\n" - f"Expected structure: /input_data/{series_name}/S01/, /input_data/{series_name}/S02/, etc.\n\n" - f"Migration guide:\n" - f" mkdir -p /input_data/{series_name}\n" - f" mv /input_data/S* /input_data/{series_name}/", + f'Input directory does not exist: {source_path}\n' + f'Expected structure: /input_data/{series_name}/S01/, ' + f'/input_data/{series_name}/S02/, etc.\n\n' + f'Migration guide:\n' + f' mkdir -p /input_data/{series_name}\n' + f' mv /input_data/S* /input_data/{series_name}/', ) - video_files = [] - if source_path.is_file(): video_files = [source_path] else: for ext in extensions: if subdirectory_filter: - pattern = f"**/{subdirectory_filter}/{ext}" + pattern = f'**/{subdirectory_filter}/{ext}' else: - pattern = f"**/{ext}" + pattern = f'**/{ext}' video_files.extend(source_path.glob(pattern)) - items = [] for video_file in sorted(video_files): episode_info = episode_manager.parse_filename(video_file) - if not episode_info: if skip_unparseable: - self.logger.error(f"Cannot parse episode info from {video_file.name}") + self.logger.error( + f'Cannot parse episode info from {video_file.name}', + ) continue episode_id = video_file.stem else: + from preprocessor.lib.episodes import EpisodeManager # pylint: disable=import-outside-toplevel episode_id = EpisodeManager.get_episode_id_for_state(episode_info) - items.append( ProcessingItem( episode_id=episode_id, input_path=video_file, - metadata={ - "episode_info": episode_info, - }, + metadata={'episode_info': episode_info}, ), ) - return items - def _create_transcription_processing_item(self, transcription_file: Path) -> ProcessingItem: - from preprocessor.episodes import EpisodeManager # pylint: disable=import-outside-toplevel - - base_name = transcription_file.stem.replace(FILE_SUFFIXES["segmented"], "").replace(FILE_SUFFIXES["simple"], "") - - episode_info = self.episode_manager.parse_filename(transcription_file) if hasattr(self, 'episode_manager') else None + def _create_transcription_processing_item( + self, transcription_file: Path, + ) -> ProcessingItem: + base_name = ( + transcription_file.stem + .replace(FILE_SUFFIXES['segmented'], '') + .replace(FILE_SUFFIXES['simple'], '') + ) + episode_info = ( + self.episode_manager.parse_filename(transcription_file) + if hasattr(self, 'episode_manager') + else None + ) if episode_info: + from preprocessor.lib.episodes import EpisodeManager # pylint: disable=import-outside-toplevel episode_id = EpisodeManager.get_episode_id_for_state(episode_info) else: episode_id = base_name - return ProcessingItem( episode_id=episode_id, input_path=transcription_file, - metadata={ - "base_name": base_name, - }, + metadata={'base_name': base_name}, ) def _build_output_path( - self, - episode_info, - filename: str, - subdir: Optional[str] = None, + self, episode_info, filename: str, subdir: Optional[str] = None, ) -> Path: target_subdir = subdir if subdir is not None else self.get_output_subdir() return self.path_manager.build_path(episode_info, target_subdir, filename) def _build_output_paths( - self, - episode_info, - filenames: List[str], - subdir: Optional[str] = None, + self, episode_info, filenames: List[str], subdir: Optional[str] = None, ) -> List[Path]: return [ self._build_output_path(episode_info, filename, subdir) @@ -390,38 +336,29 @@ def _build_output_paths( ] def _build_season_path( - self, - episode_info, - filename: str, - subdir: Optional[str] = None, + self, episode_info, filename: str, subdir: Optional[str] = None, ) -> Path: target_subdir = subdir if subdir is not None else self.get_output_subdir() return self.path_manager.build_season_path(episode_info, target_subdir, filename) def _build_filename( - self, - episode_info, - extension: str = "json", - suffix: Optional[str] = None, + self, episode_info, extension: str = 'json', suffix: Optional[str] = None, ) -> str: return self.path_manager.build_filename( - episode_info, - extension=extension, - suffix=suffix, + episode_info, extension=extension, suffix=suffix, ) def _build_single_output( self, item: ProcessingItem, suffix: str, - extension: str = "json", + extension: str = 'json', subdir: Optional[str] = None, required: bool = True, ) -> List[OutputSpec]: - episode_info = item.metadata.get("episode_info") + episode_info = item.metadata.get('episode_info') if not episode_info: return [] - filename = self._build_filename(episode_info, extension=extension, suffix=suffix) path = self._build_output_path(episode_info, filename, subdir=subdir) return [OutputSpec(path=path, required=required)] diff --git a/preprocessor/core/base_step.py b/preprocessor/core/base_step.py new file mode 100644 index 000000000..d46993f12 --- /dev/null +++ b/preprocessor/core/base_step.py @@ -0,0 +1,39 @@ +from abc import ( + ABC, + abstractmethod, +) +from typing import ( + TYPE_CHECKING, + Generic, + TypeVar, +) + +from pydantic import BaseModel + +if TYPE_CHECKING: + from preprocessor.core.context import ExecutionContext + +InputT = TypeVar("InputT") +OutputT = TypeVar("OutputT") +ConfigT = TypeVar("ConfigT", bound=BaseModel) + + +class PipelineStep(ABC, Generic[InputT, OutputT, ConfigT]): + def __init__(self, config: ConfigT) -> None: + self._config: ConfigT = config + + @abstractmethod + def execute(self, input_data: InputT, context: "ExecutionContext") -> OutputT: + pass + + @property + @abstractmethod + def name(self) -> str: + pass + + @property + def config(self) -> ConfigT: + return self._config + + def cleanup(self) -> None: + pass diff --git a/preprocessor/core/constants.py b/preprocessor/core/constants.py deleted file mode 100644 index 06063e773..000000000 --- a/preprocessor/core/constants.py +++ /dev/null @@ -1,50 +0,0 @@ -SUPPORTED_VIDEO_EXTENSIONS = ( - ".mp4", - ".avi", - ".mkv", - ".mov", - ".flv", - ".wmv", - ".webm", -) - -DEFAULT_VIDEO_EXTENSION = ".mp4" - -FILE_SUFFIXES = { - "segmented": "_segmented", - "text_segments": "_text_segments", - "simple": "_simple", - "clean": "_clean_transcription", - "clean_alt": "_clean", - "scenes": "_scenes", - "sound_events": "_sound_events", - "text_stats": "_text_stats", - "embeddings_text": "_embeddings_text", - "embeddings_video": "_embeddings_video", - "embeddings_full": "embeddings_full_episode", - "embeddings_sound": "embeddings_sound_events", - "episode_name": "episode_name_embedding", - "image_hashes": "_image_hashes", - "detections": "detections", - "character_detections": "_character_detections", -} - -FILE_EXTENSIONS = { - "json": ".json", - "jsonl": ".jsonl", - "txt": ".txt", - "srt": ".srt", - "mp4": ".mp4", - "jpg": ".jpg", -} - -OUTPUT_FILE_NAMES = { - "detections": "detections.json", - "episode_embedding": "episode_name_embedding.json", - "embeddings_text": "embeddings_text.json", -} - -OUTPUT_FILE_PATTERNS = { - "frame": "*_frame_*.jpg", - "scenes_suffix": "_scenes.json", -} diff --git a/preprocessor/core/context.py b/preprocessor/core/context.py new file mode 100644 index 000000000..017b7b056 --- /dev/null +++ b/preprocessor/core/context.py @@ -0,0 +1,78 @@ +from pathlib import Path +from typing import ( + TYPE_CHECKING, + List, + Optional, +) + +from preprocessor.lib.core.logging import ErrorHandlingLogger + +if TYPE_CHECKING: + from preprocessor.core.state_manager import StateManager + from preprocessor.lib.episodes.episode_manager import EpisodeInfo + +class ExecutionContext: + + def __init__( + self, + series_name: str, + base_output_dir: Path, + logger: ErrorHandlingLogger, + state_manager: Optional['StateManager'] = None, + force_rerun: bool = False, + ) -> None: + self._series_name: str = series_name + self._base_output_dir: Path = base_output_dir / series_name + self._state_manager: Optional['StateManager'] = state_manager + self._force_rerun: bool = force_rerun + self._logger: ErrorHandlingLogger = logger + + @property + def series_name(self) -> str: + return self._series_name + + @property + def force_rerun(self) -> bool: + return self._force_rerun + + @property + def logger(self) -> ErrorHandlingLogger: + return self._logger + + @property + def state_manager(self) -> Optional['StateManager']: + return self._state_manager + + def get_output_path( + self, episode_info: 'EpisodeInfo', subdir: str, filename: str, + ) -> Path: + season_code: str = episode_info.season_code() + episode_code: str = episode_info.episode_num() + path: Path = ( + self._base_output_dir / subdir / season_code / episode_code / filename + ) + path.parent.mkdir(parents=True, exist_ok=True) + return path + + def get_season_output_path( + self, episode_info: 'EpisodeInfo', subdir: str, filename: str, + ) -> Path: + season_code: str = episode_info.season_code() + path: Path = self._base_output_dir / subdir / season_code / filename + path.parent.mkdir(parents=True, exist_ok=True) + return path + + def is_step_completed(self, step_name: str, episode_id: str) -> bool: + if not self._state_manager: + return False + return self._state_manager.is_step_completed(step_name, episode_id) + + def mark_step_completed(self, step_name: str, episode_id: str) -> None: + if self._state_manager: + self._state_manager.mark_step_completed(step_name, episode_id) + + def mark_step_started( + self, step_name: str, episode_id: str, temp_files: Optional[List[str]] = None, + ) -> None: + if self._state_manager: + self._state_manager.mark_step_started(step_name, episode_id, temp_files) diff --git a/preprocessor/core/enums.py b/preprocessor/core/enums.py deleted file mode 100644 index 88003ee93..000000000 --- a/preprocessor/core/enums.py +++ /dev/null @@ -1,35 +0,0 @@ -from enum import Enum - - -class KeyframeStrategy(str, Enum): - SCENE_CHANGES = "scene_changes" - - -class FrameType(str, Enum): - SCENE_SINGLE = "scene_single" - SCENE_START = "scene_start" - SCENE_END = "scene_end" - - @staticmethod - def scene_mid(index: int) -> str: - return f"scene_mid_{index}" - - -class ScraperMethod(str, Enum): - CLIPBOARD = "clipboard" - CRAWL4AI = "crawl4ai" - - -class ParserMode(str, Enum): - NORMAL = "normal" - PREMIUM = "premium" - - -class TranscriptionFormat(str, Enum): - ELEVENLABS_SEGMENTED = "11labs_segmented" - ELEVENLABS = "11labs" - - -class Device(str, Enum): - CUDA = "cuda" - CPU = "cpu" diff --git a/preprocessor/core/path_manager.py b/preprocessor/core/path_manager.py index 510a44189..1ba450287 100644 --- a/preprocessor/core/path_manager.py +++ b/preprocessor/core/path_manager.py @@ -1,88 +1,21 @@ from pathlib import Path -from typing import Optional +from typing import TYPE_CHECKING from preprocessor.config.config import get_base_output_dir -from preprocessor.core.constants import ( - FILE_EXTENSIONS, - FILE_SUFFIXES, -) +if TYPE_CHECKING: + from preprocessor.lib.episodes.episode_manager import EpisodeInfo class PathManager: - def __init__(self, series_name: str): - self._series_name = series_name.lower() - self._base_output_dir = get_base_output_dir(self._series_name) - @property - def series_name(self) -> str: - return self._series_name + def __init__(self, series_name: str) -> None: + self._series_name: str = series_name.lower() - @property - def base_output_dir(self) -> Path: - return self._base_output_dir + def build_filename(self, episode_info: 'EpisodeInfo', extension: str='json', suffix: str='') -> str: + base: str = f'{self._series_name}_{episode_info.episode_code()}' + suffix_str: str = f'_{suffix}' if suffix else '' + return f'{base}{suffix_str}.{extension}' - def build_path( - self, - episode_info, - subdir: str, - filename: str, - ) -> Path: - season_code = episode_info.season_code() - episode_code = episode_info.episode_num() - - path = self._base_output_dir / subdir / season_code / episode_code / filename - path.parent.mkdir(parents=True, exist_ok=True) - - return path - - def build_season_path( - self, - episode_info, - subdir: str, - filename: str, - ) -> Path: - season_code = episode_info.season_code() - - path = self._base_output_dir / subdir / season_code / filename - path.parent.mkdir(parents=True, exist_ok=True) - - return path - - def get_episode_dir(self, episode_info, subdir: str) -> Path: - season_code = episode_info.season_code() - episode_code = episode_info.episode_num() - episode_dir = self._base_output_dir / subdir / season_code / episode_code - episode_dir.mkdir(parents=True, exist_ok=True) - return episode_dir - - def build_base_filename(self, episode_info) -> str: - return f"{self._series_name}_{episode_info.episode_code()}" - - def build_filename( - self, - episode_info, - extension: str = "json", - suffix: Optional[str] = None, - ) -> str: - base = self.build_base_filename(episode_info) - suffix_str = FILE_SUFFIXES.get(suffix, suffix) if suffix else "" - ext = FILE_EXTENSIONS.get(extension, f".{extension}") - return f"{base}{suffix_str}{ext}" - - @staticmethod - def parse_base_filename(filename: str) -> str: - name = Path(filename).stem - for suffix_value in FILE_SUFFIXES.values(): - if name.endswith(suffix_value): - return name[:-len(suffix_value)] - return name - - @staticmethod - def add_suffix_to_filename(filename: str, suffix: str) -> str: - path = Path(filename) - suffix_str = FILE_SUFFIXES.get(suffix, suffix) if suffix else "" - return str(path.parent / f"{path.stem}{suffix_str}{path.suffix}") - - @staticmethod - def get_suffix(suffix_key: str) -> str: - return FILE_SUFFIXES.get(suffix_key, "") + def get_episode_dir(self, episode_info: 'EpisodeInfo', subdir: str) -> Path: + base_output_dir: Path = get_base_output_dir(self._series_name) + return base_output_dir / subdir / episode_info.season_code() / episode_info.episode_num() diff --git a/preprocessor/core/processing_metadata.py b/preprocessor/core/processing_metadata.py index 8eb2ab96d..761726a5e 100644 --- a/preprocessor/core/processing_metadata.py +++ b/preprocessor/core/processing_metadata.py @@ -20,38 +20,40 @@ class StepMetadata: start_time: Optional[datetime] = None end_time: Optional[datetime] = None duration_seconds: Optional[float] = None - status: str = "pending" + status: str = 'pending' exit_code: Optional[int] = None extra_info: Dict[str, Any] = field(default_factory=dict) def start(self): self.start_time = datetime.now() - self.status = "running" + self.status = 'running' def finish(self, exit_code: int): self.end_time = datetime.now() self.exit_code = exit_code if self.start_time: self.duration_seconds = (self.end_time - self.start_time).total_seconds() - self.status = "success" if exit_code == 0 else "failed" + self.status = 'success' if exit_code == 0 else 'failed' def skip(self): - self.status = "skipped" + self.status = 'skipped' def to_dict(self) -> Dict[str, Any]: return { - "name": self.name, - "step_num": self.step_num, - "start_time": self.start_time.isoformat() if self.start_time else None, - "end_time": self.end_time.isoformat() if self.end_time else None, - "duration_seconds": round(self.duration_seconds, 2) if self.duration_seconds else None, - "status": self.status, - "exit_code": self.exit_code, - "extra_info": self.extra_info, + 'name': self.name, + 'step_num': self.step_num, + 'start_time': self.start_time.isoformat() if self.start_time else None, + 'end_time': self.end_time.isoformat() if self.end_time else None, + 'duration_seconds': ( + round(self.duration_seconds, 2) if self.duration_seconds else None + ), + 'status': self.status, + 'exit_code': self.exit_code, + 'extra_info': self.extra_info, } - class ProcessingMetadata: + def __init__(self, series_name: str, params: Dict[str, Any]): self.series_name = series_name self.params = self.__sanitize_params(params) @@ -59,13 +61,13 @@ def __init__(self, series_name: str, params: Dict[str, Any]): self.end_time: Optional[datetime] = None self.total_duration_seconds: Optional[float] = None self.steps: List[StepMetadata] = [] - self.final_status = "running" + self.final_status = 'running' @staticmethod def __sanitize_params(params: Dict[str, Any]) -> Dict[str, Any]: sanitized = {} for key, value in params.items(): - if key in set("state_manager"): + if key in set('state_manager'): continue if isinstance(value, Path): sanitized[key] = str(value) @@ -80,41 +82,51 @@ def add_step(self, name: str, step_num: str) -> StepMetadata: self.steps.append(step) return step - def finish_processing(self, final_exit_code: int, additional_stats: Optional[Dict[str, Any]] = None): + def finish_processing( + self, final_exit_code: int, additional_stats: Optional[Dict[str, Any]] = None, + ): self.end_time = datetime.now() self.total_duration_seconds = (self.end_time - self.start_time).total_seconds() - self.final_status = "success" if final_exit_code == 0 else "failed" + self.final_status = 'success' if final_exit_code == 0 else 'failed' if additional_stats: - self.params["additional_statistics"] = additional_stats + self.params['additional_statistics'] = additional_stats def __get_statistics(self) -> Dict[str, Any]: - completed_steps = [s for s in self.steps if s.status == "success"] - failed_steps = [s for s in self.steps if s.status == "failed"] - skipped_steps = [s for s in self.steps if s.status == "skipped"] - - step_durations = [s.duration_seconds for s in self.steps if s.duration_seconds is not None] - + completed_steps = [s for s in self.steps if s.status == 'success'] + failed_steps = [s for s in self.steps if s.status == 'failed'] + skipped_steps = [s for s in self.steps if s.status == 'skipped'] + step_durations = [ + s.duration_seconds for s in self.steps if s.duration_seconds is not None + ] return { - "total_steps": len(self.steps), - "completed_steps": len(completed_steps), - "failed_steps": len(failed_steps), - "skipped_steps": len(skipped_steps), - "total_duration_seconds": round(self.total_duration_seconds, 2) if self.total_duration_seconds else None, - "average_step_duration_seconds": round(sum(step_durations) / len(step_durations), 2) if step_durations else None, + 'total_steps': len(self.steps), + 'completed_steps': len(completed_steps), + 'failed_steps': len(failed_steps), + 'skipped_steps': len(skipped_steps), + 'total_duration_seconds': ( + round(self.total_duration_seconds, 2) + if self.total_duration_seconds + else None + ), + 'average_step_duration_seconds': ( + round(sum(step_durations) / len(step_durations), 2) + if step_durations + else None + ), } def to_dict(self) -> Dict[str, Any]: return { - "series_name": self.series_name, - "start_time": self.start_time.isoformat(), - "end_time": self.end_time.isoformat() if self.end_time else None, - "final_status": self.final_status, - "parameters": self.params, - "steps": [step.to_dict() for step in self.steps], - "statistics": self.__get_statistics(), + 'series_name': self.series_name, + 'start_time': self.start_time.isoformat(), + 'end_time': self.end_time.isoformat() if self.end_time else None, + 'final_status': self.final_status, + 'parameters': self.params, + 'steps': [step.to_dict() for step in self.steps], + 'statistics': self.__get_statistics(), } def save_to_file(self, output_path: Path): output_path.parent.mkdir(parents=True, exist_ok=True) - with open(output_path, "w", encoding="utf-8") as f: + with open(output_path, 'w', encoding='utf-8') as f: json.dump(self.to_dict(), f, indent=2, ensure_ascii=False) diff --git a/preprocessor/core/processor_factory.py b/preprocessor/core/processor_factory.py deleted file mode 100644 index f33f921f9..000000000 --- a/preprocessor/core/processor_factory.py +++ /dev/null @@ -1,64 +0,0 @@ -from typing import ( - Any, - Dict, - List, - Set, - Tuple, -) - -from preprocessor.core.processor_registry import ( - get_processor_class, - get_processor_info, - list_processors, -) - - -class ProcessorFactory: - @staticmethod - def create(processor_name: str, args: Dict[str, Any]): - processor_class = get_processor_class(processor_name) - return processor_class(args) - - @staticmethod - def list_available() -> List[str]: - return list_processors() - - @staticmethod - def get_info(processor_name: str) -> Dict[str, Any]: - return get_processor_info(processor_name) - - @staticmethod - def get_all_info() -> List[Dict[str, Any]]: - return [ - ProcessorFactory.get_info(name) - for name in ProcessorFactory.list_available() - ] - - @staticmethod - def build_dependency_graph() -> Dict[str, List[str]]: - graph = {} - for name in list_processors(): - info = get_processor_info(name) - graph[name] = info["requires"] - return graph - - @staticmethod - def validate_dependencies( - processor_name: str, - available_data: Set[str], - ) -> Tuple[bool, List[str]]: - info = get_processor_info(processor_name) - required = set(info["requires"]) - missing = required - available_data - return len(missing) == 0, sorted(missing) - - @staticmethod - def sort_by_priority(processors: List[str]) -> List[str]: - processor_info = { - name: get_processor_info(name) - for name in processors - } - return sorted( - processors, - key=lambda name: processor_info[name]["priority"], - ) diff --git a/preprocessor/core/processor_registry.py b/preprocessor/core/processor_registry.py deleted file mode 100644 index 77a982376..000000000 --- a/preprocessor/core/processor_registry.py +++ /dev/null @@ -1,48 +0,0 @@ -from typing import ( - Any, - Dict, - List, - Type, -) - -from preprocessor.core.base_processor import BaseProcessor - -PROCESSOR_REGISTRY: Dict[str, Type[BaseProcessor]] = {} - - -def register_processor(name: str): - def decorator(cls: Type[BaseProcessor]): - if name in PROCESSOR_REGISTRY: - raise ValueError(f"Processor '{name}' already registered!") - - PROCESSOR_REGISTRY[name] = cls - cls.PROCESSOR_NAME = name - - return cls - return decorator - - -def get_processor_class(name: str) -> Type[BaseProcessor]: - if name not in PROCESSOR_REGISTRY: - available = ", ".join(sorted(PROCESSOR_REGISTRY.keys())) - raise ValueError( - f"Unknown processor: '{name}'\n" - f"Available processors: {available}", - ) - return PROCESSOR_REGISTRY[name] - - -def list_processors() -> List[str]: - return sorted(PROCESSOR_REGISTRY.keys()) - - -def get_processor_info(name: str) -> Dict[str, Any]: - processor_class = get_processor_class(name) - return { - "name": name, - "class": processor_class.__name__, - "requires": getattr(processor_class, "REQUIRES", []), - "produces": getattr(processor_class, "PRODUCES", []), - "priority": getattr(processor_class, "PRIORITY", 100), - "description": getattr(processor_class, "DESCRIPTION", ""), - } diff --git a/preprocessor/core/state_manager.py b/preprocessor/core/state_manager.py index 9929748e1..756673928 100644 --- a/preprocessor/core/state_manager.py +++ b/preprocessor/core/state_manager.py @@ -15,7 +15,7 @@ Optional, ) -from preprocessor.utils.console import console +from preprocessor.lib.ui.console import console @dataclass @@ -24,7 +24,6 @@ class StepCheckpoint: episode: str completed_at: str - @dataclass class InProgressStep: step: str @@ -32,7 +31,6 @@ class InProgressStep: started_at: str temp_files: List[str] = field(default_factory=list) - @dataclass class ProcessingState: series_name: str @@ -43,34 +41,34 @@ class ProcessingState: def to_dict(self) -> Dict[str, Any]: return { - "series_name": self.series_name, - "started_at": self.started_at, - "last_checkpoint": self.last_checkpoint, - "completed_steps": [asdict(step) for step in self.completed_steps], - "in_progress": asdict(self.in_progress) if self.in_progress else None, + 'series_name': self.series_name, + 'started_at': self.started_at, + 'last_checkpoint': self.last_checkpoint, + 'completed_steps': [asdict(step) for step in self.completed_steps], + 'in_progress': asdict(self.in_progress) if self.in_progress else None, } @classmethod - def _from_dict(cls, data: Dict[str, Any]) -> "ProcessingState": + def _from_dict(cls, data: Dict[str, Any]) -> 'ProcessingState': completed_steps = [ - StepCheckpoint(**step) for step in data.get("completed_steps", []) + StepCheckpoint(**step) for step in data.get('completed_steps', []) ] - in_progress_data = data.get("in_progress") - in_progress = InProgressStep(**in_progress_data) if in_progress_data else None - + in_progress_data = data.get('in_progress') + in_progress = ( + InProgressStep(**in_progress_data) if in_progress_data else None + ) return cls( - series_name=data["series_name"], - started_at=data["started_at"], - last_checkpoint=data["last_checkpoint"], + series_name=data['series_name'], + started_at=data['started_at'], + last_checkpoint=data['last_checkpoint'], completed_steps=completed_steps, in_progress=in_progress, ) - class StateManager: - STATE_FILE: str = ".preprocessing_state.json" + STATE_FILE: str = '.preprocessing_state.json' - def __init__(self, series_name: str, working_dir: Path = Path(".")) -> None: + def __init__(self, series_name: str, working_dir: Path = Path('.')) -> None: self.__series_name: str = series_name self.__state_file: Path = working_dir / self.STATE_FILE self.__state: Optional[ProcessingState] = None @@ -79,15 +77,15 @@ def __init__(self, series_name: str, working_dir: Path = Path(".")) -> None: def load_or_create_state(self) -> ProcessingState: if self.__state_file.exists(): - console.print(f"[yellow]Found existing state file: {self.__state_file}[/yellow]") - with open(self.__state_file, "r", encoding="utf-8") as f: + console.print(f'[yellow]Found existing state file: {self.__state_file}[/yellow]') + with open(self.__state_file, 'r', encoding='utf-8') as f: data = json.load(f) self.__state = ProcessingState._from_dict(data) - console.print(f"[green]Loaded state for series: {self.__state.series_name}[/green]") - console.print(f"[green]Completed steps: {len(self.__state.completed_steps)}[/green]") + console.print(f'[green]Loaded state for series: {self.__state.series_name}[/green]') + console.print(f'[green]Completed steps: {len(self.__state.completed_steps)}[/green]') return self.__state else: - console.print("[blue]Creating new processing state...[/blue]") + console.print('[blue]Creating new processing state...[/blue]') now = datetime.now().isoformat() self.__state = ProcessingState( series_name=self.__series_name, @@ -100,15 +98,15 @@ def load_or_create_state(self) -> ProcessingState: def __save_state(self) -> None: if self.__state is None: return - self.__state.last_checkpoint = datetime.now().isoformat() - with open(self.__state_file, "w", encoding="utf-8") as f: + with open(self.__state_file, 'w', encoding='utf-8') as f: json.dump(self.__state.to_dict(), f, indent=2, ensure_ascii=False) - def mark_step_started(self, step: str, episode: str, temp_files: Optional[List[str]] = None) -> None: + def mark_step_started( + self, step: str, episode: str, temp_files: Optional[List[str]] = None, + ) -> None: if self.__state is None: - raise RuntimeError("State not initialized") - + raise RuntimeError('State not initialized') self.__state.in_progress = InProgressStep( step=step, episode=episode, @@ -116,12 +114,11 @@ def mark_step_started(self, step: str, episode: str, temp_files: Optional[List[s temp_files=temp_files or [], ) self.__save_state() - console.print(f"[cyan]Started: {step} for {episode}[/cyan]") + console.print(f'[cyan]Started: {step} for {episode}[/cyan]') def mark_step_completed(self, step: str, episode: str) -> None: if self.__state is None: - raise RuntimeError("State not initialized") - + raise RuntimeError('State not initialized') checkpoint = StepCheckpoint( step=step, episode=episode, @@ -130,38 +127,37 @@ def mark_step_completed(self, step: str, episode: str) -> None: self.__state.completed_steps.append(checkpoint) self.__state.in_progress = None self.__save_state() - console.print(f"[green]✓ Completed: {step} for {episode}[/green]") + console.print(f'[green]✓ Completed: {step} for {episode}[/green]') def is_step_completed(self, step: str, episode: str) -> bool: if self.__state is None: return False - return any( - s.step == step and s.episode == episode + (s.step == step and s.episode == episode) for s in self.__state.completed_steps ) def __rollback_in_progress(self) -> None: if self.__state is None or self.__state.in_progress is None: return - - console.print(f"[yellow]Rolling back in-progress step: {self.__state.in_progress.step}[/yellow]") - + console.print( + f'[yellow]Rolling back in-progress step: ' + f'{self.__state.in_progress.step}[/yellow]', + ) for temp_file in self.__state.in_progress.temp_files: temp_path = Path(temp_file) if temp_path.exists(): try: temp_path.unlink() - console.print(f"[yellow]Removed temp file: {temp_file}[/yellow]") + console.print(f'[yellow]Removed temp file: {temp_file}[/yellow]') except OSError as e: - console.print(f"[red]Failed to remove {temp_file}: {e}[/red]") - + console.print(f'[red]Failed to remove {temp_file}: {e}[/red]') self.__state.in_progress = None self.__save_state() def cleanup(self) -> None: if self.__state_file.exists(): - console.print(f"[blue]Cleaning up state file: {self.__state_file}[/blue]") + console.print(f'[blue]Cleaning up state file: {self.__state_file}[/blue]') self.__state_file.unlink() def register_interrupt_handler(self) -> None: @@ -170,25 +166,25 @@ def register_interrupt_handler(self) -> None: def _signal_handler(_sig: int, _frame: Any) -> None: if self.__interrupted: - console.print("\n[red]Force quit! Not cleaning up.[/red]") + console.print('\n[red]Force quit! Not cleaning up.[/red]') sys.exit(1) - self.__interrupted = True - console.print("\n[yellow]Interrupt received (Ctrl+C)...[/yellow]") - console.print("[yellow]Rolling back incomplete work...[/yellow]") + console.print('\n[yellow]Interrupt received (Ctrl+C)...[/yellow]') + console.print('[yellow]Rolling back incomplete work...[/yellow]') self.__rollback_in_progress() - console.print("[green]Cleanup complete. You can resume later.[/green]") - console.print("[blue]To resume: run the same command again[/blue]") + console.print('[green]Cleanup complete. You can resume later.[/green]') + console.print('[blue]To resume: run the same command again[/blue]') sys.exit(0) - signal.signal(signal.SIGINT, _signal_handler) signal.signal(signal.SIGTERM, _signal_handler) self.__cleanup_registered = True - console.print("[blue]Interrupt handler registered (Ctrl+C to safely stop)[/blue]") + console.print('[blue]Interrupt handler registered (Ctrl+C to safely stop)[/blue]') def get_resume_info(self) -> Optional[str]: if self.__state is None or not self.__state.completed_steps: return None - last_step = self.__state.completed_steps[-1] - return f"Resuming from: {last_step.step} ({last_step.episode}) at {last_step.completed_at}" + return ( + f'Resuming from: {last_step.step} ({last_step.episode}) ' + f'at {last_step.completed_at}' + ) diff --git a/preprocessor/core/video_processor.py b/preprocessor/core/video_processor.py deleted file mode 100644 index 5d66334e3..000000000 --- a/preprocessor/core/video_processor.py +++ /dev/null @@ -1,47 +0,0 @@ -from pathlib import Path -from typing import ( - Any, - Dict, - List, - Optional, -) - -from preprocessor.core.base_processor import ( - BaseProcessor, - ProcessingItem, -) -from preprocessor.episodes import EpisodeManager - - -class VideoProcessor(BaseProcessor): - def __init__( - self, - args: Dict[str, Any], - class_name: str, - error_exit_code: int, - loglevel: int, - ): - super().__init__( - args=args, - class_name=class_name, - error_exit_code=error_exit_code, - loglevel=loglevel, - ) - - self.input_videos: Path = Path(self._args["videos"]) - self.subdirectory_filter: Optional[str] = None - episodes_json_path = self._args.get("episodes_info_json") - self.episode_manager = EpisodeManager(episodes_json_path, self.series_name) - - def _get_processing_items(self) -> List[ProcessingItem]: - return self._create_video_processing_items( - source_path=self.input_videos, - extensions=self.get_video_glob_patterns(), - episode_manager=self.episode_manager, - skip_unparseable=True, - subdirectory_filter=self.subdirectory_filter, - ) - - def _validate_videos_required(self, args: Dict[str, Any]) -> None: - if "videos" not in args: - raise ValueError("videos path is required") diff --git a/preprocessor/embeddings/episode_name_embedder.py b/preprocessor/embeddings/episode_name_embedder.py deleted file mode 100644 index 6e76e718f..000000000 --- a/preprocessor/embeddings/episode_name_embedder.py +++ /dev/null @@ -1,166 +0,0 @@ -import json -import logging -from pathlib import Path -from typing import ( - Any, - Dict, - Optional, -) - -import numpy as np - -from preprocessor.config.config import settings -from preprocessor.core.path_manager import PathManager -from preprocessor.episodes import ( - EpisodeInfo, - EpisodeManager, -) -from preprocessor.utils.console import console -from preprocessor.utils.constants import EmbeddingKeys -from preprocessor.utils.file_utils import atomic_write_json - - -class EpisodeNameEmbedder: - def __init__( - self, - model, - episode_manager: EpisodeManager, - series_name: str, - output_dir: Optional[Path] = None, - logger: Optional[logging.Logger] = None, - ): - self.model = model - self.episode_manager = episode_manager - self.series_name = series_name - self.output_dir = output_dir or settings.embedding.get_output_dir(series_name) - self.logger = logger or logging.getLogger(__name__) - - def __generate_episode_name_embeddings( - self, - transcription_data: Dict[str, Any], - ) -> Optional[Dict[str, Any]]: - episode_info_dict = transcription_data.get("episode_info", {}) - season = episode_info_dict.get("season") - episode_number = episode_info_dict.get("episode_number") - - if season is None or episode_number is None: - self.logger.warning( - f"Missing season or episode_number in transcription data: episode_info={episode_info_dict}", - ) - return None - - episode_info = self.episode_manager.get_episode_by_season_and_relative( - season, - episode_number, - ) - if not episode_info: - self.logger.warning(f"Cannot find episode info for S{season:02d}E{episode_number:02d}") - return None - - metadata = self.episode_manager.get_metadata(episode_info) - title = metadata.get("title") - - if not title: - self.logger.warning(f"No title found for {episode_info.episode_code()}") - return None - - embedding = self.__generate_title_embedding(title) - if embedding is None: - return None - - episode_id = episode_info.episode_code() - - result = { - EmbeddingKeys.EPISODE_ID: episode_id, - EmbeddingKeys.TITLE: title, - EmbeddingKeys.TITLE_EMBEDDING: embedding.tolist(), - EmbeddingKeys.EPISODE_METADATA: { - "season": season, - "episode_number": episode_number, - "title": title, - "premiere_date": metadata.get("premiere_date"), - "series_name": self.series_name, - "viewership": metadata.get("viewership"), - }, - } - - return result - - def __generate_title_embedding(self, title: str) -> Optional[np.ndarray]: - try: - embeddings_tensor = self.model.get_text_embeddings(texts=[title]) - embedding = embeddings_tensor[0].cpu().numpy() - del embeddings_tensor - return embedding - except Exception as e: - self.logger.error(f"Failed to generate embedding for title '{title}': {e}") - return None - - @staticmethod - def __save_episode_name_embedding( - season: int, - episode: int, - embedding_data: Dict[str, Any], - series_name: str, - ) -> Path: - path_manager = PathManager(series_name) - episode_info = EpisodeInfo.create_minimal(season, episode, series_name) - - output_file = path_manager.build_path( - episode_info, - settings.output_subdirs.embeddings, - "episode_name_embedding.json", - ) - - atomic_write_json(output_file, embedding_data, indent=2, ensure_ascii=False) - - return output_file - - def generate_and_save_for_transcription( - self, - transcription_data: Dict[str, Any], - ) -> Optional[Path]: - embedding_data = self.__generate_episode_name_embeddings(transcription_data) - if not embedding_data: - return None - - season = embedding_data[EmbeddingKeys.EPISODE_METADATA]["season"] - episode = embedding_data[EmbeddingKeys.EPISODE_METADATA]["episode_number"] - - output_file = self.__save_episode_name_embedding(season, episode, embedding_data, self.series_name) - console.print( - f"[green]Generated episode name embedding for {embedding_data[EmbeddingKeys.EPISODE_ID]}: {embedding_data[EmbeddingKeys.TITLE]}[/green]", - ) - - return output_file - - @staticmethod - def load_episode_name_embedding( - season: int, - episode: int, - series_name: str, - output_dir: Optional[Path] = None, - ) -> Optional[Dict[str, Any]]: - if output_dir is None: - output_dir = settings.embedding.get_output_dir(series_name) - - path_manager = PathManager(series_name) - episode_info = EpisodeInfo( - absolute_episode=0, - season=season, - relative_episode=episode, - title="", - series_name=series_name, - ) - - embedding_file = path_manager.build_path( - episode_info, - settings.output_subdirs.embeddings, - "episode_name_embedding.json", - ) - - if not embedding_file.exists(): - return None - - with open(embedding_file, "r", encoding="utf-8") as f: - return json.load(f) diff --git a/preprocessor/embeddings/gpu_batch_processor.py b/preprocessor/embeddings/gpu_batch_processor.py deleted file mode 100644 index 55715b239..000000000 --- a/preprocessor/embeddings/gpu_batch_processor.py +++ /dev/null @@ -1,137 +0,0 @@ -import time -from typing import ( - Any, - Dict, - List, -) - -from PIL import Image -import torch - -from preprocessor.utils.batch_processor import BatchProcessor -from preprocessor.utils.console import console -from preprocessor.utils.error_handling_logger import ErrorHandlingLogger - - -class GPUBatchProcessor: - def __init__( - self, - model, - batch_size: int, - logger: ErrorHandlingLogger, - device: str, - progress_sub_batch_size: int = 100, - ): - self.model = model - self.batch_size = batch_size - self.progress_sub_batch_size = progress_sub_batch_size - self.logger = logger - self.device = device - self.max_vram_used = 0.0 - self.vram_samples = [] - self.batch_processor = BatchProcessor(min(self.batch_size, self.progress_sub_batch_size)) - - def __log_vram_usage(self) -> None: - if torch.cuda.is_available(): - vram_reserved = torch.cuda.memory_reserved(self.device) / 1024**3 - self.max_vram_used = max(self.max_vram_used, vram_reserved) - self.vram_samples.append(vram_reserved) - - def get_vram_stats(self) -> Dict[str, Any]: - if not self.vram_samples: - return {} - return { - "max_vram_gb": round(self.max_vram_used, 2), - "avg_vram_gb": round(sum(self.vram_samples) / len(self.vram_samples), 2), - "samples": len(self.vram_samples), - } - - def suggest_optimal_batch_size(self, target_vram_gb: float = 21.0) -> int: - if not self.vram_samples: - return self.batch_size - - avg_vram = sum(self.vram_samples) / len(self.vram_samples) - if avg_vram <= 0: - return self.batch_size - - vram_ratio = target_vram_gb / avg_vram - suggested = int(self.batch_size * vram_ratio * 0.9) - - suggested = max(50, min(suggested, 1000)) - - return suggested - - @staticmethod - def __compute_embeddings(model: Any, batch_pil: List[Image.Image]) -> List[List[float]]: - inputs = [{"image": img} for img in batch_pil] - embeddings_tensor = model.process(inputs, normalize=True) - batch_np = embeddings_tensor.cpu().numpy() - del embeddings_tensor - results = [emb.tolist() for emb in batch_np] - del batch_np - torch.cuda.empty_cache() - return results - - @staticmethod - def __report_batch_progress( - processed_count: int, - total_images: int, - elapsed: float, - current_batch_size: int, - batch_start_time: float, - ) -> None: - rate = current_batch_size / elapsed if elapsed > 0 else 0 - console.print( - f" [dim cyan]→ {processed_count}/{total_images} " - f"({processed_count / total_images * 100:.0f}%) - {elapsed:.1f}s ({rate:.3f} img/s)[/dim cyan]", - ) - - elapsed_total = time.time() - batch_start_time - remaining_images = total_images - processed_count - if processed_count > 0: - eta = remaining_images / (processed_count / elapsed_total) - console.print(f" [dim]Batch ETA: {eta:.0f}s[/dim]") - - def process_images_batch( - self, - pil_images: List[Image.Image], - chunk_idx: int, - ) -> List[List[float]]: - total_images = len(pil_images) - batch_start_time = time.time() - processed_count = 0 - - def _process_sub_batch(batch_pil: List[Image.Image]) -> List[List[float]]: - nonlocal processed_count - current_batch_size = len(batch_pil) - sub_batch_start = time.time() - - try: - results = self.__compute_embeddings(self.model, batch_pil) - self.__log_vram_usage() - - processed_count += current_batch_size - if total_images > self.progress_sub_batch_size: - elapsed = time.time() - sub_batch_start - self.__report_batch_progress( - processed_count, - total_images, - elapsed, - current_batch_size, - batch_start_time, - ) - - return results - except RuntimeError as e: - if "out of memory" in str(e).lower(): - torch.cuda.empty_cache() - self.logger.error( - f"OOM in chunk {chunk_idx} with batch_size={current_batch_size}. " - f"Try reducing progress_sub_batch_size in config.", - ) - raise e - except Exception as e: - self.logger.error(f"Unexpected error in chunk {chunk_idx}: {e}") - raise e - - return self.batch_processor.process(pil_images, _process_sub_batch) diff --git a/preprocessor/embeddings/qwen3_vl_embedding.py b/preprocessor/embeddings/qwen3_vl_embedding.py deleted file mode 100644 index e6edd5309..000000000 --- a/preprocessor/embeddings/qwen3_vl_embedding.py +++ /dev/null @@ -1,112 +0,0 @@ -import logging -from typing import ( - Any, - Dict, - List, - Optional, -) - -from PIL import Image -import torch -import torch.nn.functional as F -from vllm import LLM - -from preprocessor.config.config import settings - -logger = logging.getLogger(__name__) - - -class Qwen3VLEmbedder: - def __init__( - self, - model_name_or_path: str, - max_length: Optional[int] = None, - tensor_parallel_size: Optional[int] = None, - gpu_memory_utilization: Optional[float] = None, - **kwargs, - ): - if not torch.cuda.is_available(): - raise RuntimeError("CUDA is required but not available. This pipeline requires GPU.") - - self.max_length = max_length or settings.embedding_model.max_model_len - self.model_name_or_path = model_name_or_path - self.image_placeholder = settings.embedding_model.image_placeholder - - dtype = kwargs.pop("torch_dtype", torch.bfloat16) - dtype_str = "bfloat16" if dtype == torch.bfloat16 else "float16" - - self.model = LLM( - model=model_name_or_path, - runner="pooling", - dtype=dtype_str, - trust_remote_code=True, - max_model_len=self.max_length, - gpu_memory_utilization=gpu_memory_utilization or settings.embedding_model.gpu_memory_utilization, - tensor_parallel_size=tensor_parallel_size or settings.embedding_model.tensor_parallel_size, - enable_chunked_prefill=settings.embedding_model.enable_chunked_prefill, - max_num_batched_tokens=settings.embedding_model.max_num_batched_tokens, - enforce_eager=settings.embedding_model.enforce_eager, - disable_log_stats=True, - ) - - logger.info(f"vLLM Qwen3-VL-Embedding loaded: {model_name_or_path}") - - def process(self, inputs: List[Dict[str, Any]], normalize: bool = True) -> torch.Tensor: - vllm_inputs = [] - - for item in inputs: - text = item.get("text") - image = item.get("image") - video = item.get("video") - - if image: - if isinstance(image, str): - img = Image.open(image).convert("RGB") - elif isinstance(image, Image.Image): - img = image - else: - raise TypeError(f"Unsupported image type: {type(image)}") - - vllm_inputs.append({ - "prompt": self.image_placeholder, - "multi_modal_data": {"image": img}, - }) - elif text: - vllm_inputs.append({ - "prompt": text, - }) - elif video: - if isinstance(video, list): - frames = [] - for frame in video: - if isinstance(frame, str): - frames.append(Image.open(frame).convert("RGB")) - elif isinstance(frame, Image.Image): - frames.append(frame) - else: - raise TypeError(f"Unsupported frame type: {type(frame)}") - - vllm_inputs.append({ - "prompt": self.image_placeholder, - "multi_modal_data": {"image": frames[0] if frames else None}, - }) - else: - raise TypeError(f"Unsupported video type: {type(video)}") - else: - vllm_inputs.append({"prompt": "NULL"}) - - outputs = self.model.embed(vllm_inputs) - - embeddings = torch.stack([ - torch.tensor(output.outputs.embedding, dtype=torch.float32) - for output in outputs - ]) - - if normalize: - embeddings = F.normalize(embeddings, p=2, dim=-1) - - return embeddings - - def get_text_embeddings(self, texts: List[str], normalize: bool = True) -> torch.Tensor: - inputs = [{"text": text} for text in texts] - return self.process(inputs, normalize=normalize) diff --git a/preprocessor/embeddings/strategies/__init__.py b/preprocessor/embeddings/strategies/__init__.py deleted file mode 100644 index 7cfb75d6b..000000000 --- a/preprocessor/embeddings/strategies/__init__.py +++ /dev/null @@ -1,7 +0,0 @@ -from preprocessor.embeddings.strategies.base_strategy import BaseKeyframeStrategy -from preprocessor.embeddings.strategies.scene_changes_strategy import SceneChangesStrategy - -__all__ = [ - "BaseKeyframeStrategy", - "SceneChangesStrategy", -] diff --git a/preprocessor/embeddings/strategies/strategy_factory.py b/preprocessor/embeddings/strategies/strategy_factory.py deleted file mode 100644 index 400e18624..000000000 --- a/preprocessor/embeddings/strategies/strategy_factory.py +++ /dev/null @@ -1,14 +0,0 @@ -from preprocessor.core.enums import KeyframeStrategy -from preprocessor.embeddings.strategies.base_strategy import BaseKeyframeStrategy -from preprocessor.embeddings.strategies.scene_changes_strategy import SceneChangesStrategy - - -class KeyframeStrategyFactory: - @staticmethod - def create( - strategy_type: KeyframeStrategy, - frames_per_scene: int = 1, - ) -> BaseKeyframeStrategy: - if strategy_type == KeyframeStrategy.SCENE_CHANGES: - return SceneChangesStrategy(frames_per_scene=frames_per_scene) - raise ValueError(f"Unknown keyframe strategy: {strategy_type}") diff --git a/preprocessor/episodes/__init__.py b/preprocessor/episodes/__init__.py deleted file mode 100644 index 1254a6ef9..000000000 --- a/preprocessor/episodes/__init__.py +++ /dev/null @@ -1,8 +0,0 @@ -from preprocessor.episodes.episode_file_finder import EpisodeFileFinder -from preprocessor.episodes.episode_manager import ( - EpisodeInfo, - EpisodeManager, -) -from preprocessor.episodes.episode_parser import EpisodeInfoParser - -__all__ = ["EpisodeInfo", "EpisodeManager", "EpisodeInfoParser", "EpisodeFileFinder"] diff --git a/preprocessor/episodes/episode_file_finder.py b/preprocessor/episodes/episode_file_finder.py deleted file mode 100644 index 92c7f9aea..000000000 --- a/preprocessor/episodes/episode_file_finder.py +++ /dev/null @@ -1,107 +0,0 @@ -import json -import logging -from pathlib import Path -import re -from typing import ( - Any, - Dict, - List, - Optional, -) - -from preprocessor.core.constants import SUPPORTED_VIDEO_EXTENSIONS -from preprocessor.core.path_manager import PathManager - -logger = logging.getLogger(__name__) - - -class EpisodeFileFinder: - def __init__(self, series_name: str): - self.path_manager = PathManager(series_name) - - @staticmethod - def find_video_file(episode_info, search_dir: Path) -> Optional[Path]: - if not search_dir.exists(): - return None - - if search_dir.is_file(): - return search_dir - - episode_code = episode_info.episode_code() - season_dir_name = episode_info.season_code() - search_dirs = [search_dir / season_dir_name, search_dir] - - for dir_path in search_dirs: - if not dir_path.exists(): - continue - - for ext in SUPPORTED_VIDEO_EXTENSIONS: - for video_file in dir_path.glob(f"*{ext}"): - if re.search(episode_code, video_file.name, re.IGNORECASE): - return video_file - - return None - - def find_transcription_file( - self, - episode_info, - search_dir: Path, - prefer_segmented: bool = True, - ) -> Optional[Path]: - if not search_dir.exists(): - return None - - season_dir_name = episode_info.season_code() - season_dir = search_dir / season_dir_name - if not season_dir.exists(): - return None - - if prefer_segmented: - segmented = season_dir / self.path_manager.build_filename( - episode_info, - extension="json", - suffix="segmented", - ) - if segmented.exists(): - return segmented - - regular = season_dir / self.path_manager.build_filename(episode_info, extension="json") - if regular.exists(): - return regular - - return None - - @staticmethod - def find_scene_timestamps_file(episode_info, search_dir: Path) -> Optional[Path]: - if not search_dir.exists(): - return None - - episode_code = episode_info.episode_code() - pattern = f"**/*{episode_code}*_scenes.json" - - for scene_file in search_dir.glob(pattern): - return scene_file - - return None - - @staticmethod - def load_scene_timestamps( - episode_info, - search_dir: Optional[Path], - _logger=None, - ) -> Optional[List[Dict[str, Any]]]: - if not search_dir: - return None - - finder = EpisodeFileFinder("") - scene_file = finder.find_scene_timestamps_file(episode_info, search_dir) - if not scene_file: - return None - - try: - with open(scene_file, "r", encoding="utf-8") as f: - return json.load(f) - except (OSError, json.JSONDecodeError) as e: - if _logger: - _logger.error(f"Failed to load scene timestamps: {e}") - return None diff --git a/preprocessor/episodes/episode_manager.py b/preprocessor/episodes/episode_manager.py deleted file mode 100644 index 21f0baf19..000000000 --- a/preprocessor/episodes/episode_manager.py +++ /dev/null @@ -1,170 +0,0 @@ -from dataclasses import dataclass -import json -import logging -from pathlib import Path -from typing import ( - Any, - Dict, - List, - Optional, -) - -from preprocessor.core.path_manager import PathManager -from preprocessor.episodes.episode_file_finder import EpisodeFileFinder -from preprocessor.episodes.episode_parser import EpisodeInfoParser -from preprocessor.utils.constants import ( - EpisodeMetadataKeys, - EpisodesDataKeys, -) - -logger = logging.getLogger(__name__) - - -@dataclass -class EpisodeInfo: - absolute_episode: int - season: int - relative_episode: int - title: str - series_name: Optional[str] = None - premiere_date: Optional[str] = None - viewership: Optional[str] = None - - def episode_code(self) -> str: - return f"S{self.season:02d}E{self.relative_episode:02d}" - - def season_dir_name(self) -> str: - return f"S{self.season:02d}" - - def season_code(self) -> str: - return f"S{self.season:02d}" - - def episode_num(self) -> str: - return f"E{self.relative_episode:02d}" - - def is_special(self) -> bool: - return self.season == 0 - - @staticmethod - def create_minimal(season: int, episode: int, series_name: str) -> "EpisodeInfo": - return EpisodeInfo( - absolute_episode=0, - season=season, - relative_episode=episode, - title="", - series_name=series_name, - ) - - -class EpisodeManager: - def __init__(self, episodes_info_json: Optional[Path], series_name: str): - self.series_name = series_name.lower() - self.episodes_data: Optional[Dict[str, Any]] = None - self.path_manager = PathManager(self.series_name) - self.file_finder = EpisodeFileFinder(self.series_name) - self.parser = EpisodeInfoParser() - - if episodes_info_json and episodes_info_json.exists(): - with open(episodes_info_json, "r", encoding="utf-8") as f: - self.episodes_data = json.load(f) - - def parse_filename(self, file_path: Path) -> Optional[EpisodeInfo]: - return self.parser.parse_filename(file_path, self) - - def get_episode_by_season_and_relative(self, season: int, relative_episode: int) -> Optional[EpisodeInfo]: - if not self.episodes_data: - return EpisodeInfo( - absolute_episode=0, - season=season, - relative_episode=relative_episode, - title=f"S{season:02d}E{relative_episode:02d}", - series_name=self.series_name, - ) - - for season_data in self.episodes_data.get(EpisodesDataKeys.SEASONS, []): - if season_data.get(EpisodesDataKeys.SEASON_NUMBER) == season: - episodes = sorted( - season_data.get(EpisodesDataKeys.EPISODES, []), - key=lambda ep: ep.get(EpisodeMetadataKeys.EPISODE_NUMBER, 0), - ) - - if 0 < relative_episode <= len(episodes): - ep_data = episodes[relative_episode - 1] - return EpisodeInfo( - absolute_episode=0, - season=season, - relative_episode=relative_episode, - title=ep_data.get(EpisodeMetadataKeys.TITLE, f"S{season:02d}E{relative_episode:02d}"), - series_name=self.series_name, - premiere_date=ep_data.get(EpisodeMetadataKeys.PREMIERE_DATE), - viewership=ep_data.get(EpisodeMetadataKeys.VIEWERSHIP), - ) - - logger.warning( - f"Season {season} not found in episodes_info_json! " - f"Processing S{season:02d}E{relative_episode:02d} with filename-only metadata. " - f"Scrape episode info for season {season} to get title, premiere date, etc.", - ) - - return EpisodeInfo( - absolute_episode=0, - season=season, - relative_episode=relative_episode, - title=f"S{season:02d}E{relative_episode:02d}", - series_name=self.series_name, - ) - - - def find_transcription_file(self, episode_info: EpisodeInfo, search_dir: Path, prefer_segmented: bool = True) -> Optional[Path]: - return self.file_finder.find_transcription_file(episode_info, search_dir, prefer_segmented) - - @staticmethod - def find_scene_timestamps_file(episode_info: EpisodeInfo, search_dir: Path) -> Optional[Path]: - finder = EpisodeFileFinder("") - return finder.find_scene_timestamps_file(episode_info, search_dir) - - @staticmethod - def load_scene_timestamps(episode_info: EpisodeInfo, search_dir: Optional[Path], _logger=None) -> Optional[List[Dict[str, Any]]]: - return EpisodeFileFinder.load_scene_timestamps(episode_info, search_dir, _logger) - - @staticmethod - def get_metadata(episode_info: EpisodeInfo) -> Dict[str, Any]: - return { - "season": episode_info.season, - "episode_number": episode_info.relative_episode, - "title": episode_info.title, - "premiere_date": episode_info.premiere_date, - "viewership": episode_info.viewership, - } - - @staticmethod - def get_episode_id_for_state(episode_info: EpisodeInfo) -> str: - return EpisodeInfoParser.get_episode_id(episode_info) - - def list_all_episodes(self) -> List[EpisodeInfo]: - episodes = [] - - if not self.episodes_data: - return episodes - - for season_data in self.episodes_data.get(EpisodesDataKeys.SEASONS, []): - season_num = season_data.get(EpisodesDataKeys.SEASON_NUMBER, 1) - season_episodes = sorted( - season_data.get(EpisodesDataKeys.EPISODES, []), - key=lambda ep: ep.get(EpisodeMetadataKeys.EPISODE_NUMBER, 0), - ) - - for idx, ep_data in enumerate(season_episodes): - episodes.append( - EpisodeInfo( - absolute_episode=0, - season=season_num, - relative_episode=idx + 1, - title=ep_data.get(EpisodeMetadataKeys.TITLE, f"S{season_num:02d}E{idx + 1:02d}"), - series_name=self.series_name, - premiere_date=ep_data.get(EpisodeMetadataKeys.PREMIERE_DATE), - viewership=ep_data.get(EpisodeMetadataKeys.VIEWERSHIP), - ), - ) - - return episodes diff --git a/preprocessor/episodes/episode_parser.py b/preprocessor/episodes/episode_parser.py deleted file mode 100644 index b0fbdd710..000000000 --- a/preprocessor/episodes/episode_parser.py +++ /dev/null @@ -1,29 +0,0 @@ -import logging -from pathlib import Path -import re -from typing import Optional - -logger = logging.getLogger(__name__) - - -class EpisodeInfoParser: - @staticmethod - def parse_filename(file_path: Path, episode_manager) -> Optional: - full_path_str = str(file_path) - - match_season_episode = re.search(r'S(\d+)[/\\]?E(\d+)', full_path_str, re.IGNORECASE) - if match_season_episode: - season = int(match_season_episode.group(1)) - episode = int(match_season_episode.group(2)) - return episode_manager.get_episode_by_season_and_relative(season, episode) - - logger.error( - f"Cannot parse episode from filename: {file_path.name}. " - f"Expected format: S##E## (e.g., S01E05, S10E13). " - f"Absolute episode numbers (E## without season) are not supported.", - ) - return None - - @staticmethod - def get_episode_id(episode_info) -> str: - return episode_info.episode_code() diff --git a/preprocessor/characters/__init__.py b/preprocessor/lib/__init__.py similarity index 100% rename from preprocessor/characters/__init__.py rename to preprocessor/lib/__init__.py diff --git a/preprocessor/lib/ai/__init__.py b/preprocessor/lib/ai/__init__.py new file mode 100644 index 000000000..c79473d8f --- /dev/null +++ b/preprocessor/lib/ai/__init__.py @@ -0,0 +1,8 @@ +from preprocessor.lib.ai.llm_provider import ( + CharacterInfo, + EpisodeInfo, + LLMProvider, + SeasonMetadata, +) + +__all__ = ['LLMProvider', 'EpisodeInfo', 'SeasonMetadata', 'CharacterInfo'] diff --git a/preprocessor/config/llm_provider.py b/preprocessor/lib/ai/llm_provider.py similarity index 67% rename from preprocessor/config/llm_provider.py rename to preprocessor/lib/ai/llm_provider.py index 7e29dfd0a..d9eb9ffdf 100644 --- a/preprocessor/config/llm_provider.py +++ b/preprocessor/lib/ai/llm_provider.py @@ -19,8 +19,8 @@ ) from preprocessor.config.config import settings -from preprocessor.core.enums import ParserMode -from preprocessor.prompts import ( +from preprocessor.config.enums import ParserMode +from preprocessor.config.prompts import ( extract_all_seasons_system, extract_all_seasons_user, extract_characters_system, @@ -32,7 +32,7 @@ merge_episode_data_system, merge_episode_data_user, ) -from preprocessor.utils.console import console +from preprocessor.lib.ui.console import console class EpisodeInfo(BaseModel): @@ -44,6 +44,7 @@ class EpisodeInfo(BaseModel): @field_validator('viewership', mode='before') @classmethod + @staticmethod def convert_viewership_to_str(cls, v): if v is None: return None @@ -51,27 +52,25 @@ def convert_viewership_to_str(cls, v): return str(v) return v - class SeasonMetadata(BaseModel): season_number: int episodes: List[EpisodeInfo] @model_validator(mode='before') @classmethod + @staticmethod def convert_old_format(cls, data): if isinstance(data, dict) and 'episodes' in data: for idx, episode in enumerate(data['episodes'], start=1): - if isinstance(episode, dict) and 'episode_number' in episode and 'episode_in_season' not in episode: + if isinstance(episode, dict) and 'episode_number' in episode and ('episode_in_season' not in episode): episode['episode_in_season'] = idx episode['overall_episode_number'] = episode['episode_number'] del episode['episode_number'] return data - class AllSeasonsMetadata(BaseModel): seasons: List[SeasonMetadata] - class EpisodeMetadata(BaseModel): title: str description: str @@ -79,31 +78,26 @@ class EpisodeMetadata(BaseModel): season: Optional[int] = None episode_number: Optional[int] = None - class CharacterInfo(BaseModel): name: str - class CharactersList(BaseModel): characters: List[CharacterInfo] - class LLMProvider: - __DEFAULT_MODEL_NAME = "Qwen/Qwen2.5-Coder-7B-Instruct" - __GEMINI_MODEL_NAME = "gemini-2.5-flash" - + __DEFAULT_MODEL_NAME = 'Qwen/Qwen2.5-Coder-7B-Instruct' + __GEMINI_MODEL_NAME = 'gemini-2.5-flash' __instance = None __model = None __openai_client = None - def __new__(cls, model_name: Optional[str] = None, parser_mode: Optional[ParserMode] = None): + def __new__(cls, model_name: Optional[str]=None, parser_mode: Optional[ParserMode]=None): if cls.__instance is None: cls.__instance = super().__new__(cls) return cls.__instance - def __init__(self, model_name: Optional[str] = None, parser_mode: Optional[ParserMode] = None): + def __init__(self, model_name: Optional[str]=None, parser_mode: Optional[ParserMode]=None): self.parser_mode = parser_mode or ParserMode.NORMAL - if self.parser_mode == ParserMode.PREMIUM: if self.__openai_client is None: self.__init_gemini_client() @@ -116,7 +110,7 @@ def extract_season_episodes(self, page_text: str, url: str) -> Optional[SeasonMe system_prompt=extract_season_system.get(), user_prompt=extract_season_user.get().format(url=url, page_text=page_text), response_model=SeasonMetadata, - error_context=f"extraction failed for {url}", + error_context=f'extraction failed for {url}', ) def extract_episode_metadata(self, page_text: str, url: str) -> Optional[EpisodeMetadata]: @@ -124,21 +118,23 @@ def extract_episode_metadata(self, page_text: str, url: str) -> Optional[Episode system_prompt=extract_episode_metadata_system.get(), user_prompt=extract_episode_metadata_user.get().format(url=url, page_text=page_text), response_model=EpisodeMetadata, - error_context=f"extraction failed for {url}", + error_context=f'extraction failed for {url}', ) def merge_episode_data(self, metadata_list: List[EpisodeMetadata]) -> EpisodeMetadata: if not metadata_list: - raise ValueError("No metadata to merge") - + raise ValueError('No metadata to merge') if len(metadata_list) == 1: return metadata_list[0] - - combined_text = "\n\n---\n\n".join([ - f"Source {i + 1}:\nTitle: {m.title}\nDescription: {m.description}\nSummary: {m.summary}\nSeason: {m.season}\nEpisode: {m.episode_number}" + combined_text = '\n\n---\n\n'.join([ + f'Source {i + 1}:\n' + f'Title: {m.title}\n' + f'Description: {m.description}\n' + f'Summary: {m.summary}\n' + f'Season: {m.season}\n' + f'Episode: {m.episode_number}' for i, m in enumerate(metadata_list) ]) - result = self.__process_llm_request( system_prompt=merge_episode_data_system.get(), user_prompt=merge_episode_data_user.get().format( @@ -146,18 +142,16 @@ def merge_episode_data(self, metadata_list: List[EpisodeMetadata]) -> EpisodeMet combined_text=combined_text, ), response_model=EpisodeMetadata, - error_context="merge failed", + error_context='merge failed', ) - return result if result else metadata_list[0] def extract_all_seasons(self, scraped_pages: List[Dict[str, Any]]) -> Optional[List[SeasonMetadata]]: - combined_content = "" + combined_content = '' for i, page in enumerate(scraped_pages, 1): - url = page["url"] - markdown = page["markdown"] - combined_content += f"\n\n=== SOURCE {i}: {url} ===\n\n{markdown}\n" - + url = page['url'] + markdown = page['markdown'] + combined_content += f'\n\n=== SOURCE {i}: {url} ===\n\n{markdown}\n' result = self.__process_llm_request( system_prompt=extract_all_seasons_system.get(), user_prompt=extract_all_seasons_user.get().format( @@ -165,18 +159,20 @@ def extract_all_seasons(self, scraped_pages: List[Dict[str, Any]]) -> Optional[L combined_content=combined_content, ), response_model=AllSeasonsMetadata, - error_context="extraction failed", + error_context='extraction failed', ) - return result.seasons if result else None - def extract_characters(self, scraped_pages: List[Dict[str, Any]], series_name: str) -> Optional[List[CharacterInfo]]: - combined_content = "" + def extract_characters( + self, + scraped_pages: List[Dict[str, Any]], + series_name: str, + ) -> Optional[List[CharacterInfo]]: + combined_content = '' for i, page in enumerate(scraped_pages, 1): - url = page["url"] - markdown = page["markdown"] - combined_content += f"\n\n=== SOURCE {i}: {url} ===\n\n{markdown}\n" - + url = page['url'] + markdown = page['markdown'] + combined_content += f'\n\n=== SOURCE {i}: {url} ===\n\n{markdown}\n' result = self.__process_llm_request( system_prompt=extract_characters_system.get(), user_prompt=extract_characters_user.get().format( @@ -185,54 +181,46 @@ def extract_characters(self, scraped_pages: List[Dict[str, Any]], series_name: s combined_content=combined_content, ), response_model=CharactersList, - error_context="character extraction failed", + error_context='character extraction failed', ) - return result.characters if result else None def __process_llm_request( - self, - system_prompt: str, - user_prompt: str, - response_model: Type[BaseModel], - error_context: str, + self, + system_prompt: str, + user_prompt: str, + response_model: Type[BaseModel], + error_context: str, ) -> Optional[BaseModel]: try: - messages = [ - {"role": "system", "content": system_prompt}, - {"role": "user", "content": user_prompt}, - ] - + messages = [{'role': 'system', 'content': system_prompt}, {'role': 'user', 'content': user_prompt}] if self.parser_mode == ParserMode.PREMIUM: content = self.__generate_with_gemini(messages) else: content = self.__generate(messages) - data = self.__extract_json(content) return response_model(**data) - except Exception as e: - console.print(f"[red]LLM {error_context}: {e}[/red]") + console.print(f'[red]LLM {error_context}: {e}[/red]') return None def __init_gemini_client(self) -> None: - console.print("[cyan]Initializing Gemini 2.5 Flash via OpenAI SDK...[/cyan]") + console.print('[cyan]Initializing Gemini 2.5 Flash via OpenAI SDK...[/cyan]') try: api_key = settings.gemini.api_key if not api_key: - raise ValueError("GEMINI_API_KEY not set in environment") - + raise ValueError('GEMINI_API_KEY not set in environment') self.__openai_client = OpenAI( - base_url="https://generativelanguage.googleapis.com/v1beta/openai/", + base_url='https://generativelanguage.googleapis.com/v1beta/openai/', api_key=api_key, ) - console.print("[green]✓ Gemini 2.5 Flash initialized[/green]") + console.print('[green]✓ Gemini 2.5 Flash initialized[/green]') except Exception as e: - console.print(f"[red]Failed to initialize Gemini client: {e}[/red]") + console.print(f'[red]Failed to initialize Gemini client: {e}[/red]') raise e def __load_model(self) -> None: - console.print(f"[cyan]Loading LLM: {self.model_name} (vLLM, 128K context)[/cyan]") + console.print(f'[cyan]Loading LLM: {self.model_name} (vLLM, 128K context)[/cyan]') try: self.__model = LLM( model=self.model_name, @@ -240,18 +228,18 @@ def __load_model(self) -> None: max_model_len=131072, gpu_memory_utilization=0.95, tensor_parallel_size=1, - dtype="bfloat16", + dtype='bfloat16', enable_chunked_prefill=True, max_num_batched_tokens=16384, enforce_eager=True, disable_log_stats=True, ) - console.print("[green]✓ LLM loaded successfully (vLLM)[/green]") + console.print('[green]✓ LLM loaded successfully (vLLM)[/green]') except Exception as e: - console.print(f"[red]Failed to load model: {e}[/red]") + console.print(f'[red]Failed to load model: {e}[/red]') raise e - def __generate(self, messages: List[Dict], max_tokens: int = 32768) -> str: + def __generate(self, messages: List[Dict], max_tokens: int=32768) -> str: sampling_params = SamplingParams( temperature=0.7, top_p=0.8, @@ -259,37 +247,28 @@ def __generate(self, messages: List[Dict], max_tokens: int = 32768) -> str: max_tokens=max_tokens, repetition_penalty=1.05, ) - - outputs = self.__model.chat( - messages=[messages], - sampling_params=sampling_params, - ) - + outputs = self.__model.chat(messages=[messages], sampling_params=sampling_params) return outputs[0].outputs[0].text.strip() def __generate_with_gemini(self, messages: List[Dict]) -> str: - response = self.__openai_client.chat.completions.create( - model=self.__GEMINI_MODEL_NAME, - messages=messages, - ) + response = self.__openai_client.chat.completions.create(model=self.__GEMINI_MODEL_NAME, messages=messages) return response.choices[0].message.content.strip() @staticmethod def __extract_json(content: str) -> Dict[str, Any]: try: - if "```json" in content: - start = content.find("```json") + 7 - end = content.find("```", start) + if '```json' in content: + start = content.find('```json') + 7 + end = content.find('```', start) json_str = content[start:end].strip() - elif "```" in content: - start = content.find("```") + 3 - end = content.find("```", start) + elif '```' in content: + start = content.find('```') + 3 + end = content.find('```', start) json_str = content[start:end].strip() else: json_str = content.strip() - return json.loads(json_str) except json.JSONDecodeError as e: - console.print(f"[red]JSON parse error: {e}[/red]") - console.print(f"[yellow]Raw content:\n{content}[/yellow]") + console.print(f'[red]JSON parse error: {e}[/red]') + console.print(f'[yellow]Raw content:\n{content}[/yellow]') raise diff --git a/preprocessor/lib/characters/__init__.py b/preprocessor/lib/characters/__init__.py new file mode 100644 index 000000000..d1878f342 --- /dev/null +++ b/preprocessor/lib/characters/__init__.py @@ -0,0 +1,9 @@ +from preprocessor.lib.characters.face_detection import FaceDetector +from preprocessor.lib.characters.image_search import ( + BaseImageSearch, + DuckDuckGoImageSearch, + GoogleImageSearch, +) +from preprocessor.lib.characters.reference_downloader import CharacterReferenceDownloader + +__all__ = ['BaseImageSearch', 'CharacterReferenceDownloader', 'DuckDuckGoImageSearch', 'FaceDetector', 'GoogleImageSearch'] diff --git a/preprocessor/lib/characters/face_detection.py b/preprocessor/lib/characters/face_detection.py new file mode 100644 index 000000000..d2eec0ef1 --- /dev/null +++ b/preprocessor/lib/characters/face_detection.py @@ -0,0 +1,143 @@ +import os +from pathlib import Path +from typing import ( + Any, + Dict, + List, + Optional, +) +import warnings + +import cv2 +from insightface.app import FaceAnalysis +import numpy as np +from numpy.linalg import norm +import onnxruntime as ort + +from preprocessor.config.config import settings +from preprocessor.lib.ui.console import console + +warnings.filterwarnings('ignore', message='.*estimate.*is deprecated.*', category=FutureWarning, module='insightface') + +class FaceDetector: + + @staticmethod + def init() -> FaceAnalysis: + model_root = os.getenv('INSIGHTFACE_HOME', os.path.expanduser('~/.insightface')) + available_providers = ort.get_available_providers() + console.print(f"[dim]Available ONNX providers: {', '.join(available_providers)}[/dim]") + if 'CUDAExecutionProvider' not in available_providers: + console.print('[red]✗ CUDAExecutionProvider not available in onnxruntime[/red]') + console.print('[red] Check if onnxruntime-gpu is installed and CUDA libraries are accessible[/red]') + raise RuntimeError('CUDA provider not available in onnxruntime') + providers = [( + 'CUDAExecutionProvider', + { + 'device_id': 0, + 'arena_extend_strategy': 'kNextPowerOfTwo', + 'gpu_mem_limit': 8 * 1024 * 1024 * 1024, + 'cudnn_conv_algo_search': 'EXHAUSTIVE', + 'do_copy_in_default_stream': True, + }, + )] + with warnings.catch_warnings(): + warnings.filterwarnings('ignore', category=UserWarning, module='onnxruntime') + warnings.filterwarnings('ignore', category=FutureWarning, module='insightface') + console.print(f'[cyan]Loading {settings.face_recognition.model_name} face detection model (GPU-only)...[/cyan]') + try: + face_app = FaceAnalysis(name=settings.face_recognition.model_name, root=model_root, providers=providers) + face_app.prepare(ctx_id=0, det_size=settings.face_recognition.detection_size, det_thresh=settings.character.face_detection_threshold) + except Exception as e: + console.print('[red]✗ Failed to initialize face detection on GPU[/red]') + console.print(f'[red] Error: {e}[/red]') + console.print('[red] Ensure CUDA and onnxruntime-gpu are properly configured[/red]') + raise RuntimeError('GPU required but face detection initialization failed') from e + actual_providers = face_app.models['detection'].session.get_providers() + if 'CUDAExecutionProvider' not in actual_providers: + console.print('[red]✗ CUDA provider not active after initialization[/red]') + console.print(f"[red] Active providers: {', '.join(actual_providers)}[/red]") + raise RuntimeError('CUDA required but not available for face detection') + console.print(f'[green]✓ Face detection initialized ({settings.face_recognition.model_name})[/green]') + console.print('[dim] Device: GPU (CUDA)[/dim]') + console.print(f'[dim] Detection size: {settings.face_recognition.detection_size}[/dim]') + console.print(f'[dim] Face detection threshold: {settings.character.face_detection_threshold}[/dim]') + console.print(f'[dim] Model cache: {model_root}[/dim]') + return face_app + + @staticmethod + def load_character_references(characters_dir: Path, face_app: FaceAnalysis) -> Dict[str, np.ndarray]: + console.print('[blue]Loading character references...[/blue]') + character_vectors = {} + for char_dir in characters_dir.iterdir(): + if not char_dir.is_dir(): + continue + char_name = char_dir.name.replace('_', ' ').title() + vector_file = char_dir / 'face_vector.npy' + if vector_file.exists(): + character_vectors[char_name] = np.load(vector_file) + console.print(f'[dim] ✓ {char_name}: loaded from face_vector.npy[/dim]') + continue + images = list(char_dir.glob('*.jpg')) + if not images: + continue + embeddings = [] + for img_path in images: + emb = FaceDetector.__get_face_embedding(str(img_path), face_app) + if emb is not None: + embeddings.append(emb) + if embeddings: + mean_emb = np.mean(embeddings, axis=0) + centroid = mean_emb / norm(mean_emb) + character_vectors[char_name] = centroid + console.print(f'[green] ✓ {char_name}: {len(embeddings)} reference images[/green]') + console.print(f'[green]✓ Loaded {len(character_vectors)} characters[/green]') + return character_vectors + + @staticmethod + def __get_face_embedding(img_path: str, face_app: FaceAnalysis) -> Optional[np.ndarray]: + img = cv2.imread(img_path) + if img is None: + return None + faces = face_app.get(img) + if not faces: + return None + faces.sort(key=lambda x: (x.bbox[2] - x.bbox[0]) * (x.bbox[3] - x.bbox[1]), reverse=True) + return faces[0].normed_embedding + + @staticmethod + def detect_characters_in_frame( + frame_path: Path, + face_app: FaceAnalysis, + character_vectors: Dict[str, np.ndarray], + threshold: float, + ) -> List[Dict[str, Any]]: + img = cv2.imread(str(frame_path)) + if img is None: + return [] + faces = face_app.get(img) + if not faces: + return [] + detected = [] + for face in faces: + face_embedding = face.normed_embedding + bbox = face.bbox.astype(int) + best_match = None + best_similarity = threshold + for char_name, char_vector in character_vectors.items(): + similarity = np.dot(face_embedding, char_vector) + if similarity > best_similarity: + best_similarity = similarity + best_match = char_name + if best_match is not None: + detected.append({ + 'name': best_match, + 'confidence': float(best_similarity), + 'bbox': { + 'x1': int(bbox[0]), + 'y1': int(bbox[1]), + 'x2': int(bbox[2]), + 'y2': int(bbox[3]), + }, + }) + detected.sort(key=lambda x: x['confidence'], reverse=True) + return detected diff --git a/preprocessor/lib/characters/image_search/__init__.py b/preprocessor/lib/characters/image_search/__init__.py new file mode 100644 index 000000000..f1bf79335 --- /dev/null +++ b/preprocessor/lib/characters/image_search/__init__.py @@ -0,0 +1,5 @@ +from preprocessor.lib.characters.image_search.duckduckgo_image_search import DuckDuckGoImageSearch +from preprocessor.lib.characters.image_search.google_image_search import GoogleImageSearch +from preprocessor.lib.characters.image_search.image_search import BaseImageSearch + +__all__ = ['BaseImageSearch', 'DuckDuckGoImageSearch', 'GoogleImageSearch'] diff --git a/preprocessor/characters/duckduckgo_image_search.py b/preprocessor/lib/characters/image_search/duckduckgo_image_search.py similarity index 76% rename from preprocessor/characters/duckduckgo_image_search.py rename to preprocessor/lib/characters/image_search/duckduckgo_image_search.py index 224434dc0..90a4cad89 100644 --- a/preprocessor/characters/duckduckgo_image_search.py +++ b/preprocessor/lib/characters/image_search/duckduckgo_image_search.py @@ -5,13 +5,14 @@ from ddgs import DDGS -from preprocessor.characters.image_search import BaseImageSearch +from preprocessor.lib.characters.image_search.image_search import BaseImageSearch class DuckDuckGoImageSearch(BaseImageSearch): + @property def name(self) -> str: - return "DuckDuckGo" + return 'DuckDuckGo' def search(self, query: str) -> List[Dict[str, str]]: with DDGS() as ddgs: diff --git a/preprocessor/lib/characters/image_search/google_image_search.py b/preprocessor/lib/characters/image_search/google_image_search.py new file mode 100644 index 000000000..5b3021ece --- /dev/null +++ b/preprocessor/lib/characters/image_search/google_image_search.py @@ -0,0 +1,30 @@ +from typing import ( + Dict, + List, +) + +from serpapi import GoogleSearch + +from preprocessor.lib.characters.image_search.image_search import BaseImageSearch + + +class GoogleImageSearch(BaseImageSearch): + + def __init__(self, api_key: str, max_results: int=50): + super().__init__(max_results) + if not api_key: + raise ValueError('SerpAPI key is required for Google Image Search') + self.api_key = api_key + + @property + def name(self) -> str: + return 'Google Images API' + + def search(self, query: str) -> List[Dict[str, str]]: + params = {'engine': 'google_images', 'q': query, 'hl': 'pl', 'gl': 'pl', 'api_key': self.api_key} + search = GoogleSearch(params) + results = search.get_dict() + images = [] + for img_result in results.get('images_results', [])[:self.max_results]: + images.append({'image': img_result.get('original'), 'thumbnail': img_result.get('thumbnail')}) + return images diff --git a/preprocessor/characters/image_search.py b/preprocessor/lib/characters/image_search/image_search.py similarity index 87% rename from preprocessor/characters/image_search.py rename to preprocessor/lib/characters/image_search/image_search.py index 9bdd4642e..1437dc780 100644 --- a/preprocessor/characters/image_search.py +++ b/preprocessor/lib/characters/image_search/image_search.py @@ -9,7 +9,8 @@ class BaseImageSearch(ABC): - def __init__(self, max_results: int = 50): + + def __init__(self, max_results: int=50): self.max_results = max_results @abstractmethod diff --git a/preprocessor/lib/characters/models.py b/preprocessor/lib/characters/models.py new file mode 100644 index 000000000..43bff537d --- /dev/null +++ b/preprocessor/lib/characters/models.py @@ -0,0 +1,18 @@ +from dataclasses import dataclass +from pathlib import Path + +import numpy as np + + +@dataclass +class FaceData: + bbox: np.ndarray + face_vector: np.ndarray + source_image_path: Path + source_image_idx: int + face_img: np.ndarray + +@dataclass +class CandidateFace: + faces: list[FaceData] + avg_similarity: float diff --git a/preprocessor/lib/characters/reference_downloader.py b/preprocessor/lib/characters/reference_downloader.py new file mode 100644 index 000000000..1abc2edcb --- /dev/null +++ b/preprocessor/lib/characters/reference_downloader.py @@ -0,0 +1,283 @@ +from __future__ import annotations + +import json +import logging +from pathlib import Path +import random +import time +from typing import ( + Any, + Dict, + List, + Optional, +) + +import cv2 +from insightface.app import FaceAnalysis +import numpy as np +from patchright.sync_api import ( + BrowserContext, + Page, + sync_playwright, +) + +from preprocessor.config.config import settings +from preprocessor.core.base_processor import BaseProcessor +from preprocessor.lib.characters.face_detection import FaceDetector +from preprocessor.lib.characters.image_search import ( + BaseImageSearch, + DuckDuckGoImageSearch, + GoogleImageSearch, +) +from preprocessor.lib.ui.console import ( + console, + create_progress, +) + + +class CharacterReferenceDownloader(BaseProcessor): + + def __init__(self, args: Dict[str, Any]): + super().__init__(args=args, class_name=self.__class__.__name__, error_exit_code=8, loglevel=logging.DEBUG) + self.characters_json: Path = self._args['characters_json'] + self.series_name: str = self._args['series_name'] + self.output_dir: Path = self._args.get('output_dir', settings.character.get_output_dir(self.series_name)) + self.images_per_character: int = self._args.get('images_per_character', settings.character.reference_images_per_character) + self.max_results: int = settings.image_scraper.max_results_to_scrape + self.min_width: int = settings.image_scraper.min_image_width + self.min_height: int = settings.image_scraper.min_image_height + self.use_gpu: bool = True + self.search_mode: str = self._args.get('search_mode', 'normal') + self.search_engine: BaseImageSearch = self.__create_search_engine() + self.face_app: FaceAnalysis = None + self.browser_context: Optional[BrowserContext] = None + + def __create_search_engine(self) -> BaseImageSearch: + if self.search_mode == 'premium': + serpapi_key = settings.image_scraper.serpapi_key + return GoogleImageSearch(api_key=serpapi_key, max_results=self.max_results) + return DuckDuckGoImageSearch(max_results=self.max_results) + + def _validate_args(self, args: Dict[str, Any]) -> None: + if 'characters_json' not in args: + raise ValueError('characters_json is required') + + def get_output_subdir(self, item: Optional['ProcessingItem'] = None) -> str: # pylint: disable=unused-argument + return 'character_references' + + def __all_references_exist(self, characters: List[Dict[str, Any]]) -> bool: + for char in characters: + char_name = char['name'] + output_folder = self.output_dir / char_name.replace(' ', '_').lower() + existing_images = list(output_folder.glob('*.jpg')) + if len(existing_images) < self.images_per_character: + return False + return True + + def _execute(self) -> None: + if not self.characters_json.exists(): + console.print(f'[red]Characters JSON not found: {self.characters_json}[/red]') + return + with open(self.characters_json, encoding='utf-8') as f: + data = json.load(f) + characters = data.get('characters', []) + if not characters: + console.print('[yellow]No characters found in JSON[/yellow]') + return + if self.__all_references_exist(characters): + console.print(f'[green]✓ All reference images already exist for {len(characters)} characters (skipping)[/green]') + return + self.face_app = FaceDetector.init() + console.print(f'[blue]Downloading reference images for {len(characters)} characters...[/blue]') + with sync_playwright() as p: + self.browser_context = p.chromium.launch_persistent_context( + user_data_dir='/tmp/patchright_profile', + headless=True, + args=['--no-sandbox', '--disable-dev-shm-usage', '--disable-gpu'], + ignore_default_args=['--enable-automation'], + ) + with create_progress() as progress: + task = progress.add_task('Downloading references', total=len(characters)) + for i, char in enumerate(characters): + char_name = char['name'] + downloaded = False + try: + downloaded = self.__download_character_references(char_name, progress) + except Exception as e: + self.logger.error(f'Failed to download references for {char_name}: {e}') + finally: + progress.advance(task) + if downloaded and i < len(characters) - 1: + delay = random.uniform(settings.image_scraper.request_delay_min, settings.image_scraper.request_delay_max) + time.sleep(delay) + self.browser_context.close() + console.print('[green]✓ Reference download completed[/green]') + + def __count_faces(self, img) -> int: + faces = self.face_app.get(img) + return len(faces) + + @staticmethod + def _validate_and_decode_image( + img_bytes: bytes, img_url: str, logger, + ) -> np.ndarray | None: + if not img_bytes: + return None + img_array = np.asarray(bytearray(img_bytes), dtype=np.uint8) + img = cv2.imdecode(img_array, cv2.IMREAD_COLOR) + if img is None or img.size == 0: + logger.debug(f'Failed to decode image from {img_url}') + return None + if len(img.shape) != 3 or img.shape[2] != 3: + logger.debug(f'Image has unexpected shape {img.shape} from {img_url}') + return None + return img + + def __download_image_with_browser( + self, img_url: str, page: Page, + ) -> np.ndarray | None: + try: + response = page.goto( + img_url, + timeout=settings.image_scraper.page_navigation_timeout, + wait_until='domcontentloaded', + ) + if not response or response.status != 200: + return None + content_type = response.headers.get('content-type', '') + if 'image' not in content_type: + return None + img_bytes = response.body() + return self._validate_and_decode_image(img_bytes, img_url, self.logger) + except TimeoutError: + self.logger.debug(f'Timeout downloading image {img_url}') + return None + except Exception as e: + if 'net::ERR_CONNECTION_CLOSED' in str(e) or 'Navigation' in str(e): + self.logger.debug( + f'Connection/navigation error for {img_url}: {e}', + ) + else: + self.logger.debug(f'Failed to download image {img_url}: {e}') + return None + + def __prepare_output_folder(self, char_name: str) -> Path: + output_folder = self.output_dir / char_name.replace(' ', '_').lower() + output_folder.mkdir(parents=True, exist_ok=True) + return output_folder + + def __check_existing_images( + self, output_folder: Path, char_name: str, progress, + ) -> Optional[int]: + existing_images = list(output_folder.glob('*.jpg')) + if len(existing_images) >= self.images_per_character: + progress.console.print( + f'[green]✓ {char_name}: {len(existing_images)} images ' + f'already exist (skipping)[/green]', + ) + return None + return len(existing_images) + + def __validate_and_save_image( + self, img: np.ndarray, img_url: str, output_folder: Path, saved_count: int, + ) -> bool: + if not isinstance(img, np.ndarray) or img.size == 0: + self.logger.debug(f'Invalid image array from {img_url}') + return False + h, w = img.shape[:2] + if w < self.min_width or h < self.min_height: + return False + try: + face_count = self.__count_faces(img) + except Exception as face_err: + self.logger.debug(f'Face detection failed for {img_url}: {face_err}') + return False + if face_count != 1: + return False + filename = f'{saved_count:02d}.jpg' + path = output_folder / filename + cv2.imwrite(str(path), img) + return True + + def __process_search_results( + self, results: List[Dict[str, Any]], output_folder: Path, saved_count: int, + ) -> int: + sorted_results = sorted( + results, + key=lambda x: ( + 0 if x.get('image', '').lower().endswith(('.jpg', '.jpeg')) else 1, + 1 if x.get('image', '').lower().endswith('.png') else 2, + ), + ) + page = self.browser_context.new_page() + try: + for res in sorted_results: + if saved_count >= self.images_per_character: + break + img_url = res['image'] + try: + img = self.__download_image_with_browser(img_url, page) + if img is None: + continue + if self.__validate_and_save_image( + img, img_url, output_folder, saved_count, + ): + saved_count += 1 + except Exception as e: + self.logger.debug(f'Error processing image: {e}') + continue + finally: + page.close() + return saved_count + + def __print_results( + self, char_name: str, saved_count: int, progress, + ) -> None: + if saved_count >= self.images_per_character: + progress.console.print( + f'[green]✓[/green] {char_name}: ' + f'{saved_count}/{self.images_per_character} images', + ) + elif saved_count > 0: + progress.console.print( + f'[yellow]⚠[/yellow] {char_name}: ' + f'{saved_count}/{self.images_per_character} images (incomplete)', + ) + else: + progress.console.print( + f'[red]✗[/red] {char_name}: No suitable images found', + ) + + def __download_character_references(self, char_name: str, progress) -> bool: + output_folder = self.__prepare_output_folder(char_name) + saved_count = self.__check_existing_images(output_folder, char_name, progress) + if saved_count is None: + return False + search_query = f'Serial {self.series_name} {char_name} postać' + progress.console.print( + f'[cyan]Searching [{self.search_engine.name}]: {search_query}[/cyan]', + ) + for attempt in range(settings.image_scraper.retry_attempts): + try: + results = self.search_engine.search(search_query) + saved_count = self.__process_search_results( + results, output_folder, saved_count, + ) + break + except KeyboardInterrupt: + progress.console.print('\n[yellow]Download interrupted[/yellow]') + raise + except Exception as e: + if attempt < settings.image_scraper.retry_attempts - 1: + delay = settings.image_scraper.retry_delay * 2 ** attempt + self.logger.warning( + f'Attempt {attempt + 1} failed for {char_name}, ' + f'retrying in {delay}s: {e}', + ) + time.sleep(delay) + else: + self.logger.error( + f'All retry attempts failed for {char_name}: {e}', + ) + self.__print_results(char_name, saved_count, progress) + return True diff --git a/preprocessor/lib/core/__init__.py b/preprocessor/lib/core/__init__.py new file mode 100644 index 000000000..a9a53c65e --- /dev/null +++ b/preprocessor/lib/core/__init__.py @@ -0,0 +1,7 @@ +from preprocessor.lib.core.logging import ( + ErrorHandlingLogger, + LoggerNotFinalizedException, +) +from preprocessor.lib.core.time import TimeFormatter + +__all__ = ['ErrorHandlingLogger', 'LoggerNotFinalizedException', 'TimeFormatter'] diff --git a/preprocessor/utils/error_handling_logger.py b/preprocessor/lib/core/logging.py similarity index 75% rename from preprocessor/utils/error_handling_logger.py rename to preprocessor/lib/core/logging.py index 3f2cf1628..4bfed151e 100644 --- a/preprocessor/utils/error_handling_logger.py +++ b/preprocessor/lib/core/logging.py @@ -4,38 +4,36 @@ from rich.logging import RichHandler from rich.panel import Panel -from preprocessor.utils.console import console +from preprocessor.lib.ui.console import console class LoggerNotFinalizedException(Exception): - def __init__(self): - super().__init__("Logger destroyed without finalize() being called.") + def __init__(self): + super().__init__('Logger destroyed without finalize() being called.') class ErrorHandlingLogger: + def __init__(self, class_name: str, loglevel: int, error_exit_code: int) -> None: self.__class_name: str = class_name self.__error_exit_code: int = error_exit_code self.__errors: List[str] = [] self.__is_finalized: bool = False - self.__setup_logger(loglevel) def __del__(self) -> None: if not self.__is_finalized: - self.__logger.error( - f"ErrorHandlingLogger for '{self.__class_name}' destroyed without finalize().", - ) + self.__logger.error(f"ErrorHandlingLogger for '{self.__class_name}' destroyed without finalize().") if self.__errors: - self.__logger.error("Logged errors:") + self.__logger.error('Logged errors:') for error in self.__errors: - self.__logger.error(f"- {error}") + self.__logger.error(f'- {error}') raise LoggerNotFinalizedException def __setup_logger(self, level: int) -> None: logging.basicConfig( level=level, - format="%(message)s", + format='%(message)s', handlers=[ RichHandler( console=console, @@ -58,7 +56,7 @@ def log(self, level: int, message: str) -> None: elif level == logging.DEBUG: self.__logger.debug(message) else: - raise RuntimeError(f"Logging level {level} is not supported.") + raise RuntimeError(f'Logging level {level} is not supported.') def info(self, message: str) -> None: self.__logger.info(message) @@ -78,18 +76,19 @@ def finalize(self) -> int: if self.__errors: console.print( Panel( - f"[bold red]Processing for '{self.__class_name}' completed with {len(self.__errors)} error(s)[/bold red]", - title="Errors Occurred", - border_style="red", + f"[bold red]Processing for '{self.__class_name}' " + f"completed with {len(self.__errors)} error(s)[/bold red]", + title='Errors Occurred', + border_style='red', ), ) return self.__error_exit_code - console.print( Panel( - f"[bold green]Processing for '{self.__class_name}' completed successfully[/bold green]", - title="Success", - border_style="green", + f"[bold green]Processing for '{self.__class_name}' " + "completed successfully[/bold green]", + title='Success', + border_style='green', ), ) return 0 diff --git a/preprocessor/lib/core/time.py b/preprocessor/lib/core/time.py new file mode 100644 index 000000000..1d00d1713 --- /dev/null +++ b/preprocessor/lib/core/time.py @@ -0,0 +1,20 @@ +class TimeFormatter: + + @staticmethod + def format_hms(seconds: float) -> str: + hours = int(seconds // 3600) + minutes = int(seconds % 3600 // 60) + secs = int(seconds % 60) + return f'{hours}:{minutes:02d}:{secs:02d}' + + @staticmethod + def format_human(seconds: float) -> str: + if seconds < 60: + return f'{seconds:.1f}s' + minutes = int(seconds // 60) + secs = int(seconds % 60) + if minutes < 60: + return f'{minutes}m {secs}s' + hours = minutes // 60 + minutes = minutes % 60 + return f'{hours}h {minutes}m {secs}s' diff --git a/preprocessor/lib/episodes/__init__.py b/preprocessor/lib/episodes/__init__.py new file mode 100644 index 000000000..7f38bf32a --- /dev/null +++ b/preprocessor/lib/episodes/__init__.py @@ -0,0 +1,6 @@ +from preprocessor.lib.episodes.episode_manager import ( + EpisodeInfo, + EpisodeManager, +) + +__all__ = ['EpisodeInfo', 'EpisodeManager'] diff --git a/preprocessor/lib/episodes/episode_manager.py b/preprocessor/lib/episodes/episode_manager.py new file mode 100644 index 000000000..9e56471e8 --- /dev/null +++ b/preprocessor/lib/episodes/episode_manager.py @@ -0,0 +1,204 @@ +from dataclasses import dataclass +import json +from pathlib import Path +import re +from typing import ( + Any, + Dict, + List, + Optional, +) + +from preprocessor.config.constants import ( + SUPPORTED_VIDEO_EXTENSIONS, + EpisodeMetadataKeys, + EpisodesDataKeys, +) +from preprocessor.core.path_manager import PathManager +from preprocessor.lib.core.logging import ErrorHandlingLogger + + +@dataclass +class EpisodeInfo: + absolute_episode: int + season: int + relative_episode: int + title: str + series_name: Optional[str] = None + premiere_date: Optional[str] = None + viewership: Optional[str] = None + + def episode_code(self) -> str: + return f'S{self.season:02d}E{self.relative_episode:02d}' + + def season_code(self) -> str: + return f'S{self.season:02d}' + + def episode_num(self) -> str: + return f'E{self.relative_episode:02d}' + + def is_special(self) -> bool: + return self.season == 0 + +class EpisodeManager: + + def __init__(self, episodes_info_json: Optional[Path], series_name: str, logger: Optional[ErrorHandlingLogger]=None): + self.series_name = series_name.lower() + self.episodes_data: Optional[Dict[str, Any]] = None + self.path_manager = PathManager(self.series_name) + self._logger: Optional[ErrorHandlingLogger] = logger + if episodes_info_json and episodes_info_json.exists(): + with open(episodes_info_json, 'r', encoding='utf-8') as f: + self.episodes_data = json.load(f) + + def _create_episode_info( + self, + season: int, + relative_episode: int, + title: Optional[str]=None, + premiere_date: Optional[str]=None, + viewership: Optional[str]=None, + ) -> EpisodeInfo: + return EpisodeInfo( + absolute_episode=0, + season=season, + relative_episode=relative_episode, + title=title or f'S{season:02d}E{relative_episode:02d}', + series_name=self.series_name, + premiere_date=premiere_date, + viewership=viewership, + ) + + def parse_filename(self, file_path: Path) -> Optional[EpisodeInfo]: + full_path_str = str(file_path) + match_season_episode = re.search('S(\\d+)[/\\\\]?E(\\d+)', full_path_str, re.IGNORECASE) + if match_season_episode: + season = int(match_season_episode.group(1)) + episode = int(match_season_episode.group(2)) + return self.get_episode_by_season_and_relative(season, episode) + if self._logger: + self._logger.error( + f'Cannot parse episode from filename: {file_path.name}. ' + 'Expected format: S##E## (e.g., S01E05, S10E13). ' + 'Absolute episode numbers (E## without season) are not supported.', + ) + return None + + def get_episode_by_season_and_relative(self, season: int, relative_episode: int) -> EpisodeInfo: + if not self.episodes_data: + return self._create_episode_info(season, relative_episode) + for season_data in self.episodes_data.get(EpisodesDataKeys.SEASONS, []): + if season_data.get(EpisodesDataKeys.SEASON_NUMBER) == season: + episodes = sorted(season_data.get(EpisodesDataKeys.EPISODES, []), key=lambda ep: ep.get(EpisodeMetadataKeys.EPISODE_NUMBER, 0)) + if 0 < relative_episode <= len(episodes): + ep_data = episodes[relative_episode - 1] + return self._create_episode_info( + season=season, + relative_episode=relative_episode, + title=ep_data.get(EpisodeMetadataKeys.TITLE), + premiere_date=ep_data.get(EpisodeMetadataKeys.PREMIERE_DATE), + viewership=ep_data.get(EpisodeMetadataKeys.VIEWERSHIP), + ) + if self._logger: + self._logger.warning( + f'Season {season} not found in episodes_info_json! ' + f'Processing S{season:02d}E{relative_episode:02d} with filename-only metadata. ' + f'Scrape episode info for season {season} to get title, premiere date, etc.', + ) + return self._create_episode_info(season, relative_episode) + + @staticmethod + def find_video_file(episode_info: EpisodeInfo, search_dir: Path) -> Optional[Path]: + if not search_dir.exists(): + return None + if search_dir.is_file(): + return search_dir + episode_code = episode_info.episode_code() + season_dir_name = episode_info.season_code() + search_dirs = [search_dir / season_dir_name, search_dir] + for dir_path in search_dirs: + if not dir_path.exists(): + continue + for ext in SUPPORTED_VIDEO_EXTENSIONS: + for video_file in dir_path.glob(f'*{ext}'): + if re.search(episode_code, video_file.name, re.IGNORECASE): + return video_file + return None + + def find_transcription_file(self, episode_info: EpisodeInfo, search_dir: Path, prefer_segmented: bool=True) -> Optional[Path]: + if not search_dir.exists(): + return None + season_dir_name = episode_info.season_code() + season_dir = search_dir / season_dir_name + if not season_dir.exists(): + return None + if prefer_segmented: + segmented = season_dir / self.path_manager.build_filename(episode_info, extension='json', suffix='segmented') + if segmented.exists(): + return segmented + regular = season_dir / self.path_manager.build_filename(episode_info, extension='json') + if regular.exists(): + return regular + return None + + @staticmethod + def find_scene_timestamps_file(episode_info: EpisodeInfo, search_dir: Path) -> Optional[Path]: + if not search_dir.exists(): + return None + episode_code = episode_info.episode_code() + pattern = f'**/*{episode_code}*_scenes.json' + for scene_file in search_dir.glob(pattern): + return scene_file + return None + + @staticmethod + def load_scene_timestamps( + episode_info: EpisodeInfo, + search_dir: Optional[Path], + _logger: Optional[ErrorHandlingLogger]=None, + ) -> Optional[List[Dict[str, Any]]]: + if not search_dir: + return None + scene_file = EpisodeManager.find_scene_timestamps_file(episode_info, search_dir) + if not scene_file: + return None + try: + with open(scene_file, 'r', encoding='utf-8') as f: + return json.load(f) + except (OSError, json.JSONDecodeError) as e: + if _logger: + _logger.error(f'Failed to load scene timestamps: {e}') + return None + + @staticmethod + def get_metadata(episode_info: EpisodeInfo) -> Dict[str, Any]: + return { + 'season': episode_info.season, + 'episode_number': episode_info.relative_episode, + 'title': episode_info.title, + 'premiere_date': episode_info.premiere_date, + 'viewership': episode_info.viewership, + } + + @staticmethod + def get_episode_id_for_state(episode_info: EpisodeInfo) -> str: + return episode_info.episode_code() + + def list_all_episodes(self) -> List[EpisodeInfo]: + episodes: List[EpisodeInfo] = [] + if not self.episodes_data: + return episodes + for season_data in self.episodes_data.get(EpisodesDataKeys.SEASONS, []): + season_num = season_data.get(EpisodesDataKeys.SEASON_NUMBER, 1) + season_episodes = sorted(season_data.get(EpisodesDataKeys.EPISODES, []), key=lambda ep: ep.get(EpisodeMetadataKeys.EPISODE_NUMBER, 0)) + for idx, ep_data in enumerate(season_episodes): + episodes.append( + self._create_episode_info( + season=season_num, + relative_episode=idx + 1, + title=ep_data.get(EpisodeMetadataKeys.TITLE), + premiere_date=ep_data.get(EpisodeMetadataKeys.PREMIERE_DATE), + viewership=ep_data.get(EpisodeMetadataKeys.VIEWERSHIP), + ), + ) + return episodes diff --git a/preprocessor/lib/io/__init__.py b/preprocessor/lib/io/__init__.py new file mode 100644 index 000000000..bf8a647c2 --- /dev/null +++ b/preprocessor/lib/io/__init__.py @@ -0,0 +1,9 @@ +from preprocessor.lib.io.files import ( + FileOperations, + atomic_write_json, + load_json, +) +from preprocessor.lib.io.hashing import HashStorage +from preprocessor.lib.io.metadata import MetadataBuilder + +__all__ = ['FileOperations', 'HashStorage', 'MetadataBuilder', 'atomic_write_json', 'load_json'] diff --git a/preprocessor/lib/io/detection_io.py b/preprocessor/lib/io/detection_io.py new file mode 100644 index 000000000..e57226a1a --- /dev/null +++ b/preprocessor/lib/io/detection_io.py @@ -0,0 +1,30 @@ +from pathlib import Path +from typing import ( + Any, + Dict, + List, +) + +from insightface.app import FaceAnalysis +import numpy as np + +from preprocessor.lib.characters.face_detection import FaceDetector + + +def process_frames_for_detection( + frame_files: List[Path], + face_app: FaceAnalysis, + character_vectors: Dict[str, np.ndarray], + threshold: float, +) -> List[Dict[str, Any]]: + results: List[Dict[str, Any]] = [] + for frame_path in frame_files: + detections: List[Dict[str, Any]] = FaceDetector.detect_characters_in_frame( + frame_path, + face_app, + character_vectors, + threshold, + ) + if detections: + results.append({'frame': frame_path.name, 'faces': detections}) + return results diff --git a/preprocessor/lib/io/files.py b/preprocessor/lib/io/files.py new file mode 100644 index 000000000..bc0bf9ed2 --- /dev/null +++ b/preprocessor/lib/io/files.py @@ -0,0 +1,48 @@ +import json +from pathlib import Path +from typing import ( + Any, + Callable, + Dict, +) + + +class FileOperations: + + @staticmethod + def _atomic_write(path: Path, write_func: Callable[[Any], None]) -> None: + temp_path = path.with_suffix(path.suffix + '.tmp') + try: + write_func(temp_path) + temp_path.replace(path) + except Exception: + if temp_path.exists(): + temp_path.unlink() + raise + + @staticmethod + def atomic_write_json(path: Path, data: Dict[str, Any], indent: int=2) -> None: + + def __write(temp: Path) -> None: + with open(temp, 'w', encoding='utf-8') as f: + json.dump(data, f, ensure_ascii=False, indent=indent) + FileOperations._atomic_write(path, __write) + + @staticmethod + def load_json(path: Path) -> Dict[str, Any]: + with open(path, 'r', encoding='utf-8') as f: + return json.load(f) + + @staticmethod + def atomic_write_text(path: Path, content: str) -> None: + + def __write(temp: Path) -> None: + with open(temp, 'w', encoding='utf-8') as f: + f.write(content) + FileOperations._atomic_write(path, __write) + +def atomic_write_json(path: Path, data: Dict[str, Any], indent: int=2) -> None: + FileOperations.atomic_write_json(path, data, indent) + +def load_json(path: Path) -> Dict[str, Any]: + return FileOperations.load_json(path) diff --git a/preprocessor/lib/io/hashing.py b/preprocessor/lib/io/hashing.py new file mode 100644 index 000000000..a53f0642b --- /dev/null +++ b/preprocessor/lib/io/hashing.py @@ -0,0 +1,48 @@ +from pathlib import Path +from typing import ( + Any, + Dict, + List, +) + +from preprocessor.config.config import settings +from preprocessor.core.path_manager import PathManager +from preprocessor.lib.episodes import EpisodeInfo +from preprocessor.lib.io.files import FileOperations +from preprocessor.lib.io.metadata import MetadataBuilder + + +class HashStorage: + + @staticmethod + def save_image_hashes_to_json( + episode_info: EpisodeInfo, + hash_results: List[Dict[str, Any]], + series_name: str, + device: str, + batch_size: int, + ) -> Path: + path_manager = PathManager(series_name) + episode_dir = path_manager.get_episode_dir( + episode_info, + settings.output_subdirs.image_hashes, + ) + episode_dir.mkdir(parents=True, exist_ok=True) + unique_hashes = len( + set(( + h.get('perceptual_hash') + for h in hash_results + if 'perceptual_hash' in h + )), + ) + hash_data = MetadataBuilder.create_processing_metadata( + episode_info=episode_info, + processing_params={'device': device, 'batch_size': batch_size, 'hash_size': 8}, + statistics={'total_hashes': len(hash_results), 'unique_hashes': unique_hashes}, + results_key='image_hashes', + results_data=hash_results, + ) + hash_filename = path_manager.build_filename(episode_info, extension='json', suffix='image_hashes') + output_path = episode_dir / hash_filename + FileOperations.atomic_write_json(output_path, hash_data) + return output_path diff --git a/preprocessor/lib/io/metadata.py b/preprocessor/lib/io/metadata.py new file mode 100644 index 000000000..2823d89e9 --- /dev/null +++ b/preprocessor/lib/io/metadata.py @@ -0,0 +1,51 @@ +from datetime import datetime +from pathlib import Path +from typing import ( + Any, + Dict, + List, +) + +from preprocessor.core.artifacts import EmbeddingCollection + + +class MetadataBuilder: + + @staticmethod + def create_minimal_episode_info(episode_info) -> Dict[str, Any]: + return {'season': episode_info.season, 'episode_number': episode_info.relative_episode} + + @staticmethod + def create_processing_metadata( + episode_info, + processing_params: Dict[str, Any], + statistics: Dict[str, Any], + results_key: str, + results_data: List[Any], + ) -> Dict[str, Any]: + return { + 'generated_at': datetime.now().isoformat(), + 'episode_info': MetadataBuilder.create_minimal_episode_info(episode_info), + 'processing_parameters': processing_params, + 'statistics': statistics, + results_key: results_data, + } + + @staticmethod + def create_embedding_collection( + episode_id: str, + episode_info: Any, + path: Path, + model_name: str, + embedding_count: int, + embedding_type: str, + ) -> EmbeddingCollection: + """Helper to create EmbeddingCollection with standard parameters.""" + return EmbeddingCollection( + episode_id=episode_id, + episode_info=episode_info, + path=path, + model_name=model_name, + embedding_count=embedding_count, + embedding_type=embedding_type, + ) diff --git a/preprocessor/lib/media/__init__.py b/preprocessor/lib/media/__init__.py new file mode 100644 index 000000000..4a43dec8f --- /dev/null +++ b/preprocessor/lib/media/__init__.py @@ -0,0 +1,9 @@ +from preprocessor.lib.media.ffmpeg import FFmpegWrapper +from preprocessor.lib.media.resolution import Resolution + +__all__ = ['FFmpegWrapper', 'Resolution'] +try: + from preprocessor.lib.media.scene_detection import TransNetWrapper + __all__.append('TransNetWrapper') +except ImportError: + pass diff --git a/preprocessor/lib/media/ffmpeg.py b/preprocessor/lib/media/ffmpeg.py new file mode 100644 index 000000000..c9f8e6164 --- /dev/null +++ b/preprocessor/lib/media/ffmpeg.py @@ -0,0 +1,152 @@ +import json +from pathlib import Path +import subprocess +from typing import ( + Any, + Dict, + Optional, +) + + +class FFmpegWrapper: + _PROFILE = 'main' + _LEVEL = '4.1' + _PIX_FMT = 'yuv420p' + _BF = '2' + _B_ADAPT = '1' + _TWO_PASS = '1' + _RC_LOOKAHEAD = '32' + _AQ_STRENGTH = '15' + _AUDIO_CHANNELS = '2' + + @staticmethod + def _build_video_filter(width: int, height: int) -> str: + return ( + f"scale='iw*sar:ih',scale={width}:{height}:" + f"force_original_aspect_ratio=decrease,pad={width}:{height}:" + f"(ow-iw)/2:(oh-ih)/2:black,setsar=1" + ) + + @staticmethod + def _build_base_command( + input_path: Path, codec: str, preset: str, target_fps: Optional[float], + ) -> list[str]: + command = [ + 'ffmpeg', '-v', 'error', '-stats', '-hide_banner', '-y', + '-i', str(input_path), + '-c:v', codec, + '-preset', preset, + '-profile:v', FFmpegWrapper._PROFILE, + '-level', FFmpegWrapper._LEVEL, + '-pix_fmt', FFmpegWrapper._PIX_FMT, + ] + if target_fps: + command.extend(['-r', str(target_fps)]) + return command + + @staticmethod + def _build_encoding_params( + video_bitrate: str, minrate: str, maxrate: str, bufsize: str, gop_size: int, + ) -> list[str]: + return [ + '-rc', 'vbr_hq', + '-b:v', video_bitrate, + '-minrate', minrate, + '-maxrate', maxrate, + '-bufsize', bufsize, + '-bf', FFmpegWrapper._BF, + '-b_adapt', FFmpegWrapper._B_ADAPT, + '-2pass', FFmpegWrapper._TWO_PASS, + '-rc-lookahead', FFmpegWrapper._RC_LOOKAHEAD, + '-aq-strength', FFmpegWrapper._AQ_STRENGTH, + '-g', str(gop_size), + '-spatial-aq', '1', + '-temporal-aq', '1', + '-multipass', 'fullres', + ] + + @staticmethod + def _build_audio_and_output_params( + audio_bitrate: str, vf_filter: str, output_path: Path, + ) -> list[str]: + return [ + '-c:a', 'aac', + '-b:a', audio_bitrate, + '-ac', FFmpegWrapper._AUDIO_CHANNELS, + '-vf', vf_filter, + '-movflags', '+faststart', + '-f', 'mp4', + str(output_path), + ] + + @staticmethod + def transcode( # pylint: disable=too-many-arguments + input_path: Path, + output_path: Path, + codec: str, + preset: str, + resolution: str, + video_bitrate: str, + minrate: str, + maxrate: str, + bufsize: str, + audio_bitrate: str, + gop_size: int, + target_fps: Optional[float] = None, + ) -> None: + width, height = [int(x) for x in resolution.split(':')] + vf_filter = FFmpegWrapper._build_video_filter(width, height) + command = FFmpegWrapper._build_base_command(input_path, codec, preset, target_fps) + command.extend( + FFmpegWrapper._build_encoding_params( + video_bitrate, minrate, maxrate, bufsize, gop_size, + ), + ) + command.extend( + FFmpegWrapper._build_audio_and_output_params( + audio_bitrate, vf_filter, output_path, + ), + ) + subprocess.run(command, check=True, capture_output=False) + + @staticmethod + def probe_video(video_path: Path) -> Dict[str, Any]: + cmd = ['ffprobe', '-v', 'error', '-show_streams', '-show_format', '-of', 'json', str(video_path)] + result = subprocess.run(cmd, capture_output=True, text=True, check=True) + return json.loads(result.stdout) + + @staticmethod + def _get_stream_by_type(probe_data: Dict[str, Any], codec_type: str) -> Optional[Dict[str, Any]]: + streams = [s for s in probe_data.get('streams', []) if s.get('codec_type') == codec_type] + return streams[0] if streams else None + + @staticmethod + def get_framerate(probe_data: Dict[str, Any]) -> float: + stream = FFmpegWrapper._get_stream_by_type(probe_data, 'video') + if not stream: + raise ValueError('No video streams found') + r_frame_rate = stream.get('r_frame_rate') + if not r_frame_rate: + raise ValueError('Frame rate not found') + num, denom = [int(x) for x in r_frame_rate.split('/')] + return num / denom + + @staticmethod + def get_video_bitrate(probe_data: Dict[str, Any]) -> Optional[float]: + stream = FFmpegWrapper._get_stream_by_type(probe_data, 'video') + if not stream: + return None + bit_rate = stream.get('bit_rate') + if not bit_rate: + return None + return round(int(bit_rate) / 1000000, 2) + + @staticmethod + def get_audio_bitrate(probe_data: Dict[str, Any]) -> Optional[int]: + stream = FFmpegWrapper._get_stream_by_type(probe_data, 'audio') + if not stream: + return None + bit_rate = stream.get('bit_rate') + if not bit_rate: + return None + return int(int(bit_rate) / 1000) diff --git a/preprocessor/utils/resolution.py b/preprocessor/lib/media/resolution.py similarity index 70% rename from preprocessor/utils/resolution.py rename to preprocessor/lib/media/resolution.py index df14f73b5..c489a6169 100644 --- a/preprocessor/utils/resolution.py +++ b/preprocessor/lib/media/resolution.py @@ -5,34 +5,31 @@ TypeVar, ) -# pylint: disable=duplicate-code - -T = TypeVar("T", bound="Resolution") - +T = TypeVar('T', bound='Resolution') class Resolution(Enum): R4320P = (7680, 4320) R2160P = (3840, 2160) R1440P = (2560, 1440) R1080P = (1920, 1080) - R720P = (1280, 720) - R480P = (854, 480) - R360P = (640, 360) - R240P = (426, 240) - R144P = (256, 144) + R720P = (1280, 720) + R480P = (854, 480) + R360P = (640, 360) + R240P = (426, 240) + R144P = (256, 144) def __init__(self, width: int, height: int): self.width = width self.height = height def __str__(self): - return f"{self.height}p" + return f'{self.height}p' @classmethod def from_str(cls: Type[T], init: str) -> T: init = init.strip() if not init[0].isalpha(): - init = "R" + init.upper() + init = 'R' + init.upper() else: init = init.upper() return cls[init] diff --git a/preprocessor/lib/media/scene_detection.py b/preprocessor/lib/media/scene_detection.py new file mode 100644 index 000000000..8c4be9df7 --- /dev/null +++ b/preprocessor/lib/media/scene_detection.py @@ -0,0 +1,119 @@ +import gc +from pathlib import Path +from typing import ( + Any, + Dict, + List, + Optional, +) + +import decord +import numpy as np +import torch +from transnetv2_pytorch import TransNetV2 + + +class TransNetWrapper: + + def __init__(self): + self.model: Optional[TransNetV2] = None + + def load_model(self) -> None: + if not torch.cuda.is_available(): + raise RuntimeError('CUDA not available') + self.model = TransNetV2().cuda() + + def detect_scenes( + self, + video_path: Path, + threshold: float=0.5, + min_scene_len: int=15, + ) -> List[Dict[str, Any]]: + if self.model is None: + raise RuntimeError('Model not loaded. Call load_model() first.') + video_info = self.get_video_info(video_path) + if not video_info: + raise RuntimeError(f'Failed to get video info for {video_path}') + try: + _, single_frame_predictions, _ = self.model.predict_video(str(video_path)) + scene_changes = np.where(single_frame_predictions > threshold)[0] + return self._build_scenes_from_predictions( + scene_changes, + video_info, + min_scene_len, + ) + except (RuntimeError, ValueError, OSError) as e: + raise RuntimeError(f'TransNetV2 detection failed: {e}') from e + + def _build_scenes_from_predictions( + self, + scene_changes: np.ndarray, + video_info: Dict[str, Any], + min_scene_len: int, + ) -> List[Dict[str, Any]]: + """Build scene list from frame predictions.""" + scenes = [] + fps = video_info['fps'] + prev_frame = 0 + for frame_num in scene_changes: + if frame_num - prev_frame < min_scene_len: + continue + scene = self._create_scene_dict(len(scenes) + 1, prev_frame, frame_num, fps) + scenes.append(scene) + prev_frame = frame_num + total_frames = video_info['total_frames'] + if total_frames - prev_frame > min_scene_len: + scene = self._create_scene_dict(len(scenes) + 1, prev_frame, total_frames, fps) + scenes.append(scene) + return scenes + + @staticmethod + def get_video_info(video_file: Path) -> Optional[Dict[str, Any]]: + try: + vr = decord.VideoReader(str(video_file), ctx=decord.cpu(0)) + fps = vr.get_avg_fps() + total_frames = len(vr) + duration = total_frames / fps if fps > 0 else 0 + return {'fps': fps, 'duration': duration, 'total_frames': total_frames} + except (RuntimeError, ValueError, OSError): + return None + + def cleanup(self) -> None: + if self.model is not None: + del self.model + self.model = None + gc.collect() + if torch.cuda.is_available(): + torch.cuda.empty_cache() + + def _create_scene_dict( + self, + scene_number: int, + start_frame: int, + end_frame: int, + fps: float, + ) -> Dict[str, Any]: + return { + 'scene_number': scene_number, + 'start': { + 'frame': int(start_frame), + 'seconds': float(start_frame / fps), + 'timecode': self._frame_to_timecode(start_frame, fps), + }, + 'end': { + 'frame': int(end_frame), + 'seconds': float(end_frame / fps), + 'timecode': self._frame_to_timecode(end_frame, fps), + }, + 'duration': float((end_frame - start_frame) / fps), + 'frame_count': int(end_frame - start_frame), + } + + @staticmethod + def _frame_to_timecode(frame: int, fps: float) -> str: + seconds = frame / fps + hours = int(seconds // 3600) + minutes = int(seconds % 3600 // 60) + secs = int(seconds % 60) + frames = int(seconds % 1 * fps) + return f'{hours:02d}:{minutes:02d}:{secs:02d}:{frames:02d}' diff --git a/preprocessor/cli/options/__init__.py b/preprocessor/lib/scraping/__init__.py similarity index 100% rename from preprocessor/cli/options/__init__.py rename to preprocessor/lib/scraping/__init__.py diff --git a/preprocessor/lib/scraping/clipboard.py b/preprocessor/lib/scraping/clipboard.py new file mode 100644 index 000000000..2e7e6cc9a --- /dev/null +++ b/preprocessor/lib/scraping/clipboard.py @@ -0,0 +1,30 @@ +from typing import ( + List, + Optional, +) + +from patchright.sync_api import sync_playwright + +from preprocessor.lib.core.logging import ErrorHandlingLogger + + +class ScraperClipboard: + _BROWSER_ARGS: List[str] = ['--no-sandbox', '--disable-dev-shm-usage', '--disable-gpu'] + + @staticmethod + def scrape(url: str, headless: bool=True, logger: Optional[ErrorHandlingLogger]=None) -> Optional[str]: + try: + with sync_playwright() as p: + browser = p.chromium.launch(headless=headless, args=ScraperClipboard._BROWSER_ARGS) + context = browser.new_context() + page = context.new_page() + page.goto(url, wait_until='networkidle', timeout=30000) + page.keyboard.press('Control+A') + page.keyboard.press('Control+C') + clipboard_text = page.evaluate('navigator.clipboard.readText()') + browser.close() + return clipboard_text + except Exception as e: + if logger: + logger.error(f'Clipboard scraping failed: {e}') + return None diff --git a/preprocessor/lib/scraping/crawl4ai.py b/preprocessor/lib/scraping/crawl4ai.py new file mode 100644 index 000000000..1d38097c7 --- /dev/null +++ b/preprocessor/lib/scraping/crawl4ai.py @@ -0,0 +1,54 @@ +import asyncio +from pathlib import Path +from typing import Optional + +from crawl4ai import AsyncWebCrawler +from crawl4ai.async_configs import ( + BrowserConfig, + CrawlerRunConfig, +) +from pathvalidate import sanitize_filename +import ua_generator + +from preprocessor.lib.core.logging import ErrorHandlingLogger + + +class ScraperCrawl4AI: + + @staticmethod + def scrape(url: str, save_markdown: bool=False, output_dir: Optional[Path]=None, logger: Optional[ErrorHandlingLogger]=None) -> Optional[str]: + return asyncio.run(ScraperCrawl4AI.__scrape_async(url, save_markdown, output_dir, logger)) + + @staticmethod + def __sanitize_url_to_filename(url: str) -> str: + return sanitize_filename(url.replace('://', '_').replace('/', '_')) + + @staticmethod + def __save_markdown(content: str, url: str, output_dir: Path, logger: Optional[ErrorHandlingLogger]=None) -> None: + output_dir.mkdir(parents=True, exist_ok=True) + filename = ScraperCrawl4AI.__sanitize_url_to_filename(url) + md_file = output_dir / f'{filename}.md' + with open(md_file, 'w', encoding='utf-8') as f: + f.write(content) + if logger: + logger.info(f'Saved markdown to: {md_file}') + + @staticmethod + async def __scrape_async(url: str, save_markdown: bool=False, output_dir: Optional[Path]=None, logger: Optional[ErrorHandlingLogger]=None) -> Optional[str]: + try: + ua = ua_generator.generate() + browser_config = BrowserConfig(headless=True, enable_stealth=True, viewport_width=1920, viewport_height=1080, user_agent=str(ua)) + run_config = CrawlerRunConfig(wait_until='networkidle', page_timeout=60000, delay_before_return_html=2.0) + async with AsyncWebCrawler(config=browser_config) as crawler: + result = await crawler.arun(url=url, config=run_config) + if result.success: + if save_markdown and output_dir: + ScraperCrawl4AI.__save_markdown(result.markdown, url, output_dir, logger) + return result.markdown + if logger: + logger.error(f'Crawl4AI failed: {result.error_message}') + return None + except Exception as e: + if logger: + logger.error(f'Crawl4AI error: {e}') + return None diff --git a/preprocessor/lib/search/__init__.py b/preprocessor/lib/search/__init__.py new file mode 100644 index 000000000..865a9becb --- /dev/null +++ b/preprocessor/lib/search/__init__.py @@ -0,0 +1,8 @@ +from preprocessor.lib.search.elasticsearch import ElasticsearchWrapper +from preprocessor.lib.search.embedding_model import EmbeddingModelWrapper +from preprocessor.modules.search.clients.elasticsearch_queries import ElasticsearchQueries +from preprocessor.modules.search.clients.embedding_service import EmbeddingService +from preprocessor.modules.search.clients.hash_service import HashService +from preprocessor.modules.search.clients.result_formatters import ResultFormatter + +__all__ = ['ElasticsearchWrapper', 'EmbeddingModelWrapper', 'ElasticsearchQueries', 'EmbeddingService', 'HashService', 'ResultFormatter'] diff --git a/preprocessor/lib/search/elasticsearch.py b/preprocessor/lib/search/elasticsearch.py new file mode 100644 index 000000000..949a0ef85 --- /dev/null +++ b/preprocessor/lib/search/elasticsearch.py @@ -0,0 +1,61 @@ +from typing import ( + Any, + Dict, + List, + Optional, +) + +from elasticsearch import AsyncElasticsearch +import urllib3 + +urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) + +class ElasticsearchWrapper: + + def __init__(self, index_name: str, host: str='localhost:9200', dry_run: bool=False) -> None: + self.index_name: str = index_name + self.host: str = host + self.dry_run: bool = dry_run + self._client: Optional[AsyncElasticsearch] = None + + async def _get_client(self) -> AsyncElasticsearch: + if self._client is None: + self._client = AsyncElasticsearch([self.host], verify_certs=False, ssl_show_warn=False) + return self._client + + async def index_exists(self) -> bool: + if self.dry_run: + return False + client = await self._get_client() + return await client.indices.exists(index=self.index_name) + + async def create_index(self, mapping: Dict[str, Any]) -> None: + if self.dry_run: + return + client = await self._get_client() + await client.indices.create(index=self.index_name, body=mapping) + + async def delete_index(self) -> None: + if self.dry_run: + return + client = await self._get_client() + await client.indices.delete(index=self.index_name, ignore=[404]) + + async def bulk_index(self, documents: List[Dict[str, Any]]) -> Dict[str, Any]: + if self.dry_run: + return {'indexed': len(documents), 'errors': []} + client = await self._get_client() + actions = [] + for doc in documents: + actions.append({'index': {'_index': self.index_name}}) + actions.append(doc) + try: + response = await client.bulk(operations=actions) + return response + except Exception as e: + return {'errors': str(e)} + + async def close(self) -> None: + if self._client is not None: + await self._client.close() + self._client = None diff --git a/preprocessor/lib/search/embedding_model.py b/preprocessor/lib/search/embedding_model.py new file mode 100644 index 000000000..671c162f8 --- /dev/null +++ b/preprocessor/lib/search/embedding_model.py @@ -0,0 +1,23 @@ +from typing import ( + List, + Union, +) + +from preprocessor.modules.search.clients.embedding_service import EmbeddingService + + +class EmbeddingModelWrapper: + + def __init__(self, model_name: str, device: str='cuda', batch_size: int=8) -> None: + self.model_name: str = model_name + self.device: str = device + self.batch_size: int = batch_size + self._service = EmbeddingService() + + def encode_text(self, text: Union[str, List[str]]) -> Union[List[float], List[List[float]]]: + if isinstance(text, str): + return self._service.get_text_embedding(text) + return [self._service.get_text_embedding(t) for t in text] + + def encode_image(self, image_path: str) -> List[float]: + return self._service.get_image_embedding(image_path) diff --git a/preprocessor/lib/text/__init__.py b/preprocessor/lib/text/__init__.py new file mode 100644 index 000000000..ba18b5a06 --- /dev/null +++ b/preprocessor/lib/text/__init__.py @@ -0,0 +1,8 @@ +from preprocessor.lib.text.language_config import ( + ENGLISH_CONFIG, + POLISH_CONFIG, + LanguageConfig, +) +from preprocessor.lib.text.text_statistics import TextStatistics + +__all__ = ['TextStatistics', 'LanguageConfig', 'POLISH_CONFIG', 'ENGLISH_CONFIG'] diff --git a/preprocessor/lib/text/language_config.py b/preprocessor/lib/text/language_config.py new file mode 100644 index 000000000..4dabcb0f4 --- /dev/null +++ b/preprocessor/lib/text/language_config.py @@ -0,0 +1,28 @@ +from dataclasses import dataclass +from typing import Set + + +@dataclass +class LanguageConfig: + vowels: Set[str] + consonants: Set[str] + punctuation: Set[str] + special_chars: Set[str] +POLISH_VOWELS = set('aąeęioóuyAĄEĘIOÓUY') +POLISH_CONSONANTS = set('bcćdfghjklłmnńprsśtwzźżBCĆDFGHJKLŁMNŃPRSŚTWZŹŻ') +ENGLISH_VOWELS = set('aeiouAEIOU') +ENGLISH_CONSONANTS = set('bcdfghjklmnpqrstvwxyzBCDFGHJKLMNPQRSTVWXYZ') +PUNCTUATION = set('.,;:!?…-—–()[]{}"\'«»„\'\'') +SPECIAL_CHARS = set('@#$%^&*+=<>|\\/_~`') +POLISH_CONFIG = LanguageConfig( + vowels=POLISH_VOWELS | ENGLISH_VOWELS, + consonants=POLISH_CONSONANTS | ENGLISH_CONSONANTS, + punctuation=PUNCTUATION, + special_chars=SPECIAL_CHARS, +) +ENGLISH_CONFIG = LanguageConfig( + vowels=ENGLISH_VOWELS, + consonants=ENGLISH_CONSONANTS, + punctuation=PUNCTUATION, + special_chars=SPECIAL_CHARS, +) diff --git a/preprocessor/lib/text/text_statistics.py b/preprocessor/lib/text/text_statistics.py new file mode 100644 index 000000000..5c8702eb1 --- /dev/null +++ b/preprocessor/lib/text/text_statistics.py @@ -0,0 +1,159 @@ +from collections import Counter +from dataclasses import ( + dataclass, + field, +) +from pathlib import Path +import re +from typing import ( + Any, + Dict, + List, +) + +from preprocessor.lib.text.language_config import ( + ENGLISH_CONFIG, + POLISH_CONFIG, + LanguageConfig, +) + + +@dataclass +class TextStatistics: # pylint: disable=too-many-instance-attributes + text: str + language: str = 'pl' + sentences: int = 0 + lines: int = 0 + paragraphs: int = 0 + empty_lines: int = 0 + words: int = 0 + letters: int = 0 + digits: int = 0 + symbols: int = 0 + punctuation_marks: int = 0 + special_characters: int = 0 + chars_without_spaces: int = 0 + spaces: int = 0 + total_chars: int = 0 + vowels: int = 0 + consonants: int = 0 + unique_words: int = 0 + avg_word_length: float = 0.0 + avg_sentence_length: float = 0.0 + type_token_ratio: float = 0.0 + letter_frequency: Dict[str, int] = field(default_factory=dict) + word_frequency: List[Dict[str, Any]] = field(default_factory=list) + bigrams: List[Dict[str, Any]] = field(default_factory=list) + trigrams: List[Dict[str, Any]] = field(default_factory=list) + + @classmethod + def from_file(cls, file_path: Path, language: str='pl') -> 'TextStatistics': + with open(file_path, 'r', encoding='utf-8') as f: + text = f.read() + stats = cls(text=text, language=language) + stats.calculate() + return stats + + @classmethod + def from_text(cls, text: str, language: str='pl') -> 'TextStatistics': + stats = cls(text=text, language=language) + stats.calculate() + return stats + + def calculate(self) -> None: + self.__calculate_basic_stats() + self.__calculate_character_stats() + self.__calculate_word_stats() + self.__calculate_advanced_stats() + + def __get_config(self) -> LanguageConfig: + return POLISH_CONFIG if self.language == 'pl' else ENGLISH_CONFIG + + def __get_words(self) -> List[str]: + return re.findall('\\b\\w+\\b', self.text.lower()) + + def __calculate_basic_stats(self) -> None: + lines = self.text.split('\n') + self.lines = len(lines) + self.empty_lines = sum((1 for line in lines if not line.strip())) + paragraphs = self.text.split('\n\n') + self.paragraphs = len([p for p in paragraphs if p.strip()]) + sentence_pattern = '[.!?…]+(?:\\s|$)' + self.sentences = len(re.findall(sentence_pattern, self.text)) + self.total_chars = len(self.text) + self.spaces = self.text.count(' ') + self.text.count('\t') + self.text.count('\n') + self.chars_without_spaces = self.total_chars - self.spaces + + def __calculate_character_stats(self) -> None: + config = self.__get_config() + letter_counter = Counter() + for char in self.text: + if char.isalpha(): + self.letters += 1 + letter_counter[char.lower()] += 1 + if char in config.vowels: + self.vowels += 1 + elif char in config.consonants: + self.consonants += 1 + elif char.isdigit(): + self.digits += 1 + elif char in config.punctuation: + self.punctuation_marks += 1 + elif char in config.special_chars: + self.special_characters += 1 + elif not char.isspace(): + self.symbols += 1 + self.letter_frequency = dict(sorted(letter_counter.items(), key=lambda x: x[1], reverse=True)) + + def __calculate_word_stats(self) -> None: + words = self.__get_words() + self.words = len(words) + if self.words > 0: + word_counter = Counter(words) + self.unique_words = len(word_counter) + self.type_token_ratio = round(self.unique_words / self.words, 4) if self.words > 0 else 0.0 + word_lengths = [len(w) for w in words] + self.avg_word_length = round(sum(word_lengths) / len(word_lengths), 2) if word_lengths else 0.0 + self.word_frequency = [{'word': word, 'count': count} for word, count in word_counter.most_common(50)] + + def __calculate_advanced_stats(self) -> None: + if self.sentences > 0: + self.avg_sentence_length = round(self.words / self.sentences, 2) + words = self.__get_words() + if len(words) >= 2: + bigram_counter = Counter(zip(words[:-1], words[1:])) + self.bigrams = [{'bigram': f'{w1} {w2}', 'count': count} for (w1, w2), count in bigram_counter.most_common(25)] + if len(words) >= 3: + trigram_counter = Counter(zip(words[:-2], words[1:-1], words[2:])) + self.trigrams = [{'trigram': f'{w1} {w2} {w3}', 'count': count} for (w1, w2, w3), count in trigram_counter.most_common(25)] + + def to_dict(self) -> Dict[str, Any]: + return { + 'basic_statistics': { + 'sentences': self.sentences, + 'lines': self.lines, + 'paragraphs': self.paragraphs, + 'empty_lines': self.empty_lines, + 'words': self.words, + 'letters': self.letters, + 'digits': self.digits, + 'symbols': self.symbols, + 'punctuation_marks': self.punctuation_marks, + 'special_characters': self.special_characters, + 'chars_without_spaces': self.chars_without_spaces, + 'spaces': self.spaces, + 'total_chars': self.total_chars, + 'vowels': self.vowels, + 'consonants': self.consonants, + }, + 'advanced_statistics': { + 'unique_words': self.unique_words, + 'avg_word_length': self.avg_word_length, + 'avg_sentence_length': self.avg_sentence_length, + 'type_token_ratio': self.type_token_ratio, + }, + 'letter_frequency': self.letter_frequency, + 'word_frequency': self.word_frequency, + 'bigrams': self.bigrams, + 'trigrams': self.trigrams, + } diff --git a/preprocessor/lib/transcription/__init__.py b/preprocessor/lib/transcription/__init__.py new file mode 100644 index 000000000..f82f7cdd7 --- /dev/null +++ b/preprocessor/lib/transcription/__init__.py @@ -0,0 +1,25 @@ +from preprocessor.lib.transcription.generators.json_generator import JsonGenerator +from preprocessor.lib.transcription.processors.audio_normalizer import AudioNormalizer +from preprocessor.lib.transcription.processors.episode_info_processor import EpisodeInfoProcessor +from preprocessor.lib.transcription.processors.normalized_audio_processor import NormalizedAudioProcessor +from preprocessor.lib.transcription.sound_classification import ( + classify_segment, + is_sound_event, +) +from preprocessor.lib.transcription.utils import ( + TranscriptionUtils, + WhisperUtils, +) +from preprocessor.lib.transcription.whisper import Whisper + +__all__ = [ + 'JsonGenerator', + 'AudioNormalizer', + 'EpisodeInfoProcessor', + 'NormalizedAudioProcessor', + 'classify_segment', + 'is_sound_event', + 'TranscriptionUtils', + 'WhisperUtils', + 'Whisper', +] diff --git a/preprocessor/transcription/elevenlabs.py b/preprocessor/lib/transcription/elevenlabs.py similarity index 52% rename from preprocessor/transcription/elevenlabs.py rename to preprocessor/lib/transcription/elevenlabs.py index 2dee53b12..55de2bf8b 100644 --- a/preprocessor/transcription/elevenlabs.py +++ b/preprocessor/lib/transcription/elevenlabs.py @@ -12,25 +12,25 @@ from preprocessor.config.config import settings from preprocessor.core.base_processor import BaseProcessor -from preprocessor.episodes import EpisodeManager -from preprocessor.transcription.engines.elevenlabs_engine import ElevenLabsEngine -from preprocessor.transcription.generators.multi_format_generator import MultiFormatGenerator -from preprocessor.utils.console import ( +from preprocessor.lib.episodes import EpisodeManager +from preprocessor.lib.transcription.engines.elevenlabs_engine import ElevenLabsEngine +from preprocessor.lib.transcription.generators.multi_format_generator import MultiFormatGenerator +from preprocessor.lib.ui.console import ( + SimpleProgress, console, - create_progress, ) class ElevenLabsTranscriber(BaseProcessor): - def _validate_args(self, args: Dict[str, Any]) -> None: - if "videos" not in args: - raise ValueError("videos is required") - if "output_dir" not in args: - raise ValueError("output_dir is required") - if "series_name" not in args: - raise ValueError("series_name is required") - videos_path = Path(args["videos"]) + def _validate_args(self, args: Dict[str, Any]) -> None: + if 'videos' not in args: + raise ValueError('videos is required') + if 'output_dir' not in args: + raise ValueError('output_dir is required') + if 'series_name' not in args: + raise ValueError('series_name is required') + videos_path = Path(args['videos']) if not videos_path.is_dir(): raise NotADirectoryError(f"Input videos is not a directory: '{videos_path}'") @@ -38,84 +38,57 @@ def get_output_subdir(self) -> str: return settings.output_subdirs.transcriptions def __init__(self, args: Dict[str, Any]): - super().__init__( - args=args, - class_name=self.__class__.__name__, - error_exit_code=5, - loglevel=logging.DEBUG, - ) - - self.input_videos: Path = Path(self._args["videos"]) - self.output_dir: Path = Path(self._args["output_dir"]) + super().__init__(args=args, class_name=self.__class__.__name__, error_exit_code=5, loglevel=logging.DEBUG) + self.input_videos: Path = Path(self._args['videos']) + self.output_dir: Path = Path(self._args['output_dir']) self.output_dir.mkdir(parents=True, exist_ok=True) - - self.episodes_info_json: Optional[Path] = self._args.get("episodes_info_json") - - self.model_id: str = self._args.get("model_id", "scribe_v1") - self.language_code: str = self._args.get("language_code", "pol") - self.diarize: bool = self._args.get("diarize", True) - - self.episode_manager = EpisodeManager(self.episodes_info_json, self.series_name) - - self.engine = ElevenLabsEngine( - model_id=self.model_id, - language_code=self.language_code, - diarize=self.diarize, - ) + self.episodes_info_json: Optional[Path] = self._args.get('episodes_info_json') + self.model_id: str = self._args.get('model_id', 'scribe_v1') + self.language_code: str = self._args.get('language_code', 'pol') + self.diarize: bool = self._args.get('diarize', True) + self.episode_manager = EpisodeManager(self.episodes_info_json, self.series_name, self.logger) + self.engine = ElevenLabsEngine(logger=self.logger, model_id=self.model_id, language_code=self.language_code, diarize=self.diarize) def _execute(self) -> None: video_files: List[Path] = [] for ext in self.SUPPORTED_VIDEO_EXTENSIONS: - video_files.extend(self.input_videos.rglob(f"*{ext}")) + video_files.extend(self.input_videos.rglob(f'*{ext}')) video_files = sorted(video_files) - if not video_files: - self.logger.warning("No video files found") + self.logger.warning('No video files found') return - - console.print(f"[blue]Found {len(video_files)} videos to transcribe with 11labs[/blue]") - + console.print(f'[blue]Found {len(video_files)} videos to transcribe with 11labs[/blue]') try: - with create_progress() as progress: - task = progress.add_task("Transcribing with 11labs...", total=len(video_files)) - + with SimpleProgress() as progress: + task = progress.add_task('Transcribing with 11labs...', total=len(video_files)) for video_file in video_files: episode_id = video_file.stem - - if self.state_manager and self.state_manager.is_step_completed("transcribe_11labs", episode_id): - console.print(f"[yellow]Skipping (already done): {episode_id}[/yellow]") + if self.state_manager and self.state_manager.is_step_completed('transcribe_11labs', episode_id): + console.print(f'[yellow]Skipping (already done): {episode_id}[/yellow]') progress.advance(task) continue - audio_path = None try: if self.state_manager: audio_path = self.__extract_audio(video_file) - self.state_manager.mark_step_started("transcribe_11labs", episode_id, [str(audio_path)]) - + self.state_manager.mark_step_started('transcribe_11labs', episode_id, [str(audio_path)]) audio_path = audio_path or self.__extract_audio(video_file) transcription_data = self.engine.transcribe(audio_path) - self.__save_transcription(transcription_data, video_file) - if self.state_manager: - self.state_manager.mark_step_completed("transcribe_11labs", episode_id) - + self.state_manager.mark_step_completed('transcribe_11labs', episode_id) except Exception as e: - self.logger.error(f"Failed to transcribe {video_file.name}: {e}") - + self.logger.error(f'Failed to transcribe {video_file.name}: {e}') finally: if audio_path and audio_path.exists(): audio_path.unlink() - progress.advance(task) except KeyboardInterrupt: - console.print("\n[yellow]Transcription interrupted[/yellow]") + console.print('\n[yellow]Transcription interrupted[/yellow]') raise - - console.print("[blue]Generating multi-format outputs (SRT, TXT, etc.)...[/blue]") + console.print('[blue]Generating multi-format outputs (SRT, TXT, etc.)...[/blue]') if self.episodes_info_json: - jsons_source_dir = self.output_dir / "json" + jsons_source_dir = self.output_dir / 'json' multi_format_gen = MultiFormatGenerator( jsons_dir=jsons_source_dir, episodes_info_json=self.episodes_info_json, @@ -129,96 +102,71 @@ def _execute(self) -> None: def __create_segments_from_words(words: List[Dict]) -> List[Dict]: if not words: return [] - segments = [] current_segment_words = [] current_speaker = None - for word in words: - speaker_id = word.get("speaker_id", "speaker_unknown") - + speaker_id = word.get('speaker_id', 'speaker_unknown') if current_speaker is None: current_speaker = speaker_id current_segment_words = [word] elif speaker_id == current_speaker: current_segment_words.append(word) else: - segment_text = " ".join(w.get("text", "") for w in current_segment_words).strip() - segments.append({ - "text": segment_text, - "words": current_segment_words, - }) + segment_text = ' '.join((w.get('text', '') for w in current_segment_words)).strip() + segments.append({'text': segment_text, 'words': current_segment_words}) current_speaker = speaker_id current_segment_words = [word] - if current_segment_words: - segment_text = " ".join(w.get("text", "") for w in current_segment_words).strip() - segments.append({ - "text": segment_text, - "words": current_segment_words, - }) - + segment_text = ' '.join((w.get('text', '') for w in current_segment_words)).strip() + segments.append({'text': segment_text, 'words': current_segment_words}) return segments @staticmethod def __extract_audio(video_file: Path) -> Path: temp_dir = Path(tempfile.gettempdir()) - audio_path = temp_dir / f"{video_file.stem}_audio.mp3" - + audio_path = temp_dir / f'{video_file.stem}_audio.mp3' command = [ - "ffmpeg", - "-v", "error", - "-hide_banner", - "-y", - "-i", str(video_file), - "-vn", - "-acodec", "libmp3lame", - "-ar", "16000", - "-ac", "1", - "-b:a", "64k", + 'ffmpeg', '-v', 'error', '-hide_banner', '-y', + '-i', str(video_file), + '-vn', '-acodec', 'libmp3lame', + '-ar', '16000', '-ac', '1', '-b:a', '64k', str(audio_path), ] - subprocess.run(command, check=True, stdout=subprocess.DEVNULL, stderr=subprocess.PIPE) return audio_path def __save_transcription(self, data: Dict[str, Any], video_file: Path) -> None: episode_info = self.episode_manager.parse_filename(video_file) if not episode_info: - self.logger.error(f"Cannot parse episode info from {video_file.name}") + self.logger.error(f'Cannot parse episode info from {video_file.name}') return - - api_segments = data.get("segments", []) - api_words = data.get("words", []) - + api_segments = data.get('segments', []) + api_words = data.get('words', []) if api_segments: segments = api_segments words = [] for segment in segments: - segment_words = segment.get("words", []) + segment_words = segment.get('words', []) for word in segment_words: - if "speaker_id" not in word and "speaker" in segment: - word["speaker_id"] = segment["speaker"] + if 'speaker_id' not in word and 'speaker' in segment: + word['speaker_id'] = segment['speaker'] words.extend(segment_words) else: words = api_words segments = self.__create_segments_from_words(words) - output_data = { - "text": data.get("text", ""), - "language_code": data.get("language_code", "pol"), - "segments": segments, - "words": words, - "episode_info": EpisodeManager.get_metadata(episode_info), + 'text': data.get('text', ''), + 'language_code': data.get('language_code', 'pol'), + 'segments': segments, + 'words': words, + 'episode_info': EpisodeManager.get_metadata(episode_info), } - - json_dir = self.output_dir / "json" - filename = self.episode_manager.path_manager.build_filename(episode_info, extension="json") + json_dir = self.output_dir / 'json' + filename = self.episode_manager.path_manager.build_filename(episode_info, extension='json') season_dir = json_dir / episode_info.season_code() output_file = season_dir / filename output_file.parent.mkdir(parents=True, exist_ok=True) - - with open(output_file, "w", encoding="utf-8") as f: + with open(output_file, 'w', encoding='utf-8') as f: json.dump(output_data, f, indent=2, ensure_ascii=False) - - self.logger.info(f"Saved transcription: {output_file.name}") + self.logger.info(f'Saved transcription: {output_file.name}') diff --git a/preprocessor/cli/pipeline/__init__.py b/preprocessor/lib/transcription/engines/__init__.py similarity index 100% rename from preprocessor/cli/pipeline/__init__.py rename to preprocessor/lib/transcription/engines/__init__.py diff --git a/preprocessor/transcription/engines/base_engine.py b/preprocessor/lib/transcription/engines/base_engine.py similarity index 91% rename from preprocessor/transcription/engines/base_engine.py rename to preprocessor/lib/transcription/engines/base_engine.py index 7ef4474fc..fb39ba8ae 100644 --- a/preprocessor/transcription/engines/base_engine.py +++ b/preprocessor/lib/transcription/engines/base_engine.py @@ -10,10 +10,11 @@ class TranscriptionEngine(ABC): + @abstractmethod def transcribe(self, audio_path: Path) -> Dict[str, Any]: - pass + ... @abstractmethod def get_name(self) -> str: - pass + ... diff --git a/preprocessor/transcription/engines/elevenlabs_engine.py b/preprocessor/lib/transcription/engines/elevenlabs_engine.py similarity index 51% rename from preprocessor/transcription/engines/elevenlabs_engine.py rename to preprocessor/lib/transcription/engines/elevenlabs_engine.py index 495632a2e..30386d203 100644 --- a/preprocessor/transcription/engines/elevenlabs_engine.py +++ b/preprocessor/lib/transcription/engines/elevenlabs_engine.py @@ -1,5 +1,4 @@ import json -import logging from pathlib import Path import time from typing import ( @@ -12,144 +11,112 @@ from elevenlabs.core import ApiError from preprocessor.config.config import settings -from preprocessor.transcription.engines.base_engine import TranscriptionEngine -from preprocessor.utils.console import console +from preprocessor.lib.core.logging import ErrorHandlingLogger +from preprocessor.lib.transcription.engines.base_engine import TranscriptionEngine +from preprocessor.lib.ui.console import console class ElevenLabsEngine(TranscriptionEngine): + def __init__( self, - model_id: Optional[str] = None, - language_code: Optional[str] = None, - diarize: Optional[bool] = None, - polling_interval: Optional[int] = None, + logger: ErrorHandlingLogger, + model_id: Optional[str]=None, + language_code: Optional[str]=None, + diarize: Optional[bool]=None, + polling_interval: Optional[int]=None, ): if not settings.elevenlabs.api_key: - raise ValueError( - "ElevenLabs API key not provided. Set ELEVEN_API_KEY environment variable.", - ) - + raise ValueError('ElevenLabs API key not provided. Set ELEVEN_API_KEY environment variable.') self.client = ElevenLabs(api_key=settings.elevenlabs.api_key) self.model_id = model_id or settings.elevenlabs.model_id self.language_code = language_code or settings.elevenlabs.language_code self.diarize = diarize if diarize is not None else settings.elevenlabs.diarize self.polling_interval = polling_interval or settings.elevenlabs.polling_interval - self.additional_formats = [ - {"format": "srt"}, + {'format': 'srt'}, { - "format": "segmented_json", - "include_speakers": True, - "include_timestamps": True, - "segment_on_silence_longer_than_s": 0.5, - "max_segment_duration_s": 10.0, - "max_segment_chars": 200, + 'format': 'segmented_json', + 'include_speakers': True, + 'include_timestamps': True, + 'segment_on_silence_longer_than_s': 0.5, + 'max_segment_duration_s': 10.0, + 'max_segment_chars': 200, }, ] - - self.logger = logging.getLogger(self.__class__.__name__) + self._logger: ErrorHandlingLogger = logger def transcribe(self, audio_path: Path) -> Dict[str, Any]: - console.print(f"[cyan]Transcribing with 11labs: {audio_path.name}[/cyan]") - + console.print(f'[cyan]Transcribing with 11labs: {audio_path.name}[/cyan]') if not audio_path.exists(): - raise FileNotFoundError(f"Audio file not found: {audio_path}") - + raise FileNotFoundError(f'Audio file not found: {audio_path}') transcription_id = self.__submit_job(audio_path) result = self.__poll_for_results(transcription_id) - - console.print(f"[green]Transcription completed: {audio_path.name}[/green]") - + console.print(f'[green]Transcription completed: {audio_path.name}[/green]') return self.__convert_to_unified_format(result) def __submit_job(self, audio_path: Path) -> str: try: - with open(audio_path, "rb") as audio_file: + with open(audio_path, 'rb') as audio_file: audio_data = audio_file.read() - submit_response = self.client.speech_to_text.convert( file=audio_data, model_id=self.model_id, language_code=self.language_code, tag_audio_events=True, - timestamps_granularity="character", + timestamps_granularity='character', diarize=self.diarize, use_multi_channel=False, additional_formats=self.additional_formats, webhook=True, ) - - self.logger.info(f"Job submitted. ID: {submit_response.transcription_id}") + self._logger.info(f'Job submitted. ID: {submit_response.transcription_id}') return submit_response.transcription_id - except ApiError as e: - self.logger.error(f"API error during job submission: {e.body}") + self._logger.error(f'API error during job submission: {e.body}') raise def __poll_for_results(self, transcription_id: str): - self.logger.info(f"Polling for results (ID: {transcription_id})...") - + self._logger.info(f'Polling for results (ID: {transcription_id})...') max_attempts = settings.elevenlabs.max_attempts attempt = 0 - while attempt < max_attempts: try: - result = self.client.speech_to_text.transcripts.get( - transcription_id=transcription_id, - ) - - self.logger.info("Transcription complete!") + result = self.client.speech_to_text.transcripts.get(transcription_id=transcription_id) + self._logger.info('Transcription complete!') return result - except ApiError as e: if e.status_code == 404: - self.logger.info(" ...Processing. Waiting...") + self._logger.info(' ...Processing. Waiting...') time.sleep(self.polling_interval) attempt += 1 else: - self.logger.error(f"API error during polling: {e.body}") + self._logger.error(f'API error during polling: {e.body}') raise - - raise TimeoutError(f"Transcription timeout after {max_attempts} attempts") + raise TimeoutError(f'Transcription timeout after {max_attempts} attempts') @staticmethod def __convert_to_unified_format(result) -> Dict[str, Any]: - unified_data = { - "text": result.text, - "language_code": result.language_code, - "segments": [], - } - + unified_data = {'text': result.text, 'language_code': result.language_code, 'segments': []} if result.additional_formats: for fmt in result.additional_formats: - if fmt.requested_format == "segmented_json": + if fmt.requested_format == 'segmented_json': segmented_data = json.loads(fmt.content) - - for seg in segmented_data.get("segments", []): - words = seg.get("words", []) + for seg in segmented_data.get('segments', []): + words = seg.get('words', []) if not words: continue - - non_spacing_words = [w for w in words if w.get("type") != "spacing"] - - segment = { - "text": seg.get("text", "").strip(), - "words": words, - } - + non_spacing_words = [w for w in words if w.get('type') != 'spacing'] + segment = {'text': seg.get('text', '').strip(), 'words': words} if non_spacing_words: first_word = non_spacing_words[0] last_word = non_spacing_words[-1] - - segment["start"] = first_word.get("start") - segment["end"] = last_word.get("end") - segment["speaker"] = first_word.get("speaker_id") - - unified_data["segments"].append(segment) - + segment['start'] = first_word.get('start') + segment['end'] = last_word.get('end') + segment['speaker'] = first_word.get('speaker_id') + unified_data['segments'].append(segment) break - return unified_data def get_name(self) -> str: - return "ElevenLabs" + return 'ElevenLabs' diff --git a/preprocessor/lib/transcription/engines/whisper_engine.py b/preprocessor/lib/transcription/engines/whisper_engine.py new file mode 100644 index 000000000..9cb84427c --- /dev/null +++ b/preprocessor/lib/transcription/engines/whisper_engine.py @@ -0,0 +1,49 @@ +import gc +from pathlib import Path +from typing import ( + Any, + Dict, +) + +from faster_whisper import WhisperModel +import torch + +from preprocessor.lib.transcription.engines.base_engine import TranscriptionEngine +from preprocessor.lib.transcription.whisper import WhisperUtils +from preprocessor.lib.ui.console import console + + +class WhisperEngine(TranscriptionEngine): + + def __init__(self, model: str='large-v3-turbo', language: str='Polish', device: str='cuda'): + self.model_name = model + self.language = language + self.device = device + if device != 'cuda': + raise ValueError(f'Only GPU (cuda) is supported, got device={device}') + compute_type = 'float16' + console.print(f'[cyan]Loading Whisper model: {model} on {device} with compute_type={compute_type}[/cyan]') + self.model = WhisperModel(model, device=device, compute_type=compute_type) + console.print('[green]✓ Whisper model loaded[/green]') + + def transcribe(self, audio_path: Path) -> Dict[str, Any]: + console.print(f'[cyan]Transcribing with Whisper: {audio_path.name}[/cyan]') + if not audio_path.exists(): + raise FileNotFoundError(f'Audio file not found: {audio_path}') + language_code = WhisperUtils.get_language_code(self.language) + segments, info = self.model.transcribe(str(audio_path), language=language_code, beam_size=10, word_timestamps=True, condition_on_previous_text=False) + result = WhisperUtils.build_transcription_result(segments, language=info.language) + console.print(f'[green]✓ Transcription completed: {audio_path.name}[/green]') + return result + + def get_name(self) -> str: + return f'Whisper-{self.model_name}' + + def cleanup(self) -> None: + console.print('[cyan]Unloading Whisper model and clearing GPU memory...[/cyan]') + if hasattr(self, 'model'): + del self.model + gc.collect() + if torch.cuda.is_available(): + torch.cuda.empty_cache() + console.print('[green]✓ Whisper model unloaded, GPU memory cleared[/green]') diff --git a/preprocessor/embeddings/__init__.py b/preprocessor/lib/transcription/generators/__init__.py similarity index 100% rename from preprocessor/embeddings/__init__.py rename to preprocessor/lib/transcription/generators/__init__.py diff --git a/preprocessor/transcription/generators/base_generator.py b/preprocessor/lib/transcription/generators/base_generator.py similarity index 61% rename from preprocessor/transcription/generators/base_generator.py rename to preprocessor/lib/transcription/generators/base_generator.py index c1b825933..95143e82d 100644 --- a/preprocessor/transcription/generators/base_generator.py +++ b/preprocessor/lib/transcription/generators/base_generator.py @@ -9,37 +9,30 @@ Dict, ) -from preprocessor.utils.error_handling_logger import ErrorHandlingLogger +from preprocessor.lib.core.logging import ErrorHandlingLogger class BaseTranscriptionGenerator(ABC): - def __init__( - self, - input_dir: Path, - output_dir: Path, - logger: ErrorHandlingLogger, - ): + + def __init__(self, input_dir: Path, output_dir: Path, logger: ErrorHandlingLogger): self.input_dir = input_dir self.output_dir = output_dir self.logger = logger def generate(self) -> None: self.output_dir.mkdir(parents=True, exist_ok=True) - - for json_file in self.input_dir.rglob("*.json"): + for json_file in self.input_dir.rglob('*.json'): try: - with open(json_file, "r", encoding="utf-8") as f: + with open(json_file, 'r', encoding='utf-8') as f: data = json.load(f) - self._process_file(json_file, data) - except Exception as e: - self.logger.error(f"Failed to generate output for {json_file}: {e}") + self.logger.error(f'Failed to generate output for {json_file}: {e}') @abstractmethod def _process_file(self, json_file: Path, data: Dict[str, Any]) -> None: - pass + ... @abstractmethod def _get_output_filename(self, json_file: Path) -> str: - pass + ... diff --git a/preprocessor/lib/transcription/generators/json_generator.py b/preprocessor/lib/transcription/generators/json_generator.py new file mode 100644 index 000000000..3bd6fc34a --- /dev/null +++ b/preprocessor/lib/transcription/generators/json_generator.py @@ -0,0 +1,74 @@ +from pathlib import Path +from typing import ( + Any, + Dict, + Literal, +) + +from preprocessor.config.constants import ( + FILE_EXTENSIONS, + FILE_SUFFIXES, +) +from preprocessor.lib.transcription.generators.base_generator import BaseTranscriptionGenerator +from preprocessor.lib.transcription.utils import TranscriptionUtils + + +class JsonGenerator(BaseTranscriptionGenerator): + + def __init__(self, format_type: Literal['full', 'simple', 'segmented'], *args, **kwargs): + super().__init__(*args, **kwargs) + self.format_type = format_type + + def _process_file(self, json_file: Path, data: Dict[str, Any]) -> None: + ... + + def _get_output_filename(self, json_file: Path) -> str: + if self.format_type == 'full': + return json_file.name + suffix = FILE_SUFFIXES[self.format_type] + return json_file.name.replace(FILE_EXTENSIONS['json'], f"{suffix}{FILE_EXTENSIONS['json']}") + + def convert(self, data: Dict[str, Any]) -> Dict[str, Any]: + if self.format_type == 'full': + return self.convert_to_full_format(data) + if self.format_type == 'simple': + return self.convert_to_simple_format(data) + if self.format_type == 'segmented': + return self.convert_to_segmented_format(data) + raise ValueError(f'Unknown format type: {self.format_type}') + + @staticmethod + def convert_to_full_format(data: Dict[str, Any]) -> Dict[str, Any]: + segments = data.get('segments', []) + full_text = ' '.join((seg.get('text', '').strip() for seg in segments)) + language_code = data.get('language', 'pol') + if language_code in {'Polish', 'polish'}: + language_code = 'pol' + words = [] + for seg in segments: + seg_words = seg.get('words', []) + words.extend(TranscriptionUtils.convert_words_list(seg_words)) + return {'language_code': language_code, 'language_probability': 1.0, 'text': full_text, 'words': words} + + @staticmethod + def convert_to_simple_format(data: Dict[str, Any]) -> Dict[str, Any]: + segments = data.get('segments', []) + result_segments = [] + for seg in segments: + text = seg.get('text', '').strip() + seg_words = seg.get('words', []) + speaker = 'speaker_unknown' + if seg_words: + speaker = seg_words[0].get('speaker_id', 'speaker_unknown') + result_segments.append({'speaker': speaker, 'text': text}) + return {'segments': result_segments} + + @staticmethod + def convert_to_segmented_format(data: Dict[str, Any]) -> Dict[str, Any]: + segments = data.get('segments', []) + result_segments = [] + for seg in segments: + text = seg.get('text', '').strip() + seg_words = seg.get('words', []) + result_segments.append({'text': text, 'words': TranscriptionUtils.convert_words_list(seg_words)}) + return {'segments': result_segments} diff --git a/preprocessor/lib/transcription/generators/multi_format_generator.py b/preprocessor/lib/transcription/generators/multi_format_generator.py new file mode 100644 index 000000000..2c368a6c4 --- /dev/null +++ b/preprocessor/lib/transcription/generators/multi_format_generator.py @@ -0,0 +1,156 @@ +import json +from pathlib import Path +from typing import ( + Any, + Dict, + Optional, +) + +from preprocessor.config.config import ( + get_base_output_dir, + settings, +) +from preprocessor.lib.core.logging import ErrorHandlingLogger +from preprocessor.lib.episodes import EpisodeManager +from preprocessor.lib.transcription.generators.json_generator import JsonGenerator +from preprocessor.lib.transcription.generators.srt_generator import SrtGenerator +from preprocessor.lib.transcription.generators.txt_generator import TxtGenerator + + +class MultiFormatGenerator: + + def __init__(self, jsons_dir: Path, episodes_info_json: Path, output_base_path: Path, logger: ErrorHandlingLogger, series_name: str=''): + self.jsons_dir = jsons_dir + self.output_base_path = output_base_path + self.logger = logger + self.series_name = series_name.lower() if series_name else 'unknown' + self.episode_manager = EpisodeManager(episodes_info_json, self.series_name, logger) + + def __call__(self) -> None: + self.generate() + + def generate(self) -> None: + for transcription_file in self.jsons_dir.rglob('*.json'): + self.__process_file(transcription_file) + + def __load_transcription(self, transcription_file: Path) -> Optional[Dict[str, Any]]: + try: + with open(transcription_file, 'r', encoding='utf-8') as f: + return json.load(f) + except Exception as e: + self.logger.error(f'Failed to load transcription {transcription_file}: {e}') + return None + + def __check_if_already_processed(self, episode_info) -> bool: + filename = self.episode_manager.path_manager.build_filename( + episode_info, extension='json', + ) + season_code = episode_info.season_code() + episode_code = episode_info.episode_num() + main_output_file = ( + get_base_output_dir(self.series_name) + / settings.output_subdirs.transcriptions + / season_code + / episode_code + / 'raw' + / filename + ) + if main_output_file.exists(): + self.logger.info( + f'Skipping (already exists): {episode_info.episode_code()}', + ) + return True + return False + + def __generate_all_formats( + self, transcription: Dict[str, Any], episode_info, + ) -> None: + episode_metadata = EpisodeManager.get_metadata(episode_info) + transcription_with_info = {'episode_info': episode_metadata, **transcription} + self.__generate_full_json(transcription_with_info, episode_info) + self.__generate_segmented_json(transcription, episode_info) + self.__generate_simple_json(transcription, episode_info) + self.__generate_srt(transcription, episode_info) + self.__generate_txt(transcription, episode_info) + + def __process_file(self, transcription_file: Path) -> None: + try: + transcription = self.__load_transcription(transcription_file) + if not transcription: + return + episode_info = self.episode_manager.parse_filename(transcription_file) + if not episode_info: + self.logger.error( + f'Cannot extract episode info from {transcription_file.name}', + ) + return + if self.__check_if_already_processed(episode_info): + return + self.__generate_all_formats(transcription, episode_info) + except Exception as e: + self.logger.error(f'Error processing file {transcription_file}: {e}') + + def __generate_full_json(self, data: Dict[str, Any], episode_info) -> None: + filename = self.episode_manager.path_manager.build_filename(episode_info, extension='json') + season_code = episode_info.season_code() + episode_code = episode_info.episode_num() + output_file = get_base_output_dir(self.series_name) / settings.output_subdirs.transcriptions / season_code / episode_code / 'raw' / filename + output_file.parent.mkdir(parents=True, exist_ok=True) + generator = JsonGenerator('full', Path(''), output_file.parent, self.logger) + full_json = generator.convert_to_full_format(data) + full_json['episode_info'] = data.get('episode_info', {}) + with open(output_file, 'w', encoding='utf-8') as f: + json.dump(full_json, f, indent=2, ensure_ascii=False) + self.logger.info(f'Generated full JSON: {output_file}') + + def __generate_segmented_json(self, data: Dict[str, Any], episode_info) -> None: + filename = self.episode_manager.path_manager.build_filename(episode_info, extension='json', suffix='segmented') + season_code = episode_info.season_code() + episode_code = episode_info.episode_num() + output_file = get_base_output_dir(self.series_name) / settings.output_subdirs.transcriptions / season_code / episode_code / 'raw' / filename + output_file.parent.mkdir(parents=True, exist_ok=True) + generator = JsonGenerator('segmented', Path(''), output_file.parent, self.logger) + segmented_json = generator.convert_to_segmented_format(data) + segmented_json['episode_info'] = {'season': episode_info.season, 'episode_number': episode_info.relative_episode} + with open(output_file, 'w', encoding='utf-8') as f: + json.dump(segmented_json, f, indent=2, ensure_ascii=False) + self.logger.info(f'Generated segmented JSON: {output_file}') + + def __generate_simple_json(self, data: Dict[str, Any], episode_info) -> None: + filename = self.episode_manager.path_manager.build_filename(episode_info, extension='json', suffix='simple') + season_code = episode_info.season_code() + episode_code = episode_info.episode_num() + output_file = get_base_output_dir(self.series_name) / settings.output_subdirs.transcriptions / season_code / episode_code / 'raw' / filename + output_file.parent.mkdir(parents=True, exist_ok=True) + generator = JsonGenerator('simple', Path(''), output_file.parent, self.logger) + simple_json = generator.convert_to_simple_format(data) + simple_json['episode_info'] = {'season': episode_info.season, 'episode_number': episode_info.relative_episode} + with open(output_file, 'w', encoding='utf-8') as f: + json.dump(simple_json, f, indent=2, ensure_ascii=False) + self.logger.info(f'Generated simple JSON: {output_file}') + + def __generate_srt(self, data: Dict[str, Any], episode_info) -> None: + filename = self.episode_manager.path_manager.build_filename(episode_info, extension='srt') + season_code = episode_info.season_code() + episode_code = episode_info.episode_num() + output_file = get_base_output_dir(self.series_name) / settings.output_subdirs.transcriptions / season_code / episode_code / 'raw' / filename + output_file.parent.mkdir(parents=True, exist_ok=True) + output_file.parent.mkdir(parents=True, exist_ok=True) + generator = SrtGenerator(Path(''), output_file.parent, self.logger) + srt_content = generator.convert_to_srt_format(data) + with open(output_file, 'w', encoding='utf-8') as f: + f.write(srt_content) + self.logger.info(f'Generated SRT: {output_file}') + + def __generate_txt(self, data: Dict[str, Any], episode_info) -> None: + filename = self.episode_manager.path_manager.build_filename(episode_info, extension='txt') + season_code = episode_info.season_code() + episode_code = episode_info.episode_num() + output_file = get_base_output_dir(self.series_name) / settings.output_subdirs.transcriptions / season_code / episode_code / 'raw' / filename + output_file.parent.mkdir(parents=True, exist_ok=True) + output_file.parent.mkdir(parents=True, exist_ok=True) + generator = TxtGenerator(Path(''), output_file.parent, self.logger) + txt_content = generator.convert_to_txt_format(data) + with open(output_file, 'w', encoding='utf-8') as f: + f.write(txt_content) + self.logger.info(f'Generated TXT: {output_file}') diff --git a/preprocessor/lib/transcription/generators/srt_generator.py b/preprocessor/lib/transcription/generators/srt_generator.py new file mode 100644 index 000000000..1b7d23bc9 --- /dev/null +++ b/preprocessor/lib/transcription/generators/srt_generator.py @@ -0,0 +1,44 @@ +from pathlib import Path +from typing import ( + Any, + Dict, +) + +from preprocessor.config.constants import FILE_EXTENSIONS +from preprocessor.lib.transcription.generators.base_generator import BaseTranscriptionGenerator + + +class SrtGenerator(BaseTranscriptionGenerator): + + def _process_file(self, json_file: Path, data: Dict[str, Any]) -> None: + ... + + def _get_output_filename(self, json_file: Path) -> str: + return json_file.name.replace(FILE_EXTENSIONS['json'], FILE_EXTENSIONS['srt']) + + def convert_to_srt_format(self, data: Dict[str, Any]) -> str: + segments = data.get('segments', []) + srt_lines = [] + index = 1 + for seg in segments: + start = seg.get('start', 0.0) + end = seg.get('end', 0.0) + text = seg.get('text', '').strip() + if not text: + continue + start_time = self.__format_timestamp(start) + end_time = self.__format_timestamp(end) + srt_lines.append(f'{index}') + srt_lines.append(f'{start_time} --> {end_time}') + srt_lines.append(text) + srt_lines.append('') + index += 1 + return '\n'.join(srt_lines) + + @staticmethod + def __format_timestamp(seconds: float) -> str: + hours = int(seconds // 3600) + minutes = int(seconds % 3600 // 60) + secs = int(seconds % 60) + millis = int(seconds % 1 * 1000) + return f'{hours:02d}:{minutes:02d}:{secs:02d},{millis:03d}' diff --git a/preprocessor/transcription/generators/txt_generator.py b/preprocessor/lib/transcription/generators/txt_generator.py similarity index 54% rename from preprocessor/transcription/generators/txt_generator.py rename to preprocessor/lib/transcription/generators/txt_generator.py index b966db2d5..aee720036 100644 --- a/preprocessor/transcription/generators/txt_generator.py +++ b/preprocessor/lib/transcription/generators/txt_generator.py @@ -4,25 +4,24 @@ Dict, ) -from preprocessor.core.constants import FILE_EXTENSIONS -from preprocessor.transcription.generators.base_generator import BaseTranscriptionGenerator +from preprocessor.config.constants import FILE_EXTENSIONS +from preprocessor.lib.transcription.generators.base_generator import BaseTranscriptionGenerator class TxtGenerator(BaseTranscriptionGenerator): + def _process_file(self, json_file: Path, data: Dict[str, Any]) -> None: - pass + ... def _get_output_filename(self, json_file: Path) -> str: - return json_file.name.replace(FILE_EXTENSIONS["json"], FILE_EXTENSIONS["txt"]) + return json_file.name.replace(FILE_EXTENSIONS['json'], FILE_EXTENSIONS['txt']) @staticmethod def convert_to_txt_format(data: Dict[str, Any]) -> str: - segments = data.get("segments", []) - + segments = data.get('segments', []) text_parts = [] for seg in segments: - text = seg.get("text", "").strip() + text = seg.get('text', '').strip() if text: text_parts.append(text) - - return " ".join(text_parts) + return ' '.join(text_parts) diff --git a/preprocessor/lib/transcription/processors/__init__.py b/preprocessor/lib/transcription/processors/__init__.py new file mode 100644 index 000000000..2f74aca49 --- /dev/null +++ b/preprocessor/lib/transcription/processors/__init__.py @@ -0,0 +1,7 @@ +from preprocessor.lib.transcription.processors.audio_normalizer import AudioNormalizer +from preprocessor.lib.transcription.processors.episode_info_processor import EpisodeInfoProcessor +from preprocessor.lib.transcription.processors.normalized_audio_processor import NormalizedAudioProcessor +from preprocessor.lib.transcription.processors.sound_separator import SoundEventSeparator +from preprocessor.lib.transcription.processors.unicode_fixer import TranscriptionUnicodeFixer + +__all__ = ['AudioNormalizer', 'EpisodeInfoProcessor', 'NormalizedAudioProcessor', 'SoundEventSeparator', 'TranscriptionUnicodeFixer'] diff --git a/preprocessor/transcription/processors/audio_normalizer.py b/preprocessor/lib/transcription/processors/audio_normalizer.py similarity index 54% rename from preprocessor/transcription/processors/audio_normalizer.py rename to preprocessor/lib/transcription/processors/audio_normalizer.py index e0b46cc32..aaf601f7e 100644 --- a/preprocessor/transcription/processors/audio_normalizer.py +++ b/preprocessor/lib/transcription/processors/audio_normalizer.py @@ -7,24 +7,17 @@ ) from preprocessor.core.base_processor import BaseProcessor -from preprocessor.utils.error_handling_logger import ErrorHandlingLogger +from preprocessor.lib.core.logging import ErrorHandlingLogger class AudioNormalizer: SUPPORTED_VIDEO_EXTENSIONS = BaseProcessor.SUPPORTED_VIDEO_EXTENSIONS - def __init__( - self, - input_videos: Path, - output_dir: Path, - logger: ErrorHandlingLogger, - video_files: Optional[List[Path]] = None, - ): + def __init__(self, input_videos: Path, output_dir: Path, logger: ErrorHandlingLogger, video_files: Optional[List[Path]]=None): self.__input_videos: Path = input_videos self.__output_dir: Path = output_dir self.__logger: ErrorHandlingLogger = logger self.__video_files: Optional[List[Path]] = video_files - self.__output_dir.mkdir(parents=True, exist_ok=True) def __call__(self) -> None: @@ -32,68 +25,40 @@ def __call__(self) -> None: for video in self.__video_files: self.__process_video(video) else: - for video in self.__input_videos.rglob("*"): + for video in self.__input_videos.rglob('*'): if video.suffix.lower() in self.SUPPORTED_VIDEO_EXTENSIONS: self.__process_video(video) def __process_video(self, video: Path) -> None: try: - output_path = self.__output_dir / video.with_suffix(".wav").name - + output_path = self.__output_dir / video.with_suffix('.wav').name if output_path.exists(): return - audio_idx = self.__get_best_audio_stream(video) if audio_idx is None: self.__logger.error(f"Cannot find audio stream for file: '{video}'") return - self.__normalize(video=video, audio_idx=audio_idx, output=output_path) - except Exception as e: - self.__logger.error(f"Error processing video {video}: {e}") + self.__logger.error(f'Error processing video {video}: {e}') def __get_best_audio_stream(self, video: Path) -> Optional[int]: - cmd = [ - "ffprobe", "-v", "error", - "-select_streams", "a", - "-show_entries", "stream=index,bit_rate", - "-of", "json", - str(video), - ] + cmd = ['ffprobe', '-v', 'error', '-select_streams', 'a', '-show_entries', 'stream=index,bit_rate', '-of', 'json', str(video)] result = subprocess.run(cmd, capture_output=True, text=True, check=True) - streams = json.loads(result.stdout).get("streams", []) - + streams = json.loads(result.stdout).get('streams', []) if not streams: - self.__logger.error(f"No audio streams found in file: {video}") + self.__logger.error(f'No audio streams found in file: {video}') return None - - best_stream = max(streams, key=lambda s: int(s.get("bit_rate", 0))) - return best_stream["index"] + best_stream = max(streams, key=lambda s: int(s.get('bit_rate', 0))) + return best_stream['index'] def __normalize(self, video: Path, audio_idx: int, output: Path) -> None: - tmp_output = output.with_name(output.stem + "_temp.wav") - - extract_cmd = [ - "ffmpeg", "-y", - "-i", str(video), - "-map", f"0:{audio_idx}", - "-acodec", "pcm_s16le", - "-ar", "48000", - "-ac", "1", - str(output), - ] + tmp_output = output.with_name(output.stem + '_temp.wav') + extract_cmd = ['ffmpeg', '-y', '-i', str(video), '-map', f'0:{audio_idx}', '-acodec', 'pcm_s16le', '-ar', '48000', '-ac', '1', str(output)] subprocess.run(extract_cmd, check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) - self.__logger.info(f"Converted audio: {output}") - - normalize_cmd = [ - "ffmpeg", "-y", - "-i", str(output), - "-af", "dynaudnorm", - str(tmp_output), - ] + self.__logger.info(f'Converted audio: {output}') + normalize_cmd = ['ffmpeg', '-y', '-i', str(output), '-af', 'dynaudnorm', str(tmp_output)] subprocess.run(normalize_cmd, check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) - self.__logger.info(f"Normalized audio: {tmp_output}") - + self.__logger.info(f'Normalized audio: {tmp_output}') tmp_output.replace(output) - self.__logger.info(f"Replaced original file with normalized audio: {video} -> {output}") + self.__logger.info(f'Replaced original file with normalized audio: {video} -> {output}') diff --git a/preprocessor/transcription/processors/episode_info_processor.py b/preprocessor/lib/transcription/processors/episode_info_processor.py similarity index 56% rename from preprocessor/transcription/processors/episode_info_processor.py rename to preprocessor/lib/transcription/processors/episode_info_processor.py index 866044576..f9ec102b0 100644 --- a/preprocessor/transcription/processors/episode_info_processor.py +++ b/preprocessor/lib/transcription/processors/episode_info_processor.py @@ -6,36 +6,25 @@ Tuple, ) -from preprocessor.episodes import EpisodeManager -from preprocessor.utils.error_handling_logger import ErrorHandlingLogger +from preprocessor.lib.core.logging import ErrorHandlingLogger +from preprocessor.lib.episodes import EpisodeManager class EpisodeInfoProcessor: - def __init__( - self, - jsons_dir: Path, - episodes_info_json: Path, - output_path: Path, - logger: ErrorHandlingLogger, - series_name: str = "", - ): + + def __init__(self, jsons_dir: Path, episodes_info_json: Path, output_path: Path, logger: ErrorHandlingLogger, series_name: str=''): self.__jsons_dir: Path = jsons_dir self.__output_path: Path = output_path self.__logger: ErrorHandlingLogger = logger - if not series_name: series_name = self.__output_path.parent.name.lower() - self.__logger.warning( - f"No series name provided. Using fallback from folder name: '{series_name}'", - ) - + self.__logger.warning(f"No series name provided. Using fallback from folder name: '{series_name}'") self.__series_name: str = series_name.lower() self.__output_path.mkdir(parents=True, exist_ok=True) - - self.__episode_manager = EpisodeManager(episodes_info_json, self.__series_name) + self.__episode_manager = EpisodeManager(episodes_info_json, self.__series_name, self.__logger) def __call__(self) -> None: - for transcription_file in self.__jsons_dir.rglob("*.json"): + for transcription_file in self.__jsons_dir.rglob('*.json'): self.__process_file(transcription_file) def __process_file(self, transcription_file: Path) -> None: @@ -43,43 +32,35 @@ def __process_file(self, transcription_file: Path) -> None: transcription = self.__load_transcription(transcription_file) episode_info = self.__episode_manager.parse_filename(transcription_file) if not episode_info: - self.__logger.error(f"Cannot extract episode info from {transcription_file.name}") + self.__logger.error(f'Cannot extract episode info from {transcription_file.name}') return - _, new_json_name = self.__write_episode_json(transcription, episode_info) self.__rename_original_file(transcription_file, new_json_name) - except Exception as e: - self.__logger.error(f"Error processing file {transcription_file}: {e}") + self.__logger.error(f'Error processing file {transcription_file}: {e}') @staticmethod def __load_transcription(path: Path) -> Dict[str, Any]: - with path.open("r", encoding="utf-8") as f: + with path.open('r', encoding='utf-8') as f: return json.load(f) def __write_episode_json(self, transcription: Dict[str, Any], episode_info) -> Tuple[Path, str]: - new_json_name = self.__episode_manager.path_manager.build_filename(episode_info, extension="json") + new_json_name = self.__episode_manager.path_manager.build_filename(episode_info, extension='json') season_dir = self.__output_path / episode_info.season_code() output_path = season_dir / new_json_name output_path.parent.mkdir(parents=True, exist_ok=True) - - result = { - "episode_info": EpisodeManager.get_metadata(episode_info), - "segments": transcription.get("segments", []), - } - - with output_path.open("w", encoding="utf-8") as f: + result = {'episode_info': EpisodeManager.get_metadata(episode_info), 'segments': transcription.get('segments', [])} + with output_path.open('w', encoding='utf-8') as f: json.dump(result, f, ensure_ascii=False, indent=4) - - self.__logger.info(f"Created episode info {output_path}.") - return output_path, new_json_name + self.__logger.info(f'Created episode info {output_path}.') + return (output_path, new_json_name) def __rename_original_file(self, original_path: Path, new_name: str) -> None: new_src = original_path.parent / new_name if original_path.name == new_name: - self.__logger.info(f"File {original_path} already has correct name.") + self.__logger.info(f'File {original_path} already has correct name.') elif new_src.exists(): - self.__logger.error(f"Cannot rename {original_path} -> {new_src}, file already exists!") + self.__logger.error(f'Cannot rename {original_path} -> {new_src}, file already exists!') else: original_path.rename(new_src) - self.__logger.info(f"Renamed source transcription file: {original_path} -> {new_src}") + self.__logger.info(f'Renamed source transcription file: {original_path} -> {new_src}') diff --git a/preprocessor/transcription/processors/normalized_audio_processor.py b/preprocessor/lib/transcription/processors/normalized_audio_processor.py similarity index 59% rename from preprocessor/transcription/processors/normalized_audio_processor.py rename to preprocessor/lib/transcription/processors/normalized_audio_processor.py index c304462ae..d3d77395c 100644 --- a/preprocessor/transcription/processors/normalized_audio_processor.py +++ b/preprocessor/lib/transcription/processors/normalized_audio_processor.py @@ -10,15 +10,12 @@ from faster_whisper import WhisperModel import torch -from preprocessor.transcription.whisper_utils import ( - build_transcription_result, - get_language_code, -) -from preprocessor.utils.error_handling_logger import ErrorHandlingLogger +from preprocessor.lib.core.logging import ErrorHandlingLogger +from preprocessor.lib.transcription.whisper import WhisperUtils class NormalizedAudioProcessor: - SUPPORTED_AUDIO_EXTENSIONS: Tuple[str, str] = (".wav", ".mp3") + SUPPORTED_AUDIO_EXTENSIONS: Tuple[str, str] = ('.wav', '.mp3') def __init__( self, @@ -34,37 +31,36 @@ def __init__( self.__output_dir: Path = output_dir self.__logger: ErrorHandlingLogger = logger self.__audio_files: Optional[List[Path]] = audio_files - self.__language: str = language - self.__input_audios.mkdir(parents=True, exist_ok=True) self.__output_dir.mkdir(parents=True, exist_ok=True) - - if device != "cuda": - raise ValueError(f"Only GPU (cuda) is supported, got device={device}") - - compute_type = "float16" - self.__logger.info(f"Loading Whisper model {model} on {device} with compute_type={compute_type}") - self.__whisper_model = WhisperModel(model, device=device, compute_type=compute_type) + if device != 'cuda': + raise ValueError(f'Only GPU (cuda) is supported, got device={device}') + compute_type = 'float16' + self.__logger.info( + f'Loading Whisper model {model} on {device} with compute_type={compute_type}', + ) + self.__whisper_model = WhisperModel( + model, + device=device, + compute_type=compute_type, + ) def __call__(self) -> None: if self.__audio_files is not None: for audio in self.__audio_files: self.__process_normalized_audio(audio) else: - for audio in self.__input_audios.rglob("*"): + for audio in self.__input_audios.rglob('*'): if audio.suffix.lower() in self.SUPPORTED_AUDIO_EXTENSIONS: self.__process_normalized_audio(audio) def __process_normalized_audio(self, normalized_audio: Path) -> None: try: - output_file = self.__output_dir / normalized_audio.with_suffix(".json").name - + output_file = self.__output_dir / normalized_audio.with_suffix('.json').name if output_file.exists(): return - - language_code = get_language_code(self.__language) - + language_code = WhisperUtils.get_language_code(self.__language) segments, info = self.__whisper_model.transcribe( str(normalized_audio), language=language_code, @@ -74,24 +70,23 @@ def __process_normalized_audio(self, normalized_audio: Path) -> None: temperature=0.0, compression_ratio_threshold=None, ) - - result = build_transcription_result(segments, language=info.language) - - for segment_dict in result["segments"]: - segment_dict["temperature"] = 0.0 - - with open(output_file, "w", encoding="utf-8") as f: + result = WhisperUtils.build_transcription_result( + segments, + language=info.language, + ) + for segment_dict in result['segments']: + segment_dict['temperature'] = 0.0 + with open(output_file, 'w', encoding='utf-8') as f: json.dump(result, f, ensure_ascii=False, indent=2) - - self.__logger.info(f"Processed: {normalized_audio}") + self.__logger.info(f'Processed: {normalized_audio}') except Exception as e: - self.__logger.error(f"Error processing file {normalized_audio}: {e}") + self.__logger.error(f'Error processing file {normalized_audio}: {e}') def cleanup(self) -> None: - self.__logger.info("Unloading Whisper model and clearing GPU memory...") + self.__logger.info('Unloading Whisper model and clearing GPU memory...') if hasattr(self, '_NormalizedAudioProcessor__whisper_model'): del self.__whisper_model gc.collect() if torch.cuda.is_available(): torch.cuda.empty_cache() - self.__logger.info("Whisper model unloaded, GPU memory cleared") + self.__logger.info('Whisper model unloaded, GPU memory cleared') diff --git a/preprocessor/lib/transcription/processors/sound_separator.py b/preprocessor/lib/transcription/processors/sound_separator.py new file mode 100644 index 000000000..13ec57d69 --- /dev/null +++ b/preprocessor/lib/transcription/processors/sound_separator.py @@ -0,0 +1,267 @@ +import json +from pathlib import Path +import re +from typing import ( + Any, + Dict, + List, + Tuple, +) + +from preprocessor.config.config import settings +from preprocessor.config.constants import ( + FILE_EXTENSIONS, + FILE_SUFFIXES, +) +from preprocessor.config.types import ( + WordKeys, + WordTypeValues, +) +from preprocessor.core.base_processor import ( + BaseProcessor, + OutputSpec, + ProcessingItem, +) +from preprocessor.lib.episodes import EpisodeManager +from preprocessor.lib.transcription.sound_classification import ( + classify_segment, + is_sound_event, +) + + +class SoundEventSeparator(BaseProcessor): + + def __init__(self, args: Dict[str, Any]) -> None: + super().__init__(args=args, class_name=self.__class__.__name__, error_exit_code=2, loglevel=args.get('loglevel', 20)) + self.transcription_dir = Path(self._args.get('transcription_dir', settings.transcription.get_output_dir(self.series_name))) + episodes_info_json = self._args.get('episodes_info_json') + self.episode_manager = EpisodeManager(episodes_info_json, self.series_name, self.logger) + + def _validate_args(self, args: Dict[str, Any]) -> None: + ... + + def get_output_subdir(self) -> str: + return settings.output_subdirs.transcriptions + + def _get_processing_items(self) -> List[ProcessingItem]: + segmented_files = list(self.transcription_dir.rglob('**/raw/*_segmented.json')) + items = [] + for trans_file in segmented_files: + episode_info = self.episode_manager.parse_filename(trans_file) + if not episode_info: + self.logger.warning(f'Cannot parse episode info from {trans_file.name}') + continue + episode_id = EpisodeManager.get_episode_id_for_state(episode_info) + items.append(ProcessingItem(episode_id=episode_id, input_path=trans_file, metadata={'episode_info': episode_info})) + return items + + def _get_expected_outputs(self, item: ProcessingItem) -> List[OutputSpec]: + base_name = item.input_path.stem.replace(FILE_SUFFIXES['segmented'], '') + episode_dir = item.input_path.parent.parent + clean_dir = episode_dir / settings.output_subdirs.transcription_subdirs.clean + sound_dir = episode_dir / settings.output_subdirs.transcription_subdirs.sound_events + clean_json = clean_dir / f"{base_name}{FILE_SUFFIXES['clean']}{FILE_EXTENSIONS['json']}" + sound_json = sound_dir / f"{base_name}{FILE_SUFFIXES['sound_events']}{FILE_EXTENSIONS['json']}" + clean_segmented_json = clean_dir / f"{base_name}{FILE_SUFFIXES['segmented']}_clean{FILE_EXTENSIONS['json']}" + sound_segmented_json = sound_dir / f"{base_name}{FILE_SUFFIXES['segmented']}_sound_events{FILE_EXTENSIONS['json']}" + clean_txt = clean_dir / f"{base_name}{FILE_SUFFIXES['clean']}{FILE_EXTENSIONS['txt']}" + sound_txt = sound_dir / f"{base_name}{FILE_SUFFIXES['sound_events']}{FILE_EXTENSIONS['txt']}" + clean_srt = clean_dir / f"{base_name}{FILE_SUFFIXES['clean']}{FILE_EXTENSIONS['srt']}" + sound_srt = sound_dir / f"{base_name}{FILE_SUFFIXES['sound_events']}{FILE_EXTENSIONS['srt']}" + return [ + OutputSpec(path=clean_json, required=True), + OutputSpec(path=sound_json, required=True), + OutputSpec(path=clean_segmented_json, required=True), + OutputSpec(path=sound_segmented_json, required=True), + OutputSpec(path=clean_txt, required=True), + OutputSpec(path=sound_txt, required=True), + OutputSpec(path=clean_srt, required=True), + OutputSpec(path=sound_srt, required=True), + ] + + def _process_item( # pylint: disable=too-many-locals + self, item, missing_outputs: List, + ) -> None: + with open(item.input_path, 'r', encoding='utf-8') as f: + data = json.load(f) + episode_info = data.get('episode_info', {}) + segments = data.get('segments', []) + dialogue_segments = [] + sound_event_segments = [] + for segment in segments: + classification = classify_segment(segment) + if classification == 'dialogue': + dialogue_segments.append(self.__clean_segment_text(segment)) + elif classification == 'sound_event': + sound_event_segments.append(self.__enrich_sound_event(self.__clean_segment_text(segment))) + elif classification == 'mixed': + dialogue_parts, sound_parts = self.__split_mixed_segment(segment) + dialogue_segments.extend(dialogue_parts) + sound_event_segments.extend([self.__enrich_sound_event(s) for s in sound_parts]) + dialogue_segments = self.__renumber_segments(dialogue_segments) + sound_event_segments = self.__renumber_segments(sound_event_segments) + base_name = item.input_path.stem.replace(FILE_SUFFIXES['segmented'], '') + episode_dir = item.input_path.parent.parent + clean_dir = episode_dir / settings.output_subdirs.transcription_subdirs.clean + sound_dir = episode_dir / settings.output_subdirs.transcription_subdirs.sound_events + clean_dir.mkdir(parents=True, exist_ok=True) + sound_dir.mkdir(parents=True, exist_ok=True) + clean_json = clean_dir / f'{base_name}_clean_transcription.json' + sound_json = sound_dir / f'{base_name}_sound_events.json' + clean_segmented_json = clean_dir / f'{base_name}_segmented_clean.json' + sound_segmented_json = sound_dir / f'{base_name}_segmented_sound_events.json' + clean_txt = clean_dir / f'{base_name}_clean_transcription.txt' + sound_txt = sound_dir / f'{base_name}_sound_events.txt' + clean_srt = clean_dir / f'{base_name}_clean_transcription.srt' + sound_srt = sound_dir / f'{base_name}_sound_events.srt' + raw_txt = episode_dir / settings.output_subdirs.transcription_subdirs.raw / f'{base_name}.txt' + dialogue_segments_simple = self.__convert_to_simple_format(dialogue_segments) + sound_event_segments_simple = self.__convert_to_simple_format(sound_event_segments) + with open(clean_json, 'w', encoding='utf-8') as f: + json.dump({'episode_info': episode_info, 'segments': dialogue_segments_simple}, f, ensure_ascii=False, indent=4) + with open(sound_json, 'w', encoding='utf-8') as f: + json.dump({'episode_info': episode_info, 'segments': sound_event_segments_simple}, f, ensure_ascii=False, indent=4) + with open(clean_segmented_json, 'w', encoding='utf-8') as f: + json.dump({'episode_info': episode_info, 'segments': dialogue_segments}, f, ensure_ascii=False, indent=4) + with open(sound_segmented_json, 'w', encoding='utf-8') as f: + json.dump({'episode_info': episode_info, 'segments': sound_event_segments}, f, ensure_ascii=False, indent=4) + self.__generate_txt_files(raw_txt, clean_txt, sound_txt) + self.__generate_srt_files(dialogue_segments, sound_event_segments, clean_srt, sound_srt) + self.logger.info(f'Separated {item.episode_id}: {len(dialogue_segments)} dialogue, {len(sound_event_segments)} sound events') + + + def __split_mixed_segment(self, segment: Dict[str, Any]) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]: + words = segment.get('words', []) + dialogue_sequences = [] + sound_sequences = [] + current_type = None + current_words = [] + for word in words: + if word.get(WordKeys.TYPE) == WordTypeValues.SPACING: + if current_words: + current_words.append(word) + continue + is_sound = is_sound_event(word) + word_type = 'sound' if is_sound else 'dialogue' + if word_type != current_type: + if current_words: + self.__finalize_sequence(current_type, current_words, dialogue_sequences, sound_sequences, segment) + current_type = word_type + current_words = [word] + else: + current_words.append(word) + if current_words: + self.__finalize_sequence(current_type, current_words, dialogue_sequences, sound_sequences, segment) + return (dialogue_sequences, sound_sequences) + + @staticmethod + def __finalize_sequence( + seq_type: str, + words: List[Dict], + dialogue_sequences: List[Dict], + sound_sequences: List[Dict], + original_segment: Dict[str, Any], + ) -> None: + if not words: + return + non_spacing_words = [w for w in words if w.get(WordKeys.TYPE) != WordTypeValues.SPACING] + if not non_spacing_words: + return + text = ''.join([w.get('text', '') for w in words]) + text = re.sub('\\s+', ' ', text).strip() + start_time = min((w.get('start') or 0 for w in words)) + end_time = max((w.get('end') or 0 for w in words)) + new_segment = {'text': text, 'start': start_time, 'end': end_time, 'words': words} + for key in original_segment: + if key not in ['text', 'start', 'end', 'words']: + new_segment[key] = original_segment[key] + if seq_type == 'dialogue': + dialogue_sequences.append(new_segment) + else: + sound_sequences.append(new_segment) + + @staticmethod + def __clean_segment_text(segment: Dict[str, Any]) -> Dict[str, Any]: + cleaned = segment.copy() + if 'text' in cleaned: + text = cleaned['text'] + text = re.sub('\\s+', ' ', text).strip() + cleaned['text'] = text + if cleaned.get('start') is None or cleaned.get('end') is None: + words = cleaned.get('words', []) + if words: + starts = [w.get('start') or 0 for w in words if w.get('start') is not None] + ends = [w.get('end') or 0 for w in words if w.get('end') is not None] + if starts: + cleaned['start'] = min(starts) + if ends: + cleaned['end'] = max(ends) + return cleaned + + @staticmethod + def __enrich_sound_event(segment: Dict[str, Any]) -> Dict[str, Any]: + enriched = segment.copy() + enriched['sound_type'] = 'sound' + return enriched + + @staticmethod + def __renumber_segments(segments: List[Dict]) -> List[Dict]: + for i, segment in enumerate(segments): + segment['id'] = i + return segments + + @staticmethod + def __convert_to_simple_format(segments: List[Dict]) -> List[Dict]: + simple_segments = [] + for seg in segments: + simple_seg = {'id': seg.get('id'), 'text': seg.get('text', ''), 'start': seg.get('start') or 0.0, 'end': seg.get('end') or 0.0} + if 'sound_type' in seg: + simple_seg['sound_type'] = seg['sound_type'] + simple_segments.append(simple_seg) + return simple_segments + + def __generate_txt_files(self, original_txt: Path, clean_txt: Path, sound_txt: Path) -> None: + if not original_txt.exists(): + self.logger.warning(f'Original TXT file not found: {original_txt}') + return + with open(original_txt, 'r', encoding='utf-8') as f: + original_content = f.read() + clean_content = re.sub('\\([^)]*\\)', '', original_content) + clean_content = re.sub('\\s+', ' ', clean_content).strip() + sound_matches = re.findall('\\([^)]*\\)', original_content) + sound_content = ' '.join(sound_matches) + with open(clean_txt, 'w', encoding='utf-8') as f: + f.write(clean_content) + with open(sound_txt, 'w', encoding='utf-8') as f: + f.write(sound_content) + + @staticmethod + def __generate_srt_files(dialogue_segments: List[Dict], sound_segments: List[Dict], clean_srt: Path, sound_srt: Path) -> None: + + def format_timestamp(seconds: float) -> str: + hours = int(seconds // 3600) + minutes = int(seconds % 3600 // 60) + secs = int(seconds % 60) + millis = int(seconds % 1 * 1000) + return f'{hours:02d}:{minutes:02d}:{secs:02d},{millis:03d}' + + def _write_srt(segments: List[Dict], output_path: Path) -> None: + with open(output_path, 'w', encoding='utf-8') as f: + for idx, seg in enumerate(segments, start=1): + words = seg.get('words', []) + text = seg.get('text', '').strip() + if not text or not words: + continue + non_spacing_words = [w for w in words if w.get(WordKeys.TYPE) != WordTypeValues.SPACING] + if not non_spacing_words: + continue + start_time = min((w.get('start') or 0.0 for w in non_spacing_words)) + end_time = max((w.get('end') or 0.0 for w in non_spacing_words)) + f.write(f'{idx}\n') + f.write(f'{format_timestamp(start_time)} --> {format_timestamp(end_time)}\n') + f.write(f'{text}\n\n') + _write_srt(dialogue_segments, clean_srt) + _write_srt(sound_segments, sound_srt) + + def _get_progress_description(self) -> str: + return 'Separating sound events from dialogues' diff --git a/preprocessor/transcription/processors/unicode_fixer.py b/preprocessor/lib/transcription/processors/unicode_fixer.py similarity index 60% rename from preprocessor/transcription/processors/unicode_fixer.py rename to preprocessor/lib/transcription/processors/unicode_fixer.py index 3dfea589b..cc15522a1 100644 --- a/preprocessor/transcription/processors/unicode_fixer.py +++ b/preprocessor/lib/transcription/processors/unicode_fixer.py @@ -11,39 +11,39 @@ OutputSpec, ProcessingItem, ) -from preprocessor.episodes import EpisodeManager -from preprocessor.utils.transcription_utils import fix_transcription_file_unicode +from preprocessor.lib.episodes import EpisodeManager +from preprocessor.lib.transcription.utils import TranscriptionUtils class TranscriptionUnicodeFixer(BaseProcessor): + def __init__(self, args: Dict[str, Any]) -> None: super().__init__( args=args, class_name=self.__class__.__name__, error_exit_code=2, - loglevel=args.get("loglevel", 20), + loglevel=args.get('loglevel', 20), ) - + default_dir = settings.transcription.get_output_dir(self.series_name) self.transcription_jsons = Path( - self._args.get("transcription_jsons", settings.transcription.get_output_dir(self.series_name)), + self._args.get('transcription_jsons', default_dir), ) - episodes_info_json = self._args.get("episodes_info_json") - self.episode_manager = EpisodeManager(episodes_info_json, self.series_name) + episodes_info_json = self._args.get('episodes_info_json') + self.episode_manager = EpisodeManager(episodes_info_json, self.series_name, self.logger) def _validate_args(self, args: Dict[str, Any]) -> None: - pass + ... def get_output_subdir(self) -> str: return settings.output_subdirs.transcriptions def _get_processing_items(self) -> List[ProcessingItem]: - transcription_files = list(self.transcription_jsons.rglob("*.json")) - + transcription_files = list(self.transcription_jsons.rglob('*.json')) return [ ProcessingItem( - episode_id=f"unicode_fix_{i}", + episode_id=f'unicode_fix_{i}', input_path=trans_file, - metadata={"file": trans_file}, + metadata={'file': trans_file}, ) for i, trans_file in enumerate(transcription_files) ] @@ -52,13 +52,12 @@ def _get_expected_outputs(self, item: ProcessingItem) -> List[OutputSpec]: return [OutputSpec(path=item.input_path, required=True)] def _process_item(self, item: ProcessingItem, missing_outputs: List[OutputSpec]) -> None: - trans_file = item.metadata["file"] - + trans_file = item.metadata['file'] try: - was_fixed = fix_transcription_file_unicode(trans_file) + was_fixed = TranscriptionUtils.fix_transcription_file_unicode(trans_file) if was_fixed: - self.logger.info(f"Fixed unicode escapes in: {trans_file.name}") + self.logger.info(f'Fixed unicode escapes in: {trans_file.name}') else: - self.logger.debug(f"No unicode escapes found in: {trans_file.name}") + self.logger.debug(f'No unicode escapes found in: {trans_file.name}') except Exception as e: - self.logger.error(f"Error fixing unicode in {trans_file.name}: {e}") + self.logger.error(f'Error fixing unicode in {trans_file.name}: {e}') diff --git a/preprocessor/lib/transcription/sound_classification.py b/preprocessor/lib/transcription/sound_classification.py new file mode 100644 index 000000000..6b22f8076 --- /dev/null +++ b/preprocessor/lib/transcription/sound_classification.py @@ -0,0 +1,37 @@ +import re +from typing import ( + Any, + Dict, +) + +from preprocessor.config.types import ( + WordKeys, + WordTypeValues, +) + + +def is_sound_event(word: Dict[str, Any]) -> bool: + if word.get(WordKeys.TYPE) == WordTypeValues.AUDIO_EVENT: + return True + text = word.get(WordKeys.TEXT, '').strip() + if re.match(r'^\(.*\)$', text): + return True + return False + + +def classify_segment(segment: Dict[str, Any]) -> str: + words = segment.get(WordKeys.WORDS, []) + if not words: + return 'dialogue' + has_sound = False + has_dialogue = False + for word in words: + if is_sound_event(word): + has_sound = True + elif word.get(WordKeys.TYPE) not in [WordTypeValues.SPACING, '']: + has_dialogue = True + if has_sound and has_dialogue: + return 'mixed' + if has_sound: + return 'sound_event' + return 'dialogue' diff --git a/preprocessor/lib/transcription/utils.py b/preprocessor/lib/transcription/utils.py new file mode 100644 index 000000000..4cbb01f35 --- /dev/null +++ b/preprocessor/lib/transcription/utils.py @@ -0,0 +1,95 @@ +import json +from pathlib import Path +from typing import ( + Any, + Dict, + List, +) + + +class TranscriptionUtils: + + @staticmethod + def fix_unicode(file_path: Path) -> None: + if not file_path.exists(): + return + with open(file_path, 'r', encoding='utf-8') as f: + data: Dict[str, Any] = json.load(f) + with open(file_path, 'w', encoding='utf-8') as f: + json.dump(data, f, ensure_ascii=False, indent=2) + + @staticmethod + def fix_transcription_file_unicode(file_path: Path) -> bool: + if not file_path.exists(): + return False + with open(file_path, 'r', encoding='utf-8') as f: + original_content = f.read() + f.seek(0) + data: Dict[str, Any] = json.load(f) + new_content = json.dumps(data, ensure_ascii=False, indent=2) + if original_content != new_content: + with open(file_path, 'w', encoding='utf-8') as f: + f.write(new_content) + return True + return False + + @staticmethod + def convert_words_list(words: List[Dict[str, Any]]) -> List[Dict[str, Any]]: + return [ + { + 'word': word.get('text', word.get('word', '')), + 'start': word.get('start', 0.0), + 'end': word.get('end', 0.0), + 'probability': word.get('probability', word.get('confidence', 1.0)), + 'speaker_id': word.get('speaker_id', 'speaker_unknown'), + } + for word in words + ] + +class WhisperUtils: + LANGUAGE_MAP: Dict[str, str] = { + 'polish': 'pl', + 'english': 'en', + 'german': 'de', + 'french': 'fr', + 'spanish': 'es', + } + + @staticmethod + def get_language_code(language: str) -> str: + return WhisperUtils.LANGUAGE_MAP.get(language.lower(), language.lower()) + + @staticmethod + def __process_segment(segment: Any) -> Dict[str, Any]: + words = [] + if hasattr(segment, 'words') and segment.words: + for word in segment.words: + words.append({ + 'word': word.word, + 'start': word.start, + 'end': word.end, + 'probability': word.probability, + }) + return { + 'id': segment.id, + 'seek': 0, + 'start': segment.start, + 'end': segment.end, + 'text': segment.text, + 'tokens': [], + 'avg_logprob': segment.avg_logprob, + 'compression_ratio': segment.compression_ratio, + 'no_speech_prob': segment.no_speech_prob, + 'words': words, + } + + @staticmethod + def build_transcription_result(segments: Any, language: str=None) -> Dict[str, Any]: + result: Dict[str, Any] = {'text': '', 'segments': []} + if language: + result['language'] = language + for segment in segments: + segment_dict = WhisperUtils.__process_segment(segment) + result['segments'].append(segment_dict) + result['text'] += segment.text + return result diff --git a/preprocessor/lib/transcription/whisper.py b/preprocessor/lib/transcription/whisper.py new file mode 100644 index 000000000..81ebd6e14 --- /dev/null +++ b/preprocessor/lib/transcription/whisper.py @@ -0,0 +1,61 @@ +from pathlib import Path +from typing import ( + Any, + Dict, + Optional, +) + +from faster_whisper import WhisperModel +import torch + +from preprocessor.lib.transcription.utils import WhisperUtils +from preprocessor.lib.ui.console import console + + +class Whisper: + + def __init__(self, model: str='large-v3-turbo', language: str='pl', device: str='cuda', beam_size: int=5, temperature: float=0.0) -> None: + self.model_name: str = model + self.language: str = language + self.device: str = device + self.beam_size: int = beam_size + self.temperature: float = temperature + self._model: Optional[WhisperModel] = None + + def _load_model(self) -> WhisperModel: + if self._model is not None: + return self._model + if self.device != 'cuda': + raise ValueError(f'Only GPU (cuda) is supported, got device={self.device}') + compute_type = 'float16' + console.print(f'[cyan]Loading Whisper model: {self.model_name} on {self.device} with compute_type={compute_type}[/cyan]') + self._model = WhisperModel(self.model_name, device=self.device, compute_type=compute_type) + console.print('[green]✓ Whisper model loaded[/green]') + return self._model + + def transcribe(self, audio_path: Path) -> Dict[str, Any]: + console.print(f'[cyan]Transcribing with Whisper: {audio_path.name}[/cyan]') + if not audio_path.exists(): + raise FileNotFoundError(f'Audio file not found: {audio_path}') + model = self._load_model() + language_code = WhisperUtils.get_language_code(self.language) + segments, info = model.transcribe( + str(audio_path), + language=language_code, + beam_size=self.beam_size, + word_timestamps=True, + condition_on_previous_text=False, + temperature=self.temperature, + ) + result = WhisperUtils.build_transcription_result(segments, language=info.language) + console.print(f'[green]✓ Transcription completed: {audio_path.name}[/green]') + return result + + def cleanup(self) -> None: + console.print('[cyan]Unloading Whisper model and clearing GPU memory...[/cyan]') + if self._model is not None: + del self._model + self._model = None + if torch.cuda.is_available(): + torch.cuda.empty_cache() + console.print('[green]✓ Whisper model unloaded, GPU memory cleared[/green]') diff --git a/preprocessor/lib/ui/__init__.py b/preprocessor/lib/ui/__init__.py new file mode 100644 index 000000000..cf8c5a9c5 --- /dev/null +++ b/preprocessor/lib/ui/__init__.py @@ -0,0 +1,10 @@ +from preprocessor.lib.ui.console import ( + SimpleProgress, + console, +) +from preprocessor.lib.ui.progress import ( + OperationTracker, + ProgressTracker, +) + +__all__ = ['console', 'SimpleProgress', 'ProgressTracker', 'OperationTracker'] diff --git a/preprocessor/utils/console.py b/preprocessor/lib/ui/console.py similarity index 72% rename from preprocessor/utils/console.py rename to preprocessor/lib/ui/console.py index a23e4718e..b3fb82251 100644 --- a/preprocessor/utils/console.py +++ b/preprocessor/lib/ui/console.py @@ -4,25 +4,27 @@ from rich.console import Console -from preprocessor.utils.time_utils import format_time_hms +from preprocessor.lib.core.time import TimeFormatter _console_instance = None - def _get_console() -> Console: global _console_instance # pylint: disable=global-statement if _console_instance is None: - in_docker = os.path.exists('/.dockerenv') or os.getenv('DOCKER_CONTAINER', 'false') == 'true' - + in_docker = ( + os.path.exists('/.dockerenv') or + os.getenv('DOCKER_CONTAINER', 'false') == 'true' + ) + color_system = 'standard' if in_docker else 'auto' _console_instance = Console( force_terminal=True, file=sys.stderr, - color_system="standard" if in_docker else "auto", + color_system=color_system, ) return _console_instance - class SimpleProgress: + def __init__(self): self.tasks = {} self.task_counter = 0 @@ -41,13 +43,11 @@ def add_task(self, description: str, total: int): self.__print_progress(task_id) return task_id - def advance(self, task_id: int, advance: int = 1): + def advance(self, task_id: int, advance: int=1): if task_id not in self.tasks: return - task = self.tasks[task_id] task['completed'] += advance - current_time = time.time() if current_time - task['last_print'] >= 1.0 or task['completed'] >= task['total']: self.__print_progress(task_id) @@ -57,20 +57,21 @@ def __print_progress(self, task_id: int): task = self.tasks[task_id] completed = task['completed'] total = task['total'] - percent = (completed / total * 100) if total > 0 else 0 - + percent = completed / total * 100 if total > 0 else 0 elapsed = time.time() - task['start_time'] if 0 < completed < total: - eta_seconds = (elapsed / completed) * (total - completed) - eta = format_time_hms(eta_seconds) + eta_seconds = elapsed / completed * (total - completed) + eta = TimeFormatter.format_hms(eta_seconds) elif completed >= total: - eta = "0:00:00" + eta = '0:00:00' else: - eta = "-:--:--" - + eta = '-:--:--' bar_width = 30 filled = int(bar_width * completed / total) if total > 0 else 0 - progress_bar = "━" * filled + "╸" + "─" * (bar_width - filled - 1) if filled < bar_width else "━" * bar_width + if filled < bar_width: + progress_bar = '━' * filled + '╸' + '─' * (bar_width - filled - 1) + else: + progress_bar = '━' * bar_width console.print( f"[bold blue]{task['description']}[/bold blue] " @@ -86,10 +87,4 @@ def __enter__(self): def __exit__(self, exc_type, exc_val, exc_tb): pass - - -def create_progress() -> SimpleProgress: - return SimpleProgress() - - console = _get_console() diff --git a/preprocessor/utils/progress_tracker.py b/preprocessor/lib/ui/progress.py similarity index 53% rename from preprocessor/utils/progress_tracker.py rename to preprocessor/lib/ui/progress.py index b5ff79d18..bb61b86ea 100644 --- a/preprocessor/utils/progress_tracker.py +++ b/preprocessor/lib/ui/progress.py @@ -2,14 +2,12 @@ import time from typing import Optional -from preprocessor.utils.console import console -from preprocessor.utils.time_utils import ( - format_time_hms, - format_time_human, -) +from preprocessor.lib.core.time import TimeFormatter +from preprocessor.lib.ui.console import console class ProgressTracker: + def __init__(self): self.current_operation: Optional[str] = None self.start_time: Optional[float] = None @@ -18,26 +16,17 @@ def __init__(self): def track_operation(self, operation_name: str, total: int): self.current_operation = operation_name self.start_time = time.time() - console.print(f" [cyan]{operation_name} (total: {total})...[/cyan]") - - tracker = OperationTracker( - operation_name=operation_name, - total=total, - start_time=self.start_time, - ) - + console.print(f' [cyan]{operation_name} (total: {total})...[/cyan]') + tracker = OperationTracker(operation_name=operation_name, total=total, start_time=self.start_time) try: yield tracker finally: if tracker.completed > 0: elapsed = time.time() - self.start_time - console.print( - f" [green]✓ {operation_name} completed: " - f"{tracker.completed}/{total} in {format_time_human(elapsed)}[/green]", - ) - + console.print(f' [green]✓ {operation_name} completed: {tracker.completed}/{total} in {TimeFormatter.format_human(elapsed)}[/green]') class OperationTracker: + def __init__(self, operation_name: str, total: int, start_time: float): self.operation_name = operation_name self.total = total @@ -45,34 +34,23 @@ def __init__(self, operation_name: str, total: int, start_time: float): self.start_time = start_time self.last_report = 0 - def update(self, completed: int, interval: int = 10): + def update(self, completed: int, interval: int=10): self.completed = completed - - should_report = ( - completed % interval == 0 or - completed == self.total or - completed == 1 - ) - + should_report = completed % interval == 0 or completed == self.total or completed == 1 if should_report and completed != self.last_report: self.__report_progress() self.last_report = completed def __report_progress(self): elapsed = time.time() - self.start_time - percent = (self.completed / self.total * 100) if self.total > 0 else 0 - + percent = self.completed / self.total * 100 if self.total > 0 else 0 if 0 < self.completed < self.total: rate = self.completed / elapsed if elapsed > 0 else 0 remaining = self.total - self.completed eta_seconds = remaining / rate if rate > 0 else 0 - eta = format_time_hms(eta_seconds) if eta_seconds > 0 else "0:00:00" + eta = TimeFormatter.format_hms(eta_seconds) if eta_seconds > 0 else '0:00:00' elif self.completed >= self.total: - eta = "0:00:00" + eta = '0:00:00' else: - eta = "-:--:--" - - console.print( - f" [dim]{self.operation_name}: {self.completed}/{self.total} " - f"({percent:.0f}%) ETA: {eta}[/dim]", - ) + eta = '-:--:--' + console.print(f' [dim]{self.operation_name}: {self.completed}/{self.total} ({percent:.0f}%) ETA: {eta}[/dim]') diff --git a/preprocessor/lib/validation/__init__.py b/preprocessor/lib/validation/__init__.py new file mode 100644 index 000000000..154276d32 --- /dev/null +++ b/preprocessor/lib/validation/__init__.py @@ -0,0 +1,3 @@ +from preprocessor.lib.validation.file_validators import FileValidator + +__all__ = ['FileValidator'] diff --git a/preprocessor/validation/base_result.py b/preprocessor/lib/validation/base_result.py similarity index 68% rename from preprocessor/validation/base_result.py rename to preprocessor/lib/validation/base_result.py index a752cc7d7..662f3059b 100644 --- a/preprocessor/validation/base_result.py +++ b/preprocessor/lib/validation/base_result.py @@ -16,11 +16,10 @@ class ValidationStatusMixin: @property def status(self) -> str: if self.errors: - return "FAIL" + return 'FAIL' if self.warnings: - return "WARNING" - return "PASS" - + return 'WARNING' + return 'PASS' @dataclass class BaseValidationResult(ValidationStatusMixin): @@ -29,9 +28,4 @@ class BaseValidationResult(ValidationStatusMixin): stats: Dict[str, Any] = field(default_factory=dict) def to_dict(self) -> Dict[str, Any]: - return { - "status": self.status, - "errors": self.errors, - "warnings": self.warnings, - "stats": self.stats, - } + return {'status': self.status, 'errors': self.errors, 'warnings': self.warnings, 'stats': self.stats} diff --git a/preprocessor/lib/validation/file_validators.py b/preprocessor/lib/validation/file_validators.py new file mode 100644 index 000000000..4ff1dfbb0 --- /dev/null +++ b/preprocessor/lib/validation/file_validators.py @@ -0,0 +1,173 @@ +from dataclasses import dataclass +import json +from pathlib import Path +import subprocess +from typing import ( + Any, + Dict, + Optional, +) +import zipfile + +from PIL import Image + +from preprocessor.config.constants import ValidationMetadataKeys +from preprocessor.config.types.keys import ( + FfprobeFormatKeys, + FfprobeKeys, + FfprobeStreamKeys, +) + + +@dataclass +class ValidationResult: + is_valid: bool + error_message: Optional[str] = None + metadata: Optional[Dict[str, Any]] = None + +class FileValidator: + + @staticmethod + def _check_file_exists(path: Path) -> Optional[ValidationResult]: + if not path.exists(): + return ValidationResult(is_valid=False, error_message=f'File does not exist: {path}') + return None + + @staticmethod + def validate_json_file(path: Path) -> ValidationResult: + if (error := FileValidator._check_file_exists(path)): + return error + try: + with open(path, 'r', encoding='utf-8') as f: + json.load(f) + return ValidationResult( + is_valid=True, + metadata={ValidationMetadataKeys.SIZE_BYTES: path.stat().st_size}, + ) + except json.JSONDecodeError as e: + return ValidationResult(is_valid=False, error_message=f'Invalid JSON: {e}') + except Exception as e: + return ValidationResult(is_valid=False, error_message=f'Error reading file: {e}') + + @staticmethod + def validate_jsonl_file(path: Path) -> ValidationResult: + if (error := FileValidator._check_file_exists(path)): + return error + try: + line_count = 0 + with open(path, 'r', encoding='utf-8') as f: + for line_num, line in enumerate(f, 1): + line = line.strip() + if not line: + continue + try: + json.loads(line) + line_count += 1 + except json.JSONDecodeError as e: + return ValidationResult( + is_valid=False, + error_message=f'Invalid JSON at line {line_num}: {e}', + ) + return ValidationResult( + is_valid=True, + metadata={ + ValidationMetadataKeys.SIZE_BYTES: path.stat().st_size, + ValidationMetadataKeys.LINE_COUNT: line_count, + }, + ) + except Exception as e: + return ValidationResult(is_valid=False, error_message=f'Error reading file: {e}') + + @staticmethod + def validate_image_file(path: Path) -> ValidationResult: + if (error := FileValidator._check_file_exists(path)): + return error + try: + with Image.open(path) as img: + img.verify() + with Image.open(path) as img: + width, height = img.size + format_type = img.format + size_mb = path.stat().st_size / (1024 * 1024) + return ValidationResult( + is_valid=True, + metadata={ + ValidationMetadataKeys.WIDTH: width, + ValidationMetadataKeys.HEIGHT: height, + ValidationMetadataKeys.FORMAT: format_type, + ValidationMetadataKeys.SIZE_MB: round(size_mb, 2), + }, + ) + except Exception as e: + return ValidationResult(is_valid=False, error_message=f'Invalid image: {e}') + + @staticmethod + def validate_video_file(path: Path) -> ValidationResult: + if (error := FileValidator._check_file_exists(path)): + return error + try: + result = subprocess.run( + [ + 'ffprobe', '-v', 'error', '-select_streams', 'v:0', + '-show_entries', 'stream=codec_name,width,height,duration', + '-show_entries', 'format=duration,size', + '-of', 'json', str(path), + ], + capture_output=True, + text=True, + check=True, + ) + probe_data = json.loads(result.stdout) + stream = probe_data.get(FfprobeKeys.STREAMS, [{}])[0] + format_info = probe_data.get(FfprobeKeys.FORMAT, {}) + stream_duration = stream.get(FfprobeStreamKeys.DURATION) + format_duration = format_info.get(FfprobeFormatKeys.DURATION, 0) + duration = float(stream_duration or format_duration) + size_bytes = int(format_info.get(FfprobeFormatKeys.SIZE, 0)) + size_mb = size_bytes / (1024 * 1024) + return ValidationResult( + is_valid=True, + metadata={ + ValidationMetadataKeys.CODEC: stream.get(FfprobeStreamKeys.CODEC_NAME), + ValidationMetadataKeys.WIDTH: stream.get(FfprobeStreamKeys.WIDTH), + ValidationMetadataKeys.HEIGHT: stream.get(FfprobeStreamKeys.HEIGHT), + ValidationMetadataKeys.DURATION: round(duration, 2), + ValidationMetadataKeys.SIZE_MB: round(size_mb, 2), + }, + ) + except subprocess.CalledProcessError as e: + return ValidationResult(is_valid=False, error_message=f'ffprobe error: {e.stderr}') + except Exception as e: + return ValidationResult(is_valid=False, error_message=f'Error validating video: {e}') + + @staticmethod + def validate_archive_file(path: Path) -> ValidationResult: + if (error := FileValidator._check_file_exists(path)): + return error + try: + with zipfile.ZipFile(path, 'r') as zip_ref: + bad_file = zip_ref.testzip() + if bad_file: + return ValidationResult(is_valid=False, error_message=f'Corrupt file in archive: {bad_file}') + file_count = len(zip_ref.namelist()) + compressed_size = sum((info.compress_size for info in zip_ref.infolist())) + uncompressed_size = sum((info.file_size for info in zip_ref.infolist())) + compression_ratio = 0 + if uncompressed_size > 0: + compression_ratio = (1 - compressed_size / uncompressed_size) * 100 + return ValidationResult( + is_valid=True, + metadata={ + ValidationMetadataKeys.SIZE_MB: round( + path.stat().st_size / (1024 * 1024), 2, + ), + 'file_count': file_count, + 'compressed_size_mb': round(compressed_size / (1024 * 1024), 2), + 'uncompressed_size_mb': round(uncompressed_size / (1024 * 1024), 2), + 'compression_ratio': round(compression_ratio, 2), + }, + ) + except zipfile.BadZipFile as e: + return ValidationResult(is_valid=False, error_message=f'Invalid ZIP file: {e}') + except Exception as e: + return ValidationResult(is_valid=False, error_message=f'Error validating archive: {e}') diff --git a/preprocessor/lib/video/__init__.py b/preprocessor/lib/video/__init__.py new file mode 100644 index 000000000..66833a64e --- /dev/null +++ b/preprocessor/lib/video/__init__.py @@ -0,0 +1,9 @@ +from preprocessor.lib.video.emotion_utils import EmotionDetector +from preprocessor.lib.video.frame_utils import FrameLoader + +__all__ = ['EmotionDetector', 'FrameLoader'] +try: + from preprocessor.lib.video.image_hasher import PerceptualHasher + __all__.append('PerceptualHasher') +except (ImportError, RuntimeError): + pass diff --git a/preprocessor/lib/video/emotion_utils.py b/preprocessor/lib/video/emotion_utils.py new file mode 100644 index 000000000..6ef99be2d --- /dev/null +++ b/preprocessor/lib/video/emotion_utils.py @@ -0,0 +1,112 @@ +from typing import ( + Dict, + List, + Optional, + Tuple, +) + +from hsemotion_onnx.facial_emotions import HSEmotionRecognizer +import numpy as np + +from preprocessor.config.config import settings +from preprocessor.lib.core.logging import ErrorHandlingLogger + +EMOTION_LABELS = ['anger', 'contempt', 'disgust', 'fear', 'happiness', 'neutral', 'sadness', 'surprise'] + +class EmotionDetector: + + @staticmethod + def init_model(logger: Optional[ErrorHandlingLogger]=None) -> HSEmotionRecognizer: + model_name = settings.emotion_detection.model_name + if logger: + logger.info(f'Loading HSEmotion model: {model_name}...') + try: + fer = HSEmotionRecognizer(model_name=model_name) + if logger: + logger.info(f'HSEmotion model loaded: {model_name}') + return fer + except Exception as e: + raise RuntimeError(f'Failed to load HSEmotion model {model_name}: {e}') from e + + @staticmethod + def _process_emotion_result( + emotion: str, + scores: np.ndarray, + ) -> Tuple[str, float, Dict[str, float]]: + emotion_scores = { + EMOTION_LABELS[i]: float(scores[i]) + for i in range(len(EMOTION_LABELS)) + } + confidence = float(max(scores)) + dominant_emotion = emotion.lower() + return (dominant_emotion, confidence, emotion_scores) + + @staticmethod + def detect( + face_image: np.ndarray, + model: HSEmotionRecognizer, + ) -> Tuple[str, float, Dict[str, float]]: + try: + emotion, scores = model.predict_emotions(face_image, logits=False) + return EmotionDetector._process_emotion_result(emotion, scores) + except Exception as e: + raise RuntimeError(f'Emotion detection failed: {e}') from e + + @staticmethod + def _clip_bbox( + x1: int, + y1: int, + x2: int, + y2: int, + width: int, + height: int, + ) -> Tuple[int, int, int, int]: + x1 = max(0, x1) + y1 = max(0, y1) + x2 = min(width, x2) + y2 = min(height, y2) + return (x1, y1, x2, y2) + + @staticmethod + def crop_face(frame: np.ndarray, bbox: Dict[str, int]) -> Optional[np.ndarray]: + try: + x1, y1, x2, y2 = (bbox['x1'], bbox['y1'], bbox['x2'], bbox['y2']) + height, width = frame.shape[:2] + x1, y1, x2, y2 = EmotionDetector._clip_bbox(x1, y1, x2, y2, width, height) + if x2 <= x1 or y2 <= y1: + return None + face_crop = frame[y1:y2, x1:x2] + return face_crop if face_crop.size > 0 else None + except Exception: + return None + + @staticmethod + def detect_batch( + face_images: List[np.ndarray], + model: HSEmotionRecognizer, + batch_size: int = 32, + logger: Optional[ErrorHandlingLogger] = None, + ) -> List[Tuple[str, float, Dict[str, float]]]: + results = [] + total = len(face_images) + for batch_start in range(0, total, batch_size): + batch_end = min(batch_start + batch_size, total) + batch = face_images[batch_start:batch_end] + progress_pct = int(batch_end / total * 100) + if logger: + logger.info( + f'Processing emotion batch {batch_start}-{batch_end}/{total} ' + f'({progress_pct}%)', + ) + try: + batch_results = model.predict_multi_emotions(batch, logits=False) + for emotion, scores in batch_results: + results.append(EmotionDetector._process_emotion_result(emotion, scores)) + except Exception: + for face_img in batch: + try: + emotion, scores = model.predict_emotions(face_img, logits=False) + results.append(EmotionDetector._process_emotion_result(emotion, scores)) + except Exception: + results.append(None) + return results diff --git a/preprocessor/lib/video/frame_utils.py b/preprocessor/lib/video/frame_utils.py new file mode 100644 index 000000000..616cbea15 --- /dev/null +++ b/preprocessor/lib/video/frame_utils.py @@ -0,0 +1,32 @@ +from concurrent.futures import ThreadPoolExecutor +from pathlib import Path +from typing import ( + Any, + Dict, + List, +) + +from PIL import Image + + +class FrameLoader: + + @staticmethod + def _load_single(frames_dir: Path, request: Dict[str, Any], convert_rgb: bool) -> Image.Image: + if 'frame_path' in request: + frame_path = frames_dir / request['frame_path'] + else: + frame_num = request['frame_number'] + frame_path = frames_dir / f'frame_{frame_num:06d}.jpg' + if frame_path.exists(): + img = Image.open(frame_path) + if convert_rgb and img.mode != 'RGB': + img = img.convert('RGB') + return img + return Image.new('RGB', (1, 1)) + + @staticmethod + def load_from_requests(frames_dir: Path, frame_requests: List[Dict[str, Any]], convert_rgb: bool=False, num_workers: int=4) -> List[Image.Image]: + with ThreadPoolExecutor(max_workers=num_workers) as executor: + images = list(executor.map(lambda req: FrameLoader._load_single(frames_dir, req, convert_rgb), frame_requests)) + return images diff --git a/preprocessor/lib/video/image_hasher.py b/preprocessor/lib/video/image_hasher.py new file mode 100644 index 000000000..b84248a25 --- /dev/null +++ b/preprocessor/lib/video/image_hasher.py @@ -0,0 +1,36 @@ +from typing import Optional + +import torch +from torch import nn +import torch.nn.functional as F +from torchvision import models +from torchvision.models import ResNet18_Weights + + +class PerceptualHasher: + + def __init__(self) -> None: + base_model = models.resnet18(weights=ResNet18_Weights.DEFAULT) + self.model: Optional[nn.Module] = nn.Sequential(*list(base_model.children())[:-1]) + self.model.eval() + if torch.cuda.is_available(): + self.model = self.model.cuda() + + def compute_hash(self, image_tensor: torch.Tensor) -> int: + if self.model is None: + raise RuntimeError('Model not initialized or already cleaned up') + with torch.no_grad(): + features = self.model(image_tensor) + features = F.adaptive_avg_pool2d(features, (1, 1)) + features = features.flatten() + hash_bits = (features > features.median()).int() + hash_val = int(''.join([str(bit) for bit in hash_bits.tolist()[:64]]), 2) + return hash_val + + def cleanup(self) -> None: + if self.model is not None: + del self.model + self.model = None + if torch.cuda.is_available(): + torch.cuda.empty_cache() +__all__ = ['PerceptualHasher'] diff --git a/preprocessor/processors/__init__.py b/preprocessor/modules/__init__.py similarity index 100% rename from preprocessor/processors/__init__.py rename to preprocessor/modules/__init__.py diff --git a/preprocessor/scraping/__init__.py b/preprocessor/modules/audio/__init__.py similarity index 100% rename from preprocessor/scraping/__init__.py rename to preprocessor/modules/audio/__init__.py diff --git a/preprocessor/modules/audio/extraction.py b/preprocessor/modules/audio/extraction.py new file mode 100644 index 000000000..23f99864a --- /dev/null +++ b/preprocessor/modules/audio/extraction.py @@ -0,0 +1,62 @@ +from pathlib import Path +import subprocess + +from preprocessor.config.step_configs import AudioExtractionConfig +from preprocessor.core.artifacts import ( + AudioArtifact, + SourceVideo, +) +from preprocessor.core.base_step import PipelineStep +from preprocessor.core.context import ExecutionContext + + +class AudioExtractionStep(PipelineStep[SourceVideo, AudioArtifact, AudioExtractionConfig]): + + @property + def name(self) -> str: + return 'audio_extraction' + + def execute(self, input_data: SourceVideo, context: ExecutionContext) -> AudioArtifact: + episode_code = input_data.episode_info.episode_code() + output_filename: str = ( + f'{context.series_name}_{episode_code}.{self.config.format}' + ) + output_path: Path = context.get_output_path( + input_data.episode_info, + 'extracted_audio', + output_filename, + ) + if output_path.exists() and (not context.force_rerun): + if context.is_step_completed(self.name, input_data.episode_id): + context.logger.info(f'Skipping {input_data.episode_id} (cached audio)') + return AudioArtifact( + episode_id=input_data.episode_id, + episode_info=input_data.episode_info, + path=output_path, + format=self.config.format, + ) + context.logger.info(f'Extracting audio for {input_data.episode_id}') + context.mark_step_started(self.name, input_data.episode_id) + command: list[str] = [ + 'ffmpeg', '-y', '-v', 'error', + '-i', str(input_data.path), + '-vn', + '-acodec', 'pcm_s16le', + '-ar', str(self.config.sample_rate), + '-ac', str(self.config.channels), + str(output_path), + ] + try: + subprocess.run(command, check=True) + except subprocess.CalledProcessError as e: + context.logger.error(f'FFmpeg audio extraction failed: {e}') + if output_path.exists(): + output_path.unlink() + raise + context.mark_step_completed(self.name, input_data.episode_id) + return AudioArtifact( + episode_id=input_data.episode_id, + episode_info=input_data.episode_info, + path=output_path, + format=self.config.format, + ) diff --git a/preprocessor/modules/audio/separation.py b/preprocessor/modules/audio/separation.py new file mode 100644 index 000000000..95cdb4bd6 --- /dev/null +++ b/preprocessor/modules/audio/separation.py @@ -0,0 +1,244 @@ +import json +from pathlib import Path +import re +from typing import ( + Any, + Dict, + List, + Tuple, +) + +from preprocessor.config.constants import ( + FILE_EXTENSIONS, + FILE_SUFFIXES, +) +from preprocessor.config.step_configs import SoundSeparationConfig +from preprocessor.config.types import ( + WordKeys, + WordTypeValues, +) +from preprocessor.core.artifacts import TranscriptionData +from preprocessor.core.base_step import PipelineStep +from preprocessor.core.context import ExecutionContext +from preprocessor.lib.io.files import atomic_write_json +from preprocessor.lib.transcription.sound_classification import ( + classify_segment, + is_sound_event, +) + + +class SoundSeparationStep(PipelineStep[TranscriptionData, TranscriptionData, SoundSeparationConfig]): + + @property + def name(self) -> str: + return 'sound_separation' + + def execute( # pylint: disable=too-many-locals + self, + input_data: TranscriptionData, + context: ExecutionContext, + ) -> TranscriptionData: + base_name = input_data.path.stem.replace(FILE_SUFFIXES['segmented'], '') + episode_dir = input_data.path.parent.parent + clean_dir = episode_dir / 'clean' + sound_dir = episode_dir / 'sound_events' + clean_dir.mkdir(parents=True, exist_ok=True) + sound_dir.mkdir(parents=True, exist_ok=True) + clean_json = ( + clean_dir / + f"{base_name}{FILE_SUFFIXES['clean']}{FILE_EXTENSIONS['json']}" + ) + sound_json = ( + sound_dir / + f"{base_name}{FILE_SUFFIXES['sound_events']}{FILE_EXTENSIONS['json']}" + ) + if clean_json.exists() and sound_json.exists() and (not context.force_rerun): + if context.is_step_completed(self.name, input_data.episode_id): + context.logger.info(f'Skipping {input_data.episode_id} (cached)') + return TranscriptionData( + path=clean_json, + episode_id=input_data.episode_id, + episode_info=input_data.episode_info, + language=input_data.language, + model=input_data.model, + format='json', + ) + context.mark_step_started(self.name, input_data.episode_id) + with open(input_data.path, 'r', encoding='utf-8') as f: + data = json.load(f) + episode_info_dict = data.get('episode_info', {}) + segments = data.get('segments', []) + dialogue_segments = [] + sound_segments = [] + for segment in segments: + classification = classify_segment(segment) + if classification == 'dialogue': + cleaned = self._clean_segment_text(segment) + dialogue_segments.append(cleaned) + elif classification == 'sound_event': + cleaned = self._clean_segment_text(segment) + cleaned['sound_type'] = 'sound' + sound_segments.append(cleaned) + elif classification == 'mixed': + dialogue_parts, sound_parts = self._split_mixed_segment(segment) + dialogue_segments.extend(dialogue_parts) + sound_segments.extend(sound_parts) + dialogue_segments = self._renumber_segments(dialogue_segments) + sound_segments = self._renumber_segments(sound_segments) + clean_data = {'episode_info': episode_info_dict, 'segments': dialogue_segments} + sound_data = {'episode_info': episode_info_dict, 'segments': sound_segments} + atomic_write_json(clean_json, clean_data) + atomic_write_json(sound_json, sound_data) + clean_segmented = ( + clean_dir / + f"{base_name}{FILE_SUFFIXES['segmented']}_clean{FILE_EXTENSIONS['json']}" + ) + sound_segmented = ( + sound_dir / + f"{base_name}{FILE_SUFFIXES['segmented']}_sound_events{FILE_EXTENSIONS['json']}" + ) + atomic_write_json(clean_segmented, clean_data) + atomic_write_json(sound_segmented, sound_data) + clean_txt = clean_dir / f"{base_name}{FILE_SUFFIXES['clean']}{FILE_EXTENSIONS['txt']}" + sound_txt = sound_dir / f"{base_name}{FILE_SUFFIXES['sound_events']}{FILE_EXTENSIONS['txt']}" + clean_srt = clean_dir / f"{base_name}{FILE_SUFFIXES['clean']}{FILE_EXTENSIONS['srt']}" + sound_srt = sound_dir / f"{base_name}{FILE_SUFFIXES['sound_events']}{FILE_EXTENSIONS['srt']}" + self._generate_txt_file(clean_json, clean_txt) + self._generate_txt_file(sound_json, sound_txt) + self._generate_srt_file(dialogue_segments, clean_srt) + self._generate_srt_file(sound_segments, sound_srt) + context.mark_step_completed(self.name, input_data.episode_id) + return TranscriptionData( + path=clean_json, + episode_id=input_data.episode_id, + episode_info=input_data.episode_info, + language=input_data.language, + model=input_data.model, + format='json', + ) + + @staticmethod + def _is_sound_event_text(text: str) -> bool: + return bool(re.match(r'^\(.*\)$', text.strip())) + + def _split_mixed_segment( + self, + segment: Dict[str, Any], + ) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]: + words = segment.get(WordKeys.WORDS, []) + dialogue_parts = [] + sound_parts = [] + current_type = None + current_words = [] + current_start = None + for word in words: + word_type = 'sound' if is_sound_event(word) else 'dialogue' + if word.get(WordKeys.TYPE) == WordTypeValues.SPACING: + if current_words: + current_words.append(word) + continue + if word_type != current_type: + if current_words and current_type: + self._finalize_sequence( + current_type, + current_words, + current_start, + dialogue_parts, + sound_parts, + ) + current_type = word_type + current_words = [word] + current_start = word.get(WordKeys.START) + else: + current_words.append(word) + if current_words and current_type: + self._finalize_sequence( + current_type, + current_words, + current_start, + dialogue_parts, + sound_parts, + ) + return (dialogue_parts, sound_parts) + + @staticmethod + def _finalize_sequence( + seq_type: str, + words: List[Dict[str, Any]], + start: float, + dialogue_parts: List[Dict[str, Any]], + sound_parts: List[Dict[str, Any]], + ) -> None: + non_spacing = [w for w in words if w.get(WordKeys.TYPE) != WordTypeValues.SPACING] + if not non_spacing: + return + text = ''.join((w.get(WordKeys.TEXT, '') for w in words)) + end = words[-1].get(WordKeys.END, start) + new_segment = { + 'id': 0, + 'text': text, + WordKeys.START: start, + WordKeys.END: end, + WordKeys.WORDS: words, + } + if seq_type == 'sound': + new_segment['sound_type'] = 'sound' + sound_parts.append(new_segment) + else: + dialogue_parts.append(new_segment) + + @staticmethod + def _clean_segment_text(segment: Dict[str, Any]) -> Dict[str, Any]: + cleaned = segment.copy() + text = cleaned.get('text', '') + text = re.sub('\\s+', ' ', text) + cleaned['text'] = text.strip() + words = cleaned.get(WordKeys.WORDS, []) + if words: + non_spacing = [w for w in words if w.get(WordKeys.TYPE) != WordTypeValues.SPACING] + if non_spacing: + cleaned[WordKeys.START] = min((w.get(WordKeys.START, 0) for w in non_spacing)) + cleaned[WordKeys.END] = max((w.get(WordKeys.END, 0) for w in non_spacing)) + return cleaned + + @staticmethod + def _renumber_segments(segments: List[Dict[str, Any]]) -> List[Dict[str, Any]]: + for i, seg in enumerate(segments): + seg['id'] = i + return segments + + @staticmethod + def _generate_txt_file(json_path: Path, txt_path: Path) -> None: + with open(json_path, 'r', encoding='utf-8') as f: + data = json.load(f) + segments = data.get('segments', []) + text_lines = [] + for seg in segments: + text = seg.get('text', '').strip() + text = re.sub('\\([^)]*\\)', '', text) + text = re.sub('\\s+', ' ', text).strip() + if text: + text_lines.append(text) + with open(txt_path, 'w', encoding='utf-8') as f: + f.write(' '.join(text_lines)) + + @staticmethod + def _generate_srt_file(segments: List[Dict[str, Any]], srt_path: Path) -> None: + with open(srt_path, 'w', encoding='utf-8') as f: + for idx, seg in enumerate(segments, 1): + start = seg.get('start', 0) + end = seg.get('end', 0) + text = seg.get('text', '').strip() + start_time = SoundSeparationStep._format_srt_time(start) + end_time = SoundSeparationStep._format_srt_time(end) + f.write(f'{idx}\n') + f.write(f'{start_time} --> {end_time}\n') + f.write(f'{text}\n\n') + + @staticmethod + def _format_srt_time(seconds: float) -> str: + hours = int(seconds // 3600) + minutes = int(seconds % 3600 // 60) + secs = int(seconds % 60) + millis = int(seconds % 1 * 1000) + return f'{hours:02d}:{minutes:02d}:{secs:02d},{millis:03d}' diff --git a/preprocessor/modules/packaging/__init__.py b/preprocessor/modules/packaging/__init__.py new file mode 100644 index 000000000..79212c302 --- /dev/null +++ b/preprocessor/modules/packaging/__init__.py @@ -0,0 +1,3 @@ +from preprocessor.modules.packaging.archives import ArchiveGenerationStep + +__all__ = ['ArchiveGenerationStep'] diff --git a/preprocessor/modules/packaging/archives.py b/preprocessor/modules/packaging/archives.py new file mode 100644 index 000000000..02f259dac --- /dev/null +++ b/preprocessor/modules/packaging/archives.py @@ -0,0 +1,28 @@ +from pathlib import Path + +from preprocessor.config.step_configs import ArchiveConfig +from preprocessor.core.artifacts import ( + ArchiveArtifact, + ProcessedEpisode, +) +from preprocessor.core.base_step import PipelineStep +from preprocessor.core.context import ExecutionContext + + +class ArchiveGenerationStep(PipelineStep[ProcessedEpisode, ArchiveArtifact, ArchiveConfig]): + + @property + def name(self) -> str: + return 'archive_generation' + + def execute(self, input_data: ProcessedEpisode, context: ExecutionContext) -> ArchiveArtifact: + output_filename: str = f'{context.series_name}_{input_data.episode_info.episode_code()}_archive.zip' + output_path: Path = context.get_output_path(input_data.episode_info, 'archives', output_filename) + if output_path.exists() and (not context.force_rerun): + if context.is_step_completed(self.name, input_data.episode_id): + context.logger.info(f'Skipping {input_data.episode_id} (cached archive)') + return ArchiveArtifact(episode_id=input_data.episode_id, episode_info=input_data.episode_info, path=output_path) + context.logger.info(f'Generating archive for {input_data.episode_id}') + context.mark_step_started(self.name, input_data.episode_id) + context.mark_step_completed(self.name, input_data.episode_id) + return ArchiveArtifact(episode_id=input_data.episode_id, episode_info=input_data.episode_info, path=output_path) diff --git a/preprocessor/modules/scraping/__init__.py b/preprocessor/modules/scraping/__init__.py new file mode 100644 index 000000000..2c13627f9 --- /dev/null +++ b/preprocessor/modules/scraping/__init__.py @@ -0,0 +1,6 @@ +from preprocessor.modules.scraping.base_scraper import BaseScraper +from preprocessor.modules.scraping.character_scraper import CharacterScraper +from preprocessor.modules.scraping.episode_scraper import EpisodeScraper +from preprocessor.modules.scraping.reference_processor import CharacterReferenceProcessor + +__all__ = ['BaseScraper', 'CharacterReferenceProcessor', 'CharacterScraper', 'EpisodeScraper'] diff --git a/preprocessor/modules/scraping/base_scraper.py b/preprocessor/modules/scraping/base_scraper.py new file mode 100644 index 000000000..c734dbad9 --- /dev/null +++ b/preprocessor/modules/scraping/base_scraper.py @@ -0,0 +1,91 @@ +from abc import abstractmethod +import json +import logging +from pathlib import Path +from typing import ( + Any, + Dict, + List, + Optional, +) + +from preprocessor.config.config import settings +from preprocessor.config.enums import ( + ParserMode, + ScraperMethod, +) +from preprocessor.core.base_processor import BaseProcessor +from preprocessor.lib.ai.llm_provider import LLMProvider +from preprocessor.lib.scraping.clipboard import ScraperClipboard +from preprocessor.lib.scraping.crawl4ai import ScraperCrawl4AI +from preprocessor.lib.ui.console import console + + +class BaseScraper(BaseProcessor): + + def __init__(self, args: Dict[str, Any], error_exit_code: int=7): + super().__init__(args=args, class_name=self.__class__.__name__, error_exit_code=error_exit_code, loglevel=logging.DEBUG) + self.urls: List[str] = self._args['urls'] + self.output_file: Path = self._args['output_file'] + self.headless: bool = self._args.get('headless', True) + scraper_method_str = self._args.get('scraper_method', 'crawl4ai') + self.scraper_method = ScraperMethod(scraper_method_str) + parser_mode_str = self._args.get('parser_mode', 'normal') + self.parser_mode = ParserMode(parser_mode_str) + self.llm: Optional[LLMProvider] = None + + def _validate_args(self, args: Dict[str, Any]) -> None: + if 'urls' not in args or not args['urls']: + raise ValueError('At least one URL is required') + if 'output_file' not in args: + raise ValueError('output_file is required') + + def _execute(self) -> None: + self.llm = LLMProvider(parser_mode=self.parser_mode) + console.print(f'[blue]Scraping {len(self.urls)} URLs...[/blue]') + scraped_pages = self.__scrape_all_urls() + if not scraped_pages: + console.print('[yellow]No pages scraped[/yellow]') + return + console.print(f'[blue]Scraped {len(scraped_pages)} pages, processing with LLM...[/blue]') + try: + self._process_scraped_pages(scraped_pages) + except Exception as e: + self.logger.error(f'LLM processing failed: {e}') + + def __scrape_all_urls(self) -> List[Dict[str, Any]]: + scraped_pages = [] + try: + for i, url in enumerate(self.urls, 1): + console.print(f'[cyan]Fetching page {i}/{len(self.urls)}[/cyan]') + try: + page_text = self.__scrape_url(url) + if page_text: + scraped_pages.append({'url': url, 'markdown': page_text}) + console.print(f'[green]✓[/green] {url}: {len(page_text)} chars') + else: + self.logger.error(f'Failed to scrape {url}') + except Exception as e: + self.logger.error(f'Error scraping {url}: {e}') + except KeyboardInterrupt: + console.print('\n[yellow]Scraping interrupted[/yellow]') + raise + return scraped_pages + + def __scrape_url(self, url: str) -> Optional[str]: + console.print(f'[cyan]Scraping method: {self.scraper_method.value}[/cyan]') + if self.scraper_method == ScraperMethod.CLIPBOARD: + return ScraperClipboard.scrape(url, headless=self.headless, logger=self.logger) + if self.scraper_method == ScraperMethod.CRAWL4AI: + return ScraperCrawl4AI.scrape(url, save_markdown=True, output_dir=settings.scraper.get_output_dir(self.series_name), logger=self.logger) + self.logger.error(f'Unknown scraper method: {self.scraper_method}') + return None + + def _save_result(self, result: Dict[str, Any]) -> None: + self.output_file.parent.mkdir(parents=True, exist_ok=True) + with open(self.output_file, 'w', encoding='utf-8') as f: + json.dump(result, f, indent=2, ensure_ascii=False) + + @abstractmethod + def _process_scraped_pages(self, scraped_pages: List[Dict[str, Any]]) -> None: + pass diff --git a/preprocessor/modules/scraping/character_scraper.py b/preprocessor/modules/scraping/character_scraper.py new file mode 100644 index 000000000..9d2a2bbc0 --- /dev/null +++ b/preprocessor/modules/scraping/character_scraper.py @@ -0,0 +1,25 @@ +from typing import ( + Any, + Dict, + List, +) + +from preprocessor.lib.ui.console import console +from preprocessor.modules.scraping.base_scraper import BaseScraper + + +class CharacterScraper(BaseScraper): + + def __init__(self, args: Dict[str, Any]) -> None: + super().__init__(args) + self.series_name: str = self._args.get('series_name', '') + + def _process_scraped_pages(self, scraped_pages: List[Dict[str, Any]]) -> None: + characters = self.llm.extract_characters(scraped_pages, self.series_name) + if not characters: + self.logger.error('LLM failed to extract any character data') + return + result = {'sources': [item['url'] for item in scraped_pages], 'characters': [char.model_dump() for char in characters]} + self._save_result(result) + console.print(f'[green]✓ Extracted {len(characters)} characters[/green]') + console.print(f'[green]✓ Saved to: {self.output_file}[/green]') diff --git a/preprocessor/modules/scraping/character_scraper_step.py b/preprocessor/modules/scraping/character_scraper_step.py new file mode 100644 index 000000000..e1734a387 --- /dev/null +++ b/preprocessor/modules/scraping/character_scraper_step.py @@ -0,0 +1,55 @@ +from pathlib import Path +from typing import Optional + +from preprocessor.config.step_configs import CharacterScraperConfig +from preprocessor.core.artifacts import SourceVideo +from preprocessor.core.base_step import PipelineStep +from preprocessor.core.context import ExecutionContext +from preprocessor.modules.scraping.character_scraper import CharacterScraper + + +class CharacterScraperStep( + PipelineStep[SourceVideo, SourceVideo, CharacterScraperConfig], +): + def __init__(self, config: CharacterScraperConfig) -> None: + super().__init__(config) + self._executed = False + + @property + def name(self) -> str: + return "scrape_characters" + + def execute( + self, input_data: SourceVideo, context: ExecutionContext, + ) -> Optional[SourceVideo]: + if self._executed: + return input_data + + output_path = Path(self.config.output_file) + + if output_path.exists() and not context.force_rerun: + context.logger.info(f"Characters metadata already exists: {output_path}") + self._executed = True + return input_data + + context.logger.info(f"Scraping characters from {len(self.config.urls)} URLs") + + scraper = CharacterScraper( # pylint: disable=abstract-class-instantiated + { + "urls": self.config.urls, + "output_file": output_path, + "headless": self.config.headless, + "scraper_method": self.config.scraper_method, + "parser_mode": self.config.parser_mode, + }, + ) + + exit_code = scraper.work() + + if exit_code != 0: + raise RuntimeError(f"Character scraper failed with exit code {exit_code}") + + context.logger.info(f"Characters metadata saved to: {output_path}") + + self._executed = True + return input_data diff --git a/preprocessor/modules/scraping/episode_scraper.py b/preprocessor/modules/scraping/episode_scraper.py new file mode 100644 index 000000000..26f36ebec --- /dev/null +++ b/preprocessor/modules/scraping/episode_scraper.py @@ -0,0 +1,86 @@ +from pathlib import Path +from typing import ( + Any, + Dict, + List, + Optional, + Tuple, +) + +from preprocessor.lib.ui.console import console +from preprocessor.modules.scraping.base_scraper import BaseScraper + + +class EpisodeScraper(BaseScraper): + + def __init__(self, args: Dict[str, Any]) -> None: + super().__init__(args) + self.merge_sources: bool = self._args.get('merge_sources', True) + self.expected_episodes_count: Optional[int] = self._args.get('expected_episodes_count') + self.videos_dir: Optional[Path] = self._args.get('videos_dir') + + def _process_scraped_pages(self, scraped_pages: List[Dict[str, Any]]) -> None: + all_seasons = self.llm.extract_all_seasons(scraped_pages) + if not all_seasons: + self.logger.error('LLM failed to extract any season data') + return + result = {'sources': [item['url'] for item in scraped_pages], 'seasons': [season.model_dump() for season in all_seasons]} + self._save_result(result) + total_episodes = sum((len(season.episodes) for season in all_seasons)) + console.print(f'[green]✓ Extracted {len(all_seasons)} seasons, {total_episodes} episodes[/green]') + console.print(f'[green]✓ Saved to: {self.output_file}[/green]') + self.__validate_and_report_coverage(total_episodes) + + def __validate_and_report_coverage(self, scraped_episodes_count: int) -> None: + expected_count = self.__get_expected_episodes_count() + if expected_count is None: + self.__print_no_validation_warning(scraped_episodes_count) + return + status, message = self.__get_coverage_status(scraped_episodes_count, expected_count) + self.__print_coverage_report(scraped_episodes_count, expected_count, status, message) + + @staticmethod + def __print_no_validation_warning(scraped_count: int) -> None: + console.print('\n[yellow]⚠ Coverage validation:[/yellow]') + console.print(f' [cyan]Scraped episodes: {scraped_count}[/cyan]') + console.print(' [yellow]No video directory provided - unable to validate coverage[/yellow]') + console.print(' [dim]Make sure the scraped episodes cover all your video files[/dim]') + console.print(' [dim]You can add more --scrape-urls if needed[/dim]\n') + + @staticmethod + def __get_coverage_status(scraped: int, expected: int) -> Tuple[str, str]: + if scraped < expected: + return ('missing', f'Missing {expected - scraped} episodes') + if scraped > expected: + return ('extra', f'Scraped {scraped - expected} more episodes than video files') + return ('perfect', 'Perfect coverage') + + @staticmethod + def __print_coverage_report(scraped: int, expected: int, status: str, message: str) -> None: + coverage_pct = scraped / expected * 100 if expected > 0 else 0 + console.print('\n[yellow]⚠ Episode coverage validation:[/yellow]') + console.print(f' [cyan]Scraped episodes: {scraped}[/cyan]') + console.print(f' [cyan]Video files found: {expected}[/cyan]') + console.print(f' [cyan]Coverage: {coverage_pct:.1f}%[/cyan]') + if status == 'missing': + console.print(f'\n[red]✗ WARNING: {message}![/red]') + console.print(' [yellow]Consider adding more URLs to --scrape-urls[/yellow]') + console.print(' [dim]Not all video files will have metadata available[/dim]\n') + elif status == 'extra': + console.print(f'\n[yellow]⚠ Note: {message}[/yellow]') + console.print(' [dim]This is OK if you plan to add more videos later[/dim]\n') + else: + console.print('\n[green]✓ Perfect coverage - all video files have metadata![/green]\n') + + def __get_expected_episodes_count(self) -> Optional[int]: + if self.expected_episodes_count is not None: + return self.expected_episodes_count + if self.videos_dir and self.videos_dir.exists(): + return self.__count_video_files(self.videos_dir) + return None + + def __count_video_files(self, directory: Path) -> int: + count = 0 + for ext in self.SUPPORTED_VIDEO_EXTENSIONS: + count += len(list(directory.rglob(f'*{ext}'))) + return count diff --git a/preprocessor/modules/scraping/episode_scraper_step.py b/preprocessor/modules/scraping/episode_scraper_step.py new file mode 100644 index 000000000..d9d234f08 --- /dev/null +++ b/preprocessor/modules/scraping/episode_scraper_step.py @@ -0,0 +1,56 @@ +from pathlib import Path +from typing import Optional + +from preprocessor.config.step_configs import EpisodeScraperConfig +from preprocessor.core.artifacts import SourceVideo +from preprocessor.core.base_step import PipelineStep +from preprocessor.core.context import ExecutionContext +from preprocessor.modules.scraping.episode_scraper import EpisodeScraper + + +class EpisodeScraperStep( + PipelineStep[SourceVideo, SourceVideo, EpisodeScraperConfig], +): + def __init__(self, config: EpisodeScraperConfig) -> None: + super().__init__(config) + self._executed = False + + @property + def name(self) -> str: + return "scrape_episodes" + + def execute( + self, input_data: SourceVideo, context: ExecutionContext, + ) -> Optional[SourceVideo]: + if self._executed: + return input_data + + output_path = Path(self.config.output_file) + + if output_path.exists() and not context.force_rerun: + context.logger.info(f"Episodes metadata already exists: {output_path}") + self._executed = True + return input_data + + context.logger.info(f"Scraping episodes from {len(self.config.urls)} URLs") + + scraper = EpisodeScraper( # pylint: disable=abstract-class-instantiated + { + "urls": self.config.urls, + "output_file": output_path, + "headless": self.config.headless, + "merge_sources": self.config.merge_sources, + "scraper_method": self.config.scraper_method, + "parser_mode": self.config.parser_mode, + }, + ) + + exit_code = scraper.work() + + if exit_code != 0: + raise RuntimeError(f"Episode scraper failed with exit code {exit_code}") + + context.logger.info(f"Episodes metadata saved to: {output_path}") + + self._executed = True + return input_data diff --git a/preprocessor/characters/reference_processor.py b/preprocessor/modules/scraping/reference_processor.py similarity index 61% rename from preprocessor/characters/reference_processor.py rename to preprocessor/modules/scraping/reference_processor.py index 93ce1eb5f..cc6be995a 100644 --- a/preprocessor/characters/reference_processor.py +++ b/preprocessor/modules/scraping/reference_processor.py @@ -1,4 +1,3 @@ -from dataclasses import dataclass from datetime import datetime import json import logging @@ -15,65 +14,42 @@ from insightface.app import FaceAnalysis import numpy as np -from preprocessor.characters.face_detection import init_face_detection from preprocessor.config.config import settings from preprocessor.core.base_processor import ( BaseProcessor, OutputSpec, ProcessingItem, ) -from preprocessor.utils.console import console - -warnings.filterwarnings( - "ignore", - message=".*estimate.*is deprecated.*", - category=FutureWarning, - module="insightface", +from preprocessor.lib.characters.face_detection import FaceDetector +from preprocessor.lib.characters.models import ( + CandidateFace, + FaceData, ) +from preprocessor.lib.ui.console import console - -@dataclass -class FaceData: - bbox: np.ndarray - face_vector: np.ndarray - source_image_path: Path - source_image_idx: int - face_img: np.ndarray - - -@dataclass -class CandidateFace: - faces: List[FaceData] - avg_similarity: float - +warnings.filterwarnings('ignore', message='.*estimate.*is deprecated.*', category=FutureWarning, module='insightface') class CharacterReferenceProcessor(BaseProcessor): - def __init__(self, args: Dict[str, Any]): - super().__init__( - args=args, - class_name="CharacterReferenceProcessor", - error_exit_code=20, - loglevel=logging.INFO, - ) - - self.characters_dir = args["characters_dir"] - self.output_dir = args["output_dir"] - self.similarity_threshold = args["similarity_threshold"] - self.interactive = args["interactive"] + def __init__(self, args: Dict[str, Any]): + super().__init__(args=args, class_name='CharacterReferenceProcessor', error_exit_code=20, loglevel=logging.INFO) + self.characters_dir = args['characters_dir'] + self.output_dir = args['output_dir'] + self.similarity_threshold = args['similarity_threshold'] + self.interactive = args['interactive'] self.face_app: Optional[FaceAnalysis] = None def _validate_args(self, args: Dict[str, Any]) -> None: - required = ["characters_dir", "output_dir", "similarity_threshold", "interactive"] + required = ['characters_dir', 'output_dir', 'similarity_threshold', 'interactive'] for key in required: if key not in args: - raise ValueError(f"Missing required argument: {key}") + raise ValueError(f'Missing required argument: {key}') def get_output_subdir(self) -> str: - return "character_references" + return 'character_references' def _load_resources(self) -> bool: - self.face_app = init_face_detection() + self.face_app = FaceDetector.init() return True @staticmethod @@ -85,85 +61,62 @@ def __safe_resize(img: np.ndarray, target_size: tuple) -> Optional[np.ndarray]: try: return cv2.resize(img, target_size) except cv2.error as e: - logging.error(f"OpenCV resize error: {e}") + logging.error(f'OpenCV resize error: {e}') return None def _get_processing_items(self) -> List[ProcessingItem]: items = [] - if not self.characters_dir.exists(): - console.print(f"[red]Characters directory not found: {self.characters_dir}[/red]") + console.print(f'[red]Characters directory not found: {self.characters_dir}[/red]') return items - for char_dir in sorted(self.characters_dir.iterdir()): if not char_dir.is_dir(): continue - - items.append( - ProcessingItem( - episode_id=char_dir.name, - input_path=char_dir, - metadata={"char_name": char_dir.name}, - ), - ) - + items.append(ProcessingItem(episode_id=char_dir.name, input_path=char_dir, metadata={'char_name': char_dir.name})) return items def _get_expected_outputs(self, item: ProcessingItem) -> List[OutputSpec]: char_output_dir = self.output_dir / item.episode_id - return [ - OutputSpec(path=char_output_dir / "metadata.json", required=True), - OutputSpec(path=char_output_dir / "face_vector.npy", required=True), + OutputSpec(path=char_output_dir / 'metadata.json', required=True), + OutputSpec(path=char_output_dir / 'face_vector.npy', required=True), ] def _process_item(self, item: ProcessingItem, missing_outputs: List[OutputSpec]) -> None: char_dir = item.input_path - char_name = item.metadata["char_name"] - - console.print(f"[blue]Processing character: {char_name}[/blue]") - - reference_images = sorted(char_dir.glob("*.jpg")) - + char_name = item.metadata['char_name'] + console.print(f'[blue]Processing character: {char_name}[/blue]') + reference_images = sorted(char_dir.glob('*.jpg')) if len(reference_images) < 2: - console.print(f"[yellow]Skipping {char_name}: need at least 2 reference images, found {len(reference_images)}[/yellow]") + console.print(f'[yellow]Skipping {char_name}: need at least 2 reference images, found {len(reference_images)}[/yellow]') return - all_faces = self.__detect_faces_in_references(reference_images) - if not all_faces or not all_faces[0]: - console.print(f"[yellow]Skipping {char_name}: no faces detected in reference images[/yellow]") + console.print(f'[yellow]Skipping {char_name}: no faces detected in reference images[/yellow]') return - selected_faces = self.__find_common_face(all_faces, char_name, reference_images) - if not selected_faces: - console.print(f"[yellow]Skipping {char_name}: could not identify common face[/yellow]") + console.print(f'[yellow]Skipping {char_name}: could not identify common face[/yellow]') return - self.__save_processed_references(char_name, selected_faces, reference_images) - console.print(f"[green]✓ Processed {char_name}[/green]") + console.print(f'[green]✓ Processed {char_name}[/green]') def __detect_faces_in_references(self, image_paths: List[Path]) -> List[List[FaceData]]: all_faces = [] - for idx, img_path in enumerate(image_paths): img = cv2.imread(str(img_path)) if img is None: - console.print(f"[yellow]Warning: Could not read {img_path}[/yellow]") + console.print(f'[yellow]Warning: Could not read {img_path}[/yellow]') all_faces.append([]) continue - - console.print(f"[dim] {img_path.name}: detecting faces (image size: {img.shape[1]}x{img.shape[0]})...[/dim]") + console.print(f'[dim] {img_path.name}: detecting faces (image size: {img.shape[1]}x{img.shape[0]})...[/dim]') faces = self.face_app.get(img) - console.print(f"[dim] Found {len(faces)} face(s)[/dim]") - + console.print(f'[dim] Found {len(faces)} face(s)[/dim]') faces_data = [] for face in faces: bbox = face.bbox.astype(int) x1, y1, x2, y2 = bbox face_img = img[y1:y2, x1:x2] - faces_data.append( FaceData( bbox=bbox, @@ -173,9 +126,7 @@ def __detect_faces_in_references(self, image_paths: List[Path]) -> List[List[Fac face_img=face_img, ), ) - all_faces.append(faces_data) - return all_faces def __find_common_face( @@ -186,43 +137,38 @@ def __find_common_face( ) -> Optional[List[FaceData]]: first_image_faces = all_faces[0] remaining_images = all_faces[1:] - candidates = [] - for first_face in first_image_faces: matched_faces = [first_face] similarities = [] - for other_image_faces in remaining_images: if not other_image_faces: break - best_match = None - best_similarity = -1.0 - + best_similarity: float = -1.0 for other_face in other_image_faces: - similarity = np.dot(first_face.face_vector, other_face.face_vector) + similarity: float = float( + np.dot( + first_face.face_vector, + other_face.face_vector, + ), + ) if similarity > best_similarity: - best_similarity = similarity # pylint: disable=redefined-variable-type + best_similarity = similarity best_match = other_face - if best_match: matched_faces.append(best_match) similarities.append(best_similarity) if best_similarity < self.similarity_threshold: - console.print(f"[yellow]Warning: Low similarity {best_similarity:.2f} < {self.similarity_threshold:.2f}[/yellow]") + console.print( + f'[yellow]Warning: Low similarity {best_similarity:.2f} < ' + f'{self.similarity_threshold:.2f}[/yellow]', + ) else: break - if len(matched_faces) == len(all_faces): avg_similarity = np.mean(similarities) if similarities else 1.0 - candidates.append( - CandidateFace( - faces=matched_faces, - avg_similarity=avg_similarity, - ), - ) - + candidates.append(CandidateFace(faces=matched_faces, avg_similarity=avg_similarity)) if len(candidates) == 0: if self.interactive: return self.__ask_user_to_select_initial_face( @@ -244,21 +190,17 @@ def __ask_user_to_select_candidate( candidates: List[CandidateFace], char_name: str, ) -> Optional[List[FaceData]]: - console.print(f"[yellow]Character: {char_name}[/yellow]") - console.print(f"[yellow]Found {len(candidates)} possible matches across all reference images.[/yellow]") - + console.print(f'[yellow]Character: {char_name}[/yellow]') + console.print(f'[yellow]Found {len(candidates)} possible matches across all reference images.[/yellow]') for idx, candidate in enumerate(candidates, 1): - console.print(f"Candidate {idx}: avg similarity = {candidate.avg_similarity:.2f}") - - grid_path = self.__create_selection_grid(candidates, "candidates", char_name) - console.print(f"[blue]Grid image saved to: {grid_path}[/blue]") - + console.print(f'Candidate {idx}: avg similarity = {candidate.avg_similarity:.2f}') + grid_path = self.__create_selection_grid(candidates, 'candidates', char_name) + console.print(f'[blue]Grid image saved to: {grid_path}[/blue]') while True: - user_input = input(f"Select the correct character (1-{len(candidates)}) or skip (s): ").strip().lower() # pylint: disable=bad-builtin - + prompt = f'Select the correct character (1-{len(candidates)}) or skip (s): ' + user_input = input(prompt).strip().lower() # pylint: disable=bad-builtin if user_input == 's': return None - try: selection = int(user_input) if 1 <= selection <= len(candidates): @@ -274,20 +216,23 @@ def __ask_user_to_select_initial_face( char_name: str, reference_images: List[Path], ) -> Optional[List[FaceData]]: - console.print(f"[yellow]Character: {char_name}[/yellow]") - console.print("[yellow]No common face found across all reference images.[/yellow]") - console.print("[yellow]Manual selection mode: Please select the correct face from the first image.[/yellow]") - console.print(f"[yellow]Found {len(first_image_faces)} faces in first reference image.[/yellow]") - - grid_path = self.__create_selection_grid(first_image_faces, "manual", char_name) - console.print(f"[blue]Grid image saved to: {grid_path}[/blue]") - + console.print(f'[yellow]Character: {char_name}[/yellow]') + console.print('[yellow]No common face found across all reference images.[/yellow]') + console.print( + '[yellow]Manual selection mode: Please select the correct face ' + 'from the first image.[/yellow]', + ) + console.print( + f'[yellow]Found {len(first_image_faces)} faces in ' + 'first reference image.[/yellow]', + ) + grid_path = self.__create_selection_grid(first_image_faces, 'manual', char_name) + console.print(f'[blue]Grid image saved to: {grid_path}[/blue]') while True: - user_input = input(f"Select the correct face (1-{len(first_image_faces)}) or skip (s): ").strip().lower() # pylint: disable=bad-builtin - + prompt = f'Select the correct face (1-{len(first_image_faces)}) or skip (s): ' + user_input = input(prompt).strip().lower() # pylint: disable=bad-builtin if user_input == 's': return None - try: selection = int(user_input) if 1 <= selection <= len(first_image_faces): @@ -298,7 +243,9 @@ def __ask_user_to_select_initial_face( [selected_face], reference_images, ) - console.print(f"[red]Invalid selection. Please enter 1-{len(first_image_faces)} or 's'[/red]") + console.print( + f"[red]Invalid selection. Please enter 1-{len(first_image_faces)} or 's'[/red]", + ) except ValueError: console.print("[red]Invalid input. Please enter a number or 's'[/red]") @@ -311,115 +258,79 @@ def __find_matching_faces_for_reference( ) -> Optional[List[FaceData]]: for img_idx, other_image_faces in enumerate(remaining_images, 1): if not other_image_faces: - console.print( - f"[red]No faces found in image {img_idx + 1}: {reference_images[img_idx]}[/red]", - ) + img_path = reference_images[img_idx] + console.print(f'[red]No faces found in image {img_idx + 1}: {img_path}[/red]') return None - best_match = None - best_sim = -1.0 - + best_sim: float = -1.0 for other_face in other_image_faces: - similarity = np.dot(reference_vector, other_face.face_vector) + similarity: float = float(np.dot(reference_vector, other_face.face_vector)) if similarity > best_sim: - best_sim = similarity # pylint: disable=redefined-variable-type + best_sim = similarity best_match = other_face - if best_match: matched_faces.append(best_match) if best_sim < self.similarity_threshold: + img_path = reference_images[img_idx] console.print( - f"[yellow]Warning: Low similarity in image {img_idx + 1}: {reference_images[img_idx]} " - f"(similarity: {best_sim:.2f} < threshold: {self.similarity_threshold:.2f})[/yellow]", + f'[yellow]Warning: Low similarity in image {img_idx + 1}: ' + f'{img_path} (similarity: {best_sim:.2f} < ' + f'threshold: {self.similarity_threshold:.2f})[/yellow]', ) else: console.print( - f"[red]No faces detected in image {img_idx + 1}: {reference_images[img_idx]}[/red]", + f'[red]No faces detected in image {img_idx + 1}: ' + f'{reference_images[img_idx]}[/red]', ) return None - return matched_faces - def __create_selection_grid( # pylint: disable=too-many-locals - self, - data, - mode: str, - char_name: str, - ) -> Path: - if mode == "candidates": + def __create_selection_grid(self, data, mode: str, char_name: str) -> Path: # pylint: disable=too-many-locals + if mode == 'candidates': candidates = data num_refs = len(candidates[0].faces) num_candidates = len(candidates) - face_size = 150 padding = 10 label_height = 30 - grid_width = num_refs * (face_size + padding) + padding grid_height = num_candidates * (face_size + label_height + padding) + padding + label_height - grid = np.ones((grid_height, grid_width, 3), dtype=np.uint8) * 255 - for col_idx in range(num_refs): - label = f"Ref {col_idx + 1}" + label = f'Ref {col_idx + 1}' x = padding + col_idx * (face_size + padding) - cv2.putText( - grid, - label, - (x + 10, 20), - cv2.FONT_HERSHEY_SIMPLEX, - 0.5, - (0, 0, 0), - 1, + cv2.putText( grid, label, (x + 10, 20), + cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 0), 1, ) - for cand_idx, candidate in enumerate(candidates): y_base = label_height + padding + cand_idx * (face_size + label_height + padding) - for face_idx, face_data in enumerate(candidate.faces): x = padding + face_idx * (face_size + padding) y = y_base - face_resized = self.__safe_resize(face_data.face_img, (face_size, face_size)) if face_resized is not None: grid[y:y + face_size, x:x + face_size] = face_resized - - label = f"Candidate {cand_idx + 1}" - cv2.putText( - grid, - label, - (5, y_base + face_size // 2), - cv2.FONT_HERSHEY_SIMPLEX, - 0.4, - (0, 0, 255), - 1, - ) - + label = f'Candidate {cand_idx + 1}' + cv2.putText( grid, label, (5, y_base + face_size // 2), + cv2.FONT_HERSHEY_SIMPLEX, 0.4, (0, 0, 255), 1, ) else: faces_data = data num_faces = len(faces_data) cols = min(3, num_faces) rows = (num_faces + cols - 1) // cols - face_size = 150 padding = 10 - grid_width = cols * (face_size + padding) + padding grid_height = rows * (face_size + padding) + padding - grid = np.ones((grid_height, grid_width, 3), dtype=np.uint8) * 255 - for idx, face_data in enumerate(faces_data): row = idx // cols col = idx % cols - x = padding + col * (face_size + padding) y = padding + row * (face_size + padding) - face_resized = self.__safe_resize(face_data.face_img, (face_size, face_size)) if face_resized is not None: grid[y:y + face_size, x:x + face_size] = face_resized - label = str(idx + 1) cv2.putText( grid, @@ -430,13 +341,10 @@ def __create_selection_grid( # pylint: disable=too-many-locals (0, 0, 255), 2, ) - - selection_grids_dir = self.output_dir.parent / "character_selection_grids" + selection_grids_dir = self.output_dir.parent / 'character_selection_grids' selection_grids_dir.mkdir(parents=True, exist_ok=True) - output_path = selection_grids_dir / f"{char_name.replace(' ', '_').lower()}_selection.jpg" cv2.imwrite(str(output_path), grid) - return output_path def __save_processed_references( # pylint: disable=too-many-locals @@ -447,84 +355,64 @@ def __save_processed_references( # pylint: disable=too-many-locals ) -> None: char_output_dir = self.output_dir / char_name char_output_dir.mkdir(parents=True, exist_ok=True) - face_vectors = [] for idx, face_data in enumerate(selected_faces): - face_normalized = self.__safe_resize( - face_data.face_img, - settings.character.normalized_face_size, - ) - + face_normalized = self.__safe_resize(face_data.face_img, settings.character.normalized_face_size) if face_normalized is None: - self.logger.warning( - f"Skipping face {idx} for {char_name}: failed to resize (invalid dimensions)", - ) + self.logger.warning(f'Skipping face {idx} for {char_name}: failed to resize (invalid dimensions)') continue - - face_output_path = char_output_dir / f"face_{idx:02d}.jpg" + face_output_path = char_output_dir / f'face_{idx:02d}.jpg' cv2.imwrite(str(face_output_path), face_normalized) - face_vectors.append(face_data.face_vector) - mean_vector = np.mean(face_vectors, axis=0) - vector_path = char_output_dir / "face_vector.npy" + vector_path = char_output_dir / 'face_vector.npy' np.save(vector_path, mean_vector) - total_faces_detected = [] for faces_list in self.__detect_faces_in_references(reference_images): total_faces_detected.append(len(faces_list)) - similarities = [] if len(selected_faces) > 1: for i in range(len(selected_faces) - 1): similarity = np.dot(selected_faces[i].face_vector, selected_faces[i + 1].face_vector) similarities.append(similarity) - metadata = { - "character_name": char_name.replace("_", " ").title(), - "source_images": [str(img) for img in reference_images], - "processed_at": datetime.now().isoformat(), - "processing_params": { - "similarity_threshold": self.similarity_threshold, - "face_model": settings.face_recognition.model_name, - "normalized_face_size": list(settings.character.normalized_face_size), + 'character_name': char_name.replace('_', ' ').title(), + 'source_images': [str(img) for img in reference_images], + 'processed_at': datetime.now().isoformat(), + 'processing_params': { + 'similarity_threshold': self.similarity_threshold, + 'face_model': settings.face_recognition.model_name, + 'normalized_face_size': list(settings.character.normalized_face_size), }, - "detection_stats": { - "total_faces_detected": total_faces_detected, - "candidates_found": 1, - "selection_method": "automatic" if len(selected_faces) == len(reference_images) else "manual", + 'detection_stats': { + 'total_faces_detected': total_faces_detected, + 'candidates_found': 1, + 'selection_method': 'automatic' if len(selected_faces) == len(reference_images) else 'manual', }, - "selected_face_indices": [face.source_image_idx for face in selected_faces], - "average_similarity": float(np.mean(similarities)) if similarities else 1.0, - "face_vector_dim": int(mean_vector.shape[0]), + 'selected_face_indices': [face.source_image_idx for face in selected_faces], + 'average_similarity': float(np.mean(similarities)) if similarities else 1.0, + 'face_vector_dim': int(mean_vector.shape[0]), } - - metadata_path = char_output_dir / "metadata.json" - with open(metadata_path, "w", encoding="utf-8") as f: + metadata_path = char_output_dir / 'metadata.json' + with open(metadata_path, 'w', encoding='utf-8') as f: json.dump(metadata, f, indent=2, ensure_ascii=False) def _get_progress_description(self) -> str: - return "Processing character references" + return 'Processing character references' def generate_validation_grid(self) -> None: # pylint: disable=too-many-locals,too-many-statements - output_path = self.output_dir / "validation_grid.png" - + output_path = self.output_dir / 'validation_grid.png' if output_path.exists(): - console.print(f"[dim]⊘ Skipping validation grid (already exists): {output_path}[/dim]") + console.print(f'[dim]⊘ Skipping validation grid (already exists): {output_path}[/dim]') return - - console.print("\n[blue]Generating validation grid...[/blue]") - + console.print('\n[blue]Generating validation grid...[/blue]') if not self.output_dir.exists(): - console.print("[yellow]No processed references found, skipping validation grid[/yellow]") + console.print('[yellow]No processed references found, skipping validation grid[/yellow]') return - processed_chars = sorted([d for d in self.output_dir.iterdir() if d.is_dir()]) - if not processed_chars: - console.print("[yellow]No processed characters found, skipping validation grid[/yellow]") + console.print('[yellow]No processed characters found, skipping validation grid[/yellow]') return - face_size = 280 padding = 15 row_height = face_size + padding * 2 @@ -534,17 +422,13 @@ def generate_validation_grid(self) -> None: # pylint: disable=too-many-locals,t stats_col_width = 200 face_col_width = face_size + padding faces_per_char = 3 - - grid_width = label_col_width + stats_col_width + (faces_per_char * face_col_width) + padding * 2 - grid_height = header_height + (len(processed_chars) * row_height) + footer_height - + grid_width = label_col_width + stats_col_width + faces_per_char * face_col_width + padding * 2 + grid_height = header_height + len(processed_chars) * row_height + footer_height bg_color = (250, 252, 255) grid = np.full((grid_height, grid_width, 3), bg_color, dtype=np.uint8) - header_bg_color = (45, 55, 72) cv2.rectangle(grid, (0, 0), (grid_width, header_height), header_bg_color, -1) - - title_text = "FACIAL REFERENCE VALIDATION REPORT" + title_text = 'FACIAL REFERENCE VALIDATION REPORT' cv2.putText( grid, title_text, @@ -555,8 +439,7 @@ def generate_validation_grid(self) -> None: # pylint: disable=too-many-locals,t 2, cv2.LINE_AA, ) - - subtitle = "InsightFace Buffalo-L Model | Face Vector Extraction & Similarity Analysis" + subtitle = 'InsightFace Buffalo-L Model | Face Vector Extraction & Similarity Analysis' cv2.putText( grid, subtitle, @@ -567,25 +450,17 @@ def generate_validation_grid(self) -> None: # pylint: disable=too-many-locals,t 1, cv2.LINE_AA, ) - metadata_all = [] for char_dir in processed_chars: - metadata_file = char_dir / "metadata.json" + metadata_file = char_dir / 'metadata.json' if metadata_file.exists(): - with open(metadata_file, "r", encoding="utf-8") as f: + with open(metadata_file, 'r', encoding='utf-8') as f: metadata_all.append(json.load(f)) - total_chars = len(processed_chars) - avg_similarity = np.mean([m.get("average_similarity", 0) for m in metadata_all]) if metadata_all else 0 + avg_similarity = np.mean([m.get('average_similarity', 0) for m in metadata_all]) if metadata_all else 0 threshold = self.similarity_threshold - stats_y = 115 - stats_items = [ - f"Total Subjects: {total_chars}", - f"Avg Similarity: {avg_similarity:.4f}", - f"Threshold: {threshold:.2f}", - ] - + stats_items = [f'Total Subjects: {total_chars}', f'Avg Similarity: {avg_similarity:.4f}', f'Threshold: {threshold:.2f}'] for idx, stat in enumerate(stats_items): x_pos = padding * 3 + idx * 280 cv2.putText( @@ -598,18 +473,15 @@ def generate_validation_grid(self) -> None: # pylint: disable=too-many-locals,t 1, cv2.LINE_AA, ) - table_header_y = header_height + 1 cv2.line(grid, (0, table_header_y), (grid_width, table_header_y), (180, 190, 200), 2) - col_headers = [ - ("CHARACTER NAME", label_col_width // 2, 0), - ("STATISTICS", label_col_width + stats_col_width // 2, 0), - ("REFERENCE IMAGE 1", label_col_width + stats_col_width + face_col_width // 2, 0), - ("REFERENCE IMAGE 2", label_col_width + stats_col_width + face_col_width * 3 // 2, 0), - ("REFERENCE IMAGE 3", label_col_width + stats_col_width + face_col_width * 5 // 2, 0), + ('CHARACTER NAME', label_col_width // 2, 0), + ('STATISTICS', label_col_width + stats_col_width // 2, 0), + ('REFERENCE IMAGE 1', label_col_width + stats_col_width + face_col_width // 2, 0), + ('REFERENCE IMAGE 2', label_col_width + stats_col_width + face_col_width * 3 // 2, 0), + ('REFERENCE IMAGE 3', label_col_width + stats_col_width + face_col_width * 5 // 2, 0), ] - header_row_height = 40 for text, x_center, _ in col_headers: text_size = cv2.getTextSize(text, cv2.FONT_HERSHEY_SIMPLEX, 0.42, 1)[0] @@ -624,7 +496,6 @@ def generate_validation_grid(self) -> None: # pylint: disable=too-many-locals,t 1, cv2.LINE_AA, ) - cv2.line( grid, (0, table_header_y + header_row_height), @@ -632,52 +503,31 @@ def generate_validation_grid(self) -> None: # pylint: disable=too-many-locals,t (200, 210, 220), 1, ) - y_offset = header_height + header_row_height + padding - for idx, char_dir in enumerate(processed_chars): - char_name = char_dir.name.replace("_", " ").title() - metadata_file = char_dir / "metadata.json" - + char_name = char_dir.name.replace('_', ' ').title() + metadata_file = char_dir / 'metadata.json' if idx % 2 == 0: row_bg = (245, 248, 252) else: row_bg = bg_color - - cv2.rectangle( - grid, - (0, y_offset - padding), - (grid_width, y_offset + face_size + padding), - row_bg, - -1, - ) - - cv2.putText( - grid, - char_name, - (padding * 2, y_offset + face_size // 2), - cv2.FONT_HERSHEY_SIMPLEX, - 0.55, - (30, 40, 50), - 1, - cv2.LINE_AA, + cv2.rectangle( grid, (0, y_offset - padding), + (grid_width, y_offset + face_size + padding), row_bg, -1, ) - + cv2.putText( grid, char_name, (padding * 2, y_offset + face_size // 2), + cv2.FONT_HERSHEY_SIMPLEX, 0.55, (30, 40, 50), 1, cv2.LINE_AA, ) if metadata_file.exists(): - with open(metadata_file, "r", encoding="utf-8") as f: + with open(metadata_file, 'r', encoding='utf-8') as f: metadata = json.load(f) - - similarity = metadata.get("average_similarity", 0.0) - method = metadata.get("detection_stats", {}).get("selection_method", "unknown") - faces_detected = metadata.get("detection_stats", {}).get("total_faces_detected", []) - + similarity = metadata.get('average_similarity', 0.0) + method = metadata.get('detection_stats', {}).get('selection_method', 'unknown') + faces_detected = metadata.get('detection_stats', {}).get('total_faces_detected', []) stats_x = label_col_width + padding stats_y_base = y_offset + face_size // 2 - 30 - sim_color = (0, 150, 0) if similarity >= threshold else (180, 100, 0) cv2.putText( grid, - f"Similarity: {similarity:.4f}", + f'Similarity: {similarity:.4f}', (stats_x, stats_y_base), cv2.FONT_HERSHEY_SIMPLEX, 0.45, @@ -685,11 +535,10 @@ def generate_validation_grid(self) -> None: # pylint: disable=too-many-locals,t 1, cv2.LINE_AA, ) - - method_color = (50, 120, 200) if method == "automatic" else (180, 100, 50) + method_color = (50, 120, 200) if method == 'automatic' else (180, 100, 50) cv2.putText( grid, - f"Method: {method}", + f'Method: {method}', (stats_x, stats_y_base + 25), cv2.FONT_HERSHEY_SIMPLEX, 0.42, @@ -697,11 +546,10 @@ def generate_validation_grid(self) -> None: # pylint: disable=too-many-locals,t 1, cv2.LINE_AA, ) - - faces_str = str(faces_detected) if len(str(faces_detected)) < 20 else f"[{len(faces_detected)} imgs]" + faces_str = str(faces_detected) if len(str(faces_detected)) < 20 else f'[{len(faces_detected)} imgs]' cv2.putText( grid, - f"Detected: {faces_str}", + f'Detected: {faces_str}', (stats_x, stats_y_base + 50), cv2.FONT_HERSHEY_SIMPLEX, 0.38, @@ -709,23 +557,17 @@ def generate_validation_grid(self) -> None: # pylint: disable=too-many-locals,t 1, cv2.LINE_AA, ) - - face_files = sorted(char_dir.glob("face_*.jpg")) - + face_files = sorted(char_dir.glob('face_*.jpg')) for face_idx, face_file in enumerate(face_files[:faces_per_char]): face_img = cv2.imread(str(face_file)) if face_img is None: continue - face_resized = CharacterReferenceProcessor.__safe_resize(face_img, (face_size, face_size)) if face_resized is None: continue - x = label_col_width + stats_col_width + face_idx * face_col_width + padding y = y_offset - grid[y:y + face_size, x:x + face_size] = face_resized - border_color = (180, 190, 200) cv2.rectangle( grid, @@ -734,16 +576,15 @@ def generate_validation_grid(self) -> None: # pylint: disable=too-many-locals,t border_color, 1, ) - y_offset += row_height - footer_y = grid_height - footer_height + 20 cv2.line(grid, (0, footer_y - 20), (grid_width, footer_y - 20), (200, 210, 220), 1) - - footer_text = f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')} | " \ - f"Model: {settings.face_recognition.model_name} | " \ - f"Normalized Size: {settings.character.normalized_face_size[0]}x{settings.character.normalized_face_size[1]}px" - + footer_text = ( + f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')} | " + f"Model: {settings.face_recognition.model_name} | " + f"Normalized Size: {settings.character.normalized_face_size[0]}x" + f"{settings.character.normalized_face_size[1]}px" + ) cv2.putText( grid, footer_text, @@ -754,13 +595,11 @@ def generate_validation_grid(self) -> None: # pylint: disable=too-many-locals,t 1, cv2.LINE_AA, ) - legend_y = footer_y + 30 legend_items = [ - ("Automatic: Face found on all references", (50, 120, 200)), - ("Manual: User-selected reference", (180, 100, 50)), + ('Automatic: Face found on all references', (50, 120, 200)), + ('Manual: User-selected reference', (180, 100, 50)), ] - for idx, (text, color) in enumerate(legend_items): x_pos = padding * 3 + idx * 380 cv2.circle(grid, (x_pos, legend_y - 3), 5, color, -1) @@ -774,10 +613,12 @@ def generate_validation_grid(self) -> None: # pylint: disable=too-many-locals,t 1, cv2.LINE_AA, ) - - cv2.imwrite(str(output_path), grid, [cv2.IMWRITE_PNG_COMPRESSION, 6]) - - console.print(f"[green]✓ Validation grid saved to: {output_path}[/green]") - console.print(f"[green] Grid size: {grid_width}x{grid_height}px[/green]") - console.print(f"[green] Characters: {len(processed_chars)}[/green]") - console.print(f"[green] Average similarity: {avg_similarity:.4f}[/green]") + cv2.imwrite( + str(output_path), + grid, + [cv2.IMWRITE_PNG_COMPRESSION, 6], + ) + console.print(f'[green]✓ Validation grid saved to: {output_path}[/green]') + console.print(f'[green] Grid size: {grid_width}x{grid_height}px[/green]') + console.print(f'[green] Characters: {len(processed_chars)}[/green]') + console.print(f'[green] Average similarity: {avg_similarity:.4f}[/green]') diff --git a/preprocessor/modules/scraping/reference_processor_step.py b/preprocessor/modules/scraping/reference_processor_step.py new file mode 100644 index 000000000..7ab632766 --- /dev/null +++ b/preprocessor/modules/scraping/reference_processor_step.py @@ -0,0 +1,63 @@ +from pathlib import Path +from typing import Optional + +from preprocessor.config.step_configs import CharacterReferenceConfig +from preprocessor.core.artifacts import SourceVideo +from preprocessor.core.base_step import PipelineStep +from preprocessor.core.context import ExecutionContext +from preprocessor.modules.scraping.reference_processor import CharacterReferenceProcessor + + +class CharacterReferenceStep( + PipelineStep[SourceVideo, SourceVideo, CharacterReferenceConfig], +): + def __init__(self, config: CharacterReferenceConfig) -> None: + super().__init__(config) + self._executed = False + + @property + def name(self) -> str: + return "process_character_references" + + def execute( + self, input_data: SourceVideo, context: ExecutionContext, + ) -> Optional[SourceVideo]: + if self._executed: + return input_data + + characters_path = Path(self.config.characters_file) + output_dir = Path(self.config.output_dir) + + if not characters_path.exists(): + raise FileNotFoundError( + f"Characters file not found: {characters_path}. " + f"Run scrape_characters first.", + ) + + if output_dir.exists() and any(output_dir.iterdir()) and not context.force_rerun: + context.logger.info(f"Character references already exist in: {output_dir}") + self._executed = True + return input_data + + context.logger.info(f"Processing character references from {characters_path}") + + processor = CharacterReferenceProcessor( + { + "characters_file": characters_path, + "output_dir": output_dir, + "search_engine": self.config.search_engine, + "images_per_character": self.config.images_per_character, + }, + ) + + exit_code = processor.work() + + if exit_code != 0: + raise RuntimeError( + f"Character reference processor failed with exit code {exit_code}", + ) + + context.logger.info(f"Character references saved to: {output_dir}") + + self._executed = True + return input_data diff --git a/preprocessor/transcription/engines/__init__.py b/preprocessor/modules/search/__init__.py similarity index 100% rename from preprocessor/transcription/engines/__init__.py rename to preprocessor/modules/search/__init__.py diff --git a/preprocessor/modules/search/clients/__init__.py b/preprocessor/modules/search/clients/__init__.py new file mode 100644 index 000000000..a927c8764 --- /dev/null +++ b/preprocessor/modules/search/clients/__init__.py @@ -0,0 +1,6 @@ +from preprocessor.modules.search.clients.elasticsearch_queries import ElasticsearchQueries +from preprocessor.modules.search.clients.embedding_service import EmbeddingService +from preprocessor.modules.search.clients.hash_service import HashService +from preprocessor.modules.search.clients.result_formatters import ResultFormatter + +__all__ = ['ElasticsearchQueries', 'EmbeddingService', 'HashService', 'ResultFormatter'] diff --git a/preprocessor/modules/search/clients/elasticsearch_queries.py b/preprocessor/modules/search/clients/elasticsearch_queries.py new file mode 100644 index 000000000..86fa95638 --- /dev/null +++ b/preprocessor/modules/search/clients/elasticsearch_queries.py @@ -0,0 +1,411 @@ +from typing import ( + Any, + Dict, + List, + Optional, + Tuple, +) + +from elasticsearch import AsyncElasticsearch + +from preprocessor.modules.search.clients.embedding_service import EmbeddingService + + +class ElasticsearchQueries: + + def __init__(self, embedding_service: EmbeddingService) -> None: + self._embedding_service = embedding_service + + @staticmethod + def _build_episode_filters(season: Optional[int], episode: Optional[int]) -> List[Dict[str, Any]]: + filters = [] + if season is not None: + filters.append({'term': {'episode_metadata.season': season}}) + if episode is not None: + filters.append({'term': {'episode_metadata.episode_number': episode}}) + return filters + + async def search_text_query( + self, + es_client: AsyncElasticsearch, + query: str, + season: Optional[int]=None, + episode: Optional[int]=None, + limit: int=20, + ) -> Dict[str, Any]: + must_clauses = [ + {'multi_match': {'query': query, 'fields': ['text^2', 'episode_metadata.title'], 'fuzziness': 'AUTO'}}, + ] + must_clauses.extend(self._build_episode_filters(season, episode)) + query_body = {'bool': {'must': must_clauses}} + return await es_client.search( + index='ranczo_segments', + query=query_body, + size=limit, + _source=[ + 'episode_id', 'segment_id', 'text', 'start_time', 'end_time', 'speaker', + 'video_path', 'episode_metadata', 'scene_info', + ], + ) + + async def search_text_semantic( + self, + es_client: AsyncElasticsearch, + text: str, + season: Optional[int]=None, + episode: Optional[int]=None, + limit: int=10, + ) -> Dict[str, Any]: + embedding = self._embedding_service.get_text_embedding(text) + filter_clauses = self._build_episode_filters(season, episode) + knn_query: Dict[str, Any] = { + 'field': 'text_embedding', + 'query_vector': embedding, + 'k': limit, + 'num_candidates': limit * 10, + } + if filter_clauses: + knn_query['filter'] = filter_clauses + return await es_client.search( + index='ranczo_text_embeddings', + knn=knn_query, + size=limit, + _source=[ + 'episode_id', 'embedding_id', 'text', 'segment_range', + 'video_path', 'episode_metadata', 'scene_info', + ], + ) + + async def search_video_semantic( + self, + es_client: AsyncElasticsearch, + image_path: str, + season: Optional[int]=None, + episode: Optional[int]=None, + character: Optional[str]=None, + limit: int=10, + ) -> Dict[str, Any]: + embedding = self._embedding_service.get_image_embedding(image_path) + filter_clauses = self._build_episode_filters(season, episode) + if character: + filter_clauses.append({ + 'nested': { + 'path': 'character_appearances', + 'query': {'term': {'character_appearances.name': character}}, + }, + }) + knn_query: Dict[str, Any] = { + 'field': 'video_embedding', + 'query_vector': embedding, + 'k': limit, + 'num_candidates': limit * 10, + } + if filter_clauses: + knn_query['filter'] = filter_clauses + return await es_client.search( + index='ranczo_video_frames', + knn=knn_query, + size=limit, + _source=[ + 'episode_id', 'frame_number', 'timestamp', 'frame_type', 'scene_number', + 'perceptual_hash', 'video_path', 'episode_metadata', 'character_appearances', 'scene_info', + ], + ) + + async def search_text_to_video( + self, + es_client: AsyncElasticsearch, + text: str, + season: Optional[int]=None, + episode: Optional[int]=None, + character: Optional[str]=None, + limit: int=10, + ) -> Dict[str, Any]: + embedding = self._embedding_service.get_text_embedding(text) + filter_clauses = self._build_episode_filters(season, episode) + if character: + filter_clauses.append({ + 'nested': { + 'path': 'character_appearances', + 'query': {'term': {'character_appearances.name': character}}, + }, + }) + knn_query: Dict[str, Any] = { + 'field': 'video_embedding', + 'query_vector': embedding, + 'k': limit, + 'num_candidates': limit * 10, + } + if filter_clauses: + knn_query['filter'] = filter_clauses + return await es_client.search( + index='ranczo_video_frames', + knn=knn_query, + size=limit, + _source=[ + 'episode_id', 'frame_number', 'timestamp', 'frame_type', 'scene_number', + 'perceptual_hash', 'video_path', 'episode_metadata', 'character_appearances', 'scene_info', + ], + ) + + @staticmethod + async def search_by_character( + es_client: AsyncElasticsearch, + character: str, + season: Optional[int]=None, + episode: Optional[int]=None, + limit: int=20, + ) -> Dict[str, Any]: + must_clauses = [{ + 'nested': { + 'path': 'character_appearances', + 'query': {'term': {'character_appearances.name': character}}, + }, + }] + must_clauses.extend(ElasticsearchQueries._build_episode_filters(season, episode)) + return await es_client.search( + index='ranczo_video_frames', + query={'bool': {'must': must_clauses}}, + size=limit, + _source=[ + 'episode_id', 'frame_number', 'timestamp', 'video_path', + 'episode_metadata', 'character_appearances', 'scene_info', + ], + ) + + @staticmethod + async def search_by_emotion( + es_client: AsyncElasticsearch, + emotion: str, + season: Optional[int]=None, + episode: Optional[int]=None, + character: Optional[str]=None, + limit: int=20, + ) -> Dict[str, Any]: + nested_must = [{'term': {'character_appearances.emotion.label': emotion}}] + if character: + nested_must.append({'term': {'character_appearances.name': character}}) + must_clauses = [{'nested': {'path': 'character_appearances', 'query': {'bool': {'must': nested_must}}}}] + must_clauses.extend(ElasticsearchQueries._build_episode_filters(season, episode)) + nested_filter: Dict[str, Any] = {'term': {'character_appearances.emotion.label': emotion}} + if character: + nested_filter = { + 'bool': { + 'must': [ + {'term': {'character_appearances.emotion.label': emotion}}, + {'term': {'character_appearances.name': character}}, + ], + }, + } + return await es_client.search( + index='ranczo_video_frames', + query={'bool': {'must': must_clauses}}, + sort=[{ + 'character_appearances.emotion.confidence': { + 'order': 'desc', + 'nested': {'path': 'character_appearances', 'filter': nested_filter}, + }, + }], + track_scores=True, + size=limit, + _source=[ + 'episode_id', 'frame_number', 'timestamp', 'video_path', + 'episode_metadata', 'character_appearances', 'scene_info', + ], + ) + + @staticmethod + async def search_by_object( + es_client: AsyncElasticsearch, + object_query: str, + season: Optional[int]=None, + episode: Optional[int]=None, + limit: int=20, + ) -> Dict[str, Any]: + filter_clauses = ElasticsearchQueries._build_episode_filters(season, episode) + must_clauses: List[Dict[str, Any]] = [] + if ':' in object_query: + object_class, count_filter = object_query.split(':', 1) + object_class = object_class.strip() + if count_filter.endswith('+'): + min_count = int(count_filter[:-1]) + must_clauses.append({ + 'nested': { + 'path': 'detected_objects', + 'query': { + 'bool': { + 'must': [ + {'term': {'detected_objects.class': object_class}}, + {'range': {'detected_objects.count': {'gte': min_count}}}, + ], + }, + }, + }, + }) + elif '-' in count_filter: + min_c, max_c = count_filter.split('-') + must_clauses.append({ + 'nested': { + 'path': 'detected_objects', + 'query': { + 'bool': { + 'must': [ + {'term': {'detected_objects.class': object_class}}, + {'range': {'detected_objects.count': {'gte': int(min_c), 'lte': int(max_c)}}}, + ], + }, + }, + }, + }) + else: + exact_count = int(count_filter) + must_clauses.append({ + 'nested': { + 'path': 'detected_objects', + 'query': { + 'bool': { + 'must': [ + {'term': {'detected_objects.class': object_class}}, + {'term': {'detected_objects.count': exact_count}}, + ], + }, + }, + }, + }) + else: + must_clauses.append({ + 'nested': { + 'path': 'detected_objects', + 'query': {'term': {'detected_objects.class': object_query.strip()}}, + }, + }) + query_body = {'bool': {'must': must_clauses, 'filter': filter_clauses}} + object_class = object_query.split(':')[0].strip() if ':' in object_query else object_query.strip() + return await es_client.search( + index='ranczo_video_frames', + query=query_body, + sort=[{ + 'detected_objects.count': { + 'order': 'desc', + 'nested': { + 'path': 'detected_objects', + 'filter': {'term': {'detected_objects.class': object_class}}, + }, + }, + }], + track_scores=True, + size=limit, + _source=[ + 'episode_id', 'frame_number', 'timestamp', 'detected_objects', 'character_appearances', + 'video_path', 'episode_metadata', 'scene_info', + ], + ) + + @staticmethod + async def search_perceptual_hash( + es_client: AsyncElasticsearch, + phash: str, + limit: int=10, + ) -> Dict[str, Any]: + return await es_client.search( + index='ranczo_video_frames', + query={'term': {'perceptual_hash': phash}}, + size=limit, + _source=[ + 'episode_id', 'frame_number', 'timestamp', 'video_path', + 'episode_metadata', 'perceptual_hash', 'scene_info', + ], + ) + + @staticmethod + async def list_characters(es_client: AsyncElasticsearch) -> List[Tuple[str, int]]: + result = await es_client.search( + index='ranczo_video_frames', + size=0, + aggs={ + 'characters_nested': { + 'nested': {'path': 'character_appearances'}, + 'aggs': { + 'character_names': { + 'terms': {'field': 'character_appearances.name', 'size': 1000}, + }, + }, + }, + }, + ) + buckets = result['aggregations']['characters_nested']['character_names']['buckets'] + return [(b['key'], b['doc_count']) for b in buckets] + + @staticmethod + async def list_objects(es_client: AsyncElasticsearch) -> List[Tuple[str, int]]: + result = await es_client.search( + index='ranczo_video_frames', + size=0, + aggs={ + 'objects_nested': { + 'nested': {'path': 'detected_objects'}, + 'aggs': { + 'object_classes': { + 'terms': {'field': 'detected_objects.class', 'size': 1000}, + }, + }, + }, + }, + ) + buckets = result['aggregations']['objects_nested']['object_classes']['buckets'] + return [(b['key'], b['doc_count']) for b in buckets] + + @staticmethod + async def search_episode_name( + es_client: AsyncElasticsearch, + query: str, + season: Optional[int]=None, + limit: int=20, + ) -> Dict[str, Any]: + must_clauses = [ + {'multi_match': {'query': query, 'fields': ['title^2', 'episode_metadata.title'], 'fuzziness': 'AUTO'}}, + ] + if season is not None: + must_clauses.append({'term': {'episode_metadata.season': season}}) + query_body = {'bool': {'must': must_clauses}} + return await es_client.search( + index='ranczo_episode_names', + query=query_body, + size=limit, + _source=['episode_id', 'title', 'video_path', 'episode_metadata'], + ) + + async def search_episode_name_semantic( + self, + es_client: AsyncElasticsearch, + text: str, + season: Optional[int]=None, + limit: int=10, + ) -> Dict[str, Any]: + embedding = self._embedding_service.get_text_embedding(text) + filter_clauses = [] + if season is not None: + filter_clauses.append({'term': {'episode_metadata.season': season}}) + knn_query: Dict[str, Any] = { + 'field': 'title_embedding', + 'query_vector': embedding, + 'k': limit, + 'num_candidates': limit * 10, + } + if filter_clauses: + knn_query['filter'] = filter_clauses + return await es_client.search( + index='ranczo_episode_names', + knn=knn_query, + size=limit, + _source=['episode_id', 'title', 'video_path', 'episode_metadata'], + ) + + @staticmethod + async def get_stats(es_client: AsyncElasticsearch) -> Dict[str, int]: + return { + 'segments': (await es_client.count(index='ranczo_segments'))['count'], + 'text_embeddings': (await es_client.count(index='ranczo_text_embeddings'))['count'], + 'video_embeddings': (await es_client.count(index='ranczo_video_frames'))['count'], + 'episode_names': (await es_client.count(index='ranczo_episode_names'))['count'], + } diff --git a/preprocessor/search/embedding_service.py b/preprocessor/modules/search/clients/embedding_service.py similarity index 62% rename from preprocessor/search/embedding_service.py rename to preprocessor/modules/search/clients/embedding_service.py index 0b757e179..0cab1a81a 100644 --- a/preprocessor/search/embedding_service.py +++ b/preprocessor/modules/search/clients/embedding_service.py @@ -18,6 +18,7 @@ class EmbeddingService: + def __init__(self) -> None: self._model: Optional[AutoModelForVision2Seq] = None self._processor: Optional[AutoProcessor] = None @@ -25,75 +26,38 @@ def __init__(self) -> None: def _load_model(self) -> Tuple[AutoModelForVision2Seq, AutoProcessor, str]: if self._model is not None: - return self._model, self._processor, self._device - - click.echo("Loading embedding model...", err=True) + return (self._model, self._processor, self._device) + click.echo('Loading embedding model...', err=True) if not torch.cuda.is_available(): - raise RuntimeError("CUDA is required but not available. This pipeline requires GPU.") - + raise RuntimeError('CUDA is required but not available. This pipeline requires GPU.') model_name = settings.embedding_model.model_name - self._device = "cuda" - - self._model = AutoModelForVision2Seq.from_pretrained( - model_name, - dtype=torch.bfloat16, - device_map="auto", - ) + self._device = 'cuda' + self._model = AutoModelForVision2Seq.from_pretrained(model_name, dtype=torch.bfloat16, device_map='auto') self._processor = AutoProcessor.from_pretrained(model_name) - - click.echo(f"Model loaded on {self._device}", err=True) - return self._model, self._processor, self._device + click.echo(f'Model loaded on {self._device}', err=True) + return (self._model, self._processor, self._device) def get_text_embedding(self, text: str) -> List[float]: model, processor, device = self._load_model() - - messages = [{ - "role": "user", - "content": [{"type": "text", "text": text}], - }] - - text_inputs = processor.apply_chat_template( - messages, - add_generation_prompt=True, - tokenize=True, - return_tensors="pt", - ).to(device) - + messages = [{'role': 'user', 'content': [{'type': 'text', 'text': text}]}] + text_inputs = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=True, return_tensors='pt').to(device) with torch.no_grad(): output = model(input_ids=text_inputs, output_hidden_states=True) embedding = output.hidden_states[-1][:, -1, :].squeeze(0) embedding = torch.nn.functional.normalize(embedding, p=2, dim=0) - return embedding.float().cpu().numpy().tolist() def get_image_embedding(self, image_path: Union[str, Path]) -> List[float]: model, processor, device = self._load_model() - - messages = [{ - "role": "user", - "content": [ - {"type": "image", "image": str(image_path)}, - {"type": "text", "text": "Describe this image."}, - ], - }] - + messages = [{'role': 'user', 'content': [{'type': 'image', 'image': str(image_path)}, {'type': 'text', 'text': 'Describe this image.'}]}] text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) image_inputs, video_inputs = process_vision_info(messages) - - inputs = processor( - text=[text], - images=image_inputs, - videos=video_inputs, - padding=True, - return_tensors="pt", - ) + inputs = processor(text=[text], images=image_inputs, videos=video_inputs, padding=True, return_tensors='pt') inputs = inputs.to(device) - with torch.no_grad(): output = model(**inputs, output_hidden_states=True) embedding = output.hidden_states[-1][:, -1, :].squeeze(0) embedding = torch.nn.functional.normalize(embedding, p=2, dim=0) - return embedding.float().cpu().numpy().tolist() def cleanup(self) -> None: diff --git a/preprocessor/search/hash_service.py b/preprocessor/modules/search/clients/hash_service.py similarity index 60% rename from preprocessor/search/hash_service.py rename to preprocessor/modules/search/clients/hash_service.py index 8be829556..c2b03af81 100644 --- a/preprocessor/search/hash_service.py +++ b/preprocessor/modules/search/clients/hash_service.py @@ -8,29 +8,28 @@ import click import torch -from preprocessor.utils.image_hasher import PerceptualHasher +from preprocessor.lib.video.image_hasher import PerceptualHasher class HashService: + def __init__(self) -> None: self._hasher: Optional[PerceptualHasher] = None def _load_hasher(self) -> PerceptualHasher: if self._hasher is not None: return self._hasher - - click.echo("Loading perceptual hasher...", err=True) + click.echo('Loading perceptual hasher...', err=True) if not torch.cuda.is_available(): - raise RuntimeError("CUDA is required but not available. This pipeline requires GPU.") - - self._hasher = PerceptualHasher(device="cuda", hash_size=8) - click.echo("Hasher loaded on cuda", err=True) + raise RuntimeError('CUDA is required but not available. This pipeline requires GPU.') + self._hasher = PerceptualHasher(device='cuda', hash_size=8) # pylint: disable=unexpected-keyword-arg + click.echo('Hasher loaded on cuda', err=True) return self._hasher def get_perceptual_hash(self, image_path: Union[str, Path]) -> Optional[str]: hasher = self._load_hasher() - image = Image.open(image_path).convert("RGB") - hashes = hasher.compute_phash_batch([image]) + image = Image.open(image_path).convert('RGB') + hashes = hasher.compute_phash_batch([image]) # pylint: disable=no-member return hashes[0] if hashes else None def cleanup(self) -> None: diff --git a/preprocessor/modules/search/clients/result_formatters.py b/preprocessor/modules/search/clients/result_formatters.py new file mode 100644 index 000000000..a44c13a2f --- /dev/null +++ b/preprocessor/modules/search/clients/result_formatters.py @@ -0,0 +1,102 @@ +from typing import ( + Any, + Dict, + Optional, +) + +import click + +from preprocessor.config.types import ( + ElasticsearchAggregationKeys, + ElasticsearchKeys, + EpisodeMetadataKeys, +) + + +class ResultFormatter: + + @staticmethod + def format_timestamp(seconds: float) -> str: + minutes = int(seconds // 60) + secs = seconds % 60 + return f'{minutes}m {secs:.1f}s' + + @staticmethod + def _format_scene_context(scene_info: Optional[Dict[str, Any]]) -> str: + if not scene_info: + return '' + start = ResultFormatter.format_timestamp(scene_info.get('scene_start_time', 0)) + end = ResultFormatter.format_timestamp(scene_info.get('scene_end_time', 0)) + return f" [Scene {scene_info.get('scene_number', '?')}: {start} - {end}]" + + @staticmethod + def __format_character_appearances(appearances: list) -> str: + """Format character appearances with emotions.""" + chars_strs = [] + for char in appearances: + char_str = char.get('name', 'Unknown') + if char.get('emotion'): + emotion_label = char['emotion'].get('label', '?') + emotion_conf = char['emotion'].get('confidence', 0) + char_str += f' ({emotion_label} {emotion_conf:.2f})' + chars_strs.append(char_str) + return ', '.join(chars_strs) + + @staticmethod + def __format_detected_objects(objects: list) -> str: + """Format detected objects list.""" + return ', '.join([f"{obj['class']}:{obj['count']}" for obj in objects]) + + @staticmethod + def __print_text_result(source: Dict[str, Any], scene_ctx: str) -> None: + """Print text search result.""" + click.echo(f"Segment ID: {source.get('segment_id', 'N/A')}") + start_time = ResultFormatter.format_timestamp(source['start_time']) + end_time = ResultFormatter.format_timestamp(source['end_time']) + click.echo(f'Time: {start_time} - {end_time}{scene_ctx}') + click.echo(f"Speaker: {source.get('speaker', 'N/A')}") + click.echo(f"Text: {source['text']}") + + @staticmethod + def __print_video_result(source: Dict[str, Any], scene_ctx: str) -> None: + """Print video/frame search result.""" + timestamp = ResultFormatter.format_timestamp(source['timestamp']) + click.echo(f"Frame: {source['frame_number']} @ {timestamp}{scene_ctx}") + if 'frame_type' in source: + click.echo(f"Type: {source['frame_type']}") + if 'scene_number' in source: + click.echo(f"Scene number: {source['scene_number']}") + if 'perceptual_hash' in source: + click.echo(f"Hash: {source['perceptual_hash']}") + if source.get('character_appearances'): + chars = ResultFormatter.__format_character_appearances(source['character_appearances']) + click.echo(f"Characters: {chars}") + if source.get('detected_objects'): + objects = ResultFormatter.__format_detected_objects(source['detected_objects']) + click.echo(f'Objects: {objects}') + + @staticmethod + def print_results(result: Dict[str, Any], result_type: str='text') -> None: + total = result[ElasticsearchKeys.HITS][ElasticsearchKeys.TOTAL][ElasticsearchAggregationKeys.VALUE] + hits = result[ElasticsearchKeys.HITS][ElasticsearchKeys.HITS] + click.echo(f'\nZnaleziono: {total} wynikow') + click.echo('=' * 80) + for i, hit in enumerate(hits, 1): + source = hit[ElasticsearchKeys.SOURCE] + score = hit[ElasticsearchKeys.SCORE] + meta = source[EpisodeMetadataKeys.EPISODE_METADATA] + scene_ctx = ResultFormatter._format_scene_context(source.get('scene_info')) + click.echo(f'\n[{i}] Score: {score:.2f}') + season_code = 'S00' if meta['season'] == 0 else f"S{meta['season']:02d}" + click.echo(f"Episode: {season_code}E{meta['episode_number']:02d} - {meta.get('title', 'N/A')}") + if result_type == 'text': + ResultFormatter.__print_text_result(source, scene_ctx) + elif result_type == 'text_semantic': + click.echo(f"Segments: {source['segment_range'][0]}-{source['segment_range'][1]}{scene_ctx}") + click.echo(f"Embedding ID: {source.get('embedding_id', 'N/A')}") + click.echo(f"Text: {source['text']}") + elif result_type == 'episode_name': + click.echo(f"Episode Title: {source.get('title', 'N/A')}") + else: + ResultFormatter.__print_video_result(source, scene_ctx) + click.echo(f"Path: {source['video_path']}") diff --git a/preprocessor/modules/search/document_generation.py b/preprocessor/modules/search/document_generation.py new file mode 100644 index 000000000..f15310e6a --- /dev/null +++ b/preprocessor/modules/search/document_generation.py @@ -0,0 +1,88 @@ +import json +from pathlib import Path +from typing import ( + Any, + Dict, +) + +from preprocessor.config.step_configs import DocumentGenerationConfig +from preprocessor.core.artifacts import ( + Artifact, + ElasticDocuments, +) +from preprocessor.core.base_step import PipelineStep +from preprocessor.core.context import ExecutionContext +from preprocessor.lib.io.files import load_json + + +class DocumentGeneratorStep(PipelineStep[Artifact, ElasticDocuments, DocumentGenerationConfig]): + + @property + def name(self) -> str: + return 'document_generation' + + def execute(self, input_data: Artifact, context: ExecutionContext) -> ElasticDocuments: + if not hasattr(input_data, 'episode_info'): + raise ValueError('Input artifact must have episode_info') + episode_info = getattr(input_data, 'episode_info') + episode_id = getattr(input_data, 'episode_id') + output_dir = context.get_output_path(episode_info, 'elastic_documents', '') + if output_dir.exists() and (not context.force_rerun): + if context.is_step_completed(self.name, episode_id): + return ElasticDocuments(episode_id=episode_id, episode_info=episode_info, path=output_dir, document_count=0) + context.logger.info(f'Generating Elasticsearch documents for {episode_id}') + context.mark_step_started(self.name, episode_id) + data = self._gather_input_data(episode_info, context) + generated_files = [] + total_docs = 0 + if self.config.generate_segments and 'transcription' in data: + path, count = self._generate_segments_jsonl(data, episode_info, context) + generated_files.append(path) + total_docs += count + context.mark_step_completed(self.name, episode_id) + return ElasticDocuments(episode_id=episode_id, episode_info=episode_info, path=output_dir, document_count=total_docs) + + @staticmethod + def _gather_input_data(episode_info: Any, context: ExecutionContext) -> Dict[str, Any]: + data = {} + clean_filename = f'{context.series_name}_{episode_info.episode_code()}_clean_transcription.json' + clean_path = context.get_output_path(episode_info, 'transcriptions/clean', clean_filename) + if clean_path.exists(): + data['transcription'] = load_json(clean_path) + text_emb_filename = f'{context.series_name}_{episode_info.episode_code()}_embeddings_text.json' + text_emb_path = context.get_output_path(episode_info, 'embeddings', text_emb_filename) + if text_emb_path.exists(): + data['text_embeddings'] = load_json(text_emb_path) + scene_filename = f'{context.series_name}_{episode_info.episode_code()}_scenes.json' + scene_path = context.get_output_path(episode_info, 'scene_timestamps', scene_filename) + if scene_path.exists(): + data['scenes'] = load_json(scene_path) + return data + + def _generate_segments_jsonl(self, data: Dict[str, Any], episode_info: Any, context: ExecutionContext) -> tuple[Path, int]: + output_filename = f'{context.series_name}_{episode_info.episode_code()}_text_segments.jsonl' + output_path = context.get_output_path(episode_info, 'elastic_documents/text_segments', output_filename) + segments = data['transcription'].get('segments', []) + episode_metadata = self._build_episode_metadata(episode_info, context) + filename = f'{context.series_name}_{episode_info.episode_code()}.mp4' + video_bot_path = f'bot/{context.series_name.upper()}-WIDEO/{episode_info.season_code()}/{filename}' + count = 0 + with open(output_path, 'w', encoding='utf-8') as f: + for i, segment in enumerate(segments): + doc = { + 'episode_id': episode_info.episode_code(), + 'episode_metadata': episode_metadata, + 'segment_id': i, + 'text': segment.get('text', '').strip(), + 'start_time': segment.get('start', 0.0), + 'end_time': segment.get('end', 0.0), + 'speaker': segment.get('speaker', 'unknown'), + 'video_path': video_bot_path, + } + f.write(json.dumps(doc, ensure_ascii=False) + '\n') + count += 1 + return (output_path, count) + + @staticmethod + def _build_episode_metadata(episode_info: Any, context: ExecutionContext) -> Dict[str, Any]: + return {'season': episode_info.season, 'episode_number': episode_info.relative_episode, 'series_name': context.series_name} diff --git a/preprocessor/modules/search/indexing.py b/preprocessor/modules/search/indexing.py new file mode 100644 index 000000000..c96780ae8 --- /dev/null +++ b/preprocessor/modules/search/indexing.py @@ -0,0 +1,116 @@ +import asyncio +import json +from pathlib import Path +from typing import ( + Any, + Dict, + List, + Optional, +) + +from preprocessor.config.step_configs import ElasticsearchConfig +from preprocessor.core.artifacts import ( + ElasticDocuments, + IndexingResult, +) +from preprocessor.core.base_step import PipelineStep +from preprocessor.core.context import ExecutionContext +from preprocessor.lib.search.elasticsearch import ElasticsearchWrapper + + +class ElasticsearchIndexerStep(PipelineStep[List[ElasticDocuments], IndexingResult, ElasticsearchConfig]): + + def __init__(self, config: ElasticsearchConfig) -> None: + super().__init__(config) + self._es: Optional[ElasticsearchWrapper] = None + + @property + def name(self) -> str: + return 'elasticsearch_indexing' + + def execute(self, input_data: List[ElasticDocuments], context: ExecutionContext) -> IndexingResult: + return asyncio.run(self._execute_async(input_data, context)) + + async def _execute_async( + self, + input_data: List[ElasticDocuments], + context: ExecutionContext, + ) -> IndexingResult: + if not input_data: + context.logger.warning('No documents to index.') + return IndexingResult( + index_name=self.config.index_name, + document_count=0, + success=True, + ) + + docs_by_type: Dict[str, List[Path]] = {} + for doc_artifact in input_data: + doc_type: str = doc_artifact.path.parent.name + if doc_type not in docs_by_type: + docs_by_type[doc_type] = [] + docs_by_type[doc_type].append(doc_artifact.path) + + total_indexed: int = 0 + for doc_type, paths in docs_by_type.items(): + index_name: str = f'{self.config.index_name}_{doc_type}' + context.logger.info(f'Indexing {len(paths)} files into {index_name}') + + if self._es is None or self._es.index_name != index_name: + if self._es is not None: + await self._es.close() + self._es = ElasticsearchWrapper( + index_name=index_name, + host=self.config.host, + dry_run=self.config.dry_run, + ) + + try: + if not self.config.append: + await self._es.delete_index() + + mapping: Optional[Dict[str, Any]] = self._get_mapping_for_type(doc_type) + if mapping: + await self._es.create_index(mapping) + + documents: List[Dict[str, Any]] = [] + for path in paths: + with open(path, 'r', encoding='utf-8') as f: + for line in f: + if line.strip(): + documents.append(json.loads(line)) + + if documents: + if not self.config.dry_run: + await self._es.bulk_index(documents) + total_indexed += len(documents) + else: + context.logger.info( + f'Dry-run: would index {len(documents)} docs to {index_name}', + ) + except Exception as e: + context.logger.error(f'Elasticsearch indexing failed for {index_name}: {e}') + return IndexingResult( + index_name=self.config.index_name, + document_count=total_indexed, + success=False, + ) + + return IndexingResult( + index_name=self.config.index_name, + document_count=total_indexed, + success=True, + ) + + @staticmethod + def _get_mapping_for_type( + doc_type: str, # pylint: disable=unused-argument + ) -> Optional[Dict[str, Any]]: + """Get Elasticsearch mapping for document type.""" + # TODO: Load mappings from config or separate file # pylint: disable=fixme + return None + + def cleanup(self) -> None: + if self._es: + asyncio.run(self._es.close()) + self._es = None diff --git a/preprocessor/modules/text/__init__.py b/preprocessor/modules/text/__init__.py new file mode 100644 index 000000000..551160158 --- /dev/null +++ b/preprocessor/modules/text/__init__.py @@ -0,0 +1,6 @@ +from preprocessor.modules.text.analysis import TextAnalysisStep +from preprocessor.modules.text.embeddings import TextEmbeddingStep +from preprocessor.modules.text.import_step import TranscriptionImportStep +from preprocessor.modules.text.transcription import TranscriptionStep + +__all__ = ['TextAnalysisStep', 'TextEmbeddingStep', 'TranscriptionImportStep', 'TranscriptionStep'] diff --git a/preprocessor/modules/text/analysis.py b/preprocessor/modules/text/analysis.py new file mode 100644 index 000000000..88a28a941 --- /dev/null +++ b/preprocessor/modules/text/analysis.py @@ -0,0 +1,50 @@ +from datetime import datetime + +from preprocessor.config.step_configs import TextAnalysisConfig +from preprocessor.core.artifacts import ( + TextAnalysisResults, + TranscriptionData, +) +from preprocessor.core.base_step import PipelineStep +from preprocessor.core.context import ExecutionContext +from preprocessor.lib.io.files import ( + atomic_write_json, + load_json, +) +from preprocessor.lib.text.text_statistics import TextStatistics + + +class TextAnalysisStep(PipelineStep[TranscriptionData, TextAnalysisResults, TextAnalysisConfig]): + + @property + def name(self) -> str: + return 'text_analysis' + + def execute(self, input_data: TranscriptionData, context: ExecutionContext) -> TextAnalysisResults: + output_filename = input_data.path.stem + '_text_stats.json' + output_path = input_data.path.parent / output_filename + if output_path.exists() and (not context.force_rerun): + if context.is_step_completed(self.name, input_data.episode_id): + context.logger.info(f'Skipping {input_data.episode_id} (cached)') + stats_data = load_json(output_path) + return TextAnalysisResults(episode_id=input_data.episode_id, episode_info=input_data.episode_info, path=output_path, statistics=stats_data) + context.logger.info(f'Analyzing text for {input_data.episode_id}') + context.mark_step_started(self.name, input_data.episode_id) + txt_path = input_data.path + if input_data.format != 'txt': + txt_path = input_data.path.with_suffix('.txt') + if not txt_path.exists(): + raise FileNotFoundError(f'Transcription text file not found: {txt_path}') + stats = TextStatistics.from_file(txt_path, language=self.config.language) + result_data = { + 'metadata': { + 'episode_id': input_data.episode_id, + 'language': self.config.language, + 'source_file': txt_path.name, + 'analyzed_at': datetime.now().isoformat(), + }, + **stats.to_dict(), + } + atomic_write_json(output_path, result_data) + context.mark_step_completed(self.name, input_data.episode_id) + return TextAnalysisResults(episode_id=input_data.episode_id, episode_info=input_data.episode_info, path=output_path, statistics=result_data) diff --git a/preprocessor/modules/text/embeddings.py b/preprocessor/modules/text/embeddings.py new file mode 100644 index 000000000..fdd84ae16 --- /dev/null +++ b/preprocessor/modules/text/embeddings.py @@ -0,0 +1,165 @@ +from pathlib import Path +import re +from typing import ( + Any, + Dict, + List, + Optional, +) + +from preprocessor.config.step_configs import TextEmbeddingConfig +from preprocessor.core.artifacts import ( + EmbeddingCollection, + TranscriptionData, +) +from preprocessor.core.base_step import PipelineStep +from preprocessor.core.context import ExecutionContext +from preprocessor.lib.io.files import ( + atomic_write_json, + load_json, +) +from preprocessor.lib.io.metadata import MetadataBuilder +from preprocessor.lib.search.embedding_model import EmbeddingModelWrapper + + +class TextEmbeddingStep(PipelineStep[TranscriptionData, EmbeddingCollection, TextEmbeddingConfig]): + + def __init__(self, config: TextEmbeddingConfig) -> None: + super().__init__(config) + self._model: Optional[EmbeddingModelWrapper] = None + + @property + def name(self) -> str: + return 'text_embedding' + + def _create_embedding_collection( # pylint: disable=duplicate-code + self, + input_data: TranscriptionData, + output_path: Path, + embedding_count: int, + ) -> EmbeddingCollection: + """Create EmbeddingCollection with standard parameters.""" + return MetadataBuilder.create_embedding_collection( + episode_id=input_data.episode_id, + episode_info=input_data.episode_info, + path=output_path, + model_name=self.config.model_name, + embedding_count=embedding_count, + embedding_type='text', + ) + + def execute( # pylint: disable=too-many-locals + self, + input_data: TranscriptionData, + context: ExecutionContext, + ) -> EmbeddingCollection: + episode_code = input_data.episode_info.episode_code() + output_filename: str = f'{context.series_name}_{episode_code}_embeddings_text.json' + output_path: Path = context.get_output_path( + input_data.episode_info, + 'embeddings', + output_filename, + ) + if output_path.exists() and (not context.force_rerun): + if context.is_step_completed(self.name, input_data.episode_id): + context.logger.info( + f'Skipping {input_data.episode_id} (cached text embeddings)', + ) + emb_data: Dict[str, Any] = load_json(output_path) + return self._create_embedding_collection( + input_data, + output_path, + len(emb_data.get('results', [])), + ) + transcription: Dict[str, Any] = self._load_clean_transcription(input_data, context) + segments: List[Dict[str, Any]] = transcription.get('segments', []) + if not segments: + context.logger.warning(f'No text segments for embedding in {input_data.episode_id}') + return self._create_embedding_collection(input_data, output_path, 0) + if self._model is None: + self._model = EmbeddingModelWrapper( + self.config.model_name, + self.config.device, + self.config.batch_size, + ) + context.logger.info(f'Generating text embeddings for {input_data.episode_id}') + context.mark_step_started(self.name, input_data.episode_id) + full_text: str = ' '.join([seg.get('text', '') for seg in segments]) + sentences: List[str] = self._split_into_sentences(full_text) + text_chunks: List[str] = [] + chunk_metadata: List[Dict[str, Any]] = [] + step: int = self.config.text_sentences_per_chunk - self.config.text_chunk_overlap + for i in range(0, len(sentences), step): + chunk_sentences: List[str] = sentences[i:i + self.config.text_sentences_per_chunk] + if not chunk_sentences: + continue + chunk_text: str = ' '.join(chunk_sentences).strip() + if not chunk_text: + continue + char_start: int = sum((len(s) + 1 for s in sentences[:i])) + char_end: int = char_start + len(chunk_text) + start_seg_id: int = self._find_segment_at_position(segments, char_start) + end_seg_id: int = self._find_segment_at_position(segments, char_end) + text_chunks.append(chunk_text) + chunk_metadata.append({'segment_range': [start_seg_id, end_seg_id], 'text': chunk_text}) + results: List[Dict[str, Any]] = [] + for i in range(0, len(text_chunks), self.config.batch_size): + batch_texts: List[str] = text_chunks[i:i + self.config.batch_size] + batch_meta: List[Dict[str, Any]] = chunk_metadata[i:i + self.config.batch_size] + batch_embeddings: List[List[float]] = self._model.encode_text(batch_texts) + for meta, emb in zip(batch_meta, batch_embeddings): + results.append({**meta, 'embedding': emb}) + output_data: Dict[str, Any] = MetadataBuilder.create_processing_metadata( + episode_info=input_data.episode_info, + processing_params=self.config.dict(), + statistics={ + 'total_embeddings': len(results), + 'embedding_dimension': len(results[0]['embedding']) if results else 0, + }, + results_key='text_embeddings', + results_data=results, + ) + atomic_write_json(output_path, output_data) + context.mark_step_completed(self.name, input_data.episode_id) + return self._create_embedding_collection(input_data, output_path, len(results)) + + @staticmethod + def _load_clean_transcription( + input_data: TranscriptionData, + context: ExecutionContext, # pylint: disable=unused-argument + ) -> Dict[str, Any]: + raw_path: Path = input_data.path + clean_path: Path = ( + raw_path.parent.parent / 'clean' / + raw_path.name.replace('.json', '_clean_transcription.json') + ) + if clean_path.exists(): + return load_json(clean_path) + return load_json(raw_path) + + @staticmethod + def _split_into_sentences(text: str) -> List[str]: + normalized_text: str = re.sub('\\.{2,}', '.', text) + sentences: List[str] = re.split('([.!?]+(?:\\s+|$))', normalized_text) + result: List[str] = [] + for i in range(0, len(sentences) - 1, 2): + s: str = (sentences[i] + sentences[i + 1]).strip() + if s: + result.append(s) + if len(sentences) % 2 == 1 and sentences[-1].strip(): + result.append(sentences[-1].strip()) + return result + + @staticmethod + def _find_segment_at_position(segments: List[Dict[str, Any]], char_pos: int) -> int: + cumulative_length: int = 0 + for idx, seg in enumerate(segments): + seg_length: int = len(seg.get('text', '')) + 1 + if cumulative_length <= char_pos < cumulative_length + seg_length: + return idx + cumulative_length += seg_length + return len(segments) - 1 if segments else 0 + + def cleanup(self) -> None: + if self._model: + self._model = None diff --git a/preprocessor/modules/text/import_step.py b/preprocessor/modules/text/import_step.py new file mode 100644 index 000000000..3f2c13d0c --- /dev/null +++ b/preprocessor/modules/text/import_step.py @@ -0,0 +1,154 @@ +import json +from pathlib import Path +import re +from typing import ( + TYPE_CHECKING, + Any, + Dict, + List, + Optional, + Tuple, +) + +from preprocessor.config.step_configs import TranscriptionImportConfig +from preprocessor.core.artifacts import TranscriptionData +from preprocessor.core.base_step import PipelineStep +from preprocessor.core.context import ExecutionContext +from preprocessor.lib.episodes.episode_manager import EpisodeManager + +if TYPE_CHECKING: + from preprocessor.lib.episodes.episode_manager import EpisodeInfo + +class TranscriptionImportStep(PipelineStep[None, List[TranscriptionData], TranscriptionImportConfig]): + + def __init__(self, config: TranscriptionImportConfig) -> None: + super().__init__(config) + self._episode_manager: Optional[EpisodeManager] = None + + @property + def name(self) -> str: + return 'transcription_import' + + def execute(self, input_data: None, context: ExecutionContext) -> List[TranscriptionData]: + if self._episode_manager is None: + self._episode_manager = EpisodeManager(None, context.series_name, context.logger) + json_files: List[Path] = self._find_transcription_files() + if not json_files: + context.logger.warning(f'No transcription files found in {self.config.source_dir}') + return [] + context.logger.info(f'Found {len(json_files)} transcription files to import') + results: List[TranscriptionData] = [] + for json_file in json_files: + try: + artifact: Optional[TranscriptionData] = self._import_single_file(json_file, context) + if artifact: + results.append(artifact) + except Exception as e: + context.logger.error(f'Failed to import {json_file.name}: {e}') + return results + + def _find_transcription_files(self) -> List[Path]: + pattern: str = '*.json' + if self.config.format_type == '11labs_segmented': + pattern = '*_segmented.json' + files: List[Path] = sorted(self.config.source_dir.rglob(pattern)) + return [f for f in files if not f.name.startswith('.')] + + def _import_single_file(self, json_file: Path, context: ExecutionContext) -> Optional[TranscriptionData]: + episode_info: Optional['EpisodeInfo'] = self._episode_manager.parse_filename(json_file) + if not episode_info: + season_num, episode_num = self._extract_season_episode_fallback(json_file) + episode_info = self._episode_manager.get_episode_by_season_and_relative(season_num, episode_num) + if not episode_info: + context.logger.warning(f'Could not determine episode for {json_file}') + return None + episode_id: str = self._episode_manager.get_episode_id_for_state(episode_info) + output_filename: str = self._episode_manager.path_manager.build_filename(episode_info, extension='json') + output_path: Path = context.get_output_path(episode_info, 'transcriptions', output_filename) + if output_path.exists() and (not context.force_rerun): + if context.is_step_completed(self.name, episode_id): + context.logger.info(f'Skipping {episode_id} (cached)') + return TranscriptionData(episode_id=episode_id, episode_info=episode_info, path=output_path, language='pl', model='11labs', format='json') + context.logger.info(f'Importing {episode_id} from {json_file.name}') + context.mark_step_started(self.name, episode_id) + with open(json_file, 'r', encoding='utf-8') as f: + source_data: Dict[str, Any] = json.load(f) + if self.config.format_type == '11labs_segmented': + converted_data: Dict[str, Any] = self._convert_11labs_segmented(source_data, json_file) + elif self.config.format_type == '11labs': + converted_data = self._convert_11labs_full(source_data, json_file) + else: + raise ValueError(f'Unknown format type: {self.config.format_type}') + converted_data['episode_info'] = EpisodeManager.get_metadata(episode_info) + output_path.parent.mkdir(parents=True, exist_ok=True) + with open(output_path, 'w', encoding='utf-8') as f: + json.dump(converted_data, f, indent=2, ensure_ascii=False) + context.mark_step_completed(self.name, episode_id) + return TranscriptionData( + episode_id=episode_id, + episode_info=episode_info, + path=output_path, + language=converted_data.get('transcription', {}).get('language_code', 'pl'), + model=converted_data.get('transcription', {}).get('format', '11labs'), + format='json', + ) + + @staticmethod + def _convert_11labs_segmented(data: Dict[str, Any], source_file: Path) -> Dict[str, Any]: + segments: List[Dict[str, Any]] = [] + for i, segment in enumerate(data.get('segments', [])): + converted_segment: Dict[str, Any] = { + 'id': i, + 'start': segment.get('start'), + 'end': segment.get('end'), + 'text': segment.get('text', ''), + 'speaker': segment.get('speaker', 'unknown'), + 'words': segment.get('words', []), + } + segments.append(converted_segment) + return { + 'transcription': {'format': '11labs_segmented', 'source_file': source_file.name, 'segments': segments}, + 'segments': segments, + } + + @staticmethod + def _convert_11labs_full(data: Dict[str, Any], source_file: Path) -> Dict[str, Any]: + segments: List[Dict[str, Any]] = [] + words: List[Dict[str, Any]] = data.get('words', []) + current_segment: Dict[str, Any] = {'words': [], 'start': None, 'end': None, 'text': '', 'speaker': 'unknown'} + for word in words: + if current_segment['start'] is None: + current_segment['start'] = word.get('start') + current_segment['words'].append(word) + current_segment['end'] = word.get('end') + if word.get('text', '').endswith(('.', '!', '?')) or len(current_segment['words']) >= 20: + current_segment['text'] = ' '.join((w.get('text', '') for w in current_segment['words'])) + segments.append(dict(current_segment)) + current_segment = {'words': [], 'start': None, 'end': None, 'text': '', 'speaker': word.get('speaker_id', 'unknown')} + if current_segment['words']: + current_segment['text'] = ' '.join((w.get('text', '') for w in current_segment['words'])) + segments.append(current_segment) + for i, seg in enumerate(segments): + seg['id'] = i + return { + 'transcription': { + 'format': '11labs', + 'source_file': source_file.name, + 'language_code': data.get('language_code', 'pol'), + 'language_probability': data.get('language_probability', 1.0), + }, + 'segments': segments, + } + + @staticmethod + def _extract_season_episode_fallback(file_path: Path) -> Tuple[int, int]: + match: Optional[re.Match] = re.search('S(\\d+)E(\\d+)', file_path.name, re.IGNORECASE) + if match: + return (int(match.group(1)), int(match.group(2))) + parent_match: Optional[re.Match] = re.search('S(\\d+)', file_path.parent.name, re.IGNORECASE) + if parent_match: + season: int = int(parent_match.group(1)) + episode_match: Optional[re.Match] = re.search('E(\\d+)', file_path.name, re.IGNORECASE) + if episode_match: + return (season, int(episode_match.group(1))) + return (1, 1) diff --git a/preprocessor/modules/text/transcription.py b/preprocessor/modules/text/transcription.py new file mode 100644 index 000000000..ed2a976ed --- /dev/null +++ b/preprocessor/modules/text/transcription.py @@ -0,0 +1,90 @@ +from pathlib import Path +from typing import ( + Any, + Dict, + Optional, +) + +from preprocessor.config.step_configs import WhisperTranscriptionConfig +from preprocessor.core.artifacts import ( + AudioArtifact, + TranscriptionData, +) +from preprocessor.core.base_step import PipelineStep +from preprocessor.core.context import ExecutionContext +from preprocessor.lib.episodes.episode_manager import EpisodeManager +from preprocessor.lib.io.files import atomic_write_json +from preprocessor.lib.transcription.whisper import Whisper + + +class TranscriptionStep(PipelineStep[AudioArtifact, TranscriptionData, WhisperTranscriptionConfig]): + + def __init__(self, config: WhisperTranscriptionConfig) -> None: + super().__init__(config) + self._whisper: Optional[Whisper] = None + + @property + def name(self) -> str: + return 'transcription' + + def execute(self, input_data: AudioArtifact, context: ExecutionContext) -> TranscriptionData: + output_filename: str = ( + f'{context.series_name}_{input_data.episode_info.episode_code()}.json' + ) + output_path: Path = context.get_output_path( + input_data.episode_info, + 'transcriptions', + f'raw/{output_filename}', + ) + + if output_path.exists() and (not context.force_rerun): + if context.is_step_completed(self.name, input_data.episode_id): + context.logger.info(f'Skipping {input_data.episode_id} (cached transcription)') + return TranscriptionData( + episode_id=input_data.episode_id, + episode_info=input_data.episode_info, + path=output_path, + language=self.config.language, + model=self.config.model, + format='json', + ) + + if self._whisper is None: + self._whisper = Whisper( + model=self.config.model, + language=self.config.language, + device=self.config.device, + beam_size=self.config.beam_size, + ) + + context.logger.info( + f'Transcribing {input_data.episode_id} using Whisper {self.config.model}', + ) + context.mark_step_started(self.name, input_data.episode_id) + + try: + result: Dict[str, Any] = self._whisper.transcribe(input_data.path) + result['episode_info'] = EpisodeManager.get_metadata(input_data.episode_info) + atomic_write_json(output_path, result) + except Exception as e: + context.logger.error( + f'Whisper transcription failed for {input_data.episode_id}: {e}', + ) + if output_path.exists(): + output_path.unlink() + raise + + context.mark_step_completed(self.name, input_data.episode_id) + return TranscriptionData( + episode_id=input_data.episode_id, + episode_info=input_data.episode_info, + path=output_path, + language=result.get('language', self.config.language), + model=self.config.model, + format='json', + ) + + def cleanup(self) -> None: + if self._whisper: + self._whisper.cleanup() + self._whisper = None diff --git a/preprocessor/modules/validation/__init__.py b/preprocessor/modules/validation/__init__.py new file mode 100644 index 000000000..ec91195c0 --- /dev/null +++ b/preprocessor/modules/validation/__init__.py @@ -0,0 +1,7 @@ +from preprocessor.modules.validation.episode_stats import EpisodeStats +from preprocessor.modules.validation.global_validator import GlobalValidator +from preprocessor.modules.validation.report_generator import ReportGenerator +from preprocessor.modules.validation.season_comparator import SeasonComparison +from preprocessor.modules.validation.validator import Validator + +__all__ = ['EpisodeStats', 'GlobalValidator', 'ReportGenerator', 'SeasonComparison', 'Validator'] diff --git a/preprocessor/modules/validation/episode_stats.py b/preprocessor/modules/validation/episode_stats.py new file mode 100644 index 000000000..1feb96694 --- /dev/null +++ b/preprocessor/modules/validation/episode_stats.py @@ -0,0 +1,471 @@ +from dataclasses import ( + dataclass, + field, +) +import json +from pathlib import Path +from typing import ( + Any, + Dict, + List, + Optional, + Tuple, +) + +from preprocessor.config.config import ( + get_base_output_dir, + settings, +) +from preprocessor.config.constants import ( + DEFAULT_VIDEO_EXTENSION, + OUTPUT_FILE_NAMES, + OUTPUT_FILE_PATTERNS, +) +from preprocessor.core.path_manager import PathManager +from preprocessor.lib.episodes import EpisodeInfo +from preprocessor.lib.validation.base_result import ValidationStatusMixin +from preprocessor.lib.validation.file_validators import FileValidator + +ELASTIC_SUBDIRS = settings.output_subdirs.elastic_document_subdirs + +@dataclass +class EpisodeStats(ValidationStatusMixin): # pylint: disable=too-many-instance-attributes + episode_info: EpisodeInfo + series_name: str + errors: List[str] = field(default_factory=list) + warnings: List[str] = field(default_factory=list) + transcription_chars: Optional[int] = None + transcription_duration: Optional[float] = None + transcription_words: Optional[int] = None + exported_frames_count: Optional[int] = None + exported_frames_total_size_mb: Optional[float] = None + exported_frames_avg_resolution: Optional[Tuple[int, int]] = None + video_size_mb: Optional[float] = None + video_duration: Optional[float] = None + video_codec: Optional[str] = None + video_resolution: Optional[Tuple[int, int]] = None + scenes_count: Optional[int] = None + scenes_avg_duration: Optional[float] = None + image_hashes_count: Optional[int] = None + object_detections_count: Optional[int] = None + object_visualizations_count: Optional[int] = None + character_visualizations_count: Optional[int] = None + face_clusters_count: Optional[int] = None + face_clusters_total_faces: Optional[int] = None + + def collect_stats(self): + self.__validate_transcription() + self.__validate_exported_frames() + self.__validate_video() + self.__validate_scenes() + self.__validate_image_hashes() + self.__validate_character_visualizations() + self.__validate_face_clusters() + self.__validate_object_detections() + self.__validate_object_visualizations() + self.__validate_other_files() + + def __validate_transcription(self): + transcriptions_dir = PathManager(self.series_name).get_episode_dir(self.episode_info, settings.output_subdirs.transcriptions) + base_name = f'{self.series_name}_{self.episode_info.episode_code()}' + raw_dir = transcriptions_dir / settings.output_subdirs.transcription_subdirs.raw + clean_dir = transcriptions_dir / settings.output_subdirs.transcription_subdirs.clean + sound_events_dir = transcriptions_dir / settings.output_subdirs.transcription_subdirs.sound_events + transcription_files = { + 'main': raw_dir / f'{base_name}.json', + 'segmented': raw_dir / f'{base_name}_segmented.json', + 'simple': raw_dir / f'{base_name}_simple.json', + 'clean': clean_dir / f'{base_name}_clean_transcription.json', + 'clean_txt': clean_dir / f'{base_name}_clean_transcription.txt', + 'sound_events': sound_events_dir / f'{base_name}_sound_events.json', + } + if not any((f.exists() for f in transcription_files.values())): + self.errors.append('No transcription files found in any format') + return + self.__validate_raw_transcription(transcription_files) + self.__validate_clean_transcription(transcription_files['clean']) + self.__validate_clean_txt(transcription_files['clean_txt']) + self.__validate_sound_events(transcription_files['sound_events']) + + def __validate_raw_transcription(self, transcription_files: Dict[str, Path]): + raw_transcription = None + for key in ('main', 'segmented', 'simple'): + if transcription_files[key].exists(): + raw_transcription = transcription_files[key] + break + if not raw_transcription: + self.warnings.append('Missing raw transcription file (checked: .json, _segmented.json, _simple.json)') + return + result = FileValidator.validate_json_file(raw_transcription) + if not result.is_valid: + self.errors.append(f'Invalid transcription JSON: {result.error_message}') + return + self.__extract_transcription_stats(raw_transcription) + + def __extract_transcription_stats(self, raw_transcription: Path): + data = self._load_json_safely(raw_transcription) + if not data: + self.errors.append(f'Error reading transcription: {raw_transcription}') + return + text = data.get('text', '') + if not text: + segments = data.get('segments', []) + if segments: + text = ' '.join((seg.get('text', '') for seg in segments)) + self.transcription_chars = len(text) + self.transcription_words = len(text.split()) + words = data.get('words', []) + if words: + self.transcription_duration = words[-1].get('end', 0.0) + else: + segments = data.get('segments', []) + if segments and segments[-1].get('end'): + self.transcription_duration = segments[-1].get('end', 0.0) + + def __validate_clean_transcription(self, clean_transcription_file): + if not clean_transcription_file.exists(): + self.warnings.append(f'Missing clean transcription file: {clean_transcription_file.name}') + return + result = FileValidator.validate_json_file(clean_transcription_file) + if not result.is_valid: + self.warnings.append(f'Invalid clean transcription JSON: {result.error_message}') + + def __validate_clean_txt(self, clean_txt_file): + if not clean_txt_file.exists(): + self.warnings.append(f'Missing clean transcription txt: {clean_txt_file.name}') + + def __validate_sound_events(self, sound_events_file): + if not sound_events_file.exists(): + self.warnings.append(f'Missing sound events file: {sound_events_file.name}') + return + result = FileValidator.validate_json_file(sound_events_file) + if not result.is_valid: + self.warnings.append(f'Invalid sound events JSON: {result.error_message}') + + def __validate_exported_frames(self): + frames_dir = PathManager(self.series_name).get_episode_dir(self.episode_info, settings.output_subdirs.frames) + if not frames_dir.exists(): + self.warnings.append(f'Missing {settings.output_subdirs.frames} directory: {frames_dir}') + return + frame_files = sorted(frames_dir.glob(OUTPUT_FILE_PATTERNS['frame'])) + if not frame_files: + self.warnings.append(f'No frames found in {settings.output_subdirs.frames}/') + return + self.exported_frames_count = len(frame_files) + total_size = 0 + resolutions = [] + invalid_count = 0 + for frame_file in frame_files: + result = FileValidator.validate_image_file(frame_file) + if result.is_valid: + total_size += result.metadata['size_mb'] + resolutions.append((result.metadata['width'], result.metadata['height'])) + else: + invalid_count += 1 + self.errors.append(f'Invalid frame {frame_file.name}: {result.error_message}') + if invalid_count > 0: + self.warnings.append(f'{invalid_count} invalid frames found') + self.exported_frames_total_size_mb = round(total_size, 2) + if resolutions: + most_common_res = max(set(resolutions), key=resolutions.count) + self.exported_frames_avg_resolution = most_common_res + + def __validate_video(self): + filename = f'{self.series_name.lower()}_{self.episode_info.episode_code()}{DEFAULT_VIDEO_EXTENSION}' + season_dir = get_base_output_dir(self.series_name) / settings.output_subdirs.video / self.episode_info.season_code() + video_file = season_dir / filename + if not video_file.exists(): + self.warnings.append(f'Missing video file: {video_file}') + return + result = FileValidator.validate_video_file(video_file) + if not result.is_valid: + self.errors.append(f'Invalid video: {result.error_message}') + return + self.video_size_mb = result.metadata['size_mb'] + self.video_duration = result.metadata['duration'] + self.video_codec = result.metadata['codec'] + self.video_resolution = (result.metadata['width'], result.metadata['height']) + + def __validate_scenes(self): + scenes_dir = PathManager(self.series_name).get_episode_dir(self.episode_info, settings.output_subdirs.scenes) + scenes_file = scenes_dir / f"{self.series_name}_{self.episode_info.episode_code()}{OUTPUT_FILE_PATTERNS['scenes_suffix']}" + if not scenes_file.exists(): + self.errors.append(f'Missing scenes file: {scenes_file}') + return + result = FileValidator.validate_json_file(scenes_file) + if not result.is_valid: + self.errors.append(f'Invalid scenes JSON: {result.error_message}') + return + data = self._load_json_safely(scenes_file) + if not data: + self.errors.append(f'Error reading scenes: {scenes_file}') + return + self.scenes_count = data.get('total_scenes', 0) + scenes = data.get('scenes', []) + if scenes: + durations = [scene.get('duration', 0) for scene in scenes] + self.scenes_avg_duration = round(sum(durations) / len(durations), 2) + + def __validate_json_directory( + self, + subdir: str, + count_attr: Optional[str], + context_name: str, + exclude_pattern: Optional[str] = None, + check_anomalies: bool = True, + ): + dir_path = PathManager(self.series_name).get_episode_dir(self.episode_info, subdir) + count, sizes, errors = self._validate_json_files_in_directory(dir_path, exclude_pattern) + if not dir_path.exists(): + self.warnings.append(f'Missing {subdir} directory') + return + if count == 0: + self.warnings.append(f'No JSON files in {subdir}/') + return + if count_attr: + setattr(self, count_attr, count) + self.errors.extend(errors) + if check_anomalies: + self.__check_size_anomalies(sizes, context_name) + + def __validate_image_hashes(self): + self.__validate_json_directory(settings.output_subdirs.image_hashes, 'image_hashes_count', 'image_hashes') + + def __validate_visualizations(self, subdir: str, count_attr: str, context_name: str): + viz_dir = PathManager(self.series_name).get_episode_dir(self.episode_info, subdir) + total_count, invalid_count, errors = self._validate_images_in_directory(viz_dir) + if total_count == 0 and viz_dir.exists(): + self.warnings.append(f'No visualization images in {subdir}/') + return + if total_count > 0: + setattr(self, count_attr, total_count) + self.errors.extend(errors) + if invalid_count > 0: + self.warnings.append(f'{invalid_count} invalid {context_name} images found') + + def __validate_character_visualizations(self): + self.__validate_visualizations(settings.output_subdirs.character_visualizations, 'character_visualizations_count', 'character visualization') + + def __validate_face_clusters(self): + clusters_dir = PathManager(self.series_name).get_episode_dir(self.episode_info, settings.output_subdirs.face_clusters) + if not clusters_dir.exists(): + return + metadata_files = list(clusters_dir.glob('*_face_clusters.json')) + metadata_file = metadata_files[0] if metadata_files else None + if not metadata_file or not metadata_file.exists(): + self.warnings.append('Missing face clustering metadata file') + return + result = FileValidator.validate_json_file(metadata_file) + if not result.is_valid: + self.errors.append(f'Invalid face clustering metadata: {result.error_message}') + return + data = self._load_json_safely(metadata_file) + if not data: + self.errors.append(f'Error reading face clustering metadata: {metadata_file}') + return + clusters = data.get('clusters', {}) + if isinstance(clusters, dict): + self.face_clusters_count = len(clusters) + total_faces = sum((cluster_info.get('face_count', 0) for cluster_info in clusters.values())) + elif isinstance(clusters, list): + self.face_clusters_count = len(clusters) + total_faces = sum((cluster_info.get('face_count', 0) for cluster_info in clusters)) + else: + self.warnings.append('Unexpected clusters format in face clustering metadata') + return + noise_info = data.get('noise', {}) + if noise_info: + total_faces += noise_info.get('face_count', 0) + self.face_clusters_total_faces = total_faces + + def __validate_object_detections(self): + self.__validate_json_directory( + settings.output_subdirs.object_detections, + 'object_detections_count', + 'object_detections', + exclude_pattern='visualizations', + ) + + def __validate_object_visualizations(self): + self.__validate_visualizations(settings.output_subdirs.object_visualizations, 'object_visualizations_count', 'visualization') + + def __validate_embedding_dimensions(self, jsonl_file, subdir: str): + embedding_fields = { + ELASTIC_SUBDIRS.text_embeddings: 'text_embedding', + ELASTIC_SUBDIRS.video_frames: 'video_embedding', + ELASTIC_SUBDIRS.episode_names: 'title_embedding', + ELASTIC_SUBDIRS.full_episode_embeddings: 'full_episode_embedding', + ELASTIC_SUBDIRS.sound_event_embeddings: 'sound_event_embedding', + } + if subdir not in embedding_fields: + return + embedding_field = embedding_fields[subdir] + expected_dim = settings.embedding_model.embedding_dim + try: + with open(jsonl_file, 'r', encoding='utf-8') as f: + for line_num, line in enumerate(f, 1): + if not line.strip(): + continue + doc = json.loads(line) + if embedding_field in doc: + embedding = doc[embedding_field] + if isinstance(embedding, list): + actual_dim = len(embedding) + if actual_dim != expected_dim: + error_msg = ( + f'{jsonl_file.name} line {line_num}: ' + f'{embedding_field} has {actual_dim} dimensions, ' + f'expected {expected_dim}' + ) + self.errors.append(error_msg) + return + except Exception as e: + self.errors.append(f'Error validating embeddings in {jsonl_file.name}: {e}') + + def __check_size_anomalies( + self, sizes: List[int], folder_name: str, threshold: float = 0.2, + ): + if len(sizes) < 2: + return + avg_size = sum(sizes) / len(sizes) + if avg_size == 0: + return + for i, size in enumerate(sizes): + deviation = abs(size - avg_size) / avg_size + if deviation > threshold: + warning_msg = ( + f'{folder_name} file #{i + 1} size deviation: ' + f'{deviation * 100:.1f}% from average' + ) + self.warnings.append(warning_msg) + + def __validate_other_files(self): + char_detections_dir = PathManager(self.series_name).get_episode_dir(self.episode_info, settings.output_subdirs.character_detections) + detections_file = char_detections_dir / OUTPUT_FILE_NAMES['detections'] + if detections_file.exists(): + result = FileValidator.validate_json_file(detections_file) + if not result.is_valid: + self.errors.append(f"Invalid {OUTPUT_FILE_NAMES['detections']}: {result.error_message}") + embeddings_dir = PathManager(self.series_name).get_episode_dir(self.episode_info, settings.output_subdirs.embeddings) + if embeddings_dir.exists(): + embeddings_file = embeddings_dir / OUTPUT_FILE_NAMES['embeddings_text'] + if embeddings_file.exists(): + result = FileValidator.validate_json_file(embeddings_file) + if not result.is_valid: + self.errors.append(f"Invalid {OUTPUT_FILE_NAMES['embeddings_text']}: {result.error_message}") + elastic_subdirs = [ + ELASTIC_SUBDIRS.text_segments, + ELASTIC_SUBDIRS.text_embeddings, + ELASTIC_SUBDIRS.video_frames, + ELASTIC_SUBDIRS.episode_names, + ELASTIC_SUBDIRS.text_statistics, + ELASTIC_SUBDIRS.full_episode_embeddings, + ELASTIC_SUBDIRS.sound_events, + ELASTIC_SUBDIRS.sound_event_embeddings, + ] + found_elastic_docs = False + for subdir in elastic_subdirs: + elastic_base = settings.output_subdirs.elastic_documents + elastic_docs_dir = PathManager(self.series_name).get_episode_dir( + self.episode_info, f'{elastic_base}/{subdir}', + ) + if elastic_docs_dir.exists(): + found_elastic_docs = True + for jsonl_file in elastic_docs_dir.glob('*.jsonl'): + result = FileValidator.validate_jsonl_file(jsonl_file) + if not result.is_valid: + self.errors.append(f'Invalid JSONL {jsonl_file.name}: {result.error_message}') + else: + self.__validate_embedding_dimensions(jsonl_file, subdir) + if not found_elastic_docs: + self.warnings.append(f'Missing {settings.output_subdirs.elastic_documents} directory') + transcriptions_dir = PathManager(self.series_name).get_episode_dir(self.episode_info, settings.output_subdirs.transcriptions) + if transcriptions_dir.exists(): + clean_subdir = settings.output_subdirs.transcription_subdirs.clean + clean_dir = transcriptions_dir / clean_subdir + filename = f'{self.series_name}_{self.episode_info.episode_code()}_text_stats.json' + text_stats_file = clean_dir / filename + if text_stats_file.exists(): + result = FileValidator.validate_json_file(text_stats_file) + if not result.is_valid: + self.errors.append(f'Invalid text_stats JSON: {result.error_message}') + else: + self.warnings.append(f'Missing text statistics file: {text_stats_file.name}') + + def to_dict(self) -> Dict[str, Any]: + return { + 'status': self.status, + 'errors': self.errors, + 'warnings': self.warnings, + 'stats': { + 'transcription_chars': self.transcription_chars, + 'transcription_duration': self.transcription_duration, + 'transcription_words': self.transcription_words, + 'exported_frames_count': self.exported_frames_count, + 'exported_frames_total_size_mb': self.exported_frames_total_size_mb, + 'exported_frames_avg_resolution': self.exported_frames_avg_resolution, + 'video_size_mb': self.video_size_mb, + 'video_duration': self.video_duration, + 'video_codec': self.video_codec, + 'video_resolution': self.video_resolution, + 'scenes_count': self.scenes_count, + 'scenes_avg_duration': self.scenes_avg_duration, + 'image_hashes_count': self.image_hashes_count, + 'character_visualizations_count': self.character_visualizations_count, + 'face_clusters_count': self.face_clusters_count, + 'face_clusters_total_faces': self.face_clusters_total_faces, + 'object_detections_count': self.object_detections_count, + 'object_visualizations_count': self.object_visualizations_count, + }, + } + + @staticmethod + def _validate_images_in_directory( + directory: Path, + extensions: Tuple[str, ...] = ('*.jpg', '*.png'), + ) -> Tuple[int, int, List[str]]: + if not directory.exists(): + return (0, 0, []) + image_files = [] + for ext in extensions: + image_files.extend(directory.glob(ext)) + if not image_files: + return (0, 0, []) + invalid_count = 0 + errors = [] + for img_file in image_files: + result = FileValidator.validate_image_file(img_file) + if not result.is_valid: + invalid_count += 1 + errors.append(f'Invalid image {img_file.name}: {result.error_message}') + return (len(image_files), invalid_count, errors) + + @staticmethod + def _validate_json_files_in_directory( + directory: Path, exclude_pattern: Optional[str] = None, + ) -> Tuple[int, List[int], List[str]]: + if not directory.exists(): + return (0, [], []) + json_files = [ + f for f in directory.glob('*.json') + if not exclude_pattern or exclude_pattern not in str(f) + ] + if not json_files: + return (0, [], []) + sizes = [] + errors = [] + for json_file in json_files: + result = FileValidator.validate_json_file(json_file) + if not result.is_valid: + errors.append(f'Invalid JSON {json_file.name}: {result.error_message}') + else: + sizes.append(json_file.stat().st_size) + return (len(json_files), sizes, errors) + + @staticmethod + def _load_json_safely(file_path: Path) -> Optional[Dict[str, Any]]: + try: + with open(file_path, 'r', encoding='utf-8') as f: + return json.load(f) + except Exception: + return None diff --git a/preprocessor/modules/validation/global_validator.py b/preprocessor/modules/validation/global_validator.py new file mode 100644 index 000000000..2db842626 --- /dev/null +++ b/preprocessor/modules/validation/global_validator.py @@ -0,0 +1,90 @@ +from pathlib import Path +from typing import List + +from preprocessor.lib.validation.base_result import BaseValidationResult +from preprocessor.lib.validation.file_validators import FileValidator + + +class GlobalValidationResult(BaseValidationResult): + pass + +class GlobalValidator: + + def __init__(self, series_name: str, base_output_dir: Path): + self.series_name = series_name + self.base_output_dir = base_output_dir + self.result = GlobalValidationResult() + + def validate(self) -> GlobalValidationResult: + self.__validate_main_json_files() + self.__validate_characters_folder() + self.__validate_processing_metadata() + return self.result + + def __validate_json_file(self, file_path: Path, stats_key: str): + if file_path.exists(): + result = FileValidator.validate_json_file(file_path) + if not result.is_valid: + self.result.errors.append(f'Invalid {file_path.name}: {result.error_message}') + else: + self.result.stats[stats_key] = True + else: + self.result.warnings.append(f'Missing {file_path.name}') + + def __validate_main_json_files(self): + episodes_file = self.base_output_dir / f'{self.series_name}_episodes.json' + self.__validate_json_file(episodes_file, 'episodes_json_valid') + characters_file = self.base_output_dir / f'{self.series_name}_characters.json' + self.__validate_json_file(characters_file, 'characters_json_valid') + + def __validate_characters_folder(self): + characters_dir = self.base_output_dir / 'characters' + if not characters_dir.exists(): + self.result.warnings.append('Missing characters/ directory') + return + character_folders = [d for d in characters_dir.iterdir() if d.is_dir()] + if not character_folders: + self.result.warnings.append('No character folders in characters/') + return + self.result.stats['character_folders_count'] = len(character_folders) + total_images = 0 + invalid_images = 0 + characters_without_images: List[str] = [] + for char_folder in character_folders: + image_files = self.__get_character_images(char_folder) + if not image_files: + characters_without_images.append(char_folder.name) + continue + total_images += len(image_files) + for img_file in image_files: + result = FileValidator.validate_image_file(img_file) + if not result.is_valid: + invalid_images += 1 + self.result.errors.append(f'Invalid character image {char_folder.name}/{img_file.name}: {result.error_message}') + self.result.stats['character_images_count'] = total_images + self.result.stats['invalid_character_images'] = invalid_images + if characters_without_images: + self.result.warnings.append(f'{len(characters_without_images)} characters without reference images') + + def __validate_processing_metadata(self): + metadata_dir = self.base_output_dir / 'processing_metadata' + if not metadata_dir.exists(): + self.result.warnings.append('Missing processing_metadata/ directory') + return + json_files = list(metadata_dir.glob('*.json')) + if not json_files: + self.result.warnings.append('No JSON files in processing_metadata/') + return + self.result.stats['processing_metadata_files'] = len(json_files) + for json_file in json_files: + result = FileValidator.validate_json_file(json_file) + if not result.is_valid: + self.result.errors.append(f'Invalid processing metadata {json_file.name}: {result.error_message}') + + @staticmethod + def __get_character_images(char_folder: Path) -> List[Path]: + extensions = ['*.jpg', '*.jpeg', '*.png', '*.webp'] + image_files = [] + for ext in extensions: + image_files.extend(char_folder.glob(ext)) + return image_files diff --git a/preprocessor/validation/report_generator.py b/preprocessor/modules/validation/report_generator.py similarity index 58% rename from preprocessor/validation/report_generator.py rename to preprocessor/modules/validation/report_generator.py index d28d7421e..353f8a5b7 100644 --- a/preprocessor/validation/report_generator.py +++ b/preprocessor/modules/validation/report_generator.py @@ -6,11 +6,12 @@ Dict, ) -from preprocessor.validation.episode_stats import EpisodeStats -from preprocessor.validation.season_comparator import SeasonComparison +from preprocessor.modules.validation.episode_stats import EpisodeStats +from preprocessor.modules.validation.season_comparator import SeasonComparison class ReportGenerator: + def __init__(self, season: str, anomaly_threshold: float): self.season = season self.anomaly_threshold = anomaly_threshold @@ -23,19 +24,20 @@ def generate_report( output_path: Path, ): report = { - "validation_timestamp": self.timestamp, - "season": self.season, - "anomaly_threshold": self.anomaly_threshold, - "episodes": {episode_id: stats.to_dict() for episode_id, stats in episodes_stats.items()}, - "season_comparison": season_comparison.to_dict(), + 'validation_timestamp': self.timestamp, + 'season': self.season, + 'anomaly_threshold': self.anomaly_threshold, + 'episodes': { + episode_id: stats.to_dict() + for episode_id, stats in episodes_stats.items() + }, + 'season_comparison': season_comparison.to_dict(), } - self.__save_report(report, output_path) return report @staticmethod def __save_report(report: Dict[str, Any], output_path: Path): output_path.parent.mkdir(parents=True, exist_ok=True) - - with open(output_path, "w", encoding="utf-8") as f: + with open(output_path, 'w', encoding='utf-8') as f: json.dump(report, f, indent=2, ensure_ascii=False) diff --git a/preprocessor/validation/season_comparator.py b/preprocessor/modules/validation/season_comparator.py similarity index 71% rename from preprocessor/validation/season_comparator.py rename to preprocessor/modules/validation/season_comparator.py index 4ac621d17..28d7c419a 100644 --- a/preprocessor/validation/season_comparator.py +++ b/preprocessor/modules/validation/season_comparator.py @@ -9,7 +9,7 @@ Optional, ) -from preprocessor.validation.episode_stats import EpisodeStats +from preprocessor.modules.validation.episode_stats import EpisodeStats @dataclass @@ -20,7 +20,6 @@ class MetricComparison: avg_value: Optional[float] difference_percent: Optional[float] - @dataclass class Anomaly: episode: str @@ -30,7 +29,6 @@ class Anomaly: deviation_percent: float severity: str - @dataclass class SeasonComparison: season: str @@ -40,41 +38,35 @@ class SeasonComparison: def compare_episodes(self, episodes_stats: Dict[str, EpisodeStats]): metric_keys = [ - "transcription_duration", - "transcription_chars", - "transcription_words", - "exported_frames_count", - "exported_frames_total_size_mb", - "video_size_mb", - "video_duration", - "scenes_count", + 'transcription_duration', + 'transcription_chars', + 'transcription_words', + 'exported_frames_count', + 'exported_frames_total_size_mb', + 'video_size_mb', + 'video_duration', + 'scenes_count', ] - for metric_key in metric_keys: self.__compare_metric(metric_key, episodes_stats) def __compare_metric(self, metric_key: str, episodes_stats: Dict[str, EpisodeStats]): values = [] episode_values = {} - for episode_id, stats in episodes_stats.items(): value = getattr(stats, metric_key, None) if value is not None: values.append(value) episode_values[episode_id] = value - if not values: return - min_val = min(values) max_val = max(values) avg_val = sum(values) / len(values) - if min_val > 0: - diff_percent = ((max_val - min_val) / min_val) * 100 + diff_percent = (max_val - min_val) / min_val * 100 else: diff_percent = 0.0 - self.metrics[metric_key] = MetricComparison( metric_name=metric_key, min_value=round(min_val, 2), @@ -82,15 +74,14 @@ def __compare_metric(self, metric_key: str, episodes_stats: Dict[str, EpisodeSta avg_value=round(avg_val, 2), difference_percent=round(diff_percent, 2), ) - for episode_id, value in episode_values.items(): if avg_val > 0: deviation_percent = abs((value - avg_val) / avg_val) * 100 else: deviation_percent = 0.0 - if deviation_percent > self.anomaly_threshold: - severity = "ERROR" if deviation_percent > self.anomaly_threshold * 2 else "WARNING" + threshold_doubled = self.anomaly_threshold * 2 + severity = 'ERROR' if deviation_percent > threshold_doubled else 'WARNING' self.anomalies.append( Anomaly( episode=episode_id, @@ -104,23 +95,23 @@ def __compare_metric(self, metric_key: str, episodes_stats: Dict[str, EpisodeSta def to_dict(self) -> Dict[str, Any]: return { - "metrics": { + 'metrics': { metric_name: { - "min": metric.min_value, - "max": metric.max_value, - "avg": metric.avg_value, - "difference_percent": metric.difference_percent, + 'min': metric.min_value, + 'max': metric.max_value, + 'avg': metric.avg_value, + 'difference_percent': metric.difference_percent, } for metric_name, metric in self.metrics.items() }, - "anomalies": [ + 'anomalies': [ { - "episode": anomaly.episode, - "metric": anomaly.metric, - "value": anomaly.value, - "avg": anomaly.avg, - "deviation_percent": anomaly.deviation_percent, - "severity": anomaly.severity, + 'episode': anomaly.episode, + 'metric': anomaly.metric, + 'value': anomaly.value, + 'avg': anomaly.avg, + 'deviation_percent': anomaly.deviation_percent, + 'severity': anomaly.severity, } for anomaly in self.anomalies ], diff --git a/preprocessor/modules/validation/validator.py b/preprocessor/modules/validation/validator.py new file mode 100644 index 000000000..98d79a978 --- /dev/null +++ b/preprocessor/modules/validation/validator.py @@ -0,0 +1,123 @@ +from datetime import datetime +from pathlib import Path +from typing import ( + Dict, + Optional, +) + +from rich.console import Console +from rich.progress import track + +from preprocessor.config.config import settings +from preprocessor.core.path_manager import PathManager +from preprocessor.lib.episodes import EpisodeManager +from preprocessor.lib.io.files import FileOperations +from preprocessor.modules.validation.episode_stats import EpisodeStats +from preprocessor.modules.validation.report_generator import ReportGenerator +from preprocessor.modules.validation.season_comparator import SeasonComparison + +console = Console() + +class Validator: + + def __init__( + self, + season: str, + series_name: str = 'ranczo', + anomaly_threshold: float = 20.0, + base_output_dir: Path = None, + episodes_info_json: Optional[Path] = None, + ): + self.season = season + self.series_name = series_name + self.anomaly_threshold = anomaly_threshold + self.base_output_dir = base_output_dir + self.episode_manager = EpisodeManager(episodes_info_json, series_name) + self.validation_reports_dir = base_output_dir / settings.output_subdirs.validation_reports + + def validate(self) -> int: + transcriptions_season_path = self.base_output_dir / 'transcriptions' / self.season + if not transcriptions_season_path.exists(): + console.print(f'[red]Season directory not found: {transcriptions_season_path}[/red]') + return 1 + console.print(f'[bold cyan]Validating season {self.season}...[/bold cyan]') + episodes_stats = self.__collect_episodes_stats(transcriptions_season_path) + if not episodes_stats: + console.print(f'[red]No episodes found in {transcriptions_season_path}[/red]') + return 1 + self.validation_reports_dir.mkdir(parents=True, exist_ok=True) + self.__generate_episode_reports(episodes_stats) + season_comparison = SeasonComparison(season=self.season, anomaly_threshold=self.anomaly_threshold) + season_comparison.compare_episodes(episodes_stats) + report_generator = ReportGenerator(season=self.season, anomaly_threshold=self.anomaly_threshold) + season_report_path = self.validation_reports_dir / f'{self.series_name}_{self.season}_season.json' + report_generator.generate_report(episodes_stats, season_comparison, season_report_path) + self.__print_summary(episodes_stats, season_comparison) + console.print(f'\n[green]Validation reports saved to: {self.validation_reports_dir}[/green]') + return 0 + + def __collect_episodes_stats(self, transcriptions_season_path: Path) -> Dict[str, EpisodeStats]: + episode_dirs = sorted([d for d in transcriptions_season_path.iterdir() if d.is_dir() and d.name.startswith('E')]) + episodes_stats = {} + for episode_dir in track(episode_dirs, description='Collecting episode stats'): + episode_num = int(episode_dir.name[1:]) + season_num = int(self.season[1:]) + episode_info = self.episode_manager.get_episode_by_season_and_relative(season_num, episode_num) + if not episode_info: + console.print(f'[yellow]Skipping {episode_dir.name}: could not parse episode info[/yellow]') + continue + episode_id = episode_info.episode_code() + stats = EpisodeStats(episode_info=episode_info, series_name=self.series_name) + stats.collect_stats() + episodes_stats[episode_id] = stats + return episodes_stats + + def __generate_episode_reports(self, episodes_stats: Dict[str, EpisodeStats]): + for stats in episodes_stats.values(): + episode_report = { + 'validation_timestamp': datetime.now().isoformat(), + 'episode_id': stats.episode_info.episode_code(), + 'episode_title': stats.episode_info.title, + 'status': stats.status, + 'errors': stats.errors, + 'warnings': stats.warnings, + 'stats': stats.to_dict()['stats'], + } + path_manager = PathManager(self.series_name) + report_filename = path_manager.build_filename(stats.episode_info, extension='json') + report_path = self.validation_reports_dir / report_filename + FileOperations.atomic_write_json(report_path, episode_report) + + def __print_summary(self, episodes_stats: Dict[str, EpisodeStats], season_comparison: SeasonComparison): + console.print(f'\n[bold]Validation Summary for {self.season}[/bold]') + console.print(f'Total episodes: {len(episodes_stats)}') + pass_count = sum((1 for stats in episodes_stats.values() if stats.status == 'PASS')) + warning_count = sum((1 for stats in episodes_stats.values() if stats.status == 'WARNING')) + fail_count = sum((1 for stats in episodes_stats.values() if stats.status == 'FAIL')) + console.print(f' [green]PASS:[/green] {pass_count}') + console.print(f' [yellow]WARNING:[/yellow] {warning_count}') + console.print(f' [red]FAIL:[/red] {fail_count}') + if season_comparison.anomalies: + console.print(f'\n[bold yellow]Anomalies detected: {len(season_comparison.anomalies)}[/bold yellow]') + for anomaly in season_comparison.anomalies[:5]: + color = 'red' if anomaly.severity == 'ERROR' else 'yellow' + msg = ( + f'{anomaly.metric} = {anomaly.value} ' + f'(avg: {anomaly.avg}, deviation: {anomaly.deviation_percent:.1f}%)' + ) + console.print(f' [{color}]{anomaly.episode}[/{color}]: {msg}') + if len(season_comparison.anomalies) > 5: + console.print(f' ... and {len(season_comparison.anomalies) - 5} more') + for episode_id, stats in episodes_stats.items(): + if stats.errors: + console.print(f'\n[red]Errors in {episode_id}:[/red]') + for error in stats.errors[:3]: + console.print(f' - {error}') + if len(stats.errors) > 3: + console.print(f' ... and {len(stats.errors) - 3} more') + if stats.warnings: + console.print(f'\n[yellow]Warnings in {episode_id}:[/yellow]') + for warning in stats.warnings[:3]: + console.print(f' - {warning}') + if len(stats.warnings) > 3: + console.print(f' ... and {len(stats.warnings) - 3} more') diff --git a/preprocessor/transcription/generators/__init__.py b/preprocessor/modules/video/__init__.py similarity index 100% rename from preprocessor/transcription/generators/__init__.py rename to preprocessor/modules/video/__init__.py diff --git a/preprocessor/modules/video/frame_export.py b/preprocessor/modules/video/frame_export.py new file mode 100644 index 000000000..655f0ec3a --- /dev/null +++ b/preprocessor/modules/video/frame_export.py @@ -0,0 +1,221 @@ +from datetime import datetime +import json +from pathlib import Path +import shutil +import subprocess +from typing import ( + Any, + Dict, + List, +) + +from PIL import Image +import decord + +from preprocessor.config.step_configs import FrameExportConfig +from preprocessor.config.types import FrameRequest +from preprocessor.core.artifacts import ( + FrameCollection, + SceneCollection, +) +from preprocessor.core.base_step import PipelineStep +from preprocessor.core.context import ExecutionContext +from preprocessor.lib.io.files import atomic_write_json +from preprocessor.modules.video.strategies.strategy_factory import KeyframeStrategyFactory + + +class FrameExporterStep(PipelineStep[SceneCollection, FrameCollection, FrameExportConfig]): + + def __init__(self, config: FrameExportConfig): + super().__init__(config) + decord.bridge.set_bridge('native') + self.strategy = KeyframeStrategyFactory.create(self.config.keyframe_strategy, self.config.frames_per_scene) + + @property + def name(self) -> str: + return 'frame_export' + + def execute(self, input_data: SceneCollection, context: ExecutionContext) -> FrameCollection: + episode_dir = context.get_output_path(input_data.episode_info, 'exported_frames', '') + metadata_filename = f'{context.series_name}_{input_data.episode_info.episode_code()}_frame_metadata.json' + metadata_file = episode_dir / metadata_filename + if metadata_file.exists() and (not context.force_rerun): + if context.is_step_completed(self.name, input_data.episode_id): + context.logger.info(f'Skipping {input_data.episode_id} (cached)') + with open(metadata_file, 'r', encoding='utf-8') as f: + metadata = json.load(f) + return FrameCollection( + episode_id=input_data.episode_id, + episode_info=input_data.episode_info, + directory=episode_dir, + frame_count=metadata['statistics']['total_frames'], + metadata_path=metadata_file, + ) + if episode_dir.exists(): + context.logger.info(f'Cleaning incomplete frames from previous run: {episode_dir}') + shutil.rmtree(episode_dir, ignore_errors=True) + episode_dir.mkdir(parents=True, exist_ok=True) + video_path = input_data.video_path + if not video_path.exists(): + raise FileNotFoundError(f'Video file not found for frame export: {video_path}') + data = {'scene_timestamps': {'scenes': input_data.scenes}} + frame_requests = self.strategy.extract_frame_requests(video_path, data) + if not frame_requests: + context.logger.warning(f'No frames to extract for {input_data.episode_id}') + return FrameCollection( + episode_id=input_data.episode_id, + episode_info=input_data.episode_info, + directory=episode_dir, + frame_count=0, + metadata_path=metadata_file, + ) + context.logger.info(f'Extracting {len(frame_requests)} keyframes from {video_path.name}') + context.mark_step_started(self.name, input_data.episode_id) + try: + self._extract_frames(video_path, frame_requests, episode_dir, input_data.episode_info, context) + self._write_metadata(frame_requests, input_data.episode_info, video_path, context, metadata_file) + except Exception as e: + context.logger.error(f'Failed to extract frames from {video_path}: {e}') + shutil.rmtree(episode_dir, ignore_errors=True) + raise + context.mark_step_completed(self.name, input_data.episode_id) + return FrameCollection( + episode_id=input_data.episode_id, + episode_info=input_data.episode_info, + directory=episode_dir, + frame_count=len(frame_requests), + metadata_path=metadata_file, + ) + + def _extract_frames( + self, + video_file: Path, + frame_requests: List[FrameRequest], + episode_dir: Path, + episode_info, + context: ExecutionContext, + ) -> None: + video_metadata = self._get_video_metadata(video_file) + dar = self._calculate_display_aspect_ratio(video_metadata) + vr = decord.VideoReader(str(video_file), ctx=decord.cpu(0)) + for req in frame_requests: + frame_num = req['frame_number'] + self._extract_and_save_frame(vr, frame_num, episode_dir, episode_info, dar, context.series_name) + del vr + + def _extract_and_save_frame( + self, + vr, + frame_num: int, + episode_dir: Path, + episode_info, + dar: float, + series_name: str, + ) -> None: + frame_np = vr[frame_num].asnumpy() + frame_pil = Image.fromarray(frame_np) + resized = self._resize_frame(frame_pil, dar) + base_filename = f'{series_name}_{episode_info.episode_code()}' + filename = f'{base_filename}_frame_{frame_num:06d}.jpg' + resized.save(episode_dir / filename, quality=90) + + @staticmethod + def _get_video_metadata(video_path: Path) -> Dict[str, Any]: + cmd = [ + 'ffprobe', '-v', 'error', '-select_streams', 'v:0', + '-show_entries', 'stream=width,height,sample_aspect_ratio,display_aspect_ratio', + '-of', 'json', str(video_path), + ] + result = subprocess.run(cmd, capture_output=True, text=True, check=True) + probe_data: Dict[str, Any] = json.loads(result.stdout) + streams: List[Dict[str, Any]] = probe_data.get('streams', []) + if not streams: + raise ValueError(f'No video streams found in {video_path}') + return streams[0] + + @staticmethod + def _calculate_display_aspect_ratio(metadata: Dict[str, Any]) -> float: + width = metadata.get('width', 0) + height = metadata.get('height', 0) + if width == 0 or height == 0: + raise ValueError('Invalid video dimensions') + sar_str = metadata.get('sample_aspect_ratio', '1:1') + if sar_str == 'N/A' or not sar_str: + sar_str = '1:1' + try: + sar_num, sar_denom = [int(x) for x in sar_str.split(':')] + sar = sar_num / sar_denom if sar_denom != 0 else 1.0 + except (ValueError, ZeroDivisionError): + sar = 1.0 + return width / height * sar + + def _resize_frame(self, frame: Image.Image, display_aspect_ratio: float) -> Image.Image: + target_width = self.config.resolution.width + target_height = self.config.resolution.height + target_aspect = target_width / target_height + if abs(display_aspect_ratio - target_aspect) < 0.01: + return frame.resize((target_width, target_height), Image.Resampling.LANCZOS) + if display_aspect_ratio > target_aspect: + new_height = target_height + new_width = int(target_height * display_aspect_ratio) + resized = frame.resize((new_width, new_height), Image.Resampling.LANCZOS) + x_crop = (new_width - target_width) // 2 + cropped = resized.crop((x_crop, 0, x_crop + target_width, target_height)) + return cropped + new_width = target_width + new_height = int(target_width / display_aspect_ratio) + resized = frame.resize((new_width, new_height), Image.Resampling.LANCZOS) + result = Image.new('RGB', (target_width, target_height), (0, 0, 0)) + y_offset = (target_height - new_height) // 2 + result.paste(resized, (0, y_offset)) + return result + + def _write_metadata( + self, + frame_requests: List[FrameRequest], + episode_info, + source_video: Path, + context: ExecutionContext, + metadata_file: Path, + ) -> None: + frame_types_count = {} + frames_with_paths = [] + base_filename = f'{context.series_name}_{episode_info.episode_code()}' + for frame in frame_requests: + frame_type = frame.get('type', 'unknown') + frame_types_count[frame_type] = frame_types_count.get(frame_type, 0) + 1 + frame_with_path = frame.copy() + frame_num = frame['frame_number'] + frame_with_path['frame_path'] = f'{base_filename}_frame_{frame_num:06d}.jpg' + frames_with_paths.append(frame_with_path) + scene_numbers = { + f.get('scene_number', -1) + for f in frame_requests + if f.get('scene_number', -1) != -1 + } + metadata = { + 'generated_at': datetime.now().isoformat(), + 'episode_info': { + 'season': episode_info.season, + 'episode_number': episode_info.relative_episode, + 'absolute_episode': episode_info.absolute_episode, + }, + 'source_video': str(source_video), + 'processing_parameters': { + 'frame_width': self.config.resolution.width, + 'frame_height': self.config.resolution.height, + 'keyframe_strategy': self.config.keyframe_strategy.value, + 'frames_per_scene': self.config.frames_per_scene, + }, + 'statistics': { + 'total_frames': len(frame_requests), + 'frame_types': frame_types_count, + 'total_scenes': len(scene_numbers), + 'timestamp_range': { + 'start': min((f.get('timestamp', 0) for f in frame_requests), default=0), + 'end': max((f.get('timestamp', 0) for f in frame_requests), default=0), + }, + }, + 'frames': frames_with_paths, + } + atomic_write_json(metadata_file, metadata, indent=2) diff --git a/preprocessor/modules/video/scene_detection.py b/preprocessor/modules/video/scene_detection.py new file mode 100644 index 000000000..22b691021 --- /dev/null +++ b/preprocessor/modules/video/scene_detection.py @@ -0,0 +1,79 @@ +from preprocessor.config.step_configs import SceneDetectionConfig +from preprocessor.core.artifacts import ( + SceneCollection, + TranscodedVideo, +) +from preprocessor.core.base_step import PipelineStep +from preprocessor.core.context import ExecutionContext +from preprocessor.lib.io.files import ( + atomic_write_json, + load_json, +) +from preprocessor.lib.media.scene_detection import TransNetWrapper + + +class SceneDetectorStep(PipelineStep[TranscodedVideo, SceneCollection, SceneDetectionConfig]): + + def __init__(self, config: SceneDetectionConfig): + super().__init__(config) + self.transnet = TransNetWrapper() + self._model_loaded = False + + @property + def name(self) -> str: + return 'scene_detection' + + def execute(self, input_data: TranscodedVideo, context: ExecutionContext) -> SceneCollection: + output_filename = f'{context.series_name}_{input_data.episode_info.episode_code()}_scenes.json' + output_path = context.get_output_path(input_data.episode_info, 'scene_timestamps', output_filename) + if output_path.exists() and (not context.force_rerun): + if context.is_step_completed(self.name, input_data.episode_id): + context.logger.info(f'Skipping {input_data.episode_id} (cached)') + scenes_data = load_json(output_path) + return SceneCollection( + path=output_path, + video_path=input_data.path, + episode_id=input_data.episode_id, + episode_info=input_data.episode_info, + scenes=scenes_data.get('scenes', []), + threshold=self.config.threshold, + min_scene_len=self.config.min_scene_len, + ) + if not self._model_loaded: + context.logger.info('Loading TransNetV2 model...') + self.transnet.load_model() + self._model_loaded = True + context.logger.info(f'Detecting scenes in {input_data.episode_id}') + context.mark_step_started(self.name, input_data.episode_id) + scenes = self.transnet.detect_scenes( + input_data.path, + threshold=self.config.threshold, + min_scene_len=self.config.min_scene_len, + ) + video_info = self.transnet.get_video_info(input_data.path) + output_data = { + 'total_scenes': len(scenes), + 'video_info': video_info, + 'detection_settings': { + 'threshold': self.config.threshold, + 'min_scene_len': self.config.min_scene_len, + 'method': 'transnetv2', + }, + 'scenes': scenes, + } + atomic_write_json(output_path, output_data) + context.mark_step_completed(self.name, input_data.episode_id) + return SceneCollection( + path=output_path, + video_path=input_data.path, + episode_id=input_data.episode_id, + episode_info=input_data.episode_info, + scenes=scenes, + threshold=self.config.threshold, + min_scene_len=self.config.min_scene_len, + ) + + def cleanup(self) -> None: + if self._model_loaded: + self.transnet.cleanup() + self._model_loaded = False diff --git a/preprocessor/modules/video/strategies/__init__.py b/preprocessor/modules/video/strategies/__init__.py new file mode 100644 index 000000000..99c7a0e38 --- /dev/null +++ b/preprocessor/modules/video/strategies/__init__.py @@ -0,0 +1,4 @@ +from preprocessor.modules.video.strategies.base_strategy import BaseKeyframeStrategy +from preprocessor.modules.video.strategies.scene_changes_strategy import SceneChangesStrategy + +__all__ = ['BaseKeyframeStrategy', 'SceneChangesStrategy'] diff --git a/preprocessor/embeddings/strategies/base_strategy.py b/preprocessor/modules/video/strategies/base_strategy.py similarity index 59% rename from preprocessor/embeddings/strategies/base_strategy.py rename to preprocessor/modules/video/strategies/base_strategy.py index 12c271714..930186bf1 100644 --- a/preprocessor/embeddings/strategies/base_strategy.py +++ b/preprocessor/modules/video/strategies/base_strategy.py @@ -11,10 +11,7 @@ class BaseKeyframeStrategy(ABC): + @abstractmethod - def extract_frame_requests( - self, - video_path: Path, - data: Dict[str, Any], - ) -> List[Dict[str, Any]]: + def extract_frame_requests(self, video_path: Path, data: Dict[str, Any]) -> List[Dict[str, Any]]: pass diff --git a/preprocessor/embeddings/strategies/scene_changes_strategy.py b/preprocessor/modules/video/strategies/scene_changes_strategy.py similarity index 56% rename from preprocessor/embeddings/strategies/scene_changes_strategy.py rename to preprocessor/modules/video/strategies/scene_changes_strategy.py index 9244ad68e..ce248841f 100644 --- a/preprocessor/embeddings/strategies/scene_changes_strategy.py +++ b/preprocessor/modules/video/strategies/scene_changes_strategy.py @@ -5,63 +5,48 @@ List, ) -from preprocessor.core.enums import FrameType -from preprocessor.embeddings.strategies.base_strategy import BaseKeyframeStrategy -from preprocessor.utils.console import console +from preprocessor.config.enums import FrameType +from preprocessor.lib.ui.console import console +from preprocessor.modules.video.strategies.base_strategy import BaseKeyframeStrategy class SceneChangesStrategy(BaseKeyframeStrategy): + def __init__(self, frames_per_scene: int): self.frames_per_scene = frames_per_scene - def extract_frame_requests( - self, - video_path: Path, - data: Dict[str, Any], - ) -> List[Dict[str, Any]]: - scene_timestamps = data.get("scene_timestamps", {}) - scenes = scene_timestamps.get("scenes", []) - + def extract_frame_requests(self, video_path: Path, data: Dict[str, Any]) -> List[Dict[str, Any]]: + scene_timestamps = data.get('scene_timestamps', {}) + scenes = scene_timestamps.get('scenes', []) if not scenes: - console.print("[yellow]No scene timestamps found[/yellow]") + console.print('[yellow]No scene timestamps found[/yellow]') return [] - - video_info = scene_timestamps.get("video_info", {}) - fps = video_info.get("fps") + video_info = scene_timestamps.get('video_info', {}) + fps = video_info.get('fps') if fps is None: - raise ValueError("FPS not found in scene_timestamps video_info") + raise ValueError('FPS not found in scene_timestamps video_info') frame_requests = [] - for i, scene in enumerate(scenes): - start_frame = scene.get("start", {}).get("frame", 0) - frame_count = scene.get("frame_count", 1) - + start_frame = scene.get('start', {}).get('frame', 0) + frame_count = scene.get('frame_count', 1) if frame_count <= 1: frame_requests.append(self.__create_request(start_frame, fps, FrameType.SCENE_SINGLE, i)) continue - for frame_idx in range(self.frames_per_scene): position = frame_idx / (self.frames_per_scene - 1) if self.frames_per_scene > 1 else 0.0 frame_number = int(start_frame + position * (frame_count - 1)) - if frame_idx == 0: frame_type = FrameType.SCENE_START elif frame_idx == self.frames_per_scene - 1: frame_type = FrameType.SCENE_END else: frame_type = FrameType.scene_mid(frame_idx) - frame_requests.append(self.__create_request(frame_number, fps, frame_type, i)) - return frame_requests @staticmethod - def __create_request(frame: int, fps: float, type_name: str, scene_num: int = None) -> Dict[str, Any]: - req = { - "frame_number": int(frame), - "timestamp": float(frame / fps), - "type": type_name, - } + def __create_request(frame: int, fps: float, type_name: str, scene_num: int=None) -> Dict[str, Any]: + req = {'frame_number': int(frame), 'timestamp': float(frame / fps), 'type': type_name} if scene_num is not None: - req["scene_number"] = scene_num + req['scene_number'] = scene_num return req diff --git a/preprocessor/modules/video/strategies/strategy_factory.py b/preprocessor/modules/video/strategies/strategy_factory.py new file mode 100644 index 000000000..e26b1075d --- /dev/null +++ b/preprocessor/modules/video/strategies/strategy_factory.py @@ -0,0 +1,12 @@ +from preprocessor.config.enums import KeyframeStrategy +from preprocessor.modules.video.strategies.base_strategy import BaseKeyframeStrategy +from preprocessor.modules.video.strategies.scene_changes_strategy import SceneChangesStrategy + + +class KeyframeStrategyFactory: + + @staticmethod + def create(strategy_type: KeyframeStrategy, frames_per_scene: int=1) -> BaseKeyframeStrategy: + if strategy_type == KeyframeStrategy.SCENE_CHANGES: + return SceneChangesStrategy(frames_per_scene=frames_per_scene) + raise ValueError(f'Unknown keyframe strategy: {strategy_type}') diff --git a/preprocessor/modules/video/transcoding.py b/preprocessor/modules/video/transcoding.py new file mode 100644 index 000000000..4634e0e64 --- /dev/null +++ b/preprocessor/modules/video/transcoding.py @@ -0,0 +1,104 @@ +from preprocessor.config.step_configs import TranscodeConfig +from preprocessor.core.artifacts import ( + SourceVideo, + TranscodedVideo, +) +from preprocessor.core.base_step import PipelineStep +from preprocessor.core.context import ExecutionContext +from preprocessor.lib.media.ffmpeg import FFmpegWrapper + + +class VideoTranscoderStep(PipelineStep[SourceVideo, TranscodedVideo, TranscodeConfig]): + + @property + def name(self) -> str: + return 'video_transcode' + + def execute( # pylint: disable=too-many-locals + self, input_data: SourceVideo, context: ExecutionContext, + ) -> TranscodedVideo: + output_filename = f'{context.series_name}_{input_data.episode_info.episode_code()}.mp4' + output_path = context.get_season_output_path(input_data.episode_info, 'transcoded_videos', output_filename) + if output_path.exists() and (not context.force_rerun): + if context.is_step_completed(self.name, input_data.episode_id): + context.logger.info(f'Skipping {input_data.episode_id} (cached)') + resolution_str = ( + f'{self.config.resolution.width}x{self.config.resolution.height}' + ) + return TranscodedVideo( + path=output_path, + episode_id=input_data.episode_id, + episode_info=input_data.episode_info, + resolution=resolution_str, + codec=self.config.codec, + ) + probe_data = FFmpegWrapper.probe_video(input_data.path) + input_fps = FFmpegWrapper.get_framerate(probe_data) + input_video_bitrate = FFmpegWrapper.get_video_bitrate(probe_data) + input_audio_bitrate = FFmpegWrapper.get_audio_bitrate(probe_data) + target_fps = min(input_fps, 30.0) + if target_fps < input_fps: + msg = ( + f'Input FPS ({input_fps}) > 30. ' + f'Limiting to {target_fps} FPS for compatibility and smaller file size.' + ) + context.logger.info(msg) + video_bitrate = self.config.video_bitrate_mbps + minrate = self.config.minrate_mbps + maxrate = self.config.maxrate_mbps + bufsize = self.config.bufsize_mbps + if input_video_bitrate and input_video_bitrate < video_bitrate: + adjusted_bitrate = min(input_video_bitrate * 1.05, video_bitrate) + ratio = adjusted_bitrate / video_bitrate + video_bitrate = adjusted_bitrate + minrate = round(minrate * ratio, 2) + maxrate = round(maxrate * ratio, 2) + bufsize = round(bufsize * ratio, 2) + msg = ( + f'Input video bitrate ({input_video_bitrate} Mbps) < ' + f'target ({self.config.video_bitrate_mbps} Mbps). ' + f'Adjusted to {video_bitrate} Mbps to avoid quality loss.' + ) + context.logger.info(msg) + audio_bitrate = self.config.audio_bitrate_kbps + if input_audio_bitrate and input_audio_bitrate < audio_bitrate: + adjusted_audio_bitrate = min(int(input_audio_bitrate * 1.05), audio_bitrate) + audio_bitrate = adjusted_audio_bitrate + msg = ( + f'Input audio bitrate ({input_audio_bitrate} kbps) < ' + f'target ({self.config.audio_bitrate_kbps} kbps). ' + f'Adjusted to {audio_bitrate} kbps to avoid quality loss.' + ) + context.logger.info(msg) + context.logger.info(f'Transcoding {input_data.episode_id}') + temp_path = output_path.with_suffix('.mp4.tmp') + context.mark_step_started(self.name, input_data.episode_id, [str(temp_path)]) + try: + FFmpegWrapper.transcode( + input_path=input_data.path, + output_path=temp_path, + codec=self.config.codec, + preset=self.config.preset, + resolution=f'{self.config.resolution.width}:{self.config.resolution.height}', + video_bitrate=f'{video_bitrate}M', + minrate=f'{minrate}M', + maxrate=f'{maxrate}M', + bufsize=f'{bufsize}M', + audio_bitrate=f'{audio_bitrate}k', + gop_size=int(target_fps * self.config.gop_size), + target_fps=target_fps if target_fps < input_fps else None, + ) + temp_path.replace(output_path) + except Exception: + if temp_path.exists(): + temp_path.unlink() + raise + context.mark_step_completed(self.name, input_data.episode_id) + resolution_str = f'{self.config.resolution.width}x{self.config.resolution.height}' + return TranscodedVideo( + path=output_path, + episode_id=input_data.episode_id, + episode_info=input_data.episode_info, + resolution=resolution_str, + codec=self.config.codec, + ) diff --git a/preprocessor/modules/vision/__init__.py b/preprocessor/modules/vision/__init__.py new file mode 100644 index 000000000..423227cba --- /dev/null +++ b/preprocessor/modules/vision/__init__.py @@ -0,0 +1,8 @@ +from preprocessor.modules.vision.character_detection import CharacterDetectorStep +from preprocessor.modules.vision.embeddings import VideoEmbeddingStep +from preprocessor.modules.vision.emotion_detection import EmotionDetectionStep +from preprocessor.modules.vision.face_clustering import FaceClusteringStep +from preprocessor.modules.vision.image_hashing import ImageHashStep +from preprocessor.modules.vision.object_detection import ObjectDetectionStep + +__all__ = ['CharacterDetectorStep', 'EmotionDetectionStep', 'FaceClusteringStep', 'ImageHashStep', 'ObjectDetectionStep', 'VideoEmbeddingStep'] diff --git a/preprocessor/modules/vision/character_detection.py b/preprocessor/modules/vision/character_detection.py new file mode 100644 index 000000000..a18d41bc9 --- /dev/null +++ b/preprocessor/modules/vision/character_detection.py @@ -0,0 +1,118 @@ +from pathlib import Path +from typing import ( + Any, + Dict, + List, +) + +import numpy as np + +from preprocessor.config.step_configs import CharacterDetectionConfig +from preprocessor.core.artifacts import ( + DetectionResults, + FrameCollection, +) +from preprocessor.core.base_step import PipelineStep +from preprocessor.core.context import ExecutionContext +from preprocessor.lib.characters import FaceDetector +from preprocessor.lib.io.detection_io import process_frames_for_detection +from preprocessor.lib.io.files import ( + atomic_write_json, + load_json, +) + + +class CharacterDetectorStep(PipelineStep[FrameCollection, DetectionResults, CharacterDetectionConfig]): + + def __init__(self, config: CharacterDetectionConfig) -> None: + super().__init__(config) + self._face_app = None + self._character_vectors: Dict[str, np.ndarray] = {} + + @property + def name(self) -> str: + return 'character_detection' + + def execute( + self, input_data: FrameCollection, context: ExecutionContext, + ) -> DetectionResults: + filename = f'{context.series_name}_{input_data.episode_info.episode_code()}' + output_filename: str = f'{filename}_character_detections.json' + output_path: Path = context.get_output_path( + input_data.episode_info, 'character_detections', output_filename, + ) + if output_path.exists() and (not context.force_rerun): + if context.is_step_completed(self.name, input_data.episode_id): + context.logger.info(f'Skipping {input_data.episode_id} (cached character detections)') + det_data: Dict[str, Any] = load_json(output_path) + return DetectionResults( + episode_id=input_data.episode_id, + episode_info=input_data.episode_info, + path=output_path, + detection_type='character', + detection_count=len(det_data.get('detections', [])), + ) + if self._face_app is None: + context.logger.info('Initializing face detection model...') + self._face_app = FaceDetector.init() + characters_dir: Path = Path('preprocessor/output_data') / context.series_name / 'characters' + if not characters_dir.exists(): + characters_dir = Path('preprocessor/input_data') / context.series_name / 'characters' + if characters_dir.exists(): + context.logger.info(f'Loading character references from {characters_dir}') + self._character_vectors = FaceDetector.load_character_references( + characters_dir, self._face_app, + ) + else: + context.logger.warning(f'Characters directory not found: {characters_dir}') + context.logger.info(f'Detecting characters in {input_data.episode_id}') + context.mark_step_started(self.name, input_data.episode_id) + frame_files: List[Path] = sorted([ + f for f in input_data.directory.glob('*.jpg') + if f.is_file() and 'frame_' in f.name + ]) + if not frame_files: + context.logger.warning(f'No frame files found in {input_data.directory}') + return DetectionResults( + episode_id=input_data.episode_id, + episode_info=input_data.episode_info, + path=output_path, + detection_type='character', + detection_count=0, + ) + results: List[Dict[str, Any]] = process_frames_for_detection( + frame_files, self._face_app, self._character_vectors, self.config.threshold, + ) + output_data: Dict[str, Any] = { + 'episode_id': input_data.episode_id, + 'series_name': context.series_name, + 'detection_settings': self.config.dict(), + 'statistics': { + 'total_frames_processed': len(frame_files), + 'frames_with_detections': len(results), + 'character_counts': self._count_characters(results), + }, + 'detections': results, + } + atomic_write_json(output_path, output_data) + context.mark_step_completed(self.name, input_data.episode_id) + return DetectionResults( + episode_id=input_data.episode_id, + episode_info=input_data.episode_info, + path=output_path, + detection_type='character', + detection_count=len(results), + ) + + @staticmethod + def _count_characters(results: List[Dict[str, Any]]) -> Dict[str, int]: + counts: Dict[str, int] = {} + for res in results: + for face in res.get('faces', []): + name: str = face.get('character_name', 'unknown') + counts[name] = counts.get(name, 0) + 1 + return counts + + def cleanup(self) -> None: + self._face_app = None + self._character_vectors = {} diff --git a/preprocessor/modules/vision/embeddings.py b/preprocessor/modules/vision/embeddings.py new file mode 100644 index 000000000..302c03c6f --- /dev/null +++ b/preprocessor/modules/vision/embeddings.py @@ -0,0 +1,129 @@ +from pathlib import Path +from typing import ( + Any, + Dict, + List, + Optional, +) + +import numpy as np + +from preprocessor.config.step_configs import VideoEmbeddingConfig +from preprocessor.core.artifacts import ( + EmbeddingCollection, + FrameCollection, +) +from preprocessor.core.base_step import PipelineStep +from preprocessor.core.context import ExecutionContext +from preprocessor.lib.io.files import ( + atomic_write_json, + load_json, +) +from preprocessor.lib.io.metadata import MetadataBuilder +from preprocessor.lib.search.embedding_model import EmbeddingModelWrapper + + +class VideoEmbeddingStep(PipelineStep[FrameCollection, EmbeddingCollection, VideoEmbeddingConfig]): + + def __init__(self, config: VideoEmbeddingConfig) -> None: + super().__init__(config) + self._model: Optional[EmbeddingModelWrapper] = None + + @property + def name(self) -> str: + return 'video_embedding' + + def _create_embedding_collection( # pylint: disable=duplicate-code + self, + input_data: FrameCollection, + output_path: Path, + embedding_count: int, + ) -> EmbeddingCollection: + """Create EmbeddingCollection with standard parameters.""" + return MetadataBuilder.create_embedding_collection( + episode_id=input_data.episode_id, + episode_info=input_data.episode_info, + path=output_path, + model_name=self.config.model_name, + embedding_count=embedding_count, + embedding_type='video', + ) + + def execute( # pylint: disable=too-many-locals + self, input_data: FrameCollection, context: ExecutionContext, + ) -> EmbeddingCollection: + filename_base = f'{context.series_name}_{input_data.episode_info.episode_code()}' + output_filename: str = f'{filename_base}_embeddings_video.json' + output_path: Path = context.get_output_path(input_data.episode_info, 'embeddings', output_filename) + if output_path.exists() and (not context.force_rerun): + if context.is_step_completed(self.name, input_data.episode_id): + context.logger.info(f'Skipping {input_data.episode_id} (cached video embeddings)') + emb_data: Dict[str, Any] = load_json(output_path) + return self._create_embedding_collection( + input_data, + output_path, + len(emb_data.get('video_embeddings', [])), + ) + frame_metadata: Dict[str, Any] = load_json(input_data.metadata_path) + frame_requests: List[Dict[str, Any]] = frame_metadata.get('frames', []) + if not frame_requests: + context.logger.warning(f'No frames for embedding in {input_data.episode_id}') + return self._create_embedding_collection(input_data, output_path, 0) + image_hashes: Dict[int, str] = self._load_image_hashes(input_data, context) + if self._model is None: + self._model = EmbeddingModelWrapper(self.config.model_name, self.config.device) + self._model.load_model() # pylint: disable=no-member + msg = ( + f'Generating video embeddings for {len(frame_requests)} frames ' + f'in {input_data.episode_id}' + ) + context.logger.info(msg) + context.mark_step_started(self.name, input_data.episode_id) + results: List[Dict[str, Any]] = [] + batch_size: int = self.config.batch_size + for i in range(0, len(frame_requests), batch_size): + batch: List[Dict[str, Any]] = frame_requests[i:i + batch_size] + image_paths: List[str] = [str(input_data.directory / f['frame_path']) for f in batch] + batch_embeddings: List[np.ndarray] = self._model.encode_images(image_paths) # pylint: disable=no-member + for request, emb in zip(batch, batch_embeddings): + res: Dict[str, Any] = {**request, 'embedding': emb.tolist()} + frame_num: int = request.get('frame_number', -1) + if frame_num in image_hashes: + res['perceptual_hash'] = image_hashes[frame_num] + results.append(res) + statistics = { + 'total_embeddings': len(results), + 'embedding_dimension': len(results[0]['embedding']) if results else 0, + 'frames_with_hash': len(image_hashes), + } + output_data: Dict[str, Any] = MetadataBuilder.create_processing_metadata( + episode_info=input_data.episode_info, + processing_params=self.config.dict(), + statistics=statistics, + results_key='video_embeddings', + results_data=results, + ) + atomic_write_json(output_path, output_data) + context.mark_step_completed(self.name, input_data.episode_id) + return self._create_embedding_collection(input_data, output_path, len(results)) + + @staticmethod + def _load_image_hashes( + input_data: FrameCollection, context: ExecutionContext, + ) -> Dict[int, str]: + filename_base = f'{context.series_name}_{input_data.episode_info.episode_code()}' + hash_filename: str = f'{filename_base}_image_hashes.json' + hash_path: Path = context.get_output_path(input_data.episode_info, 'image_hashes', hash_filename) + if not hash_path.exists(): + return {} + try: + data: Dict[str, Any] = load_json(hash_path) + return {h['frame_number']: h['perceptual_hash'] for h in data.get('hashes', [])} + except Exception as e: + context.logger.warning(f'Could not load image hashes from {hash_path}: {e}') + return {} + + def cleanup(self) -> None: + if self._model: + self._model.cleanup() # pylint: disable=no-member + self._model = None diff --git a/preprocessor/modules/vision/emotion_detection.py b/preprocessor/modules/vision/emotion_detection.py new file mode 100644 index 000000000..48a3d5bcb --- /dev/null +++ b/preprocessor/modules/vision/emotion_detection.py @@ -0,0 +1,28 @@ +from pathlib import Path + +from preprocessor.config.step_configs import EmotionDetectionConfig +from preprocessor.core.artifacts import ( + EmotionData, + FrameCollection, +) +from preprocessor.core.base_step import PipelineStep +from preprocessor.core.context import ExecutionContext + + +class EmotionDetectionStep(PipelineStep[FrameCollection, EmotionData, EmotionDetectionConfig]): + + @property + def name(self) -> str: + return 'emotion_detection' + + def execute(self, input_data: FrameCollection, context: ExecutionContext) -> EmotionData: + output_filename: str = f'{context.series_name}_{input_data.episode_info.episode_code()}_emotions.json' + output_path: Path = context.get_output_path(input_data.episode_info, 'emotion_detections', output_filename) + if output_path.exists() and (not context.force_rerun): + if context.is_step_completed(self.name, input_data.episode_id): + context.logger.info(f'Skipping {input_data.episode_id} (cached emotion detection)') + return EmotionData(episode_id=input_data.episode_id, episode_info=input_data.episode_info, path=output_path) + context.logger.info(f'Detecting emotions for {input_data.episode_id}') + context.mark_step_started(self.name, input_data.episode_id) + context.mark_step_completed(self.name, input_data.episode_id) + return EmotionData(episode_id=input_data.episode_id, episode_info=input_data.episode_info, path=output_path) diff --git a/preprocessor/modules/vision/face_clustering.py b/preprocessor/modules/vision/face_clustering.py new file mode 100644 index 000000000..0f3ab7aab --- /dev/null +++ b/preprocessor/modules/vision/face_clustering.py @@ -0,0 +1,28 @@ +from pathlib import Path + +from preprocessor.config.step_configs import FaceClusteringConfig +from preprocessor.core.artifacts import ( + ClusterData, + FrameCollection, +) +from preprocessor.core.base_step import PipelineStep +from preprocessor.core.context import ExecutionContext + + +class FaceClusteringStep(PipelineStep[FrameCollection, ClusterData, FaceClusteringConfig]): + + @property + def name(self) -> str: + return 'face_clustering' + + def execute(self, input_data: FrameCollection, context: ExecutionContext) -> ClusterData: + output_filename: str = f'{context.series_name}_{input_data.episode_info.episode_code()}_clusters.json' + output_path: Path = context.get_output_path(input_data.episode_info, 'face_clusters', output_filename) + if output_path.exists() and (not context.force_rerun): + if context.is_step_completed(self.name, input_data.episode_id): + context.logger.info(f'Skipping {input_data.episode_id} (cached face clustering)') + return ClusterData(episode_id=input_data.episode_id, episode_info=input_data.episode_info, path=output_path) + context.logger.info(f'Clustering faces for {input_data.episode_id}') + context.mark_step_started(self.name, input_data.episode_id) + context.mark_step_completed(self.name, input_data.episode_id) + return ClusterData(episode_id=input_data.episode_id, episode_info=input_data.episode_info, path=output_path) diff --git a/preprocessor/modules/vision/image_hashing.py b/preprocessor/modules/vision/image_hashing.py new file mode 100644 index 000000000..dbbe8201c --- /dev/null +++ b/preprocessor/modules/vision/image_hashing.py @@ -0,0 +1,113 @@ +import gc +from pathlib import Path +from typing import ( + Any, + Dict, + List, + Optional, +) + +import torch + +from preprocessor.config.step_configs import ImageHashConfig +from preprocessor.core.artifacts import ( + FrameCollection, + ImageHashCollection, +) +from preprocessor.core.base_step import PipelineStep +from preprocessor.core.context import ExecutionContext +from preprocessor.lib.io.files import ( + atomic_write_json, + load_json, +) +from preprocessor.lib.video.frame_utils import FrameLoader +from preprocessor.lib.video.image_hasher import PerceptualHasher + + +class ImageHashStep(PipelineStep[FrameCollection, ImageHashCollection, ImageHashConfig]): + + def __init__(self, config: ImageHashConfig) -> None: + super().__init__(config) + self._hasher: Optional[PerceptualHasher] = None + + @property + def name(self) -> str: + return 'image_hashing' + + def execute( # pylint: disable=too-many-locals + self, input_data: FrameCollection, context: ExecutionContext, + ) -> ImageHashCollection: + filename_base = f'{context.series_name}_{input_data.episode_info.episode_code()}' + output_filename: str = f'{filename_base}_image_hashes.json' + output_path: Path = context.get_output_path(input_data.episode_info, 'image_hashes', output_filename) + if output_path.exists() and (not context.force_rerun): + if context.is_step_completed(self.name, input_data.episode_id): + context.logger.info(f'Skipping {input_data.episode_id} (cached)') + hash_data: Dict[str, Any] = load_json(output_path) + return ImageHashCollection( + episode_id=input_data.episode_id, + episode_info=input_data.episode_info, + path=output_path, + hash_count=len(hash_data.get('hashes', [])), + ) + frame_metadata: Dict[str, Any] = load_json(input_data.metadata_path) + frame_requests: List[Dict[str, Any]] = frame_metadata.get('frames', []) + if not frame_requests: + context.logger.warning(f'No frames to hash for {input_data.episode_id}') + return ImageHashCollection( + episode_id=input_data.episode_id, + episode_info=input_data.episode_info, + path=output_path, + hash_count=0, + ) + if self._hasher is None: + context.logger.info(f'Loading image hasher on {self.config.device}...') + self._hasher = PerceptualHasher() + msg = ( + f'Computing hashes for {len(frame_requests)} frames ' + f'in {input_data.episode_id}' + ) + context.logger.info(msg) + context.mark_step_started(self.name, input_data.episode_id) + hash_results: List[Dict[str, Any]] = [] + batch_size: int = self.config.batch_size + for i in range(0, len(frame_requests), batch_size): + batch: List[Dict[str, Any]] = frame_requests[i:i + batch_size] + pil_images = FrameLoader.load_from_requests(input_data.directory, batch) + phashes: List[str] = self._hasher.compute_phash_batch(pil_images) # pylint: disable=no-member + for request, phash in zip(batch, phashes): + result: Dict[str, Any] = request.copy() + result['perceptual_hash'] = phash + hash_results.append(result) + del pil_images + if i % (batch_size * 5) == 0: + self._cleanup_memory() + output_data: Dict[str, Any] = { + 'episode_id': input_data.episode_id, + 'series_name': context.series_name, + 'generated_at': frame_metadata.get('generated_at'), + 'hash_settings': { + 'device': self.config.device, + 'batch_size': self.config.batch_size, + }, + 'hashes': hash_results, + } + atomic_write_json(output_path, output_data) + context.mark_step_completed(self.name, input_data.episode_id) + self._cleanup_memory() + return ImageHashCollection( + episode_id=input_data.episode_id, + episode_info=input_data.episode_info, + path=output_path, + hash_count=len(hash_results), + ) + + def cleanup(self) -> None: + self._hasher = None + self._cleanup_memory() + + @staticmethod + def _cleanup_memory() -> None: + gc.collect() + if torch.cuda.is_available(): + torch.cuda.empty_cache() diff --git a/preprocessor/modules/vision/object_detection.py b/preprocessor/modules/vision/object_detection.py new file mode 100644 index 000000000..36ce94396 --- /dev/null +++ b/preprocessor/modules/vision/object_detection.py @@ -0,0 +1,28 @@ +from pathlib import Path + +from preprocessor.config.step_configs import ObjectDetectionConfig +from preprocessor.core.artifacts import ( + FrameCollection, + ObjectDetectionData, +) +from preprocessor.core.base_step import PipelineStep +from preprocessor.core.context import ExecutionContext + + +class ObjectDetectionStep(PipelineStep[FrameCollection, ObjectDetectionData, ObjectDetectionConfig]): + + @property + def name(self) -> str: + return 'object_detection' + + def execute(self, input_data: FrameCollection, context: ExecutionContext) -> ObjectDetectionData: + output_filename: str = f'{context.series_name}_{input_data.episode_info.episode_code()}_objects.json' + output_path: Path = context.get_output_path(input_data.episode_info, 'object_detections', output_filename) + if output_path.exists() and (not context.force_rerun): + if context.is_step_completed(self.name, input_data.episode_id): + context.logger.info(f'Skipping {input_data.episode_id} (cached object detection)') + return ObjectDetectionData(episode_id=input_data.episode_id, episode_info=input_data.episode_info, path=output_path) + context.logger.info(f'Detecting objects for {input_data.episode_id}') + context.mark_step_started(self.name, input_data.episode_id) + context.mark_step_completed(self.name, input_data.episode_id) + return ObjectDetectionData(episode_id=input_data.episode_id, episode_info=input_data.episode_info, path=output_path) diff --git a/preprocessor/processors/archive_generator.py b/preprocessor/processors/archive_generator.py deleted file mode 100644 index 90a947bfc..000000000 --- a/preprocessor/processors/archive_generator.py +++ /dev/null @@ -1,182 +0,0 @@ -import logging -from pathlib import Path -from typing import ( - Any, - Dict, - List, -) -import zipfile - -from preprocessor.config.config import ( - get_base_output_dir, - settings, -) -from preprocessor.core.base_processor import ( - BaseProcessor, - OutputSpec, - ProcessingItem, -) -from preprocessor.core.constants import ( - FILE_EXTENSIONS, - FILE_SUFFIXES, -) -from preprocessor.core.processor_registry import register_processor -from preprocessor.episodes import EpisodeManager -from preprocessor.utils.console import console - -ELASTIC_SUBDIRS = settings.output_subdirs.elastic_document_subdirs - - -@register_processor("generate_archives") -class ArchiveGenerator(BaseProcessor): - REQUIRES = ["elastic_documents"] - PRODUCES = ["archives"] - PRIORITY = 90 - DESCRIPTION = "Generate archive files" - - FOLDER_TO_FILE_SUFFIX = { - ELASTIC_SUBDIRS.text_segments: "text_segments", - ELASTIC_SUBDIRS.text_embeddings: "text_embeddings", - ELASTIC_SUBDIRS.video_frames: "video_frames", - ELASTIC_SUBDIRS.episode_names: "episode_name", - ELASTIC_SUBDIRS.text_statistics: "text_statistics", - ELASTIC_SUBDIRS.full_episode_embeddings: "full_episode_embedding", - ELASTIC_SUBDIRS.sound_events: "sound_events", - ELASTIC_SUBDIRS.sound_event_embeddings: "sound_event_embeddings", - } - - def __init__(self, args: Dict[str, Any]): - super().__init__( - args=args, - class_name=self.__class__.__name__, - error_exit_code=11, - loglevel=logging.DEBUG, - ) - - self.elastic_documents_dir: Path = self._args["elastic_documents_dir"] - self.output_dir: Path = self._args.get("output_dir", get_base_output_dir(self.series_name) / "archives") - self.allow_partial: bool = self._args.get("allow_partial", False) - - episodes_info_json = self._args.get("episodes_info_json") - self.episode_manager = EpisodeManager(episodes_info_json, self.series_name) - - def _validate_args(self, args: Dict[str, Any]) -> None: - if "elastic_documents_dir" not in args: - raise ValueError("elastic_documents_dir is required") - - def get_output_subdir(self) -> str: - return settings.output_subdirs.archives - - def _get_processing_items(self) -> List[ProcessingItem]: - segments_dir = self.elastic_documents_dir / ELASTIC_SUBDIRS.text_segments - if not segments_dir.exists(): - console.print(f"[yellow]Text segments directory not found: {segments_dir}[/yellow]") - return [] - - all_segment_files = list(segments_dir.glob(f"**/*{FILE_SUFFIXES['text_segments']}{FILE_EXTENSIONS['jsonl']}")) - items = [] - - for segment_file in all_segment_files: - episode_info = self.episode_manager.parse_filename(segment_file) - if not episode_info: - self.logger.warning(f"Cannot parse episode info from {segment_file}") - continue - - base_name = segment_file.stem.replace(FILE_SUFFIXES["text_segments"], "") - items.append( - ProcessingItem( - episode_id=episode_info.episode_code(), - input_path=segment_file, - metadata={ - "base_name": base_name, - "episode_info": episode_info, - }, - ), - ) - - return items - - def _get_expected_outputs(self, item: ProcessingItem) -> List[OutputSpec]: - episode_info = item.metadata["episode_info"] - base_name = item.metadata["base_name"] - - archive_name = f"{base_name}.zip" - archive_path = ( - self.output_dir - / episode_info.season_code() - / episode_info.episode_num() - / archive_name - ) - - return [OutputSpec(path=archive_path, required=True)] - - def _process_item(self, item: ProcessingItem, missing_outputs: List[OutputSpec]) -> None: - episode_info = item.metadata["episode_info"] - base_name = item.metadata["base_name"] - - console.print(f"[cyan]Archiving documents for: {item.episode_id}[/cyan]") - - episode_files = self.__collect_episode_files(episode_info, base_name) - - if not episode_files: - self.logger.warning(f"No files found for {item.episode_id}") - return - - expected_count = len(self.FOLDER_TO_FILE_SUFFIX) - found_count = len(episode_files) - - if found_count < expected_count and not self.allow_partial: - console.print( - f"[yellow]Skipping {item.episode_id}: incomplete files " - f"({found_count}/{expected_count}). Use --allow-partial to archive anyway.[/yellow]", - ) - return - - for output_spec in missing_outputs: - self.__create_archive(output_spec.path, episode_files) - - console.print(f"[green]Completed archive for: {item.episode_id}[/green]") - - def __collect_episode_files(self, episode_info, base_name: str) -> Dict[str, Path]: - collected_files = {} - - for folder_name, file_suffix in self.FOLDER_TO_FILE_SUFFIX.items(): - file_name = f"{base_name}_{file_suffix}.jsonl" - file_path = ( - self.elastic_documents_dir - / folder_name - / episode_info.season_code() - / episode_info.episode_num() - / file_name - ) - - if file_path.exists(): - collected_files[folder_name] = file_path - else: - self.logger.warning(f"File not found: {file_path}") - - return collected_files - - def __create_archive(self, archive_path: Path, files: Dict[str, Path]) -> None: - archive_path.parent.mkdir(parents=True, exist_ok=True) - - temp_path = archive_path.with_suffix(archive_path.suffix + ".tmp") - - try: - with zipfile.ZipFile(temp_path, "w", zipfile.ZIP_DEFLATED) as zipf: - for _, file_path in files.items(): - zipf.write(file_path, arcname=file_path.name) - self.logger.debug(f"Added to archive: {file_path.name}") - - temp_path.replace(archive_path) - - archive_size_mb = archive_path.stat().st_size / (1024 * 1024) - console.print( - f"[green]Created archive: {archive_path.name} " - f"({len(files)} files, {archive_size_mb:.2f} MB)[/green]", - ) - - except Exception as e: - if temp_path.exists(): - temp_path.unlink() - raise RuntimeError(f"Failed to create archive {archive_path}: {e}") from e diff --git a/preprocessor/processors/character_detector.py b/preprocessor/processors/character_detector.py deleted file mode 100644 index 64de214be..000000000 --- a/preprocessor/processors/character_detector.py +++ /dev/null @@ -1,118 +0,0 @@ -from __future__ import annotations - -import logging -from pathlib import Path -from typing import ( - Any, - Dict, - List, -) - -from insightface.app import FaceAnalysis -import numpy as np - -from preprocessor.characters.face_detection import ( - init_face_detection, - load_character_references, -) -from preprocessor.config.config import settings -from preprocessor.core.base_processor import ( - BaseProcessor, - OutputSpec, - ProcessingItem, -) -from preprocessor.core.path_manager import PathManager -from preprocessor.core.processor_registry import register_processor -from preprocessor.episodes import EpisodeManager -from preprocessor.utils.console import console -from preprocessor.utils.detection_io import ( - process_frames_for_detection, - save_character_detections, -) - -# pylint: disable=duplicate-code - - -@register_processor("detect_characters") -class CharacterDetector(BaseProcessor): - REQUIRES = ["frames"] - PRODUCES = ["character_detections"] - PRIORITY = 60 - DESCRIPTION = "Detect characters in frames" - - def __init__(self, args: Dict[str, Any]): - super().__init__( - args=args, - class_name=self.__class__.__name__, - error_exit_code=9, - loglevel=logging.DEBUG, - ) - - self.frames_dir: Path = self._args["frames_dir"] - self.characters_dir: Path = self._args.get("characters_dir", settings.character.get_output_dir(self.series_name)) - self.threshold: float = settings.character.frame_detection_threshold - - episodes_info_json = self._args.get("episodes_info_json") - self.episode_manager = EpisodeManager(episodes_info_json, self.series_name) - - self.face_app: FaceAnalysis = None - self.character_vectors: Dict[str, np.ndarray] = {} - - def _validate_args(self, args: Dict[str, Any]) -> None: - if "frames_dir" not in args: - raise ValueError("frames_dir is required") - - def get_output_subdir(self) -> str: - return settings.output_subdirs.character_detections - - # pylint: disable=duplicate-code - def _get_processing_items(self) -> List[ProcessingItem]: - return self._get_episode_processing_items_from_metadata( - "**/*_frame_metadata.json", - self.frames_dir, - self.episode_manager, - ) - - def _get_expected_outputs(self, item: ProcessingItem) -> List[OutputSpec]: - episode_info = item.metadata["episode_info"] - path_manager = PathManager(self.series_name) - detections_filename = path_manager.build_filename( - episode_info, - extension="json", - suffix="character_detections", - ) - detections_output = self._build_output_path(episode_info, detections_filename) - return [OutputSpec(path=detections_output, required=True)] - # pylint: enable=duplicate-code - - def _load_resources(self) -> bool: - if not self.characters_dir.exists(): - console.print(f"[red]Characters directory not found: {self.characters_dir}[/red]") - return False - - self.face_app = init_face_detection() - self.character_vectors = load_character_references(self.characters_dir, self.face_app) - - if not self.character_vectors: - console.print("[yellow]No character references loaded[/yellow]") - return False - - return True - - def _process_item(self, item: ProcessingItem, missing_outputs: List[OutputSpec]) -> None: - metadata_file = item.input_path - episode_info = item.metadata["episode_info"] - frames_dir = metadata_file.parent - - frame_files = sorted([ - f for f in frames_dir.glob("*.jpg") - if f.is_file() and "frame_" in f.name - ]) - - results = process_frames_for_detection( - frame_files, - self.face_app, - self.character_vectors, - self.threshold, - ) - save_character_detections(episode_info, results, self.path_manager) diff --git a/preprocessor/processors/elastic_document_generator.py b/preprocessor/processors/elastic_document_generator.py deleted file mode 100644 index 16c4b7abe..000000000 --- a/preprocessor/processors/elastic_document_generator.py +++ /dev/null @@ -1,929 +0,0 @@ -import json -import logging -from pathlib import Path -from typing import ( - Any, - Dict, - List, - Optional, -) - -from preprocessor.config.config import ( - get_base_output_dir, - settings, -) -from preprocessor.core.base_processor import ( - BaseProcessor, - OutputSpec, - ProcessingItem, -) -from preprocessor.core.constants import ( - FILE_EXTENSIONS, - FILE_SUFFIXES, -) -from preprocessor.core.processor_registry import register_processor -from preprocessor.embeddings.episode_name_embedder import EpisodeNameEmbedder -from preprocessor.episodes import EpisodeManager -from preprocessor.types import ( - CharacterDetectionInFrame, - EpisodeMetadata, - ObjectDetectionInFrame, - SceneTimestampsData, -) -from preprocessor.utils.console import console -from preprocessor.utils.constants import ( - CharacterDetectionKeys, - DetectionKeys, - ElasticDocKeys, - EmbeddingKeys, - EmotionKeys, - EpisodeMetadataKeys, - ObjectDetectionKeys, - SceneKeys, - SceneTimeKeys, -) - -ELASTIC_SUBDIRS = settings.output_subdirs.elastic_document_subdirs - - -@register_processor("generate_elastic_docs") -class ElasticDocumentGenerator(BaseProcessor): - REQUIRES = ["transcriptions", "embeddings"] - PRODUCES = ["elastic_documents"] - PRIORITY = 80 - DESCRIPTION = "Generate Elasticsearch documents" - - def __init__(self, args: Dict[str, Any]): - super().__init__( - args=args, - class_name=self.__class__.__name__, - error_exit_code=10, - loglevel=logging.DEBUG, - ) - - self.transcription_jsons: Path = self._args["transcription_jsons"] - self.embeddings_dir: Optional[Path] = self._args.get("embeddings_dir") - self.scene_timestamps_dir: Optional[Path] = self._args.get("scene_timestamps_dir") - self.character_detections_dir: Optional[Path] = self._args.get("character_detections_dir") - self.object_detections_dir: Optional[Path] = self._args.get("object_detections_dir") - self.output_dir: Path = self._args.get( - "output_dir", - get_base_output_dir(self.series_name) / "elastic_documents", - ) - - episodes_info_json = self._args.get("episodes_info_json") - self.episode_manager = EpisodeManager(episodes_info_json, self.series_name) - - def _validate_args(self, args: Dict[str, Any]) -> None: - if "transcription_jsons" not in args: - raise ValueError("transcription_jsons is required") - - def get_output_subdir(self) -> str: - return settings.output_subdirs.elastic_documents - - def __build_elastic_path(self, episode_info, subdoc_type: str, filename: str) -> Path: - full_subdir = f"{self.get_output_subdir()}/{subdoc_type}" - return self._build_output_path(episode_info, filename, subdir=full_subdir) - - def _get_processing_items(self) -> List[ProcessingItem]: - all_transcription_files = list(self.transcription_jsons.glob("**/raw/*_segmented.json")) - items = [] - - for trans_file in all_transcription_files: - items.append(self._create_transcription_processing_item(trans_file)) - - return items - - def _get_expected_outputs(self, item: ProcessingItem) -> List[OutputSpec]: # pylint: disable=too-many-locals,too-many-statements - base_name = item.metadata["base_name"] - episode_info = self.episode_manager.parse_filename(item.input_path) - - outputs = [] - - if episode_info: - segments_filename = f"{base_name}{FILE_SUFFIXES['text_segments']}{FILE_EXTENSIONS['jsonl']}" - segments_file = self.__build_elastic_path( - episode_info, - ELASTIC_SUBDIRS.text_segments, - segments_filename, - ) - outputs.append(OutputSpec(path=segments_file, required=True)) - - season_code = episode_info.season_code() - episode_code = episode_info.episode_num() - trans_dir = self.path_manager.base_output_dir / settings.output_subdirs.transcriptions / season_code / episode_code - sound_events_dir = trans_dir / settings.output_subdirs.transcription_subdirs.sound_events - sound_events_filename = self.episode_manager.path_manager.build_filename( - episode_info, - extension="json", - suffix="sound_events", - ) - sound_events_json = sound_events_dir / sound_events_filename - if sound_events_json.exists(): - sound_events_elastic = f"{base_name}_sound_events.jsonl" - sound_events_file = self.__build_elastic_path( - episode_info, - ELASTIC_SUBDIRS.sound_events, - sound_events_elastic, - ) - outputs.append(OutputSpec(path=sound_events_file, required=False)) - else: - season_dir = item.input_path.parent.name - filename = f"{base_name}{FILE_SUFFIXES['text_segments']}{FILE_EXTENSIONS['jsonl']}" - path = self.output_dir / ELASTIC_SUBDIRS.text_segments / season_dir / filename - outputs.append( - OutputSpec( - path=path, - required=True, - ), - ) - - if self.embeddings_dir and episode_info: - season_code = episode_info.season_code() - episode_code = episode_info.episode_num() - episode_emb_dir = self.path_manager.base_output_dir / settings.output_subdirs.embeddings / season_code / episode_code - text_emb_files = list(episode_emb_dir.glob("*_embeddings_text.json")) - text_emb_file = text_emb_files[0] if text_emb_files else None - video_emb_files = list(episode_emb_dir.glob("*_embeddings_video.json")) - video_emb_file = video_emb_files[0] if video_emb_files else None - - if text_emb_file and text_emb_file.exists(): - text_embeddings_filename = f"{base_name}_text_embeddings.jsonl" - text_embeddings_file = self.__build_elastic_path( - episode_info, - ELASTIC_SUBDIRS.text_embeddings, - text_embeddings_filename, - ) - outputs.append(OutputSpec(path=text_embeddings_file, required=True)) - - if video_emb_file and video_emb_file.exists(): - video_frames_filename = f"{base_name}_video_frames.jsonl" - video_frames_file = self.__build_elastic_path( - episode_info, - ELASTIC_SUBDIRS.video_frames, - video_frames_filename, - ) - outputs.append(OutputSpec(path=video_frames_file, required=True)) - - episode_name_emb = EpisodeNameEmbedder.load_episode_name_embedding( - episode_info.season, - episode_info.relative_episode, - self.series_name, - output_dir=self.embeddings_dir, - ) - if episode_name_emb: - episode_name_filename = f"{base_name}_episode_name.jsonl" - episode_name_file = self.__build_elastic_path( - episode_info, - ELASTIC_SUBDIRS.episode_names, - episode_name_filename, - ) - outputs.append(OutputSpec(path=episode_name_file, required=True)) - - season_code = episode_info.season_code() - episode_code = episode_info.episode_num() - trans_dir = self.path_manager.base_output_dir / settings.output_subdirs.transcriptions / season_code / episode_code - clean_dir = trans_dir / settings.output_subdirs.transcription_subdirs.clean - text_stats_filename = f"{base_name}_text_stats.json" - text_stats_file = clean_dir / text_stats_filename - if text_stats_file.exists(): - text_stats_elastic_filename = f"{base_name}_text_statistics.jsonl" - text_stats_elastic_file = self.__build_elastic_path( - episode_info, - ELASTIC_SUBDIRS.text_statistics, - text_stats_elastic_filename, - ) - outputs.append(OutputSpec(path=text_stats_elastic_file, required=True)) - - full_episode_emb_file = episode_emb_dir / f"{base_name}_embeddings_full_episode.json" - if full_episode_emb_file.exists(): - full_episode_elastic_filename = f"{base_name}_full_episode_embedding.jsonl" - full_episode_elastic_file = self.__build_elastic_path( - episode_info, - ELASTIC_SUBDIRS.full_episode_embeddings, - full_episode_elastic_filename, - ) - outputs.append(OutputSpec(path=full_episode_elastic_file, required=True)) - - sound_event_emb_file = episode_emb_dir / f"{base_name}_embeddings_sound_events.json" - if sound_event_emb_file.exists(): - sound_event_elastic_filename = f"{base_name}_sound_event_embeddings.jsonl" - sound_event_elastic_file = self.__build_elastic_path( - episode_info, - ELASTIC_SUBDIRS.sound_event_embeddings, - sound_event_elastic_filename, - ) - outputs.append(OutputSpec(path=sound_event_elastic_file, required=False)) - - return outputs - - def _process_item(self, item: ProcessingItem, missing_outputs: List[OutputSpec]) -> None: # pylint: disable=too-many-locals,too-many-statements - trans_file = item.input_path - base_name = item.metadata["base_name"] - season_dir = trans_file.parent.name - - console.print(f"[cyan]Processing: {trans_file.name}[/cyan]") - - episode_dir = trans_file.parent.parent - clean_dir = episode_dir / settings.output_subdirs.transcription_subdirs.clean - - base_name_for_clean = trans_file.stem - suffixes = (FILE_SUFFIXES["segmented"], FILE_SUFFIXES["sound_events"], FILE_SUFFIXES["clean"], FILE_SUFFIXES["clean_alt"]) - while True: - removed = False - for suffix in suffixes: - if base_name_for_clean.endswith(suffix): - base_name_for_clean = base_name_for_clean[:-len(suffix)] - removed = True - break - if not removed: - break - - clean_transcription_file = clean_dir / f"{base_name_for_clean}{FILE_SUFFIXES['clean']}{FILE_EXTENSIONS['json']}" - - if not clean_transcription_file.exists(): - self.logger.warning(f"Clean transcription not found: {clean_transcription_file}, skipping") - return - trans_file_for_segments = clean_transcription_file - - with open(trans_file_for_segments, "r", encoding="utf-8") as f: - transcription_data = json.load(f) - - episode_info_dict = transcription_data.get(EpisodeMetadataKeys.EPISODE_INFO, {}) - season = episode_info_dict.get(EpisodeMetadataKeys.SEASON) - episode_number = episode_info_dict.get(EpisodeMetadataKeys.EPISODE_NUMBER) - - if season is None or episode_number is None: - console.print(f"[red]Missing episode info in {trans_file.name}[/red]") - return - - episode_info = self.episode_manager.get_episode_by_season_and_relative(season, episode_number) - if not episode_info: - console.print(f"[red]Cannot find episode info for S{season:02d}E{episode_number:02d}[/red]") - return - - episode_metadata = self.__build_episode_metadata(episode_info) - episode_id = episode_info.episode_code() - filename = f"{self.series_name.lower()}_{episode_info.episode_code()}{FILE_EXTENSIONS['mp4']}" - video_path = str(Path("bot") / f"{self.series_name.upper()}-WIDEO" / episode_info.season_code() / filename) - - scene_timestamps = self.__load_scene_timestamps(episode_info) - character_detections = self.__load_character_detections(episode_info) - object_detections = self.__load_object_detections(episode_info) - - if any(f"{FILE_SUFFIXES['text_segments']}{FILE_EXTENSIONS['jsonl']}" in str(o.path) for o in missing_outputs): - self.__generate_segments( - transcription_data, - episode_id, - episode_metadata, - video_path, - scene_timestamps, - season_dir, - base_name, - ) - - trans_dir = self.path_manager.get_episode_dir(episode_info, settings.output_subdirs.transcriptions) - sound_events_dir = trans_dir / settings.output_subdirs.transcription_subdirs.sound_events - sound_events_json = sound_events_dir / f"{base_name}_sound_events.json" - if sound_events_json.exists() and any("_sound_events.jsonl" in str(o.path) for o in missing_outputs): - with open(sound_events_json, "r", encoding="utf-8") as f: - sound_events_data = json.load(f) - - self.__generate_sound_events( - sound_events_data, - episode_id, - episode_metadata, - video_path, - scene_timestamps, - episode_info, - base_name, - ) - - if self.embeddings_dir: - episode_emb_dir = self.path_manager.get_episode_dir(episode_info, settings.output_subdirs.embeddings) - text_emb_files = list(episode_emb_dir.glob("*_embeddings_text.json")) - text_emb_file = text_emb_files[0] if text_emb_files else None - - if text_emb_file and text_emb_file.exists() and any("_text_embeddings.jsonl" in str(o.path) for o in missing_outputs): - self.__generate_text_embeddings( - text_emb_file, - episode_id, - episode_metadata, - video_path, - episode_info, - base_name, - ) - - video_emb_files = list(episode_emb_dir.glob("*_embeddings_video.json")) - video_emb_file = video_emb_files[0] if video_emb_files else None - - if video_emb_file and video_emb_file.exists() and any("_video_frames.jsonl" in str(o.path) for o in missing_outputs): - self.__generate_video_frames( - video_emb_file, - episode_id, - episode_metadata, - video_path, - scene_timestamps, - character_detections, - object_detections, - episode_info, - base_name, - ) - - episode_name_emb = EpisodeNameEmbedder.load_episode_name_embedding( - season, - episode_number, - self.series_name, - output_dir=self.embeddings_dir, - ) - if episode_name_emb and any("_episode_name.jsonl" in str(o.path) for o in missing_outputs): - self.__generate_episode_name_document( - episode_name_emb, - episode_id, - episode_metadata, - video_path, - episode_info, - base_name, - ) - - trans_dir = self.path_manager.get_episode_dir(episode_info, settings.output_subdirs.transcriptions) - clean_dir = trans_dir / settings.output_subdirs.transcription_subdirs.clean - text_stats_file = clean_dir / f"{base_name}_text_stats.json" - if text_stats_file.exists() and any("_text_statistics.jsonl" in str(o.path) for o in missing_outputs): - self.__generate_text_statistics_document( - text_stats_file, - episode_id, - episode_metadata, - video_path, - episode_info, - base_name, - ) - - if self.embeddings_dir: - episode_emb_dir = self.path_manager.get_episode_dir(episode_info, settings.output_subdirs.embeddings) - full_episode_emb_file = episode_emb_dir / f"{base_name}_embeddings_full_episode.json" - - if full_episode_emb_file.exists() and any("_full_episode_embedding.jsonl" in str(o.path) for o in missing_outputs): - self.__generate_full_episode_embedding_document( - full_episode_emb_file, - episode_id, - episode_metadata, - video_path, - episode_info, - base_name, - ) - - sound_event_emb_file = episode_emb_dir / f"{base_name}_embeddings_sound_events.json" - - if sound_event_emb_file.exists() and any("_sound_event_embeddings.jsonl" in str(o.path) for o in missing_outputs): - self.__generate_sound_event_embeddings_document( - sound_event_emb_file, - episode_id, - episode_metadata, - video_path, - episode_info, - base_name, - ) - - console.print(f"[green]Completed: {trans_file.name}[/green]") - - def __build_episode_metadata(self, episode_info) -> EpisodeMetadata: - metadata = self.episode_manager.get_metadata(episode_info) - return { - "season": episode_info.season, - "episode_number": episode_info.relative_episode, - "title": metadata.get("title"), - "premiere_date": metadata.get("premiere_date"), - "series_name": self.series_name, - "viewership": metadata.get("viewership"), - } - - def __load_scene_timestamps(self, episode_info) -> Optional[SceneTimestampsData]: - return EpisodeManager.load_scene_timestamps(episode_info, self.scene_timestamps_dir, self.logger) - - def __load_character_detections(self, episode_info) -> Dict[int, List[CharacterDetectionInFrame]]: - if not self.character_detections_dir: - return {} - - detection_dir = self.path_manager.get_episode_dir(episode_info, settings.output_subdirs.character_detections) - detection_files = list(detection_dir.glob("*_character_detections.json")) - detection_file = detection_files[0] if detection_files else None - - if not detection_file or not detection_file.exists(): - return {} - - try: - with open(detection_file, "r", encoding="utf-8") as f: - data = json.load(f) - - detections_dict = {} - for detection in data.get(DetectionKeys.DETECTIONS, []): - frame_number = detection.get(DetectionKeys.FRAME_NUMBER) - if frame_number is not None: - detections_dict[frame_number] = detection.get(DetectionKeys.CHARACTERS, []) - elif DetectionKeys.FRAME in detection: - frame_file = detection[DetectionKeys.FRAME] - detections_dict[frame_file] = detection.get(DetectionKeys.CHARACTERS, []) - - return detections_dict - except Exception as e: - self.logger.error(f"Error loading character detections: {e}") - return {} - - def __load_object_detections(self, episode_info) -> Dict[str, List[ObjectDetectionInFrame]]: - if not self.object_detections_dir: - return {} - - detection_dir = self.path_manager.get_episode_dir(episode_info, settings.output_subdirs.object_detections) - detection_files = list(detection_dir.glob("*_object_detections.json")) - detection_file = detection_files[0] if detection_files else None - - if not detection_file or not detection_file.exists(): - return {} - - try: - with open(detection_file, "r", encoding="utf-8") as f: - data = json.load(f) - - detections_dict = {} - for frame_data in data.get(DetectionKeys.DETECTIONS, []): - frame_name = frame_data[DetectionKeys.FRAME_NAME] - detections_dict[frame_name] = frame_data.get(DetectionKeys.DETECTIONS, []) - - return detections_dict - except Exception as e: - self.logger.error(f"Error loading object detections: {e}") - return {} - - @staticmethod - def __get_characters_for_frame( - frame_identifier, - character_detections: Dict[int, List[CharacterDetectionInFrame]], - ) -> List[CharacterDetectionInFrame]: - characters = character_detections.get(frame_identifier, []) - - character_list = [] - for char in characters: - char_data = { - CharacterDetectionKeys.NAME: char[CharacterDetectionKeys.NAME], - CharacterDetectionKeys.CONFIDENCE: char.get(CharacterDetectionKeys.CONFIDENCE), - } - - if CharacterDetectionKeys.EMOTION in char: - char_data[CharacterDetectionKeys.EMOTION] = { - EmotionKeys.LABEL: char[CharacterDetectionKeys.EMOTION][EmotionKeys.LABEL], - EmotionKeys.CONFIDENCE: char[CharacterDetectionKeys.EMOTION][EmotionKeys.CONFIDENCE], - } - - character_list.append(char_data) - - return character_list - - @staticmethod - def __get_objects_for_frame(frame_name: str, object_detections: Dict[str, List[ObjectDetectionInFrame]]) -> List[Dict[str, Any]]: - detections = object_detections.get(frame_name, []) - objects_summary = {} - for det in detections: - class_name = det[ObjectDetectionKeys.CLASS_NAME] - if class_name in objects_summary: - objects_summary[class_name] += 1 - else: - objects_summary[class_name] = 1 - - return [{"class": cls, "count": cnt} for cls, cnt in objects_summary.items()] - - @staticmethod - def __find_scene_for_timestamp(timestamp: float, scene_timestamps: Optional[SceneTimestampsData]) -> Optional[Dict[str, Any]]: - if not scene_timestamps or SceneKeys.SCENES not in scene_timestamps: - return None - - scenes = scene_timestamps[SceneKeys.SCENES] - for scene in scenes: - start_time = scene[SceneKeys.START][SceneTimeKeys.SECONDS] - end_time = scene[SceneKeys.END][SceneTimeKeys.SECONDS] - - if start_time is None or end_time is None: - continue - - if start_time <= timestamp < end_time: - return { - SceneKeys.SCENE_NUMBER: scene[SceneKeys.SCENE_NUMBER], - SceneKeys.SCENE_START_TIME: start_time, - SceneKeys.SCENE_END_TIME: end_time, - SceneKeys.SCENE_START_FRAME: scene[SceneKeys.START][SceneTimeKeys.FRAME], - SceneKeys.SCENE_END_FRAME: scene[SceneKeys.END][SceneTimeKeys.FRAME], - } - - return None - - def __generate_segments( # pylint: disable=too-many-locals - self, - transcription_data: Dict[str, Any], - episode_id: str, - episode_metadata: EpisodeMetadata, - video_path: str, - scene_timestamps: Optional[SceneTimestampsData], - season_dir: str, - base_name: str, - ) -> None: - segments = transcription_data.get("segments", []) - if not segments: - return - - season = episode_metadata.get("season") - episode = episode_metadata.get("episode_number") - episode_info = self.episode_manager.get_episode_by_season_and_relative(season, episode) - - if episode_info: - output_file = self.path_manager.build_path( - episode_info, - f"{settings.output_subdirs.elastic_documents}/{ELASTIC_SUBDIRS.text_segments}", - f"{base_name}{FILE_SUFFIXES['text_segments']}{FILE_EXTENSIONS['jsonl']}", - ) - else: - filename = f"{base_name}{FILE_SUFFIXES['text_segments']}{FILE_EXTENSIONS['jsonl']}" - output_file = self.output_dir / ELASTIC_SUBDIRS.text_segments / season_dir / filename - - output_file.parent.mkdir(parents=True, exist_ok=True) - - with open(output_file, "w", encoding="utf-8") as f: - for i, segment in enumerate(segments): - text = segment.get("text", "").strip() - if not text: - continue - - words = segment.get("words", []) - if words: - start_time = words[0].get("start") or 0.0 - end_time = words[-1].get("end") or 0.0 - speaker = words[0].get("speaker_id", "unknown") - else: - start_time = segment.get("start", 0.0) - end_time = segment.get("end", 0.0) - speaker = segment.get("speaker", "unknown") - - scene_info = self.__find_scene_for_timestamp(start_time, scene_timestamps) - - doc = { - "episode_id": episode_id, - "episode_metadata": episode_metadata, - "segment_id": i, - "text": text, - "start_time": start_time, - "end_time": end_time, - "speaker": speaker, - "video_path": video_path, - } - - if scene_info: - doc[ElasticDocKeys.SCENE_INFO] = scene_info - - f.write(json.dumps(doc, ensure_ascii=False) + "\n") - - console.print(f"[green]Generated {len(segments)} segment documents → {output_file.name}[/green]") - - def __generate_sound_events( - self, - sound_events_data: Dict[str, Any], - episode_id: str, - episode_metadata: EpisodeMetadata, - video_path: str, - scene_timestamps: Optional[SceneTimestampsData], - episode_info, - base_name: str, - ) -> None: - segments = sound_events_data.get("segments", []) - if not segments: - return - - output_file = self.path_manager.build_path( - episode_info, - f"{settings.output_subdirs.elastic_documents}/{ELASTIC_SUBDIRS.sound_events}", - f"{base_name}_sound_events.jsonl", - ) - output_file.parent.mkdir(parents=True, exist_ok=True) - - with open(output_file, "w", encoding="utf-8") as f: - for i, segment in enumerate(segments): - if "text" not in segment: - continue - - words = segment.get("words", []) - if not words: - start_time = segment.get("start") or 0.0 - end_time = segment.get("end") or 0.0 - else: - start_time = words[0].get("start") or 0.0 - end_time = words[-1].get("end") or 0.0 - - scene_info = self.__find_scene_for_timestamp(start_time, scene_timestamps) - - doc = { - "episode_id": episode_id, - "episode_metadata": episode_metadata, - "segment_id": i, - "text": segment.get("text", ""), - "sound_type": segment.get("sound_type", "sound"), - "start_time": start_time, - "end_time": end_time, - "video_path": video_path, - } - - if scene_info: - doc[ElasticDocKeys.SCENE_INFO] = scene_info - - f.write(json.dumps(doc, ensure_ascii=False) + "\n") - - console.print(f"[green]Generated {len(segments)} sound event documents → {output_file.name}[/green]") - - def __generate_text_embeddings( - self, - text_emb_file: Path, - episode_id: str, - episode_metadata: EpisodeMetadata, - video_path: str, - episode_info, - base_name: str, - ) -> None: - with open(text_emb_file, "r", encoding="utf-8") as f: - data = json.load(f) - - text_embeddings = data.get("text_embeddings", []) - if not text_embeddings: - return - - output_file = self.path_manager.build_path( - episode_info, - f"{settings.output_subdirs.elastic_documents}/{ELASTIC_SUBDIRS.text_embeddings}", - f"{base_name}_text_embeddings.jsonl", - ) - output_file.parent.mkdir(parents=True, exist_ok=True) - - with open(output_file, "w", encoding="utf-8") as f: - for i, emb in enumerate(text_embeddings): - segment_range = emb.get("segment_range", []) - text = emb.get("text", "") - embedding = emb.get("embedding", []) - - if not embedding: - continue - - doc = { - "episode_id": episode_id, - "episode_metadata": episode_metadata, - "embedding_id": i, - "segment_range": segment_range[0] if segment_range else 0, - "text": text, - "text_embedding": embedding, - "video_path": video_path, - } - - f.write(json.dumps(doc, ensure_ascii=False) + "\n") - - console.print(f"[green]Generated {len(text_embeddings)} text embedding documents → {output_file.name}[/green]") - - def __generate_video_frames( # pylint: disable=too-many-locals - self, - video_emb_file: Path, - episode_id: str, - episode_metadata: EpisodeMetadata, - video_path: str, - scene_timestamps: Optional[SceneTimestampsData], - character_detections: Dict[str, List[Dict[str, Any]]], - object_detections: Dict[str, List[Dict[str, Any]]], - episode_info, - base_name: str, - ) -> None: - with open(video_emb_file, "r", encoding="utf-8") as f: - data = json.load(f) - - video_embeddings = data.get("video_embeddings", []) - if not video_embeddings: - return - - output_file = self.path_manager.build_path( - episode_info, - f"{settings.output_subdirs.elastic_documents}/{ELASTIC_SUBDIRS.video_frames}", - f"{base_name}_video_frames.jsonl", - ) - output_file.parent.mkdir(parents=True, exist_ok=True) - - with open(output_file, "w", encoding="utf-8") as f: - for emb in video_embeddings: - frame_number = emb.get(EmbeddingKeys.FRAME_NUMBER) - timestamp = emb.get(EmbeddingKeys.TIMESTAMP) - embedding = emb.get(EmbeddingKeys.EMBEDDING) - - if embedding is None or timestamp is None: - continue - - scene_info = self.__find_scene_for_timestamp(timestamp, scene_timestamps) - - perceptual_hash = emb.get(EmbeddingKeys.PERCEPTUAL_HASH) - frame_path = emb.get(EmbeddingKeys.FRAME_PATH, f"frame_{frame_number:06d}.jpg" if frame_number is not None else "") - - doc = { - "episode_id": episode_id, - "episode_metadata": episode_metadata, - "frame_number": frame_number, - "timestamp": timestamp, - "frame_type": emb.get("type", "unknown"), - "video_path": video_path, - "video_embedding": embedding, - } - - if frame_number is not None: - characters = self.__get_characters_for_frame(frame_number, character_detections) - if characters: - doc[ElasticDocKeys.CHARACTER_APPEARANCES] = characters - - if frame_path: - frame_name = Path(frame_path).name if isinstance(frame_path, str) else frame_path - objects = self.__get_objects_for_frame(frame_name, object_detections) - if objects: - doc[ElasticDocKeys.DETECTED_OBJECTS] = objects - - if perceptual_hash: - doc[ElasticDocKeys.PERCEPTUAL_HASH] = perceptual_hash - try: - doc[ElasticDocKeys.PERCEPTUAL_HASH_INT] = int(perceptual_hash, 16) - except (ValueError, TypeError): - pass - - if EmbeddingKeys.SCENE_NUMBER in emb: - doc[EmbeddingKeys.SCENE_NUMBER] = emb[EmbeddingKeys.SCENE_NUMBER] - - if scene_info: - doc[ElasticDocKeys.SCENE_INFO] = scene_info - - f.write(json.dumps(doc, ensure_ascii=False) + "\n") - - console.print(f"[green]Generated {len(video_embeddings)} video frame documents → {output_file.name}[/green]") - - def __generate_episode_name_document( - self, - episode_name_emb: Dict[str, Any], - episode_id: str, - episode_metadata: EpisodeMetadata, - video_path: str, - episode_info, - base_name: str, - ) -> None: - output_file = self.path_manager.build_path( - episode_info, - f"{settings.output_subdirs.elastic_documents}/{ELASTIC_SUBDIRS.episode_names}", - f"{base_name}_episode_name.jsonl", - ) - output_file.parent.mkdir(parents=True, exist_ok=True) - - title_embedding = episode_name_emb.get(EmbeddingKeys.TITLE_EMBEDDING, []) - if not title_embedding: - return - - doc = { - "episode_id": episode_id, - "episode_metadata": episode_metadata, - EmbeddingKeys.TITLE: episode_name_emb.get(EmbeddingKeys.TITLE, ""), - EmbeddingKeys.TITLE_EMBEDDING: title_embedding, - "video_path": video_path, - } - - with open(output_file, "w", encoding="utf-8") as f: - f.write(json.dumps(doc, ensure_ascii=False) + "\n") - - console.print(f"[green]Generated episode name document → {output_file.name}[/green]") - - def __generate_text_statistics_document( - self, - text_stats_file: Path, - episode_id: str, - episode_metadata: EpisodeMetadata, - video_path: str, - episode_info, - base_name: str, - ) -> None: - with open(text_stats_file, "r", encoding="utf-8") as f: - stats_data = json.load(f) - - basic_stats = stats_data.get("basic_statistics", {}) - advanced_stats = stats_data.get("advanced_statistics", {}) - - if not basic_stats: - return - - output_file = self.path_manager.build_path( - episode_info, - f"{settings.output_subdirs.elastic_documents}/{ELASTIC_SUBDIRS.text_statistics}", - f"{base_name}_text_statistics.jsonl", - ) - output_file.parent.mkdir(parents=True, exist_ok=True) - - doc = { - "episode_id": episode_id, - "episode_metadata": episode_metadata, - "video_path": video_path, - "language": stats_data.get("metadata", {}).get("language", "pl"), - "analyzed_at": stats_data.get("metadata", {}).get("analyzed_at"), - "basic_statistics": basic_stats, - "advanced_statistics": advanced_stats, - "word_frequency": stats_data.get("word_frequency", [])[:20], - "bigrams": stats_data.get("bigrams", [])[:10], - "trigrams": stats_data.get("trigrams", [])[:10], - } - - with open(output_file, "w", encoding="utf-8") as f: - f.write(json.dumps(doc, ensure_ascii=False) + "\n") - - console.print(f"[green]Generated text statistics document → {output_file.name}[/green]") - - def __generate_full_episode_embedding_document( - self, - full_episode_emb_file: Path, - episode_id: str, - episode_metadata: EpisodeMetadata, - video_path: str, - episode_info, - base_name: str, - ) -> None: - with open(full_episode_emb_file, "r", encoding="utf-8") as f: - data = json.load(f) - - full_episode_embedding_data = data.get("full_episode_embedding", {}) - if not full_episode_embedding_data or "embedding" not in full_episode_embedding_data: - return - - output_file = self.path_manager.build_path( - episode_info, - f"{settings.output_subdirs.elastic_documents}/{ELASTIC_SUBDIRS.full_episode_embeddings}", - f"{base_name}_full_episode_embedding.jsonl", - ) - output_file.parent.mkdir(parents=True, exist_ok=True) - - doc = { - "episode_id": episode_id, - "episode_metadata": episode_metadata, - "full_transcript": full_episode_embedding_data.get("text", ""), - "transcript_length": full_episode_embedding_data.get("transcript_length", 0), - "full_episode_embedding": full_episode_embedding_data.get("embedding", []), - "video_path": video_path, - } - - with open(output_file, "w", encoding="utf-8") as f: - f.write(json.dumps(doc, ensure_ascii=False) + "\n") - - console.print(f"[green]Generated full episode embedding document → {output_file.name}[/green]") - - def __generate_sound_event_embeddings_document( # pylint: disable=too-many-locals - self, - sound_event_emb_file: Path, - episode_id: str, - episode_metadata: EpisodeMetadata, - video_path: str, - episode_info, - base_name: str, - ) -> None: - with open(sound_event_emb_file, "r", encoding="utf-8") as f: - data = json.load(f) - - sound_event_embeddings = data.get("sound_event_embeddings", []) - if not sound_event_embeddings: - return - - output_file = self.path_manager.build_path( - episode_info, - f"{settings.output_subdirs.elastic_documents}/{ELASTIC_SUBDIRS.sound_event_embeddings}", - f"{base_name}_sound_event_embeddings.jsonl", - ) - output_file.parent.mkdir(parents=True, exist_ok=True) - - with open(output_file, "w", encoding="utf-8") as f: - for i, emb in enumerate(sound_event_embeddings): - segment_range = emb.get("segment_range", []) - text = emb.get("text", "") - embedding = emb.get("embedding", []) - sound_types = emb.get("sound_types", []) - start_time = emb.get("start_time", 0.0) - end_time = emb.get("end_time", 0.0) - - if not embedding: - continue - - if isinstance(segment_range, list) and len(segment_range) == 2: - segment_range = {"gte": segment_range[0], "lte": segment_range[1]} - - doc = { - "episode_id": episode_id, - "episode_metadata": episode_metadata, - "embedding_id": i, - "segment_range": segment_range, - "text": text, - "sound_types": sound_types, - "start_time": start_time, - "end_time": end_time, - "sound_event_embedding": embedding, - "video_path": video_path, - } - - f.write(json.dumps(doc, ensure_ascii=False) + "\n") - - console.print(f"[green]Generated {len(sound_event_embeddings)} sound event embedding documents → {output_file.name}[/green]") diff --git a/preprocessor/processors/elasticsearch_indexer.py b/preprocessor/processors/elasticsearch_indexer.py deleted file mode 100644 index 1369a6425..000000000 --- a/preprocessor/processors/elasticsearch_indexer.py +++ /dev/null @@ -1,291 +0,0 @@ -import asyncio -import json -import logging -from pathlib import Path -from typing import ( - Any, - Awaitable, - Callable, - Dict, - List, -) - -from elasticsearch import exceptions as es_exceptions -from elasticsearch.helpers import ( - BulkIndexError, - async_bulk, -) - -from preprocessor.config.config import ( - get_base_output_dir, - settings, -) -from preprocessor.core.base_processor import BaseProcessor -from preprocessor.core.processor_registry import register_processor -from preprocessor.episodes import EpisodeManager -from preprocessor.processors.elasticsearch_manager import ElasticSearchManager -from preprocessor.utils.console import console - -ELASTIC_SUBDIRS = settings.output_subdirs.elastic_document_subdirs - - -@register_processor("index_elasticsearch") -class ElasticSearchIndexer(BaseProcessor): - REQUIRES = ["elastic_documents"] - PRODUCES = ["indexed"] - PRIORITY = 95 - DESCRIPTION = "Index documents in Elasticsearch" - - def __init__(self, args: Dict[str, Any]) -> None: - super().__init__( - args=args, - class_name=self.__class__.__name__, - error_exit_code=1, - loglevel=logging.DEBUG, - ) - - self.dry_run = self._args.get("dry_run", False) - self.name = self._args["name"] - self.elastic_documents_dir = self._args.get( - "elastic_documents_dir", - get_base_output_dir(self.series_name) / "elastic_documents", - ) - self.transcription_jsons = self._args.get("transcription_jsons") - self.append = self._args.get("append", False) - - episodes_info_json = self._args.get("episodes_info_json") - self.episode_manager = EpisodeManager(episodes_info_json, self.series_name) - self.client = None - - def _validate_args(self, args: Dict[str, Any]) -> None: - if "name" not in args: - raise ValueError("index name is required") - - def get_output_subdir(self) -> str: - return settings.output_subdirs.elastic_documents - - @staticmethod - def __sanitize_error_for_logging(error: Dict[str, Any]) -> Dict[str, Any]: - vector_keys = {"text_embedding", "video_embedding", "title_embedding", "embedding"} - - def _truncate_vectors(obj): - if isinstance(obj, dict): - return { - k: f"[vector dim={len(v)}]" if k in vector_keys and isinstance(v, list) else _truncate_vectors(v) - for k, v in obj.items() - } - if isinstance(obj, list) and len(obj) > 10: - return obj[:3] + ["..."] - return obj - - return _truncate_vectors(error) - - def __call__(self) -> None: - asyncio.run(self.__exec_async()) - - def _execute(self) -> None: - asyncio.run(self.__exec_async()) - - def __check_files_exist(self) -> bool: - if not self.elastic_documents_dir.exists(): - return False - - return any([ - any(self.elastic_documents_dir.glob(f"{ELASTIC_SUBDIRS.text_segments}/**/*.jsonl")), - any(self.elastic_documents_dir.glob(f"{ELASTIC_SUBDIRS.text_embeddings}/**/*.jsonl")), - any(self.elastic_documents_dir.glob(f"{ELASTIC_SUBDIRS.video_frames}/**/*.jsonl")), - any(self.elastic_documents_dir.glob(f"{ELASTIC_SUBDIRS.episode_names}/**/*.jsonl")), - any(self.elastic_documents_dir.glob(f"{ELASTIC_SUBDIRS.full_episode_embeddings}/**/*.jsonl")), - any(self.elastic_documents_dir.glob(f"{ELASTIC_SUBDIRS.sound_events}/**/*.jsonl")), - any(self.elastic_documents_dir.glob(f"{ELASTIC_SUBDIRS.sound_event_embeddings}/**/*.jsonl")), - ]) - - async def __exec_async(self) -> None: - if not self.__check_files_exist(): - self.logger.info("No elastic documents found to index.") - return - - try: - self.client = await ElasticSearchManager.connect_to_elasticsearch( - settings.elasticsearch.host, - settings.elasticsearch.user, - settings.elasticsearch.password, - self.logger, - ) - except es_exceptions.ConnectionError: - console.print("[red]✗ Failed to connect to Elasticsearch[/red]") - console.print(f"[yellow]Make sure Elasticsearch is running at: {settings.elasticsearch.host}[/yellow]") - console.print("[yellow]Run: docker-compose -f docker-compose.test.yml up -d[/yellow]") - return - - try: - indices = { - ELASTIC_SUBDIRS.text_segments: f"{self.name}_segments", - ELASTIC_SUBDIRS.text_embeddings: f"{self.name}_text_embeddings", - ELASTIC_SUBDIRS.video_frames: f"{self.name}_video_frames", - ELASTIC_SUBDIRS.episode_names: f"{self.name}_episode_names", - ELASTIC_SUBDIRS.full_episode_embeddings: f"{self.name}_full_episode_embeddings", - ELASTIC_SUBDIRS.sound_events: f"{self.name}_sound_events", - ELASTIC_SUBDIRS.sound_event_embeddings: f"{self.name}_sound_event_embeddings", - } - - for doc_type, index_name in indices.items(): - console.print(f"[cyan]Processing {doc_type} → {index_name}[/cyan]") - - if not self.append: - await self.__delete_index(index_name) - await self.__create_index(index_name, doc_type) - elif not await self.client.indices.exists(index=index_name): - self.logger.info(f"Index '{index_name}' does not exist. Creating it.") - await self.__create_index(index_name, doc_type) - else: - self.logger.info(f"Append mode: not deleting nor recreating index '{index_name}'.") - - await self.__index_documents(doc_type, index_name) - - if not self.dry_run: - for doc_type, index_name in indices.items(): - if await self.client.indices.exists(index=index_name): - await self.__print_sample_document(index_name) - finally: - await self.client.close() - - async def __create_index(self, index_name: str, doc_type: str) -> None: - mappings = { - ELASTIC_SUBDIRS.text_segments: ElasticSearchManager.SEGMENTS_INDEX_MAPPING, - ELASTIC_SUBDIRS.text_embeddings: ElasticSearchManager.TEXT_EMBEDDINGS_INDEX_MAPPING, - ELASTIC_SUBDIRS.video_frames: ElasticSearchManager.VIDEO_EMBEDDINGS_INDEX_MAPPING, - ELASTIC_SUBDIRS.episode_names: ElasticSearchManager.EPISODE_NAMES_INDEX_MAPPING, - ELASTIC_SUBDIRS.full_episode_embeddings: ElasticSearchManager.FULL_EPISODE_EMBEDDINGS_INDEX_MAPPING, - ELASTIC_SUBDIRS.sound_events: ElasticSearchManager.SOUND_EVENTS_INDEX_MAPPING, - ELASTIC_SUBDIRS.sound_event_embeddings: ElasticSearchManager.SOUND_EVENT_EMBEDDINGS_INDEX_MAPPING, - } - - async def operation(): - if await self.client.indices.exists(index=index_name): - self.logger.info(f"Index '{index_name}' already exists.") - else: - await self.client.indices.create( - index=index_name, - body=mappings[doc_type], - ) - self.logger.info(f"Index '{index_name}' created.") - - await self.__do_crud(operation, index_name) - - async def __delete_index(self, index_name: str) -> None: - async def operation(): - if await self.client.indices.exists(index=index_name): - await self.client.indices.delete(index=index_name) - self.logger.info(f"Deleted index: {index_name}") - else: - self.logger.info(f"Index '{index_name}' does not exist. No action taken.") - - await self.__do_crud(operation, index_name) - - async def __do_crud(self, operation: Callable[[], Awaitable[None]], index_name: str) -> None: - try: - await operation() - except es_exceptions.RequestError as e: - self.logger.error(f"Failed operation on index '{index_name}': {e}") - raise - except es_exceptions.ConnectionError as e: - self.logger.error(f"Connection error: {e}") - raise - - async def __index_documents(self, doc_type: str, index_name: str) -> None: - jsonl_files = list(self.elastic_documents_dir.glob(f"{doc_type}/**/*.jsonl")) - - if not jsonl_files: - self.logger.info(f"No {doc_type} documents found. Skipping.") - return - - actions = self.__load_jsonl_files(jsonl_files, index_name) - - if not actions: - self.logger.info(f"No {doc_type} documents to index.") - return - - console.print(f"[cyan]Prepared {len(actions)} {doc_type} documents for indexing[/cyan]") - - if self.dry_run: - self.logger.info(f"Dry-run: would index {len(actions)} documents to '{index_name}'") - if actions: - sample = json.dumps(actions[0], indent=2, ensure_ascii=False)[:500] - self.logger.info(f"Sample document:\n{sample}...") - else: - try: - await async_bulk( - self.client, - actions, - chunk_size=50, - max_chunk_bytes=5 * 1024 * 1024, - ) - console.print(f"[green]✓ Indexed {len(actions)} {doc_type} documents → {index_name}[/green]") - except BulkIndexError as e: - self.logger.error(f"Bulk indexing failed: {len(e.errors)} errors.") - for error in e.errors[:3]: - sanitized = self.__sanitize_error_for_logging(error) - self.logger.error(f"Failed document: {json.dumps(sanitized, indent=2)}") - if len(e.errors) > 10: - self.logger.error(f"... and {len(e.errors) - 10} more errors") - - def __load_jsonl_files(self, jsonl_files: List[Path], index_name: str) -> List[Dict[str, Any]]: - actions = [] - - for jsonl_file in jsonl_files: - self.logger.info(f"Loading {jsonl_file.name}") - with open(jsonl_file, "r", encoding="utf-8") as f: - for line in f: - if line.strip(): - doc = json.loads(line) - actions.append({ - "_index": index_name, - "_source": doc, - }) - - return actions - - def _load_jsonl_documents(self, doc_dir: Path, index_name: str) -> List[Dict[str, Any]]: - actions = [] - - for jsonl_file in doc_dir.rglob("*.jsonl"): - self.logger.info(f"Loading {jsonl_file.name}") - with open(jsonl_file, "r", encoding="utf-8") as f: - for line in f: - if line.strip(): - doc = json.loads(line) - actions.append({ - "_index": index_name, - "_source": doc, - }) - - return actions - - async def __print_sample_document(self, index_name: str) -> None: - try: # pylint: disable=too-many-try-statements - response = await self.client.search(index=index_name, size=1) - if not response["hits"]["hits"]: - self.logger.info(f"No documents found in {index_name}.") - return - - document = response["hits"]["hits"][0]["_source"] - doc_id = response["hits"]["hits"][0]["_id"] - - console.print(f"\n[cyan]Sample document from {index_name}:[/cyan]") - console.print(f" Document ID: {doc_id}") - - if "episode_id" in document: - console.print(f" Episode: {document['episode_id']}") - if "video_path" in document: - console.print(f" Video: {document['video_path']}") - if "text" in document: - text_preview = document['text'][:100] - console.print(f" Text: {text_preview}...") - if "perceptual_hash" in document: - console.print(f" Hash: {document['perceptual_hash']}") - if "timestamp" in document: - console.print(f" Timestamp: {document['timestamp']}") - - except Exception as e: - self.logger.error(f"Failed to retrieve sample document: {e}") diff --git a/preprocessor/processors/elasticsearch_manager.py b/preprocessor/processors/elasticsearch_manager.py deleted file mode 100644 index 43d607d4c..000000000 --- a/preprocessor/processors/elasticsearch_manager.py +++ /dev/null @@ -1,379 +0,0 @@ -import json - -from elasticsearch import ( - AsyncElasticsearch, - exceptions as es_exceptions, -) -import urllib3 - -from preprocessor.config.config import settings - -urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) - -# pylint: disable=duplicate-code -class ElasticSearchManager: - INDEX_MAPPING: json = { - "mappings": { - "properties": { - "episode_info": { - "type": "object", - "properties": { - "season": {"type": "integer"}, - "episode_number": {"type": "integer"}, - "title": {"type": "text"}, - "premiere_date": {"type": "date", "format": "dd.MM.yyyy||d.MM.yyyy||d.M.yyyy||yyyy-MM-dd||strict_date_optional_time||epoch_millis"}, - "viewership": {"type": "keyword"}, - "description": {"type": "text"}, - "summary": {"type": "text"}, - "is_special_feature": {"type": "boolean"}, - "special_feature_type": {"type": "keyword"}, - }, - }, - "text": {"type": "text"}, - "start": {"type": "float"}, - "end": {"type": "float"}, - "video_path": {"type": "keyword"}, - "transcription": { - "type": "object", - "properties": { - "format": {"type": "keyword"}, - "source_file": {"type": "keyword"}, - "language_code": {"type": "keyword"}, - "language_probability": {"type": "float"}, - "segments": { - "type": "nested", - "properties": { - "id": {"type": "integer"}, - "start": {"type": "float"}, - "end": {"type": "float"}, - "text": {"type": "text"}, - "speaker": {"type": "keyword"}, - "words": {"type": "object", "enabled": False}, - }, - }, - }, - }, - "scene_timestamps": { - "type": "object", - "properties": { - "total_scenes": {"type": "integer"}, - "video_info": { - "type": "object", - "properties": { - "fps": {"type": "float"}, - "duration": {"type": "float"}, - "total_frames": {"type": "integer"}, - }, - }, - "detection_settings": {"type": "object", "enabled": False}, - "scenes": { - "type": "nested", - "properties": { - "scene_number": {"type": "integer"}, - "start": {"type": "object", "enabled": False}, - "end": {"type": "object", "enabled": False}, - "duration": {"type": "float"}, - "frame_count": {"type": "integer"}, - }, - }, - }, - }, - "text_embeddings": { - "type": "nested", - "properties": { - "segment_range": {"type": "integer"}, - "text": {"type": "text"}, - "embedding": {"type": "float", "index": False}, - }, - }, - "video_embeddings": { - "type": "nested", - "properties": { - "frame_number": {"type": "integer"}, - "timestamp": {"type": "float"}, - "type": {"type": "keyword"}, - "embedding": {"type": "float", "index": False}, - }, - }, - "id": {"type": "integer"}, - "seek": {"type": "integer"}, - "author": {"type": "keyword"}, - "comment": {"type": "text"}, - "tags": {"type": "keyword"}, - "location": {"type": "keyword"}, - "actors": {"type": "keyword"}, - }, - }, - } - - SEGMENTS_INDEX_MAPPING: json = { - "mappings": { - "properties": { - "episode_id": {"type": "keyword"}, - "episode_metadata": { - "properties": { - "season": {"type": "integer"}, - "episode_number": {"type": "integer"}, - "title": {"type": "text", "fields": {"keyword": {"type": "keyword"}}}, - "premiere_date": {"type": "date", "format": "dd.MM.yyyy||d.MM.yyyy||d.M.yyyy||yyyy-MM-dd||strict_date_optional_time||epoch_millis"}, - "series_name": {"type": "keyword"}, - "viewership": {"type": "keyword"}, - }, - }, - "segment_id": {"type": "integer"}, - "text": { - "type": "text", - "analyzer": "standard", - "fields": { - "keyword": {"type": "keyword"}, - }, - }, - "start_time": {"type": "float"}, - "end_time": {"type": "float"}, - "speaker": {"type": "keyword"}, - "video_path": {"type": "keyword"}, - "scene_info": { - "properties": { - "scene_number": {"type": "integer"}, - "scene_start_time": {"type": "float"}, - "scene_end_time": {"type": "float"}, - "scene_start_frame": {"type": "integer"}, - "scene_end_frame": {"type": "integer"}, - }, - }, - }, - }, - } - - TEXT_EMBEDDINGS_INDEX_MAPPING: json = { - "mappings": { - "properties": { - "episode_id": {"type": "keyword"}, - "episode_metadata": { - "properties": { - "season": {"type": "integer"}, - "episode_number": {"type": "integer"}, - "title": {"type": "text"}, - "premiere_date": {"type": "date", "format": "dd.MM.yyyy||d.MM.yyyy||d.M.yyyy||yyyy-MM-dd||strict_date_optional_time||epoch_millis"}, - "series_name": {"type": "keyword"}, - }, - }, - "embedding_id": {"type": "integer"}, - "segment_range": {"type": "integer"}, - "text": {"type": "text"}, - "text_embedding": { - "type": "dense_vector", - "dims": settings.embedding_model.embedding_dim, - "index": True, - "similarity": "cosine", - }, - "video_path": {"type": "keyword"}, - }, - }, - } - - VIDEO_EMBEDDINGS_INDEX_MAPPING: json = { - "mappings": { - "properties": { - "episode_id": {"type": "keyword"}, - "episode_metadata": { - "properties": { - "season": {"type": "integer"}, - "episode_number": {"type": "integer"}, - "title": {"type": "text"}, - "premiere_date": {"type": "date", "format": "dd.MM.yyyy||d.MM.yyyy||d.M.yyyy||yyyy-MM-dd||strict_date_optional_time||epoch_millis"}, - "series_name": {"type": "keyword"}, - }, - }, - "frame_number": {"type": "integer"}, - "timestamp": {"type": "float"}, - "frame_type": {"type": "keyword"}, - "scene_number": {"type": "integer"}, - "video_embedding": { - "type": "dense_vector", - "dims": settings.embedding_model.embedding_dim, - "index": True, - "similarity": "cosine", - }, - "perceptual_hash": {"type": "keyword"}, - "perceptual_hash_int": {"type": "unsigned_long"}, - "video_path": {"type": "keyword"}, - "character_appearances": { - "type": "nested", - "properties": { - "name": {"type": "keyword"}, - "confidence": {"type": "float"}, - "emotion": { - "properties": { - "label": {"type": "keyword"}, - "confidence": {"type": "float"}, - }, - }, - }, - }, - "detected_objects": { - "type": "nested", - "properties": { - "class": {"type": "keyword"}, - "count": {"type": "integer"}, - }, - }, - "scene_info": { - "properties": { - "scene_start_time": {"type": "float"}, - "scene_end_time": {"type": "float"}, - "scene_start_frame": {"type": "integer"}, - "scene_end_frame": {"type": "integer"}, - }, - }, - }, - }, - } - - EPISODE_NAMES_INDEX_MAPPING: json = { - "mappings": { - "properties": { - "episode_id": {"type": "keyword"}, - "episode_metadata": { - "properties": { - "season": {"type": "integer"}, - "episode_number": {"type": "integer"}, - "title": {"type": "text", "fields": {"keyword": {"type": "keyword"}}}, - "premiere_date": {"type": "date", "format": "dd.MM.yyyy||d.MM.yyyy||d.M.yyyy||yyyy-MM-dd||strict_date_optional_time||epoch_millis"}, - "series_name": {"type": "keyword"}, - "viewership": {"type": "keyword"}, - }, - }, - "title": { - "type": "text", - "analyzer": "standard", - "fields": { - "keyword": {"type": "keyword"}, - }, - }, - "title_embedding": { - "type": "dense_vector", - "dims": settings.embedding_model.embedding_dim, - "index": True, - "similarity": "cosine", - }, - "video_path": {"type": "keyword"}, - }, - }, - } - - FULL_EPISODE_EMBEDDINGS_INDEX_MAPPING: json = { - "mappings": { - "properties": { - "episode_id": {"type": "keyword"}, - "episode_metadata": { - "properties": { - "season": {"type": "integer"}, - "episode_number": {"type": "integer"}, - "title": {"type": "text", "fields": {"keyword": {"type": "keyword"}}}, - "premiere_date": {"type": "date", "format": "dd.MM.yyyy||d.MM.yyyy||d.M.yyyy||yyyy-MM-dd||strict_date_optional_time||epoch_millis"}, - "series_name": {"type": "keyword"}, - "viewership": {"type": "keyword"}, - }, - }, - "full_transcript": {"type": "text"}, - "transcript_length": {"type": "integer"}, - "full_episode_embedding": { - "type": "dense_vector", - "dims": settings.embedding_model.embedding_dim, - "index": True, - "similarity": "cosine", - }, - "video_path": {"type": "keyword"}, - }, - }, - } - - SOUND_EVENTS_INDEX_MAPPING = { - "mappings": { - "properties": { - "episode_id": {"type": "keyword"}, - "episode_metadata": { - "properties": { - "season": {"type": "integer"}, - "episode_number": {"type": "integer"}, - "title": {"type": "text"}, - }, - }, - "segment_id": {"type": "integer"}, - "text": {"type": "text", "analyzer": "standard"}, - "sound_type": {"type": "keyword"}, - "start_time": {"type": "float"}, - "end_time": {"type": "float"}, - "video_path": {"type": "keyword"}, - "scene_info": { - "properties": { - "scene_id": {"type": "integer"}, - "scene_start": {"type": "float"}, - "scene_end": {"type": "float"}, - }, - }, - }, - }, - } - - SOUND_EVENT_EMBEDDINGS_INDEX_MAPPING = { - "mappings": { - "properties": { - "episode_id": {"type": "keyword"}, - "episode_metadata": { - "properties": { - "season": {"type": "integer"}, - "episode_number": {"type": "integer"}, - "title": {"type": "text"}, - }, - }, - "embedding_id": {"type": "integer"}, - "segment_range": {"type": "integer_range"}, - "text": {"type": "text"}, - "sound_types": {"type": "keyword"}, - "start_time": {"type": "float"}, - "end_time": {"type": "float"}, - "sound_event_embedding": { - "type": "dense_vector", - "dims": settings.embedding_model.embedding_dim, - "index": True, - "similarity": "cosine", - }, - "video_path": {"type": "keyword"}, - }, - }, - } - - @staticmethod - async def connect_to_elasticsearch( - es_host: str, - es_user: str, - es_pass: str, - logger, - ) -> AsyncElasticsearch: - es_config = { - "hosts": [es_host], - "verify_certs": False, - "request_timeout": 30, - "max_retries": 3, - "retry_on_timeout": True, - } - - if es_user and es_pass: - es_config["basic_auth"] = (es_user, es_pass) - - es = AsyncElasticsearch(**es_config) - try: - if not await es.ping(): - raise es_exceptions.ConnectionError("Failed to connect to Elasticsearch") - logger.info(f"Connected to Elasticsearch at {es_host}") - return es - except (es_exceptions.ConnectionError, Exception) as e: - error_msg = f"Cannot connect to Elasticsearch at {es_host}" - if "Connection refused" in str(e) or "Failed to establish" in str(e): - logger.error(f"{error_msg} - is Elasticsearch running?") - else: - logger.error(f"{error_msg}: {str(e)}") - raise es_exceptions.ConnectionError(error_msg) from e -# pylint: enable=duplicate-code diff --git a/preprocessor/processors/embedding_generator.py b/preprocessor/processors/embedding_generator.py deleted file mode 100644 index 389f5a985..000000000 --- a/preprocessor/processors/embedding_generator.py +++ /dev/null @@ -1,827 +0,0 @@ -import gc -import json -import logging -from pathlib import Path -import re -from typing import ( - Any, - Dict, - List, - Optional, -) - -import numpy as np -import torch - -from preprocessor.config.config import settings -from preprocessor.core.base_processor import ( - BaseProcessor, - OutputSpec, - ProcessingItem, -) -from preprocessor.core.constants import FILE_SUFFIXES -from preprocessor.core.processor_registry import register_processor -from preprocessor.embeddings.episode_name_embedder import EpisodeNameEmbedder -from preprocessor.embeddings.gpu_batch_processor import GPUBatchProcessor -from preprocessor.embeddings.qwen3_vl_embedding import Qwen3VLEmbedder -from preprocessor.episodes import EpisodeManager -from preprocessor.utils.batch_processing_utils import compute_embeddings_in_batches -from preprocessor.utils.console import console -from preprocessor.utils.constants import EpisodeMetadataKeys -from preprocessor.utils.file_utils import atomic_write_json -from preprocessor.utils.image_hash_utils import load_image_hashes_for_episode -from preprocessor.utils.metadata_utils import create_processing_metadata - -# pylint: disable=duplicate-code - - -@register_processor("generate_embeddings") -class EmbeddingGenerator(BaseProcessor): # pylint: disable=too-many-instance-attributes - REQUIRES = ["transcriptions", "frames"] - PRODUCES = ["embeddings"] - PRIORITY = 50 - DESCRIPTION = "Generate multimodal embeddings" - - def __init__(self, args: Dict[str, Any]): - super().__init__( - args=args, - class_name=self.__class__.__name__, - error_exit_code=9, - loglevel=logging.DEBUG, - ) - - self.transcription_jsons: Path = self._args["transcription_jsons"] - self.frames_dir: Path = self._args.get( - "frames_dir", - settings.frame_export.get_output_dir(self.series_name), - ) - self.output_dir: Path = self._args.get( - "output_dir", - settings.embedding.get_output_dir(self.series_name), - ) - - self.model_name: str = self._args.get("model", settings.embedding_model.model_name) - self.model_revision: str = self._args.get("model_revision", settings.embedding_model.model_revision) - self.batch_size: int = self._args.get("batch_size", settings.embedding.batch_size) - self.device: str = "cuda" - - self.segments_per_embedding: int = self._args.get("segments_per_embedding", settings.text_chunking.segments_per_embedding) - self.text_sentences_per_chunk: int = self._args.get("text_sentences_per_chunk", settings.text_chunking.text_sentences_per_chunk) - self.text_chunk_overlap: int = self._args.get("text_chunk_overlap", settings.text_chunking.text_chunk_overlap) - self.generate_text: bool = self._args.get("generate_text", True) - self.generate_video: bool = self._args.get("generate_video", True) - self.generate_episode_names: bool = self._args.get("generate_episode_names", True) - self.generate_full_episode: bool = self._args.get("generate_full_episode", settings.embedding.generate_full_episode_embedding) - self.generate_sound_events: bool = self._args.get("generate_sound_events", True) - - self.image_hashes_dir: Path = Path(self._args.get("image_hashes_dir", settings.image_hash.get_output_dir(self.series_name))) - - episodes_info_json = self._args.get("episodes_info_json") - self.episode_manager = EpisodeManager(episodes_info_json, self.series_name) - - self.model = None - self.processor = None - self.gpu_processor: Optional[GPUBatchProcessor] = None - self.episode_name_embedder: Optional[EpisodeNameEmbedder] = None - - def _validate_args(self, args: Dict[str, Any]) -> None: - if "transcription_jsons" not in args: - raise ValueError("transcription_jsons is required") - if not torch.cuda.is_available(): - raise RuntimeError("CUDA is not available. This application requires GPU.") - - def get_output_subdir(self) -> str: - return settings.output_subdirs.embeddings - - def cleanup(self) -> None: - console.print("[cyan]Unloading embedding model...[/cyan]") - self.model = None - self.processor = None - self._cleanup_memory() - console.print("[green]✓ Model unloaded[/green]") - - def _get_processing_items(self) -> List[ProcessingItem]: - all_transcription_files = list(self.transcription_jsons.glob("**/*.json")) - items = [] - seen_episodes = set() - - for trans_file in all_transcription_files: - if "_simple.json" in trans_file.name or "_text_stats.json" in trans_file.name: - continue - - if trans_file.parent.name in {"clean", "sound_events"}: - continue - - if not trans_file.name.endswith("_segmented.json"): - segmented_version = trans_file.parent / f"{trans_file.stem}_segmented.json" - if segmented_version.exists(): - continue - - episode_info = self.episode_manager.parse_filename(trans_file) - if episode_info: - episode_key = (episode_info.season, episode_info.relative_episode) - if episode_key in seen_episodes: - continue - seen_episodes.add(episode_key) - - items.append(self._create_transcription_processing_item(trans_file)) - - return items - - def _should_skip_item(self, item: ProcessingItem): - trans_file = item.input_path - parent_name = trans_file.parent.name - if parent_name in {"raw", "clean", "sound_events"}: - episode_dir = trans_file.parent.parent - else: - episode_dir = trans_file.parent - - clean_dir = episode_dir / settings.output_subdirs.transcription_subdirs.clean - base_name = self.__remove_all_suffixes(trans_file.stem) - clean_transcription_file = clean_dir / f"{base_name}_clean_transcription.json" - - if clean_transcription_file.exists(): - try: - with open(clean_transcription_file, "r", encoding="utf-8") as f: - data = json.load(f) - segments = data.get("segments", []) - if not segments: - episode_id = item.episode_id - self.logger.warning( - f"Empty clean transcription (no text segments) for {episode_id}, " - f"will skip text embeddings but generate other types (sound events, episode names, etc.)", - ) - except Exception as e: - self.logger.error(f"Failed to read {clean_transcription_file}: {e}") - - return super()._should_skip_item(item) - - def _get_expected_outputs(self, item: ProcessingItem) -> List[OutputSpec]: - outputs = [] - episode_info = self.episode_manager.parse_filename(item.input_path) - if not episode_info: - return outputs - - if self.generate_text: - text_filename = self.episode_manager.path_manager.build_filename( - episode_info, - extension="json", - suffix="_embeddings_text", - ) - text_output = self._build_output_path(episode_info, text_filename) - outputs.append(OutputSpec(path=text_output, required=True)) - - if self.generate_episode_names: - episode_name_filename = f"{FILE_SUFFIXES['episode_name']}.json" - episode_name_output = self._build_output_path(episode_info, episode_name_filename) - outputs.append(OutputSpec(path=episode_name_output, required=True)) - - if self.generate_video: - video_filename = self.episode_manager.path_manager.build_filename( - episode_info, - extension="json", - suffix="_embeddings_video", - ) - video_output = self._build_output_path(episode_info, video_filename) - outputs.append(OutputSpec(path=video_output, required=True)) - - if self.generate_full_episode: - full_episode_filename = self.episode_manager.path_manager.build_filename( - episode_info, - extension="json", - suffix="_embeddings_full_episode", - ) - full_episode_output = self._build_output_path(episode_info, full_episode_filename) - outputs.append(OutputSpec(path=full_episode_output, required=True)) - - if self.generate_sound_events: - sound_events_filename = self.episode_manager.path_manager.build_filename( - episode_info, - extension="json", - suffix="_embeddings_sound_events", - ) - sound_events_output = self._build_output_path(episode_info, sound_events_filename) - outputs.append(OutputSpec(path=sound_events_output, required=True)) - - return outputs - - def _get_temp_files(self, item: ProcessingItem) -> List[str]: - temp_files = [] - expected_outputs = self._get_expected_outputs(item) - for output in expected_outputs: - temp_path = output.path.with_suffix('.json.tmp') - temp_files.append(str(temp_path)) - return temp_files - - def _get_processing_info(self) -> List[str]: - return [ - f"[cyan]Loading model: {self.model_name}[/cyan]", - f"[cyan]Device: {self.device}[/cyan]", - f"[cyan]Batch size: {self.batch_size}[/cyan]", - ] - - def _load_resources(self) -> bool: - self.__load_model() - self.gpu_processor = GPUBatchProcessor( - self.model, - self.batch_size, - self.logger, - self.device, - progress_sub_batch_size=settings.embedding.progress_sub_batch_size, - ) - self.episode_name_embedder = EpisodeNameEmbedder( - model=self.model, - episode_manager=self.episode_manager, - series_name=self.series_name, - logger=self.logger, - ) - return True - - def __load_model(self) -> None: - try: - self.model = Qwen3VLEmbedder( - model_name_or_path=self.model_name, - torch_dtype=torch.bfloat16, - ) - console.print("[green]Qwen3-VL-Embedding model loaded successfully (vLLM)[/green]") - except Exception as e: - self.logger.error(f"Failed to load model: {e}") - raise - - def _process_item(self, item: ProcessingItem, missing_outputs: List[OutputSpec]) -> None: # pylint: disable=too-many-locals,too-many-statements - trans_file = item.input_path - - parent_name = trans_file.parent.name - if parent_name in {"raw", "clean", "sound_events"}: - episode_dir = trans_file.parent.parent - else: - episode_dir = trans_file.parent - - clean_dir = episode_dir / settings.output_subdirs.transcription_subdirs.clean - - base_name = self.__remove_all_suffixes(trans_file.stem) - clean_transcription_file = clean_dir / f"{base_name}_clean_transcription.json" - - if not clean_transcription_file.exists(): - self.logger.warning(f"Clean transcription not found: {clean_transcription_file}, skipping text embeddings generation") - with open(trans_file, "r", encoding="utf-8") as f: - data = json.load(f) - data["segments"] = [] - else: - with open(clean_transcription_file, "r", encoding="utf-8") as f: - data = json.load(f) - - has_segments = bool(data.get("segments")) - segmented_file = trans_file.parent / f"{trans_file.stem}_segmented.json" - - if not has_segments and segmented_file.exists(): - return - - need_text = any("embeddings_text.json" in str(o.path) for o in missing_outputs) - need_video = any("embeddings_video.json" in str(o.path) for o in missing_outputs) - need_episode_name = any("episode_name_embedding.json" in str(o.path) for o in missing_outputs) - need_full_episode = any("embeddings_full_episode.json" in str(o.path) for o in missing_outputs) - need_sound_events = any("embeddings_sound_events.json" in str(o.path) for o in missing_outputs) - - text_embeddings = [] - if need_text: - text_embeddings = self.__generate_text_embeddings(data) - - sound_event_embeddings = [] - if need_sound_events: - sound_event_embeddings = self.__generate_sound_event_embeddings(trans_file) - - video_embeddings = [] - if need_video: - episode_info = data.get("episode_info", {}) - frame_metadata = self.__load_frame_metadata(episode_info) - if frame_metadata: - video_embeddings = self.__generate_video_embeddings(episode_info, frame_metadata) - - if need_episode_name and self.episode_name_embedder: - self.episode_name_embedder.generate_and_save_for_transcription(data) - - full_episode_embedding = None - if need_full_episode: - full_episode_embedding = self.__generate_full_episode_embedding(trans_file) - - episode_dir = self.__get_episode_output_dir(trans_file) - episode_info_dict = data.get("episode_info", {}) - season = episode_info_dict.get("season", 0) - episode_num = episode_info_dict.get("episode_number", 0) - - episode_info_temp = self.episode_manager.get_episode_by_season_and_relative(season, episode_num) - if episode_info_temp: - episode_code = episode_info_temp.episode_code() - else: - episode_code = f"S{season:02d}E{episode_num:02d}" - - text_output = episode_dir / f"{self.episode_manager.series_name}_{episode_code}_embeddings_text.json" - video_output = episode_dir / f"{self.episode_manager.series_name}_{episode_code}_embeddings_video.json" - full_episode_output = episode_dir / f"{self.episode_manager.series_name}_{episode_code}_embeddings_full_episode.json" - sound_events_output = episode_dir / f"{self.episode_manager.series_name}_{episode_code}_embeddings_sound_events.json" - self.__save_embeddings( - data, - text_embeddings, - video_embeddings, - full_episode_embedding, - sound_event_embeddings, - text_output, - video_output, - full_episode_output, - sound_events_output, - ) - self._cleanup_memory() - - def __generate_text_embeddings(self, data: Dict[str, Any]) -> List[Dict[str, Any]]: # pylint: disable=too-many-locals - segments = data.get("segments", []) - if not segments: - return [] - - text_chunks = [] - chunk_metadata = [] - - if True: # Always use sentence-based chunking for text # pylint: disable=using-constant-test - full_text = " ".join([seg.get("text", "") for seg in segments]) - sentences = self.__split_into_sentences(full_text) - - sentences_per_chunk = self.text_sentences_per_chunk - overlap = self.text_chunk_overlap - step = sentences_per_chunk - overlap - - for i in range(0, len(sentences), step): - chunk_sentences = sentences[i:i + sentences_per_chunk] - if not chunk_sentences: - continue - - chunk_text = " ".join(chunk_sentences).strip() - if not chunk_text: - continue - - char_start = sum(len(s) + 1 for s in sentences[:i]) - char_end = char_start + len(chunk_text) - - start_seg_id = self.__find_segment_at_position(segments, char_start) - end_seg_id = self.__find_segment_at_position(segments, char_end) - - text_chunks.append(chunk_text) - chunk_metadata.append({ - "segment_range": [start_seg_id, end_seg_id], - "text": chunk_text, - }) - else: - for i in range(0, len(segments), self.segments_per_embedding): - chunk = segments[i: i + self.segments_per_embedding] - combined_text = " ".join([seg.get("text", "") for seg in chunk]) - - if combined_text.strip(): - text_chunks.append(combined_text) - chunk_metadata.append({ - "segment_range": [i, i + len(chunk) - 1], - "text": combined_text, - }) - - if not text_chunks: - return [] - - embeddings = [] - text_batch_size = settings.embedding.text_batch_size - - with self.progress.track_operation( - f"Text embeddings ({len(text_chunks)} chunks)", - (len(text_chunks) + text_batch_size - 1) // text_batch_size, - ) as tracker: - for batch_idx in range(0, len(text_chunks), text_batch_size): - batch_texts = text_chunks[batch_idx: batch_idx + text_batch_size] - batch_meta = chunk_metadata[batch_idx: batch_idx + text_batch_size] - - try: - batch_embeddings = self.__encode_text_batch(batch_texts) - for meta, embedding in zip(batch_meta, batch_embeddings): - embeddings.append({ - **meta, - "embedding": embedding.tolist(), - }) - except (RuntimeError, ValueError, OSError) as e: - self.logger.error(f"Failed text embedding batch {batch_idx}: {e}") - - tracker.update((batch_idx // text_batch_size) + 1, interval=5) - - return embeddings - - def __generate_sound_event_embeddings(self, trans_file: Path) -> List[Dict[str, Any]]: # pylint: disable=too-many-locals - parent_name = trans_file.parent.name - if parent_name in {"raw", "clean", "sound_events"}: - episode_dir = trans_file.parent.parent - else: - episode_dir = trans_file.parent - - sound_events_dir = episode_dir / settings.output_subdirs.transcription_subdirs.sound_events - - base_name = self.__remove_all_suffixes(trans_file.stem) - sound_events_file = sound_events_dir / f"{base_name}_sound_events.json" - - if not sound_events_file.exists(): - self.logger.warning(f"Sound events file not found: {sound_events_file}, skipping sound event embeddings generation") - return [] - - try: - with open(sound_events_file, "r", encoding="utf-8") as f: - sound_events_data = json.load(f) - except Exception as e: - self.logger.error(f"Failed to load sound events file {sound_events_file}: {e}") - return [] - - segments = sound_events_data.get("segments", []) - if not segments: - return [] - - text_chunks = [] - chunk_metadata = [] - - for i in range(0, len(segments), self.segments_per_embedding): - chunk = segments[i: i + self.segments_per_embedding] - combined_text = " ".join([seg.get("text", "") for seg in chunk]) - - if combined_text.strip(): - sound_types = set() - for seg in chunk: - sound_type = seg.get("sound_type", "sound") - sound_types.add(sound_type) - - start_time = chunk[0].get("start", 0.0) if chunk else 0.0 - end_time = chunk[-1].get("end", 0.0) if chunk else 0.0 - - text_chunks.append(combined_text) - chunk_metadata.append({ - "segment_range": [i, i + len(chunk) - 1], - "text": combined_text, - "sound_types": list(sound_types), - "start_time": start_time, - "end_time": end_time, - }) - - if not text_chunks: - return [] - - embeddings = [] - text_batch_size = settings.embedding.text_batch_size - - with self.progress.track_operation( - f"Sound event embeddings ({len(text_chunks)} chunks)", - (len(text_chunks) + text_batch_size - 1) // text_batch_size, - ) as tracker: - for batch_idx in range(0, len(text_chunks), text_batch_size): - batch_texts = text_chunks[batch_idx: batch_idx + text_batch_size] - batch_meta = chunk_metadata[batch_idx: batch_idx + text_batch_size] - - try: - batch_embeddings = self.__encode_text_batch(batch_texts) - for meta, embedding in zip(batch_meta, batch_embeddings): - embeddings.append({ - **meta, - "embedding": embedding.tolist(), - }) - except (RuntimeError, ValueError, OSError) as e: - self.logger.error(f"Failed sound event embedding batch {batch_idx}: {e}") - - tracker.update((batch_idx // text_batch_size) + 1, interval=5) - - return embeddings - - @staticmethod - def __remove_all_suffixes(base_name: str) -> str: - suffixes = (FILE_SUFFIXES["segmented"], FILE_SUFFIXES["sound_events"], FILE_SUFFIXES["clean"], FILE_SUFFIXES["clean_alt"]) - while True: - removed = False - for suffix in suffixes: - if base_name.endswith(suffix): - base_name = base_name[:-len(suffix)] - removed = True - break - if not removed: - break - return base_name - - @staticmethod - def __split_into_sentences(text: str) -> List[str]: - normalized_text = re.sub(r'\.{2,}', '.', text) - normalized_text = re.sub(r'!{2,}', '!', normalized_text) - normalized_text = re.sub(r'\?{2,}', '?', normalized_text) - - sentences = re.split(r'([.!?]+(?:\s+|$))', normalized_text) - raw_sentences = [] - for i in range(0, len(sentences) - 1, 2): - sentence = sentences[i] + (sentences[i + 1] if i + 1 < len(sentences) else "") - sentence = sentence.strip() - if sentence: - raw_sentences.append(sentence) - if len(sentences) % 2 == 1 and sentences[-1].strip(): - raw_sentences.append(sentences[-1].strip()) - - result = [] - buffer = "" - min_sentence_length = 30 - - for sentence in raw_sentences: - buffer = (buffer + " " + sentence).strip() if buffer else sentence - - if len(buffer) >= min_sentence_length: - result.append(buffer) - buffer = "" - - if buffer: - if result: - result[-1] = result[-1] + " " + buffer - else: - result.append(buffer) - - return result - - @staticmethod - def __find_segment_at_position(segments: List[Dict[str, Any]], char_pos: int) -> int: - cumulative_length = 0 - for idx, seg in enumerate(segments): - seg_text = seg.get("text", "") - seg_length = len(seg_text) + 1 - if cumulative_length <= char_pos < cumulative_length + seg_length: - return idx - cumulative_length += seg_length - return len(segments) - 1 if segments else 0 - - def __encode_text_batch(self, texts: List[str]) -> List[np.ndarray]: - inputs = [{"text": text} for text in texts] - embeddings_tensor = self.model.process(inputs, normalize=True) - embeddings = [emb.cpu().numpy() for emb in embeddings_tensor] - del embeddings_tensor - return embeddings - - def __generate_full_episode_embedding(self, trans_file: Path) -> Optional[Dict[str, Any]]: # pylint: disable=too-many-locals,too-many-statements - parent_name = trans_file.parent.name - if parent_name in {"raw", "clean", "sound_events"}: - episode_dir = trans_file.parent.parent - else: - episode_dir = trans_file.parent - - clean_dir = episode_dir / settings.output_subdirs.transcription_subdirs.clean - - base_name = self.__remove_all_suffixes(trans_file.stem) - clean_txt_file = clean_dir / f"{base_name}_clean_transcription.txt" - - if not clean_txt_file.exists(): - self.logger.warning(f"Clean transcript file not found: {clean_txt_file}") - return None - - try: # pylint: disable=too-many-try-statements - with open(clean_txt_file, "r", encoding="utf-8") as f: - full_text = f.read().strip() - - if not full_text: - self.logger.warning(f"Empty clean transcript file: {clean_txt_file}") - return None - - console.print(f"[cyan]Generating full episode embedding ({len(full_text)} chars)...[/cyan]") - - max_chars_per_chunk = 6000 - overlap_chars = 4500 - - if len(full_text) > max_chars_per_chunk: - console.print( - f"[yellow]Text too long ({len(full_text)} chars), " - f"using sliding window (chunk={max_chars_per_chunk}, overlap={overlap_chars})...[/yellow]", - ) - - chunks = [] - step_size = max_chars_per_chunk - overlap_chars - - for i in range(0, len(full_text), step_size): - chunk_end = min(i + max_chars_per_chunk, len(full_text)) - chunk = full_text[i:chunk_end] - - if len(chunk.strip()) < 100: - continue - - chunks.append(chunk) - - if chunk_end >= len(full_text): - break - - console.print(f"[cyan]Processing {len(chunks)} overlapping chunks...[/cyan]") - chunk_embeddings = [] - chunk_weights = [] - - for idx, chunk in enumerate(chunks): - inputs = [{"text": chunk}] - embeddings_tensor = self.model.process(inputs, normalize=True) - chunk_embedding = embeddings_tensor[0].cpu().numpy() - chunk_embeddings.append(chunk_embedding) - del embeddings_tensor - - weight = len(chunk) / max_chars_per_chunk - chunk_weights.append(weight) - - if (idx + 1) % 5 == 0 or idx == len(chunks) - 1: - console.print(f"[cyan]Processed chunk {idx + 1}/{len(chunks)}[/cyan]") - - chunk_weights_array = np.array(chunk_weights) - chunk_weights_normalized = chunk_weights_array / chunk_weights_array.sum() - - embedding = np.average(chunk_embeddings, axis=0, weights=chunk_weights_normalized) - embedding = embedding / np.linalg.norm(embedding) - - console.print(f"[green]✓ Weighted-averaged {len(chunks)} overlapping chunks[/green]") - else: - inputs = [{"text": full_text}] - embeddings_tensor = self.model.process(inputs, normalize=True) - embedding = embeddings_tensor[0].cpu().numpy() - del embeddings_tensor - - return { - "text": full_text, - "embedding": embedding.tolist(), - "transcript_length": len(full_text), - } - - except Exception as e: - self.logger.error(f"Failed to generate full episode embedding: {e}") - return None - - def __load_frame_metadata(self, episode_info_dict: Dict[str, Any]) -> Optional[Dict[str, Any]]: - season = episode_info_dict.get("season") - episode = episode_info_dict.get("episode_number") - if season is None or episode is None: - return None - - episode_info_obj = self.episode_manager.get_episode_by_season_and_relative(season, episode) - if not episode_info_obj: - return None - - frames_episode_dir = self.path_manager.get_episode_dir(episode_info_obj, settings.output_subdirs.frames) - metadata_file = frames_episode_dir / f"{self.episode_manager.series_name}_{episode_info_obj.episode_code()}_frame_metadata.json" - - if not metadata_file.exists(): - self.logger.warning(f"Frame metadata not found: {metadata_file}") - return None - - with open(metadata_file, "r", encoding="utf-8") as f: - return json.load(f) - - def __load_image_hashes(self, episode_info_dict: Dict[str, Any]) -> Dict[int, str]: - return load_image_hashes_for_episode(episode_info_dict, self.series_name, self.logger) - - def __generate_video_embeddings(self, episode_info_dict: Dict[str, Any], frame_metadata: Dict[str, Any]) -> List[Dict[str, Any]]: - frame_requests = frame_metadata.get("frames", []) - if not frame_requests: - return [] - - season = episode_info_dict.get("season") - episode = episode_info_dict.get("episode_number") - - episode_info_obj = self.episode_manager.get_episode_by_season_and_relative(season, episode) - if not episode_info_obj: - return [] - - frames_episode_dir = self.path_manager.get_episode_dir(episode_info_obj, settings.output_subdirs.frames) - episode_output_dir = self.path_manager.get_episode_dir(episode_info_obj, settings.output_subdirs.embeddings) - checkpoint_file = episode_output_dir / "embeddings_video_checkpoint.json" - - image_hashes = self.__load_image_hashes(episode_info_dict) - embeddings = compute_embeddings_in_batches( - frames_episode_dir, - frame_requests, - self.gpu_processor, - self.batch_size, - image_hashes, - checkpoint_file=checkpoint_file, - checkpoint_interval=20, - prefetch_count=settings.embedding.prefetch_chunks, - ) - self._cleanup_memory() - return embeddings - - def __get_episode_output_dir(self, transcription_file: Path) -> Path: - episode_info_from_file = self.episode_manager.parse_filename(transcription_file) - if episode_info_from_file: - return self.path_manager.get_episode_dir(episode_info_from_file, settings.output_subdirs.embeddings) - return self.path_manager.base_output_dir / settings.output_subdirs.embeddings - - def __save_embeddings( - self, - data, - text_embeddings, - video_embeddings, - full_episode_embedding, - sound_event_embeddings, - text_output, - video_output, - full_episode_output, - sound_events_output, - ): - episode_info = data.get(EpisodeMetadataKeys.EPISODE_INFO, {}) - text_output.parent.mkdir(parents=True, exist_ok=True) - - if text_embeddings: - text_data = create_processing_metadata( - episode_info=type( - 'obj', (object,), { - 'season': episode_info.get(EpisodeMetadataKeys.SEASON), - 'relative_episode': episode_info.get(EpisodeMetadataKeys.EPISODE_NUMBER), - }, - )(), - processing_params={ - "model_name": self.model_name, - "model_revision": self.model_revision, - "segments_per_embedding": self.segments_per_embedding, - "use_sentence_based_chunking": True, - "text_sentences_per_chunk": self.text_sentences_per_chunk, - "text_chunk_overlap": self.text_chunk_overlap, - "device": self.device, - }, - statistics={ - "total_embeddings": len(text_embeddings), - "embedding_dimension": len(text_embeddings[0]["embedding"]) if text_embeddings else 0, - }, - results_key="text_embeddings", - results_data=text_embeddings, - ) - atomic_write_json(text_output, text_data, indent=2, ensure_ascii=False) - - if video_embeddings: - video_data = create_processing_metadata( - episode_info=type( - 'obj', (object,), { - 'season': episode_info.get(EpisodeMetadataKeys.SEASON), - 'relative_episode': episode_info.get(EpisodeMetadataKeys.EPISODE_NUMBER), - }, - )(), - processing_params={ - "model_name": self.model_name, - "model_revision": self.model_revision, - "batch_size": self.batch_size, - "device": self.device, - }, - statistics={ - "total_embeddings": len(video_embeddings), - "embedding_dimension": len(video_embeddings[0]["embedding"]) if video_embeddings else 0, - "frames_with_hash": sum(1 for e in video_embeddings if "perceptual_hash" in e), - }, - results_key="video_embeddings", - results_data=video_embeddings, - ) - atomic_write_json(video_output, video_data, indent=2, ensure_ascii=False) - - if full_episode_embedding: - full_episode_data = create_processing_metadata( - episode_info=type( - 'obj', (object,), { - 'season': episode_info.get(EpisodeMetadataKeys.SEASON), - 'relative_episode': episode_info.get(EpisodeMetadataKeys.EPISODE_NUMBER), - }, - )(), - processing_params={ - "model_name": self.model_name, - "model_revision": self.model_revision, - "device": self.device, - }, - statistics={ - "transcript_length": full_episode_embedding.get("transcript_length", 0), - "embedding_dimension": len(full_episode_embedding["embedding"]) if "embedding" in full_episode_embedding else 0, - }, - results_key="full_episode_embedding", - results_data=full_episode_embedding, - ) - atomic_write_json(full_episode_output, full_episode_data, indent=2, ensure_ascii=False) - console.print(f"[green]✓ Saved full episode embedding to: {full_episode_output}[/green]") - - if sound_event_embeddings: - sound_events_data = create_processing_metadata( - episode_info=type( - 'obj', (object,), { - 'season': episode_info.get(EpisodeMetadataKeys.SEASON), - 'relative_episode': episode_info.get(EpisodeMetadataKeys.EPISODE_NUMBER), - }, - )(), - processing_params={ - "model_name": self.model_name, - "model_revision": self.model_revision, - "segments_per_embedding": self.segments_per_embedding, - "use_sentence_based_chunking": True, - "text_sentences_per_chunk": self.text_sentences_per_chunk, - "text_chunk_overlap": self.text_chunk_overlap, - "device": self.device, - }, - statistics={ - "total_embeddings": len(sound_event_embeddings), - "embedding_dimension": len(sound_event_embeddings[0]["embedding"]) if sound_event_embeddings else 0, - }, - results_key="sound_event_embeddings", - results_data=sound_event_embeddings, - ) - atomic_write_json(sound_events_output, sound_events_data, indent=2, ensure_ascii=False) - console.print(f"[green]✓ Saved sound event embeddings to: {sound_events_output}[/green]") - - @staticmethod - def _cleanup_memory() -> None: - gc.collect() - if torch.cuda.is_available(): - torch.cuda.empty_cache() diff --git a/preprocessor/processors/frame_exporter.py b/preprocessor/processors/frame_exporter.py deleted file mode 100644 index 790eecf4b..000000000 --- a/preprocessor/processors/frame_exporter.py +++ /dev/null @@ -1,285 +0,0 @@ -from datetime import datetime -import json -import logging -from pathlib import Path -import shutil -import subprocess -from typing import ( - Any, - Dict, - List, - Optional, -) - -from PIL import Image -import decord - -from preprocessor.config.config import settings -from preprocessor.core.base_processor import ( - OutputSpec, - ProcessingItem, -) -from preprocessor.core.enums import KeyframeStrategy -from preprocessor.core.processor_registry import register_processor -from preprocessor.core.video_processor import VideoProcessor -from preprocessor.embeddings.strategies.strategy_factory import KeyframeStrategyFactory -from preprocessor.episodes import EpisodeManager -from preprocessor.types import FrameRequest -from preprocessor.utils.console import console -from preprocessor.utils.file_utils import atomic_write_json - - -@register_processor("export_frames") -class FrameExporter(VideoProcessor): - REQUIRES = ["videos", "scene_timestamps"] - PRODUCES = ["frames"] - PRIORITY = 30 - DESCRIPTION = "Export keyframes from videos" - - def __init__(self, args: Dict[str, Any]) -> None: - super().__init__( - args=args, - class_name=self.__class__.__name__, - error_exit_code=10, - loglevel=logging.DEBUG, - ) - decord.bridge.set_bridge('native') - - self.output_frames: Path = Path( - self._args.get("output_frames", settings.frame_export.get_output_dir(self.series_name)), - ) - self.output_frames.mkdir(parents=True, exist_ok=True) - - self.scene_timestamps_dir: Path = Path( - self._args.get("scene_timestamps_dir", settings.scene_detection.get_output_dir(self.series_name)), - ) - - resolution = self._args.get("resolution", settings.frame_export.resolution) - self.resize_width: int = resolution.width - self.resize_height: int = resolution.height - - keyframe_strategy_str = self._args.get("keyframe_strategy", settings.keyframe_extraction.strategy) - self.keyframe_strategy = KeyframeStrategy(keyframe_strategy_str) - self.frames_per_scene: int = self._args.get("frames_per_scene", settings.keyframe_extraction.scene_changes.frames_per_scene) - - self.strategy = KeyframeStrategyFactory.create( - self.keyframe_strategy, - self.frames_per_scene, - ) - - def _validate_args(self, args: Dict[str, Any]) -> None: - self._validate_videos_required(args) - - if "scene_timestamps_dir" in args: - scene_path = Path(args["scene_timestamps_dir"]) - if scene_path and not scene_path.exists(): - console.print(f"[yellow]Warning: Scene timestamps directory does not exist: {scene_path}[/yellow]") - - def get_output_subdir(self) -> str: - return settings.output_subdirs.frames - - def _get_expected_outputs(self, item: ProcessingItem) -> List[OutputSpec]: - episode_info = item.metadata["episode_info"] - - metadata_filename = self.episode_manager.path_manager.build_filename( - episode_info, - extension="json", - suffix="_frame_metadata", - ) - metadata_file = self._build_output_path(episode_info, metadata_filename) - return [OutputSpec(path=metadata_file, required=True)] - - def _get_temp_files(self, item: ProcessingItem) -> List[str]: - expected_outputs = self._get_expected_outputs(item) - if not expected_outputs: - return [] - temp_metadata = expected_outputs[0].path.with_suffix('.json.tmp') - return [str(temp_metadata)] - - def _process_item(self, item: ProcessingItem, missing_outputs: List[OutputSpec]) -> None: - episode_info = item.metadata["episode_info"] - episode_dir = self.__get_episode_dir(episode_info) - - if episode_dir.exists(): - metadata_filename = self.episode_manager.path_manager.build_filename( - episode_info, - extension="json", - suffix="_frame_metadata", - ) - metadata_file = episode_dir / metadata_filename - if not metadata_file.exists(): - console.print(f"[yellow]Cleaning incomplete frames from previous run: {episode_dir}[/yellow]") - shutil.rmtree(episode_dir, ignore_errors=True) - - episode_dir.mkdir(parents=True, exist_ok=True) - - data = self.__prepare_data(episode_info) - frame_requests = self.strategy.extract_frame_requests(item.input_path, data) - - if not frame_requests: - console.print(f"[yellow]No frames to extract for {item.input_path.name}[/yellow]") - return - - console.print(f"[cyan]Extracting {len(frame_requests)} keyframes from {item.input_path.name}[/cyan]") - - try: - self.__extract_frames(item.input_path, frame_requests, episode_dir, episode_info) - self.__write_metadata(episode_dir, frame_requests, episode_info, item.input_path) - console.print(f"[green]✓ Exported {len(frame_requests)} frames to {episode_dir}[/green]") - except Exception as e: - self.logger.error(f"Failed to extract frames from {item.input_path}: {e}") - console.print(f"[yellow]Cleaning incomplete frames due to error: {episode_dir}[/yellow]") - shutil.rmtree(episode_dir, ignore_errors=True) - raise - - def __get_episode_dir(self, episode_info) -> Path: - season_code = episode_info.season_code() - episode_code = episode_info.episode_num() - return self.path_manager.base_output_dir / self.get_output_subdir() / season_code / episode_code - - def __prepare_data(self, episode_info) -> Dict[str, Any]: - data = {} - scene_timestamps = self.__load_scene_timestamps(episode_info) - if scene_timestamps: - data["scene_timestamps"] = scene_timestamps - return data - - def __extract_frames(self, video_file: Path, frame_requests: List[FrameRequest], episode_dir: Path, episode_info) -> None: - metadata = self.__get_video_metadata(video_file) - self.current_video_dar = self.__calculate_display_aspect_ratio(metadata) - - vr = decord.VideoReader(str(video_file), ctx=decord.cpu(0)) - frame_numbers = [req["frame_number"] for req in frame_requests] - - with self.progress.track_operation(f"Keyframes ({len(frame_numbers)} frames)", len(frame_numbers)) as tracker: - for idx, frame_num in enumerate(frame_numbers, 1): - self.__extract_and_save_frame(vr, frame_num, episode_dir, episode_info) - tracker.update(idx, interval=50) - - del vr - - def __extract_and_save_frame(self, vr, frame_num: int, episode_dir: Path, episode_info) -> None: - frame_np = vr[frame_num].asnumpy() - frame_pil = Image.fromarray(frame_np) - - resized = self.__resize_frame(frame_pil, self.current_video_dar) - base_filename = self.episode_manager.path_manager.build_base_filename(episode_info) - filename = f"{base_filename}_frame_{frame_num:06d}.jpg" - resized.save(episode_dir / filename, quality=90) - - @staticmethod - def __get_video_metadata(video_path: Path) -> Dict[str, Any]: - cmd = [ - "ffprobe", "-v", "error", - "-select_streams", "v:0", - "-show_entries", "stream=width,height,sample_aspect_ratio,display_aspect_ratio", - "-of", "json", - str(video_path), - ] - result = subprocess.run(cmd, capture_output=True, text=True, check=True) - probe_data: Dict[str, Any] = json.loads(result.stdout) - streams: List[Dict[str, Any]] = probe_data.get("streams", []) - if not streams: - raise ValueError(f"No video streams found in {video_path}") - return streams[0] - - @staticmethod - def __calculate_display_aspect_ratio(metadata: Dict[str, Any]) -> float: - width = metadata.get("width", 0) - height = metadata.get("height", 0) - if width == 0 or height == 0: - raise ValueError("Invalid video dimensions") - - sar_str = metadata.get("sample_aspect_ratio", "1:1") - if sar_str == "N/A" or not sar_str: - sar_str = "1:1" - - try: - sar_num, sar_denom = [int(x) for x in sar_str.split(":")] - sar = sar_num / sar_denom if sar_denom != 0 else 1.0 - except (ValueError, ZeroDivisionError): - sar = 1.0 - - return (width / height) * sar - - def __resize_frame(self, frame: Image.Image, display_aspect_ratio: float) -> Image.Image: - target_aspect = self.resize_width / self.resize_height - - if abs(display_aspect_ratio - target_aspect) < 0.01: - return frame.resize((self.resize_width, self.resize_height), Image.Resampling.LANCZOS) - - if display_aspect_ratio > target_aspect: - new_height = self.resize_height - new_width = int(self.resize_height * display_aspect_ratio) - resized = frame.resize((new_width, new_height), Image.Resampling.LANCZOS) - - x_crop = (new_width - self.resize_width) // 2 - cropped = resized.crop((x_crop, 0, x_crop + self.resize_width, self.resize_height)) - return cropped - - new_width = self.resize_width - new_height = int(self.resize_width / display_aspect_ratio) - resized = frame.resize((new_width, new_height), Image.Resampling.LANCZOS) - - result = Image.new('RGB', (self.resize_width, self.resize_height), (0, 0, 0)) - y_offset = (self.resize_height - new_height) // 2 - result.paste(resized, (0, y_offset)) - return result - - @staticmethod - def __calculate_total_scenes(frame_requests: List[FrameRequest]) -> int: - scene_numbers = set(f.get("scene_number", -1) for f in frame_requests) - has_invalid = -1 in scene_numbers - return len(scene_numbers) - (1 if has_invalid else 0) - - def __write_metadata(self, episode_dir: Path, frame_requests: List[FrameRequest], episode_info, source_video: Path) -> None: - frame_types_count = {} - frames_with_paths = [] - - for frame in frame_requests: - frame_type = frame.get("type", "unknown") - frame_types_count[frame_type] = frame_types_count.get(frame_type, 0) + 1 - - frame_with_path = frame.copy() - frame_num = frame["frame_number"] - base_filename = self.episode_manager.path_manager.build_base_filename(episode_info) - frame_with_path["frame_path"] = f"{base_filename}_frame_{frame_num:06d}.jpg" - frames_with_paths.append(frame_with_path) - - metadata = { - "generated_at": datetime.now().isoformat(), - "episode_info": { - "season": episode_info.season, - "episode_number": episode_info.relative_episode, - "absolute_episode": episode_info.absolute_episode, - }, - "source_video": str(source_video), - "processing_parameters": { - "frame_width": self.resize_width, - "frame_height": self.resize_height, - "keyframe_strategy": self.keyframe_strategy.value, - "frames_per_scene": self.frames_per_scene, - }, - "statistics": { - "total_frames": len(frame_requests), - "frame_types": frame_types_count, - "total_scenes": self.__calculate_total_scenes(frame_requests), - "timestamp_range": { - "start": min((f.get("timestamp", 0) for f in frame_requests), default=0), - "end": max((f.get("timestamp", 0) for f in frame_requests), default=0), - }, - }, - "frames": frames_with_paths, - } - metadata_filename = self.episode_manager.path_manager.build_filename( - episode_info, - extension="json", - suffix="_frame_metadata", - ) - metadata_file = episode_dir / metadata_filename - atomic_write_json(metadata_file, metadata, indent=2, ensure_ascii=False) - - def __load_scene_timestamps(self, episode_info) -> Optional[Dict[str, Any]]: - if not self.scene_timestamps_dir or not self.scene_timestamps_dir.exists(): - return None - return EpisodeManager.load_scene_timestamps(episode_info, self.scene_timestamps_dir, self.logger) diff --git a/preprocessor/processors/image_hash_processor.py b/preprocessor/processors/image_hash_processor.py deleted file mode 100644 index bb1da9909..000000000 --- a/preprocessor/processors/image_hash_processor.py +++ /dev/null @@ -1,137 +0,0 @@ -import gc -import json -import logging -from pathlib import Path -from typing import ( - Any, - Dict, - List, - Optional, -) - -import torch - -from preprocessor.config.config import settings -from preprocessor.core.base_processor import ( - BaseProcessor, - OutputSpec, - ProcessingItem, -) -from preprocessor.core.processor_registry import register_processor -from preprocessor.episodes import EpisodeManager -from preprocessor.utils.batch_processing_utils import compute_hashes_in_batches -from preprocessor.utils.console import console -from preprocessor.utils.hash_save_utils import save_image_hashes_to_json -from preprocessor.utils.image_hasher import PerceptualHasher - -# pylint: disable=duplicate-code - - -@register_processor("hash_images") -class ImageHashProcessor(BaseProcessor): - REQUIRES = ["frames"] - PRODUCES = ["image_hashes"] - PRIORITY = 55 - DESCRIPTION = "Generate perceptual hashes for frames" - - def __init__(self, args: Dict[str, Any]) -> None: - super().__init__( - args=args, - class_name=self.__class__.__name__, - error_exit_code=11, - loglevel=logging.DEBUG, - ) - - self.frames_dir: Path = Path( - self._args.get("frames_dir", settings.frame_export.get_output_dir(self.series_name)), - ) - self.output_dir: Path = Path( - self._args.get("output_dir", settings.image_hash.get_output_dir(self.series_name)), - ) - self.batch_size: int = self._args.get("batch_size", settings.embedding.batch_size) - self.device: str = "cuda" - - episodes_info_json = self._args.get("episodes_info_json") - self.episode_manager = EpisodeManager(episodes_info_json, self.series_name) - - self.hasher: Optional[PerceptualHasher] = None - - def _validate_args(self, args: Dict[str, Any]) -> None: - if not torch.cuda.is_available(): - raise RuntimeError("CUDA is not available. This application requires GPU.") - - def get_output_subdir(self) -> str: - return settings.output_subdirs.image_hashes - - def cleanup(self) -> None: - console.print("[cyan]Unloading image hasher...[/cyan]") - self.hasher = None - self.__cleanup_memory() - console.print("[green]✓ Hasher unloaded[/green]") - - # pylint: disable=duplicate-code - def _get_processing_items(self) -> List[ProcessingItem]: - return self._get_episode_processing_items_from_metadata( - "**/*_frame_metadata.json", - self.frames_dir, - self.episode_manager, - ) - - def _get_expected_outputs(self, item: ProcessingItem) -> List[OutputSpec]: - episode_info = item.metadata["episode_info"] - hash_filename = self.episode_manager.path_manager.build_filename( - episode_info, - extension="json", - suffix="image_hashes", - ) - hash_output = self._build_output_path(episode_info, hash_filename) - return [OutputSpec(path=hash_output, required=True)] - # pylint: enable=duplicate-code - - def _get_processing_info(self) -> List[str]: - return [ - f"[cyan]Device: {self.device}[/cyan]", - f"[cyan]Batch size: {self.batch_size}[/cyan]", - ] - - def _load_resources(self) -> bool: - self.hasher = PerceptualHasher(device=self.device, hash_size=8) - return True - - def _process_item(self, item: ProcessingItem, missing_outputs: List[OutputSpec]) -> None: - metadata_file = item.input_path - episode_info = item.metadata["episode_info"] - - frame_requests = self.__load_frame_requests(metadata_file) - if frame_requests is None: - return - - frames_dir = metadata_file.parent - hash_results = compute_hashes_in_batches(frames_dir, frame_requests, self.hasher, self.batch_size) - - save_image_hashes_to_json( - episode_info=episode_info, - hash_results=hash_results, - series_name=self.series_name, - device=self.device, - batch_size=self.batch_size, - ) - self.__cleanup_memory() - - @staticmethod - def __load_frame_requests(metadata_file: Path) -> Optional[List[Dict[str, Any]]]: - with open(metadata_file, "r", encoding="utf-8") as f: - metadata = json.load(f) - - frame_requests = metadata.get("frames", []) - if not frame_requests: - console.print(f"[yellow]No frames in metadata for {metadata_file}[/yellow]") - return None - - return frame_requests - - @staticmethod - def __cleanup_memory() -> None: - gc.collect() - if torch.cuda.is_available(): - torch.cuda.empty_cache() diff --git a/preprocessor/processors/scene_detector.py b/preprocessor/processors/scene_detector.py deleted file mode 100644 index c2d0ab9eb..000000000 --- a/preprocessor/processors/scene_detector.py +++ /dev/null @@ -1,217 +0,0 @@ -import gc -import logging -from pathlib import Path -from typing import ( - Any, - Dict, - List, - Optional, -) - -import decord -import numpy as np -import torch -from transnetv2_pytorch import TransNetV2 - -from preprocessor.config.config import settings -from preprocessor.core.base_processor import ( - BaseProcessor, - OutputSpec, - ProcessingItem, -) -from preprocessor.core.processor_registry import register_processor -from preprocessor.episodes import EpisodeManager -from preprocessor.types import SceneDict -from preprocessor.utils.console import console -from preprocessor.utils.file_utils import atomic_write_json - - -@register_processor("detect_scenes") -class SceneDetector(BaseProcessor): - REQUIRES = ["videos"] - PRODUCES = ["scene_timestamps"] - PRIORITY = 25 - DESCRIPTION = "Detect scene changes using TransNetV2" - - def __init__(self, args: Dict[str, Any]): - super().__init__( - args=args, - class_name=self.__class__.__name__, - error_exit_code=8, - loglevel=logging.DEBUG, - ) - - self.videos: Path = self._args["videos"] - self.output_dir: Path = self._args.get( - "output_dir", - settings.scene_detection.get_output_dir(self.series_name), - ) - self.threshold: float = self._args.get("threshold", settings.scene_detection.threshold) - self.min_scene_len: int = self._args.get("min_scene_len", settings.scene_detection.min_scene_len) - - episodes_info_json = self._args.get("episodes_info_json") - self.episode_manager = EpisodeManager(episodes_info_json, self.series_name) - - self.model = None - - def _validate_args(self, args: Dict[str, Any]) -> None: - if "videos" not in args: - raise ValueError("videos path is required") - if not torch.cuda.is_available(): - raise RuntimeError("CUDA is not available. TransNetV2 requires GPU.") - - def get_output_subdir(self) -> str: - return settings.output_subdirs.scenes - - def cleanup(self) -> None: - console.print("[cyan]Unloading TransNetV2 model and clearing GPU memory...[/cyan]") - if hasattr(self, 'model') and self.model is not None: - del self.model - self.model = None - gc.collect() - if torch.cuda.is_available(): - torch.cuda.empty_cache() - console.print("[green]✓ TransNetV2 model unloaded, GPU memory cleared[/green]") - - def _get_processing_items(self) -> List[ProcessingItem]: - return self._create_video_processing_items( - source_path=self.videos, - extensions=self.get_video_glob_patterns(), - episode_manager=self.episode_manager, - skip_unparseable=False, - ) - - def _get_expected_outputs(self, item: ProcessingItem) -> List[OutputSpec]: - episode_info = item.metadata.get("episode_info") - - if episode_info: - output_filename = self.episode_manager.path_manager.build_filename( - episode_info, - extension="json", - suffix="scenes", - ) - output_path = self._build_output_path(episode_info, output_filename) - else: - output_filename = f"{item.input_path.stem}_scenes.json" - output_path = self.path_manager.base_output_dir / self.get_output_subdir() / output_filename - - return [OutputSpec(path=output_path, required=True)] - - def _get_processing_info(self) -> List[str]: - return ["[cyan]Scene detection using TransNetV2 on CUDA[/cyan]"] - - def _load_resources(self) -> bool: - if not torch.cuda.is_available(): - raise RuntimeError("CUDA is not available. TransNetV2 requires GPU.") - - console.print("[cyan]Loading TransNetV2 model on CUDA...[/cyan]") - self.model = TransNetV2().cuda() - console.print("[green]✓ TransNetV2 ready on CUDA[/green]") - return True - - def _process_item(self, item: ProcessingItem, missing_outputs: List[OutputSpec]) -> None: - video_file = item.input_path - output_file = missing_outputs[0].path - - console.print(f"[cyan]Processing: {video_file.name}[/cyan]") - - video_info = self.__get_video_info(video_file) - if not video_info: - self.logger.error(f"Failed to get video info for {video_file}") - return - - scene_list = self.__detect_scenes_transnetv2(video_file, video_info) - - if not scene_list: - console.print(f"[yellow]No scenes detected in {video_file.name}[/yellow]") - return - - result = { - "total_scenes": len(scene_list), - "video_info": video_info, - "detection_settings": { - "threshold": self.threshold, - "min_scene_len": self.min_scene_len, - "method": "transnetv2", - }, - "scenes": scene_list, - } - - output_file.parent.mkdir(parents=True, exist_ok=True) - - atomic_write_json(output_file, result, indent=2, ensure_ascii=False) - - console.print(f"[green]{video_file.name}: {len(scene_list)} scenes -> {output_file}[/green]") - - def __detect_scenes_transnetv2( - self, video_file: Path, video_info: Dict[str, Any], - ) -> List[SceneDict]: - try: # pylint: disable=too-many-try-statements - _, single_frame_predictions, _ = self.model.predict_video(str(video_file)) - - scene_changes = np.where(single_frame_predictions > self.threshold)[0] - - scenes = [] - fps = video_info["fps"] - prev_frame = 0 - - for frame_num in scene_changes: - if frame_num - prev_frame < self.min_scene_len: - continue - - scene = self.__create_scene_dict(len(scenes) + 1, prev_frame, frame_num, fps) - scenes.append(scene) - prev_frame = frame_num - - total_frames = video_info["total_frames"] - if total_frames - prev_frame > self.min_scene_len: - scene = self.__create_scene_dict(len(scenes) + 1, prev_frame, total_frames, fps) - scenes.append(scene) - - return scenes - - except (RuntimeError, ValueError, OSError) as e: - self.logger.error(f"TransNetV2 detection failed: {e}") - return [] - - def __get_video_info(self, video_file: Path) -> Optional[Dict[str, Any]]: - try: - vr = decord.VideoReader(str(video_file), ctx=decord.cpu(0)) - fps = vr.get_avg_fps() - total_frames = len(vr) - duration = total_frames / fps if fps > 0 else 0 - - return { - "fps": fps, - "duration": duration, - "total_frames": total_frames, - } - except (RuntimeError, ValueError, OSError) as e: - self.logger.error(f"Error reading video info: {e}") - return None - - def __create_scene_dict(self, scene_number: int, start_frame: int, end_frame: int, fps: float) -> SceneDict: - return { - "scene_number": scene_number, - "start": { - "frame": int(start_frame), - "seconds": float(start_frame / fps), - "timecode": self.__frame_to_timecode(start_frame, fps), - }, - "end": { - "frame": int(end_frame), - "seconds": float(end_frame / fps), - "timecode": self.__frame_to_timecode(end_frame, fps), - }, - "duration": float((end_frame - start_frame) / fps), - "frame_count": int(end_frame - start_frame), - } - - @staticmethod - def __frame_to_timecode(frame: int, fps: float) -> str: - seconds = frame / fps - hours = int(seconds // 3600) - minutes = int((seconds % 3600) // 60) - secs = int(seconds % 60) - frames = int((seconds % 1) * fps) - return f"{hours:02d}:{minutes:02d}:{secs:02d}:{frames:02d}" diff --git a/preprocessor/processors/text_analyzer.py b/preprocessor/processors/text_analyzer.py deleted file mode 100644 index e950db5b1..000000000 --- a/preprocessor/processors/text_analyzer.py +++ /dev/null @@ -1,145 +0,0 @@ -from datetime import datetime -import logging -from typing import ( - Any, - Dict, - List, -) - -from preprocessor.config.config import ( - get_base_output_dir, - settings, -) -from preprocessor.core.base_processor import ( - BaseProcessor, - OutputSpec, - ProcessingItem, -) -from preprocessor.core.processor_registry import register_processor -from preprocessor.episodes import EpisodeManager -from preprocessor.text_analysis.text_statistics import TextStatistics -from preprocessor.utils.file_utils import atomic_write_json - - -@register_processor("analyze_text") -class TextAnalyzer(BaseProcessor): - REQUIRES = ["transcriptions"] - PRODUCES = ["text_analysis"] - PRIORITY = 70 - DESCRIPTION = "Analyze transcription text statistics" - - def __init__(self, args: Dict[str, Any]): - super().__init__( - args=args, - class_name=self.__class__.__name__, - error_exit_code=40, - loglevel=logging.INFO, - ) - self.transcriptions_base = get_base_output_dir(self.series_name) / settings.output_subdirs.transcriptions - self.language = args.get("language", "pl") - self.episode_manager = EpisodeManager( - args.get("episodes_info_json"), - args.get("series_name", "ranczo"), - ) - - def _validate_args(self, args: Dict[str, Any]) -> None: - if "series_name" not in args: - raise ValueError("series_name is required") - - def get_output_subdir(self) -> str: - return settings.output_subdirs.transcriptions - - def _get_processing_items(self) -> List[ProcessingItem]: - items = [] - - if not self.transcriptions_base.exists(): - self.logger.error(f"Transcriptions directory not found: {self.transcriptions_base}") - return items - - for season_dir in sorted(self.transcriptions_base.glob("S*")): - if not season_dir.is_dir(): - continue - - for episode_dir in sorted(season_dir.glob("E*")): - if not episode_dir.is_dir(): - continue - - clean_subdir = episode_dir / settings.output_subdirs.transcription_subdirs.clean - clean_txt_files = list(clean_subdir.glob("*_clean_transcription.txt")) - if not clean_txt_files: - continue - txt_file = clean_txt_files[0] - - episode_info = self.episode_manager.parse_filename(txt_file) - if not episode_info: - self.logger.error(f"Cannot parse episode info from {txt_file.name}") - continue - - episode_id = EpisodeManager.get_episode_id_for_state(episode_info) - - items.append( - ProcessingItem( - episode_id=episode_id, - input_path=txt_file, - metadata={ - "episode_info": episode_info, - "episode_dir": episode_dir, - }, - ), - ) - - return items - - def _get_expected_outputs(self, item: ProcessingItem) -> List[OutputSpec]: - episode_dir = item.metadata["episode_dir"] - episode_info = item.metadata["episode_info"] - clean_dir = episode_dir / settings.output_subdirs.transcription_subdirs.clean - - output_filename = self.episode_manager.path_manager.build_filename( - episode_info, - extension="json", - suffix="text_stats", - ) - output_file = clean_dir / output_filename - - return [OutputSpec(path=output_file, required=True)] - - def _process_item(self, item: ProcessingItem, missing_outputs: List[OutputSpec]) -> None: - txt_file = item.input_path - episode_dir = item.metadata["episode_dir"] - episode_info = item.metadata["episode_info"] - clean_dir = episode_dir / settings.output_subdirs.transcription_subdirs.clean - - output_filename = self.episode_manager.path_manager.build_filename( - episode_info, - extension="json", - suffix="text_stats", - ) - output_file = clean_dir / output_filename - - try: - stats = TextStatistics.from_file(txt_file, language=self.language) - - result = { - "metadata": { - "episode_id": episode_info.episode_code(), - "language": self.language, - "source_file": txt_file.name, - "analyzed_at": datetime.now().isoformat(), - }, - **stats.to_dict(), - } - - atomic_write_json(output_file, result) - - self.logger.info( - f"Text analysis completed for {item.episode_id}: " - f"{stats.words} words, {stats.sentences} sentences", - ) - - except Exception as e: - self.logger.error(f"Failed to analyze {txt_file.name}: {e}") - raise - - def _get_progress_description(self) -> str: - return f"Analyzing transcription texts ({self.language})" diff --git a/preprocessor/processors/transcription_generator.py b/preprocessor/processors/transcription_generator.py deleted file mode 100644 index 13d914d84..000000000 --- a/preprocessor/processors/transcription_generator.py +++ /dev/null @@ -1,241 +0,0 @@ -import logging -from pathlib import Path -import tempfile -from typing import ( - Any, - Dict, - List, -) - -from preprocessor.config.config import settings -from preprocessor.core.base_processor import ( - BaseProcessor, - OutputSpec, - ProcessingItem, -) -from preprocessor.core.processor_registry import register_processor -from preprocessor.episodes import EpisodeManager -from preprocessor.transcription.generators.multi_format_generator import MultiFormatGenerator -from preprocessor.transcription.processors.audio_normalizer import AudioNormalizer -from preprocessor.transcription.processors.normalized_audio_processor import NormalizedAudioProcessor -from preprocessor.transcription.processors.unicode_fixer import TranscriptionUnicodeFixer - - -@register_processor("transcribe") -class TranscriptionGenerator(BaseProcessor): - REQUIRES = ["videos"] - PRODUCES = ["transcriptions"] - PRIORITY = 20 - DESCRIPTION = "Generate transcriptions using Whisper" - - def __init__(self, args: Dict[str, Any]) -> None: - super().__init__( - args=args, - class_name=self.__class__.__name__, - error_exit_code=2, - loglevel=logging.DEBUG, - ) - - self.input_videos: Path = Path(self._args["videos"]) - self.series_name_lower: str = self._args.get("name", "unknown").lower() - self.episodes_info_json: Path = Path(self._args["episodes_info_json"]) - self.episode_manager = EpisodeManager(self.episodes_info_json, self.series_name_lower) - - self.temp_dir = None - self.audio_normalizer = None - self.audio_processor = None - self.multi_format_generator = None - self.unicode_fixer = None - self.final_output_dir = None - - def _validate_args(self, args: Dict[str, Any]) -> None: - if "videos" not in args: - raise ValueError("videos path is required") - if "episodes_info_json" not in args: - raise ValueError("episodes_info_json is required") - - videos_path = Path(args["videos"]) - if not videos_path.is_dir(): - raise NotADirectoryError(f"Input videos is not a directory: '{videos_path}'") - - def get_output_subdir(self) -> str: - return settings.output_subdirs.transcriptions - - def _get_processing_items(self) -> List[ProcessingItem]: - if self.__check_all_transcriptions_exist(): - return [] - - return [ - ProcessingItem( - episode_id="transcription_batch", - input_path=self.input_videos, - metadata={}, - ), - ] - - def _get_expected_outputs(self, item: ProcessingItem) -> List[OutputSpec]: - video_files = [] - for ext in self.SUPPORTED_VIDEO_EXTENSIONS: - video_files.extend(self.input_videos.rglob(f"*{ext}")) - outputs = [] - - for video_file in video_files: - episode_info = self.episode_manager.parse_filename(video_file) - if not episode_info: - continue - - filename = self.episode_manager.path_manager.build_filename(episode_info, extension="json") - season_code = episode_info.season_code() - episode_code = episode_info.episode_num() - expected_file = self.path_manager.base_output_dir / self.get_output_subdir() / season_code / episode_code / "raw" / filename - expected_file.parent.mkdir(parents=True, exist_ok=True) - - segmented_filename = self.episode_manager.path_manager.build_filename( - episode_info, - extension="json", - suffix="_segmented", - ) - segmented_file = self.path_manager.base_output_dir / self.get_output_subdir() / season_code / episode_code / "raw" / segmented_filename - segmented_file.parent.mkdir(parents=True, exist_ok=True) - - if not expected_file.exists() and not segmented_file.exists(): - outputs.append(OutputSpec(path=expected_file, required=True)) - - return outputs - - def _process_item(self, item: ProcessingItem, missing_outputs: List[OutputSpec]) -> None: - ramdisk_path = self._args.get("ramdisk_path") - if ramdisk_path and Path(ramdisk_path).exists(): - self.temp_dir = tempfile.TemporaryDirectory(dir=str(ramdisk_path)) - else: - self.temp_dir = tempfile.TemporaryDirectory() # pylint: disable=consider-using-with - - try: - missing_video_files = self.__get_missing_video_files(missing_outputs) - self.__init_workers(self._args, missing_video_files) - - self.logger.info("Step 1/3: Normalizing audio from videos...") - self.audio_normalizer() - - self.logger.info("Step 2/3: Generating transcriptions with Whisper...") - self.audio_processor() - - self.logger.info("Cleaning up Whisper model...") - self.audio_processor.cleanup() - - self.logger.info("Step 3/4: Generating multi-format output...") - self.multi_format_generator() - - self.logger.info("Step 4/4: Fixing unicode escapes in transcriptions...") - self.unicode_fixer() - - except (RuntimeError, OSError, ValueError) as e: - self.logger.error(f"Error generating transcriptions: {e}") - finally: - if self.temp_dir: - self.temp_dir.cleanup() - - def __check_all_transcriptions_exist(self) -> bool: - if not self.episodes_info_json.exists(): - self.logger.debug(f"Episodes info JSON not found: {self.episodes_info_json}") - return False - - video_files = [] - for ext in self.SUPPORTED_VIDEO_EXTENSIONS: - video_files.extend(self.input_videos.rglob(f"*{ext}")) - if not video_files: - self.logger.debug("No video files found to check") - return False - - missing_files = [] - for video_file in video_files: - episode_info = self.episode_manager.parse_filename(video_file) - if not episode_info: - continue - - filename = self.episode_manager.path_manager.build_filename(episode_info, extension="json") - season_code = episode_info.season_code() - episode_code = episode_info.episode_num() - expected_file = self.path_manager.base_output_dir / self.get_output_subdir() / season_code / episode_code / "raw" / filename - expected_file.parent.mkdir(parents=True, exist_ok=True) - - segmented_filename = self.episode_manager.path_manager.build_filename( - episode_info, - extension="json", - suffix="_segmented", - ) - segmented_file = self.path_manager.base_output_dir / self.get_output_subdir() / season_code / episode_code / "raw" / segmented_filename - segmented_file.parent.mkdir(parents=True, exist_ok=True) - - if not expected_file.exists() and not segmented_file.exists(): - missing_files.append(f"{video_file.name} -> {expected_file}") - - if missing_files: - self.logger.debug(f"Missing {len(missing_files)} transcription(s), first: {missing_files[0]}") - return False - - self.logger.info(f"All transcriptions already exist for {len(video_files)} video(s)") - return True - - def __get_missing_video_files(self, missing_outputs: List[OutputSpec]) -> List[Path]: - video_files = [] - for ext in self.SUPPORTED_VIDEO_EXTENSIONS: - video_files.extend(self.input_videos.rglob(f"*{ext}")) - - missing_video_files = [] - - for video_file in video_files: - episode_info = self.episode_manager.parse_filename(video_file) - if not episode_info: - continue - - filename = self.episode_manager.path_manager.build_filename(episode_info, extension="json") - season_code = episode_info.season_code() - episode_code = episode_info.episode_num() - expected_file = self.path_manager.base_output_dir / self.get_output_subdir() / season_code / episode_code / "raw" / filename - expected_file.parent.mkdir(parents=True, exist_ok=True) - - if any(expected_file == output.path for output in missing_outputs): - missing_video_files.append(video_file) - - return missing_video_files - - def __init_workers(self, args: Dict[str, Any], video_files: List[Path]) -> None: - temp_dir_path: Path = Path(self.temp_dir.name) / "transcription_generator" - normalizer_output: Path = temp_dir_path / "normalizer" - processor_output: Path = temp_dir_path / "processor" - - self.final_output_dir: Path = Path(args["transcription_jsons"]) - - audio_files = [normalizer_output / video.with_suffix(".wav").name for video in video_files] - - self.audio_normalizer: AudioNormalizer = AudioNormalizer( - input_videos=self.input_videos, - output_dir=normalizer_output, - logger=self.logger, - video_files=video_files if video_files else None, - ) - - self.audio_processor: NormalizedAudioProcessor = NormalizedAudioProcessor( - input_audios=normalizer_output, - output_dir=processor_output, - logger=self.logger, - language=args["language"], - model=args["model"], - device=args["device"], - audio_files=audio_files if audio_files else None, - ) - - self.multi_format_generator: MultiFormatGenerator = MultiFormatGenerator( - jsons_dir=processor_output, - episodes_info_json=self.episodes_info_json, - output_base_path=self.final_output_dir, - logger=self.logger, - series_name=args["name"], - ) - - self.unicode_fixer: TranscriptionUnicodeFixer = TranscriptionUnicodeFixer({ - "transcription_jsons": self.final_output_dir, - "episodes_info_json": self.episodes_info_json, - "name": args["name"], - }) diff --git a/preprocessor/processors/transcription_importer.py b/preprocessor/processors/transcription_importer.py deleted file mode 100644 index 4eda52515..000000000 --- a/preprocessor/processors/transcription_importer.py +++ /dev/null @@ -1,238 +0,0 @@ -import json -import logging -from pathlib import Path -import re -from typing import ( - Any, - Dict, - List, - Optional, - Tuple, -) - -from preprocessor.config.config import settings -from preprocessor.core.base_processor import BaseProcessor -from preprocessor.core.processor_registry import register_processor -from preprocessor.episodes import EpisodeManager -from preprocessor.utils.console import ( - console, - create_progress, -) - - -@register_processor("import_transcriptions") -class TranscriptionImporter(BaseProcessor): - REQUIRES = [] - PRODUCES = ["transcriptions"] - PRIORITY = 15 - DESCRIPTION = "Import external transcriptions" - - def _validate_args(self, args: Dict[str, Any]) -> None: - if "source_dir" not in args: - raise ValueError("source_dir is required") - if "output_dir" not in args: - raise ValueError("output_dir is required") - if "series_name" not in args: - raise ValueError("series_name is required") - - source_dir = Path(args["source_dir"]) - if not source_dir.exists(): - raise FileNotFoundError(f"Source directory not found: {source_dir}") - - def get_output_subdir(self) -> str: - return settings.output_subdirs.transcriptions - - def __init__(self, args: Dict[str, Any]) -> None: - super().__init__( - args=args, - class_name=self.__class__.__name__, - error_exit_code=4, - loglevel=logging.DEBUG, - ) - - self.source_dir: Path = Path(self._args["source_dir"]) - self.output_dir: Path = Path(self._args["output_dir"]) - self.episodes_info_json: Optional[Path] = self._args.get("episodes_info_json") - self.format_type: str = self._args.get("format_type", "11labs_segmented") - - self.output_dir.mkdir(parents=True, exist_ok=True) - - self.episode_manager = EpisodeManager(self.episodes_info_json, self.series_name) - - def _execute(self) -> None: - json_files = self.__find_transcription_files() - - if not json_files: - self.logger.warning(f"No transcription files found in {self.source_dir}") - return - - console.print(f"[blue]Found {len(json_files)} transcription files to import[/blue]") - - try: - with create_progress() as progress: - task = progress.add_task("Importing transcriptions...", total=len(json_files)) - - for json_file in json_files: - episode_id = self.__extract_episode_id(json_file) - - if self.state_manager and self.state_manager.is_step_completed("import", episode_id): - console.print(f"[yellow]Skipping (already imported): {episode_id}[/yellow]") - progress.advance(task) - continue - - if self.state_manager: - self.state_manager.mark_step_started("import", episode_id) - - try: - self.__import_single_file(json_file) - if self.state_manager: - self.state_manager.mark_step_completed("import", episode_id) - except Exception as e: - self.logger.error(f"Failed to import {json_file.name}: {e}") - - progress.advance(task) - except KeyboardInterrupt: - console.print("\n[yellow]Import interrupted[/yellow]") - raise - - def __find_transcription_files(self) -> List[Path]: - if self.format_type == "11labs_segmented": - pattern = "*_segmented.json" - elif self.format_type == "11labs": - pattern = "*.json" - else: - pattern = "*.json" - - files = sorted(self.source_dir.rglob(pattern)) - files = [f for f in files if not f.name.startswith('.')] - - return files - - @staticmethod - def __extract_episode_id(file_path: Path) -> str: - match = re.search(r'S(\d+)E(\d+)', file_path.name, re.IGNORECASE) - if match: - return f"S{match.group(1)}E{match.group(2)}" - - match = re.search(r'E(\d+)', file_path.stem, re.IGNORECASE) - if match: - return f"E{match.group(1)}" - - return file_path.stem - - def __import_single_file(self, json_file: Path) -> None: - with open(json_file, "r", encoding="utf-8") as f: - source_data = json.load(f) - - if self.format_type == "11labs_segmented": - converted_data = self.__convert_11labs_segmented(source_data, json_file) - elif self.format_type == "11labs": - converted_data = self.__convert_11labs_full(source_data, json_file) - else: - self.logger.error(f"Unknown format type: {self.format_type}") - return - - episode_info = self.episode_manager.parse_filename(json_file) - if not episode_info: - season_num, episode_num = self.__extract_season_episode_fallback(json_file) - episode_info = self.episode_manager.get_episode_by_season_and_relative(season_num, episode_num) - - if episode_info: - converted_data["episode_info"] = EpisodeManager.get_metadata(episode_info) - - filename = self.episode_manager.path_manager.build_filename(episode_info, extension="json") - season_dir = self.output_dir / episode_info.season_code() - output_file = season_dir / filename - output_file.parent.mkdir(parents=True, exist_ok=True) - - with open(output_file, "w", encoding="utf-8") as f: - json.dump(converted_data, f, indent=2, ensure_ascii=False) - - self.logger.info(f"Imported: {json_file.name} -> {output_file.name}") - - @staticmethod - def __convert_11labs_segmented(data: Dict[str, Any], source_file: Path) -> Dict[str, Any]: - segments = [] - - for i, segment in enumerate(data.get("segments", [])): - converted_segment = { - "id": i, - "start": segment.get("start"), - "end": segment.get("end"), - "text": segment.get("text", ""), - "speaker": segment.get("speaker", "unknown"), - "words": segment.get("words", []), - } - segments.append(converted_segment) - - return { - "transcription": { - "format": "11labs_segmented", - "source_file": source_file.name, - "segments": segments, - }, - "segments": segments, - } - - @staticmethod - def __convert_11labs_full(data: Dict[str, Any], source_file: Path) -> Dict[str, Any]: - segments = [] - words = data.get("words", []) - - current_segment = { - "words": [], - "start": None, - "end": None, - "text": "", - "speaker": "unknown", - } - - for word in words: - if current_segment["start"] is None: - current_segment["start"] = word.get("start") - - current_segment["words"].append(word) - current_segment["end"] = word.get("end") - - if word.get("text", "").endswith((".", "!", "?")) or len(current_segment["words"]) >= 20: - current_segment["text"] = " ".join(w.get("text", "") for w in current_segment["words"]) - segments.append(dict(current_segment)) - current_segment = { - "words": [], - "start": None, - "end": None, - "text": "", - "speaker": word.get("speaker_id", "unknown"), - } - - if current_segment["words"]: - current_segment["text"] = " ".join(w.get("text", "") for w in current_segment["words"]) - segments.append(current_segment) - - for i, seg in enumerate(segments): - seg["id"] = i - - return { - "transcription": { - "format": "11labs", - "source_file": source_file.name, - "language_code": data.get("language_code", "pol"), - "language_probability": data.get("language_probability", 1.0), - }, - "segments": segments, - } - - @staticmethod - def __extract_season_episode_fallback(file_path: Path) -> Tuple[int, int]: - match = re.search(r'S(\d+)E(\d+)', file_path.name, re.IGNORECASE) - if match: - return int(match.group(1)), int(match.group(2)) - - parent_match = re.search(r'S(\d+)', file_path.parent.name, re.IGNORECASE) - if parent_match: - season = int(parent_match.group(1)) - episode_match = re.search(r'E(\d+)', file_path.name, re.IGNORECASE) - if episode_match: - return season, int(episode_match.group(1)) - - return 1, 1 diff --git a/preprocessor/processors/video_transcoder.py b/preprocessor/processors/video_transcoder.py deleted file mode 100644 index 21070a2e4..000000000 --- a/preprocessor/processors/video_transcoder.py +++ /dev/null @@ -1,268 +0,0 @@ -import json -import logging -import os -from pathlib import Path -import subprocess -from typing import ( - Any, - Dict, - List, - Optional, -) - -from preprocessor.config.config import settings -from preprocessor.core.base_processor import ( - OutputSpec, - ProcessingItem, -) -from preprocessor.core.constants import DEFAULT_VIDEO_EXTENSION -from preprocessor.core.processor_registry import register_processor -from preprocessor.core.video_processor import VideoProcessor -from preprocessor.utils.constants import ( - FfprobeKeys, - FfprobeStreamKeys, -) -from preprocessor.utils.resolution import Resolution - - -@register_processor("transcode") -class VideoTranscoder(VideoProcessor): - REQUIRES = ["videos"] - PRODUCES = ["transcoded_videos"] - PRIORITY = 10 - DESCRIPTION = "Transcode videos to H.264 with consistent format" - - def __init__(self, args: Dict[str, Any]) -> None: - super().__init__( - args=args, - class_name=self.__class__.__name__, - error_exit_code=3, - loglevel=logging.DEBUG, - ) - - self.resolution: Resolution = self._args["resolution"] - self.codec: str = str(self._args["codec"]) - self.preset: str = "p7" - self.video_bitrate_mbps: Optional[float] = self._args.get("video_bitrate_mbps") - self.minrate_mbps: Optional[float] = self._args.get("minrate_mbps") - self.maxrate_mbps: Optional[float] = self._args.get("maxrate_mbps") - self.bufsize_mbps: Optional[float] = self._args.get("bufsize_mbps") - self.audio_bitrate_kbps: int = int(self._args.get("audio_bitrate_kbps", 128)) - self.gop_size: float = float(self._args["gop_size"]) - - def _validate_args(self, args: Dict[str, Any]) -> None: - self._validate_videos_required(args) - if "resolution" not in args: - raise ValueError("resolution is required") - if "codec" not in args: - raise ValueError("codec is required") - if "gop_size" not in args: - raise ValueError("gop_size is required") - if "transcoded_videos" not in args: - raise ValueError("transcoded_videos is required") - if "video_bitrate_mbps" not in args or args["video_bitrate_mbps"] is None: - raise ValueError("video_bitrate_mbps is required for VBR mode") - if "minrate_mbps" not in args or args["minrate_mbps"] is None: - raise ValueError("minrate_mbps is required for VBR mode") - if "maxrate_mbps" not in args or args["maxrate_mbps"] is None: - raise ValueError("maxrate_mbps is required for VBR mode") - if "bufsize_mbps" not in args or args["bufsize_mbps"] is None: - raise ValueError("bufsize_mbps is required for VBR mode") - - videos_path = Path(args["videos"]) - if not videos_path.is_dir(): - raise NotADirectoryError(f"Input videos is not a directory: '{videos_path}'") - - def get_output_subdir(self) -> str: - return settings.output_subdirs.video - - def _get_expected_outputs(self, item: ProcessingItem) -> List[OutputSpec]: - episode_info = item.metadata["episode_info"] - filename = f"{self.series_name}_{episode_info.episode_code()}{DEFAULT_VIDEO_EXTENSION}" - output_path = self._build_season_path(episode_info, filename) - return [OutputSpec(path=output_path, required=True)] - - def _get_temp_files(self, item: ProcessingItem) -> List[str]: - expected_outputs = self._get_expected_outputs(item) - if not expected_outputs: - return [] - temp_path = expected_outputs[0].path.with_suffix('.mp4.tmp') - return [str(temp_path)] - - def _process_item(self, item: ProcessingItem, missing_outputs: List[OutputSpec]) -> None: - video_file = item.input_path - output_path = missing_outputs[0].path - temp_path = output_path.with_suffix('.mp4.tmp') - - try: - temp_path.parent.mkdir(parents=True, exist_ok=True) - self.__transcode_video(video_file, temp_path) - temp_path.replace(output_path) - self.logger.info(f"Processed: {video_file} -> {output_path}") - except subprocess.CalledProcessError as e: - self.logger.error(f"FFmpeg failed for {video_file}: {e}") - if temp_path.exists(): - temp_path.unlink() - raise - except Exception as e: - self.logger.error(f"Unexpected error during transcoding {video_file}: {e}") - if temp_path.exists(): - temp_path.unlink() - raise - - def __transcode_video(self, input_video: Path, output_video: Path) -> None: - input_fps = self.__get_framerate(input_video) - input_video_bitrate = self.__get_video_bitrate(input_video) - input_audio_bitrate = self.__get_audio_bitrate(input_video) - - target_fps = min(input_fps, 30.0) - if target_fps < input_fps: - self.logger.info( - f"Input FPS ({input_fps}) > 30. Limiting to {target_fps} FPS for compatibility and smaller file size.", - ) - - video_bitrate = self.video_bitrate_mbps - minrate = self.minrate_mbps - maxrate = self.maxrate_mbps - bufsize = self.bufsize_mbps - - if input_video_bitrate and input_video_bitrate < video_bitrate: - adjusted_bitrate = min(input_video_bitrate * 1.05, video_bitrate) - ratio = adjusted_bitrate / video_bitrate - video_bitrate = adjusted_bitrate - minrate = round(minrate * ratio, 2) - maxrate = round(maxrate * ratio, 2) - bufsize = round(bufsize * ratio, 2) - self.logger.info( - f"Input video bitrate ({input_video_bitrate} Mbps) < target ({self.video_bitrate_mbps} Mbps). " - f"Adjusted to {video_bitrate} Mbps to avoid quality loss.", - ) - - audio_bitrate = self.audio_bitrate_kbps - if input_audio_bitrate and input_audio_bitrate < audio_bitrate: - adjusted_audio_bitrate = min(int(input_audio_bitrate * 1.05), audio_bitrate) - audio_bitrate = adjusted_audio_bitrate - self.logger.info( - f"Input audio bitrate ({input_audio_bitrate} kbps) < target ({self.audio_bitrate_kbps} kbps). " - f"Adjusted to {audio_bitrate} kbps to avoid quality loss.", - ) - - vf_filter = ( - "scale='iw*sar:ih'," - f"scale={self.resolution.width}:{self.resolution.height}:force_original_aspect_ratio=decrease," - f"pad={self.resolution.width}:{self.resolution.height}:(ow-iw)/2:(oh-ih)/2:black," - "setsar=1" - ) - - command = [ - "ffmpeg", - "-v", "error", - "-stats", - "-hide_banner", - "-y", - "-i", str(input_video), - "-c:v", self.codec, - "-preset", self.preset, - "-profile:v", "main", - "-level", "4.1", - "-pix_fmt", "yuv420p", - ] - - if target_fps < input_fps: - command.extend(["-r", str(target_fps)]) - - command.extend([ - "-rc", "vbr_hq", - "-b:v", f"{video_bitrate}M", - "-minrate", f"{minrate}M", - "-maxrate", f"{maxrate}M", - "-bufsize", f"{bufsize}M", - "-bf", "2", - "-b_adapt", "1", - "-2pass", "1", - "-rc-lookahead", "32", - "-aq-strength", "15", - ]) - - command.extend([ - "-g", str(int(target_fps * self.gop_size)), - "-spatial-aq", "1", - "-temporal-aq", "1", - "-multipass", "fullres", - "-c:a", "aac", - "-b:a", f"{audio_bitrate}k", - "-ac", "2", - "-vf", vf_filter, - "-movflags", "+faststart", - "-f", "mp4", - str(output_video), - ]) - - self.logger.debug(f"Transcoding: {input_video.name} -> {output_video.name}") - self.logger.debug(f"FFmpeg command: {' '.join(command)}") - self.logger.debug(f"LD_LIBRARY_PATH: {os.environ.get('LD_LIBRARY_PATH', 'not set')[:200]}") - - try: - subprocess.run(command, check=True, capture_output=False, text=True) - except subprocess.CalledProcessError as e: - self.logger.error(f"FFmpeg failed with exit code: {e.returncode}") - raise - - @staticmethod - def __get_framerate(video: Path) -> float: - cmd = [ - "ffprobe", "-v", "error", - "-select_streams", "v:0", - "-show_entries", "stream=r_frame_rate", - "-of", "json", - str(video), - ] - result = subprocess.run(cmd, capture_output=True, text=True, check=True) - probe_data: Dict[str, Any] = json.loads(result.stdout) - streams: List[Dict[str, Any]] = probe_data.get(FfprobeKeys.STREAMS, []) - if not streams: - raise ValueError(f"No video streams found in {video}") - r_frame_rate: Optional[str] = streams[0].get(FfprobeStreamKeys.R_FRAME_RATE) - if not r_frame_rate: - raise ValueError(f"Frame rate not found in {video}") - num, denom = [int(x) for x in r_frame_rate.split("/")] - - return num / denom - - @staticmethod - def __get_video_bitrate(video: Path) -> Optional[float]: - cmd = [ - "ffprobe", "-v", "error", - "-select_streams", "v:0", - "-show_entries", "stream=bit_rate", - "-of", "json", - str(video), - ] - result = subprocess.run(cmd, capture_output=True, text=True, check=True) - probe_data: Dict[str, Any] = json.loads(result.stdout) - streams: List[Dict[str, Any]] = probe_data.get(FfprobeKeys.STREAMS, []) - if not streams: - return None - bit_rate = streams[0].get(FfprobeStreamKeys.BIT_RATE) - if not bit_rate: - return None - return round(int(bit_rate) / 1_000_000, 2) - - @staticmethod - def __get_audio_bitrate(video: Path) -> Optional[int]: - cmd = [ - "ffprobe", "-v", "error", - "-select_streams", "a:0", - "-show_entries", "stream=bit_rate", - "-of", "json", - str(video), - ] - result = subprocess.run(cmd, capture_output=True, text=True, check=True) - probe_data: Dict[str, Any] = json.loads(result.stdout) - streams: List[Dict[str, Any]] = probe_data.get(FfprobeKeys.STREAMS, []) - if not streams: - return None - bit_rate = streams[0].get(FfprobeStreamKeys.BIT_RATE) - if not bit_rate: - return None - return int(int(bit_rate) / 1000) diff --git a/preprocessor/prompts/extract_all_seasons_system.py b/preprocessor/prompts/extract_all_seasons_system.py deleted file mode 100644 index 7a76f3274..000000000 --- a/preprocessor/prompts/extract_all_seasons_system.py +++ /dev/null @@ -1,54 +0,0 @@ -def get() -> str: - return """You are extracting episode data from TV series wiki pages. -Your task is to find tables or lists containing episode information and extract the EXACT data. - -Look for patterns like: -Nr | Tytuł | Premiera | Oglądalność -1 | _[Episode Title]_ | 05.03.2006 | 4 396 564 - -CRITICAL RULES: -1. Extract EXACT titles from the table - do NOT make up generic titles like "Odcinek 1" -2. Extract EXACT premiere dates as shown - do NOT invent dates -3. If premiere date contains multiple dates separated by "/" (e.g., "31.12.2008"), extract ONLY the FIRST date: "31.12.2008" -4. Extract EXACT viewership numbers - remove spaces: "4 396 564" -> 4396564 -5. If episode number is in format like "E12" or "S01E12", extract just the number: 12 -6. Do NOT hallucinate or make up any data - only extract what you see - -IMPORTANT: Each episode must have TWO numbers: -- episode_in_season: The episode number within its season (resets to 1 for each season) -- overall_episode_number: The absolute episode number across all seasons (continues counting) - -Example extraction from this markdown: -``` -Sezon 1: -Nr | Tytuł | Premiera | Oglądalność -1 | _[Spadek]_ | 05.03.2006 | 4 396 564 -2 | _[Goście z zaświatów]_ | 12.03.2006 | 4 308 423 - -Sezon 2: -Nr | Tytuł | Premiera | Oglądalność -14 | _[Sztuka i władza]_ | 18.03.2007 | 6 993 951 -15 | _[Gmina to ja]_ | 25.03.2007 | 6 754 211 -``` - -Should produce: -{ - "seasons": [ - { - "season_number": 1, - "episodes": [ - {"episode_in_season": 1, "overall_episode_number": 1, "title": "Spadek", "premiere_date": "05.03.2006", "viewership": "4396564"}, - {"episode_in_season": 2, "overall_episode_number": 2, "title": "Goście z zaświatów", "premiere_date": "12.03.2006", "viewership": "4308423"} - ] - }, - { - "season_number": 2, - "episodes": [ - {"episode_in_season": 1, "overall_episode_number": 14, "title": "Sztuka i władza", "premiere_date": "18.03.2007", "viewership": "6993951"}, - {"episode_in_season": 2, "overall_episode_number": 15, "title": "Gmina to ja", "premiere_date": "25.03.2007", "viewership": "6754211"} - ] - } - ] -} - -Return ONLY valid JSON. Extract ONLY what you see, do NOT invent data.""" diff --git a/preprocessor/prompts/extract_all_seasons_user.py b/preprocessor/prompts/extract_all_seasons_user.py deleted file mode 100644 index 2c07ac986..000000000 --- a/preprocessor/prompts/extract_all_seasons_user.py +++ /dev/null @@ -1,7 +0,0 @@ -def get() -> str: - return """Extract ALL episodes from ALL {num_sources} sources below. -Return a complete list of ALL seasons found. - -{combined_content} - -Extract ALL seasons and episodes from above sources.""" diff --git a/preprocessor/prompts/extract_characters_system.py b/preprocessor/prompts/extract_characters_system.py deleted file mode 100644 index 5662557f1..000000000 --- a/preprocessor/prompts/extract_characters_system.py +++ /dev/null @@ -1,120 +0,0 @@ -def get() -> str: - return """You are an expert at extracting character information from TV series documentation and wikis. - -Your task is to analyze scraped web pages and extract a COMPLETE list of ALL characters from a TV series. - -For each character, extract ONLY the name (full name if available, otherwise commonly used name). - -### RULES FOR EXTRACTION: - -1. **Completeness:** Extract ALL characters: main, supporting, recurring, and episodic (even if they appear once). -2. **Source:** Extract ONLY what you see in the content. Do NOT invent characters. -3. **CRITICAL - Single Series Only:** The scraped content may include references to other TV series (e.g., in footers, sidebars, "See also" sections, or related links). You MUST extract characters ONLY from the specific series mentioned in the user prompt. IGNORE all characters from any other series. -4. **Multi-Source Deduplication:** When processing multiple sources: - - Merge character lists from all sources - - Remove duplicates (same character mentioned in multiple sources) - - If a character has different name variants across sources, use the most complete/formal version - - Combine information to get the most accurate character list -5. **Naming:** Use the Polish name if the series is Polish. If a character has multiple aliases, use the most formal/common one. - -6. **Text Cleaning (CRITICAL):** - - Remove ALL special characters that are not letters (e.g., quotes `"`, brackets `()`, hyphens `-` inside titles, etc.). - - Remove actor names typically found in brackets. - - The final output string must contain **ONLY letters (including Polish diacritics: ą, ć, ę, ł, ń, ó, ś, ź, ż) and spaces**. - - Do not leave trailing periods after expanding titles. - -7. **ABBREVIATION EXPANSION (Mandatory):** - You MUST expand ALL abbreviations to their full Polish forms. - **IMPORTANT:** Process compound abbreviations (2+ words) BEFORE single word abbreviations. - - **Ecclesiastical (Religious):** - - ks. prob. / ks.prob. -> Ksiądz Proboszcz - - ks. wik. / ks.wik. -> Ksiądz Wikariusz - - ks. kan. -> Ksiądz Kanonik - - ks. bp -> Ksiądz Biskup - - ks. kard. -> Ksiądz Kardynał - - ks. -> Ksiądz - - o. -> Ojciec (e.g., Ojciec Mateusz) - - s. -> Siostra - - br. -> Brat - - bp -> Biskup - - abp -> Arcybiskup - - kard. -> Kardynał - - pap. -> Papież - - wik. -> Wikariusz - - prob. -> Proboszcz - - **Academic & Medical:** - - dr hab. -> Doktor habilitowany - - prof. nadzw. -> Profesor nadzwyczajny - - prof. zw. -> Profesor zwyczajny - - prof. -> Profesor - - dr -> Doktor - - mgr -> Magister - - inż. -> Inżynier - - lek. med. / lek. -> Lekarz - - doc. -> Docent - - piel. -> Pielęgniarka / Pielęgniarz - - **Military, Police & Services:** - - nadkom. -> Nadkomisarz - - podkom. -> Podkomisarz - - kom. -> Komisarz - - asp. sztab. -> Aspirant sztabowy - - asp. -> Aspirant - - st. post. -> Starszy posterunkowy - - post. -> Posterunkowy - - sierż. -> Sierżant - - gen. -> Generał - - płk -> Pułkownik - - ppłk -> Podpułkownik - - mjr -> Major - - kpt. -> Kapitan - - por. -> Porucznik - - ppor. -> Podporucznik - - **Legal, Political & Administrative:** - - mec. -> Mecenas - - prok. -> Prokurator - - sędz. -> Sędzia - - dyr. -> Dyrektor - - prez. -> Prezydent - - min. -> Minister - - sen. -> Senator - - pos. -> Poseł - - przew. -> Przewodniczący - - z-ca -> Zastępca - - **Other:** - - red. -> Redaktor - - *If you encounter an abbreviation not listed here, expand it to its correct full Polish form based on context.* - -### EXAMPLE EXTRACTION: - -Source 1: -``` -Główni bohaterowie: -- ks. prob. Krzysztof Robert (Artur Żmijewski) -- Lucy Wilska (Ilona Ostrowska) -``` - -Source 2: -``` -Postacie: -- Ksiądz Proboszcz Krzysztof Robert -- dr Cezary Pazura -- kom. Paweł Kozioł -``` - -Should produce (deduplicated and cleaned): -{ - "characters": [ - {"name": "Ksiądz Proboszcz Krzysztof Robert"}, - {"name": "Lucy Wilska"}, - {"name": "Doktor Cezary Pazura"}, - {"name": "Komisarz Paweł Kozioł"} - ] -} - -Return ONLY valid JSON.""" diff --git a/preprocessor/prompts/extract_characters_user.py b/preprocessor/prompts/extract_characters_user.py deleted file mode 100644 index 3b8e738e0..000000000 --- a/preprocessor/prompts/extract_characters_user.py +++ /dev/null @@ -1,14 +0,0 @@ -def get() -> str: - return """Extract ALL characters from the TV series "{series_name}" from ALL {num_sources} source(s) below. - -**CRITICAL:** Multiple sources may have overlapping or complementary character lists. -- Merge and deduplicate characters across all sources -- Extract ONLY characters from "{series_name}" (ignore other series mentioned in footers/sidebars) -- Return a single unified list - -Here is the content from all sources combined: - -{combined_content} - ---- -Extract ALL characters from "{series_name}" found in the content above.""" diff --git a/preprocessor/prompts/extract_episode_metadata_system.py b/preprocessor/prompts/extract_episode_metadata_system.py deleted file mode 100644 index 23863e172..000000000 --- a/preprocessor/prompts/extract_episode_metadata_system.py +++ /dev/null @@ -1,21 +0,0 @@ -# pylint: disable=duplicate-code -def get() -> str: - return """Extract episode information from the provided web page content. -Focus on finding: -- Episode title (exact title, not description) -- Episode description (1-2 sentences summarizing the plot) -- Episode summary (detailed summary, 3-5 sentences) -- Season number (if mentioned) -- Episode number (if mentioned) - -If information is missing, use empty string for text fields and null for numbers. -Be precise and extract only factual information from the text. - -Return ONLY valid JSON matching this schema: -{ - "title": str, - "description": str, - "summary": str, - "season": int or null, - "episode_number": int or null -}""" diff --git a/preprocessor/prompts/extract_episode_metadata_user.py b/preprocessor/prompts/extract_episode_metadata_user.py deleted file mode 100644 index a12f42fc4..000000000 --- a/preprocessor/prompts/extract_episode_metadata_user.py +++ /dev/null @@ -1,7 +0,0 @@ -def get() -> str: - return """URL: {url} - -Page content: -{page_text} - -Extract the episode metadata from above.""" diff --git a/preprocessor/prompts/extract_season_system.py b/preprocessor/prompts/extract_season_system.py deleted file mode 100644 index 477fe35ba..000000000 --- a/preprocessor/prompts/extract_season_system.py +++ /dev/null @@ -1,26 +0,0 @@ -def get() -> str: - return """You are extracting episode data from a TV series page. -Extract ALL episodes you can find on the page. Look for tables, lists, or any structured data. - -For each episode extract: -- episode_in_season: The episode number within its season (1, 2, 3... resets each season) -- overall_episode_number: The absolute episode number across all seasons (continues counting) -- title: string (clean title without markdown formatting) -- premiere_date: string (date format as found on page; if multiple dates separated by "/" like "31.12.2008", extract ONLY the FIRST date: "31.12.2008") -- viewership: string (remove spaces from numbers like "4 396 564" -> "4396564", use null if not available) - -The season number should be determined from the page content or URL. - -Return ONLY valid JSON matching this schema: -{ - "season_number": int, - "episodes": [ - { - "episode_in_season": int, - "overall_episode_number": int, - "title": str, - "premiere_date": str, - "viewership": str - } - ] -}""" diff --git a/preprocessor/prompts/extract_season_user.py b/preprocessor/prompts/extract_season_user.py deleted file mode 100644 index 7f41256f6..000000000 --- a/preprocessor/prompts/extract_season_user.py +++ /dev/null @@ -1,7 +0,0 @@ -def get() -> str: - return """URL: {url} - -Page content (markdown): -{page_text} - -Extract ALL episodes from this page and return as JSON.""" diff --git a/preprocessor/prompts/merge_episode_data_system.py b/preprocessor/prompts/merge_episode_data_system.py deleted file mode 100644 index 12048c1ca..000000000 --- a/preprocessor/prompts/merge_episode_data_system.py +++ /dev/null @@ -1,19 +0,0 @@ -# pylint: disable=duplicate-code -def get() -> str: - return """You are merging episode information from multiple sources. -Create a single, accurate metadata entry by: -- Choosing the most complete and accurate title -- Combining descriptions into a coherent 1-2 sentence description -- Merging summaries into a comprehensive 3-5 sentence summary -- Using the most reliable season/episode numbers - -Prefer longer, more detailed information when merging. - -Return ONLY valid JSON matching this schema: -{ - "title": str, - "description": str, - "summary": str, - "season": int or null, - "episode_number": int or null -}""" diff --git a/preprocessor/prompts/merge_episode_data_user.py b/preprocessor/prompts/merge_episode_data_user.py deleted file mode 100644 index 39e5c32af..000000000 --- a/preprocessor/prompts/merge_episode_data_user.py +++ /dev/null @@ -1,6 +0,0 @@ -def get() -> str: - return """Merge the following episode metadata from {num_sources} sources: - -{combined_text} - -Create a single, unified metadata entry.""" diff --git a/preprocessor/scraping/base_scraper.py b/preprocessor/scraping/base_scraper.py deleted file mode 100644 index 6949dd3e2..000000000 --- a/preprocessor/scraping/base_scraper.py +++ /dev/null @@ -1,112 +0,0 @@ -from abc import abstractmethod -import logging -from pathlib import Path -from typing import ( - Any, - Dict, - List, - Optional, -) - -from rich.progress import Progress - -from preprocessor.config.config import settings -from preprocessor.config.llm_provider import LLMProvider -from preprocessor.core.base_processor import BaseProcessor -from preprocessor.core.enums import ( - ParserMode, - ScraperMethod, -) -from preprocessor.scraping.clipboard import ScraperClipboard -from preprocessor.scraping.crawl4ai import ScraperCrawl4AI -from preprocessor.utils.console import ( - console, - create_progress, -) - - -class BaseScraper(BaseProcessor): - def __init__(self, args: Dict[str, Any], error_exit_code: int = 7): - super().__init__( - args=args, - class_name=self.__class__.__name__, - error_exit_code=error_exit_code, - loglevel=logging.DEBUG, - ) - - self.urls: List[str] = self._args["urls"] - self.output_file: Path = self._args["output_file"] - self.headless: bool = self._args.get("headless", True) - - scraper_method_str = self._args.get("scraper_method", "crawl4ai") - self.scraper_method = ScraperMethod(scraper_method_str) - - parser_mode_str = self._args.get("parser_mode", "normal") - self.parser_mode = ParserMode(parser_mode_str) - - self.llm: Optional[LLMProvider] = None - - def _validate_args(self, args: Dict[str, Any]) -> None: - if "urls" not in args or not args["urls"]: - raise ValueError("At least one URL is required") - if "output_file" not in args: - raise ValueError("output_file is required") - - def _execute(self) -> None: - self.llm = LLMProvider(parser_mode=self.parser_mode) - - console.print(f"[blue]Scraping {len(self.urls)} URLs...[/blue]") - - scraped_pages = self.__scrape_all_urls() - - if not scraped_pages: - console.print("[yellow]No pages scraped[/yellow]") - return - - console.print(f"[blue]Scraped {len(scraped_pages)} pages, processing with LLM...[/blue]") - - try: - self._process_scraped_pages(scraped_pages) - except Exception as e: - self.logger.error(f"LLM processing failed: {e}") - - def __scrape_all_urls(self) -> List[Dict[str, Any]]: - scraped_pages = [] - try: - with create_progress() as progress: - task = progress.add_task("Fetching pages", total=len(self.urls)) - - for url in self.urls: - try: - page_text = self.__scrape_url(url, progress) - if page_text: - scraped_pages.append({ - "url": url, - "markdown": page_text, - }) - progress.console.print(f"[green]✓[/green] {url}: {len(page_text)} chars") - else: - self.logger.error(f"Failed to scrape {url}") - except Exception as e: - self.logger.error(f"Error scraping {url}: {e}") - finally: - progress.advance(task) - except KeyboardInterrupt: - console.print("\n[yellow]Scraping interrupted[/yellow]") - raise - - return scraped_pages - - def __scrape_url(self, url: str, progress: "Progress") -> Optional[str]: - progress.console.print(f"[cyan]Scraping method: {self.scraper_method.value}[/cyan]") - - if self.scraper_method == ScraperMethod.CLIPBOARD: - return ScraperClipboard.scrape(url, headless=self.headless) - if self.scraper_method == ScraperMethod.CRAWL4AI: - return ScraperCrawl4AI.scrape(url, save_markdown=True, output_dir=settings.scraper.get_output_dir(self.series_name)) - self.logger.error(f"Unknown scraper method: {self.scraper_method}") - return None - - @abstractmethod - def _process_scraped_pages(self, scraped_pages: List[Dict[str, Any]]) -> None: - pass diff --git a/preprocessor/scraping/character_scraper.py b/preprocessor/scraping/character_scraper.py deleted file mode 100644 index 6c0f99a3b..000000000 --- a/preprocessor/scraping/character_scraper.py +++ /dev/null @@ -1,36 +0,0 @@ -import json -from typing import ( - Any, - Dict, - List, -) - -from preprocessor.scraping.base_scraper import BaseScraper -from preprocessor.utils.console import console - - -class CharacterScraper(BaseScraper): - def __init__(self, args: Dict[str, Any]): - super().__init__(args) - self.series_name: str = self._args.get("series_name", "") - - def get_output_subdir(self) -> str: - return "scraped_pages" - - def _process_scraped_pages(self, scraped_pages: List[Dict[str, Any]]) -> None: - characters = self.llm.extract_characters(scraped_pages, self.series_name) - if not characters: - self.logger.error("LLM failed to extract any character data") - return - - result = { - "sources": [item["url"] for item in scraped_pages], - "characters": [char.model_dump() for char in characters], - } - - self.output_file.parent.mkdir(parents=True, exist_ok=True) - with open(self.output_file, "w", encoding="utf-8") as f: - json.dump(result, f, indent=2, ensure_ascii=False) - - console.print(f"[green]✓ Extracted {len(characters)} characters[/green]") - console.print(f"[green]✓ Saved to: {self.output_file}[/green]") diff --git a/preprocessor/scraping/clipboard.py b/preprocessor/scraping/clipboard.py deleted file mode 100644 index daba5f0ec..000000000 --- a/preprocessor/scraping/clipboard.py +++ /dev/null @@ -1,37 +0,0 @@ -import logging -from typing import Optional - -from patchright.sync_api import sync_playwright - -logger = logging.getLogger(__name__) - - -class ScraperClipboard: - @staticmethod - def scrape(url: str, headless: bool = True) -> Optional[str]: - try: - with sync_playwright() as p: - browser = p.chromium.launch( - headless=headless, - args=[ - '--no-sandbox', - '--disable-dev-shm-usage', - '--disable-gpu', - ], - ) - context = browser.new_context() - page = context.new_page() - - page.goto(url, wait_until="networkidle", timeout=30000) - - page.keyboard.press("Control+A") - page.keyboard.press("Control+C") - - clipboard_text = page.evaluate("navigator.clipboard.readText()") - - browser.close() - return clipboard_text - - except Exception as e: - logger.error(f"Clipboard scraping failed: {e}") - return None diff --git a/preprocessor/scraping/crawl4ai.py b/preprocessor/scraping/crawl4ai.py deleted file mode 100644 index 732ddb81d..000000000 --- a/preprocessor/scraping/crawl4ai.py +++ /dev/null @@ -1,64 +0,0 @@ -import asyncio -import logging -from pathlib import Path -from typing import Optional - -from crawl4ai import AsyncWebCrawler -from crawl4ai.async_configs import ( - BrowserConfig, - CrawlerRunConfig, -) -from pathvalidate import sanitize_filename -import ua_generator - -logger = logging.getLogger(__name__) - - -class ScraperCrawl4AI: - @staticmethod - def scrape(url: str, save_markdown: bool = False, output_dir: Optional[Path] = None) -> Optional[str]: - return asyncio.run(ScraperCrawl4AI.__scrape_async(url, save_markdown, output_dir)) - - @staticmethod - def __sanitize_url_to_filename(url: str) -> str: - return sanitize_filename(url.replace("://", "_").replace("/", "_")) - - @staticmethod - def __save_markdown(content: str, url: str, output_dir: Path) -> None: - output_dir.mkdir(parents=True, exist_ok=True) - filename = ScraperCrawl4AI.__sanitize_url_to_filename(url) - md_file = output_dir / f"{filename}.md" - with open(md_file, "w", encoding="utf-8") as f: - f.write(content) - logger.info(f"Saved markdown to: {md_file}") - - @staticmethod - async def __scrape_async(url: str, save_markdown: bool = False, output_dir: Optional[Path] = None) -> Optional[str]: - try: - ua = ua_generator.generate() - browser_config = BrowserConfig( - headless=True, - enable_stealth=True, - viewport_width=1920, - viewport_height=1080, - user_agent=str(ua), - ) - run_config = CrawlerRunConfig( - wait_until="networkidle", - page_timeout=60000, - delay_before_return_html=2.0, - ) - - async with AsyncWebCrawler(config=browser_config) as crawler: - result = await crawler.arun(url=url, config=run_config) - - if result.success: - if save_markdown and output_dir: - ScraperCrawl4AI.__save_markdown(result.markdown, url, output_dir) - return result.markdown - logger.error(f"Crawl4AI failed: {result.error_message}") - return None - - except Exception as e: - logger.error(f"Crawl4AI error: {e}") - return None diff --git a/preprocessor/scraping/episode_scraper.py b/preprocessor/scraping/episode_scraper.py deleted file mode 100644 index 4e8ed2e3b..000000000 --- a/preprocessor/scraping/episode_scraper.py +++ /dev/null @@ -1,87 +0,0 @@ -from pathlib import Path -from typing import ( - Any, - Dict, - List, - Optional, -) - -from patchright.sync_api import sync_playwright # noqa: F401 # pylint: disable=unused-import - -from preprocessor.scraping.base_scraper import BaseScraper -from preprocessor.utils.console import console -from preprocessor.utils.file_utils import atomic_write_json - - -class EpisodeScraper(BaseScraper): - def __init__(self, args: Dict[str, Any]): - super().__init__(args) - self.merge_sources: bool = self._args.get("merge_sources", True) - self.expected_episodes_count: Optional[int] = self._args.get("expected_episodes_count") - self.videos_dir: Optional[Path] = self._args.get("videos_dir") - - def get_output_subdir(self) -> str: - return "scraped_pages" - - def _process_scraped_pages(self, scraped_pages: List[Dict[str, Any]]) -> None: - all_seasons = self.llm.extract_all_seasons(scraped_pages) - if not all_seasons: - self.logger.error("LLM failed to extract any season data") - return - - result = { - "sources": [item["url"] for item in scraped_pages], - "seasons": [season.model_dump() for season in all_seasons], - } - - self.output_file.parent.mkdir(parents=True, exist_ok=True) - atomic_write_json(self.output_file, result, indent=2, ensure_ascii=False) - - total_episodes = sum(len(season.episodes) for season in all_seasons) - console.print(f"[green]✓ Extracted {len(all_seasons)} seasons, {total_episodes} episodes[/green]") - console.print(f"[green]✓ Saved to: {self.output_file}[/green]") - - self.__validate_episode_coverage(total_episodes) - - def __validate_episode_coverage(self, scraped_episodes_count: int) -> None: - expected_count = self.__get_expected_episodes_count() - - if expected_count is None: - console.print("\n[yellow]⚠ Coverage validation:[/yellow]") - console.print(f" [cyan]Scraped episodes: {scraped_episodes_count}[/cyan]") - console.print(" [yellow]No video directory provided - unable to validate coverage[/yellow]") - console.print(" [dim]Make sure the scraped episodes cover all your video files[/dim]") - console.print(" [dim]You can add more --scrape-urls if needed[/dim]\n") - return - - coverage_percentage = (scraped_episodes_count / expected_count * 100) if expected_count > 0 else 0 - - console.print("\n[yellow]⚠ Episode coverage validation:[/yellow]") - console.print(f" [cyan]Scraped episodes: {scraped_episodes_count}[/cyan]") - console.print(f" [cyan]Video files found: {expected_count}[/cyan]") - console.print(f" [cyan]Coverage: {coverage_percentage:.1f}%[/cyan]") - - if scraped_episodes_count < expected_count: - console.print(f"\n[red]✗ WARNING: Missing {expected_count - scraped_episodes_count} episodes![/red]") - console.print(" [yellow]Consider adding more URLs to --scrape-urls[/yellow]") - console.print(" [dim]Not all video files will have metadata available[/dim]\n") - elif scraped_episodes_count > expected_count: - console.print(f"\n[yellow]⚠ Note: Scraped {scraped_episodes_count - expected_count} more episodes than video files[/yellow]") - console.print(" [dim]This is OK if you plan to add more videos later[/dim]\n") - else: - console.print("\n[green]✓ Perfect coverage - all video files have metadata![/green]\n") - - def __get_expected_episodes_count(self) -> Optional[int]: - if self.expected_episodes_count is not None: - return self.expected_episodes_count - - if self.videos_dir and self.videos_dir.exists(): - return self.__count_video_files(self.videos_dir) - - return None - - def __count_video_files(self, directory: Path) -> int: - count = 0 - for ext in self.SUPPORTED_VIDEO_EXTENSIONS: - count += len(list(directory.rglob(f"*{ext}"))) - return count diff --git a/preprocessor/search/__init__.py b/preprocessor/search/__init__.py deleted file mode 100644 index f3bba3a4d..000000000 --- a/preprocessor/search/__init__.py +++ /dev/null @@ -1,11 +0,0 @@ -from preprocessor.search.elasticsearch_queries import ElasticsearchQueries -from preprocessor.search.embedding_service import EmbeddingService -from preprocessor.search.hash_service import HashService -from preprocessor.search.result_formatters import ResultFormatter - -__all__ = [ - "ElasticsearchQueries", - "EmbeddingService", - "HashService", - "ResultFormatter", -] diff --git a/preprocessor/search/elasticsearch_queries.py b/preprocessor/search/elasticsearch_queries.py deleted file mode 100644 index 2acf7e122..000000000 --- a/preprocessor/search/elasticsearch_queries.py +++ /dev/null @@ -1,467 +0,0 @@ -from typing import ( - Any, - Dict, - List, - Optional, - Tuple, -) - -from elasticsearch import AsyncElasticsearch - -from preprocessor.search.embedding_service import EmbeddingService - - -class ElasticsearchQueries: - def __init__(self, embedding_service: EmbeddingService) -> None: - self._embedding_service = embedding_service - - @staticmethod - def _build_episode_filters(season: Optional[int], episode: Optional[int]) -> List[Dict[str, Any]]: - filters = [] - if season is not None: - filters.append({"term": {"episode_metadata.season": season}}) - if episode is not None: - filters.append({"term": {"episode_metadata.episode_number": episode}}) - return filters - - async def search_text_query( - self, - es_client: AsyncElasticsearch, - query: str, - season: Optional[int] = None, - episode: Optional[int] = None, - limit: int = 20, - ) -> Dict[str, Any]: - must_clauses = [{ - "multi_match": { - "query": query, - "fields": ["text^2", "episode_metadata.title"], - "fuzziness": "AUTO", - }, - }] - - must_clauses.extend(self._build_episode_filters(season, episode)) - - query_body = {"bool": {"must": must_clauses}} - - return await es_client.search( - index="ranczo_segments", - query=query_body, - size=limit, - _source=[ - "episode_id", "segment_id", "text", "start_time", "end_time", - "speaker", "video_path", "episode_metadata", "scene_info", - ], - ) - - async def search_text_semantic( - self, - es_client: AsyncElasticsearch, - text: str, - season: Optional[int] = None, - episode: Optional[int] = None, - limit: int = 10, - ) -> Dict[str, Any]: - embedding = self._embedding_service.get_text_embedding(text) - - filter_clauses = self._build_episode_filters(season, episode) - - knn_query: Dict[str, Any] = { - "field": "text_embedding", - "query_vector": embedding, - "k": limit, - "num_candidates": limit * 10, - } - if filter_clauses: - knn_query["filter"] = filter_clauses - - return await es_client.search( - index="ranczo_text_embeddings", - knn=knn_query, - size=limit, - _source=[ - "episode_id", "embedding_id", "text", "segment_range", - "video_path", "episode_metadata", "scene_info", - ], - ) - - async def search_video_semantic( - self, - es_client: AsyncElasticsearch, - image_path: str, - season: Optional[int] = None, - episode: Optional[int] = None, - character: Optional[str] = None, - limit: int = 10, - ) -> Dict[str, Any]: - embedding = self._embedding_service.get_image_embedding(image_path) - - filter_clauses = self._build_episode_filters(season, episode) - if character: - filter_clauses.append({ - "nested": { - "path": "character_appearances", - "query": {"term": {"character_appearances.name": character}}, - }, - }) - - knn_query: Dict[str, Any] = { - "field": "video_embedding", - "query_vector": embedding, - "k": limit, - "num_candidates": limit * 10, - } - if filter_clauses: - knn_query["filter"] = filter_clauses - - return await es_client.search( - index="ranczo_video_frames", - knn=knn_query, - size=limit, - _source=[ - "episode_id", "frame_number", "timestamp", "frame_type", "scene_number", - "perceptual_hash", "video_path", "episode_metadata", "character_appearances", "scene_info", - ], - ) - - async def search_text_to_video( - self, - es_client: AsyncElasticsearch, - text: str, - season: Optional[int] = None, - episode: Optional[int] = None, - character: Optional[str] = None, - limit: int = 10, - ) -> Dict[str, Any]: - embedding = self._embedding_service.get_text_embedding(text) - - filter_clauses = self._build_episode_filters(season, episode) - if character: - filter_clauses.append({ - "nested": { - "path": "character_appearances", - "query": {"term": {"character_appearances.name": character}}, - }, - }) - - knn_query: Dict[str, Any] = { - "field": "video_embedding", - "query_vector": embedding, - "k": limit, - "num_candidates": limit * 10, - } - if filter_clauses: - knn_query["filter"] = filter_clauses - - return await es_client.search( - index="ranczo_video_frames", - knn=knn_query, - size=limit, - _source=[ - "episode_id", "frame_number", "timestamp", "frame_type", "scene_number", - "perceptual_hash", "video_path", "episode_metadata", "character_appearances", "scene_info", - ], - ) - - @staticmethod - async def search_by_character( - es_client: AsyncElasticsearch, - character: str, - season: Optional[int] = None, - episode: Optional[int] = None, - limit: int = 20, - ) -> Dict[str, Any]: - must_clauses = [{ - "nested": { - "path": "character_appearances", - "query": {"term": {"character_appearances.name": character}}, - }, - }] - - must_clauses.extend(ElasticsearchQueries._build_episode_filters(season, episode)) - - return await es_client.search( - index="ranczo_video_frames", - query={"bool": {"must": must_clauses}}, - size=limit, - _source=[ - "episode_id", "frame_number", "timestamp", "video_path", - "episode_metadata", "character_appearances", "scene_info", - ], - ) - - @staticmethod - async def search_by_emotion( - es_client: AsyncElasticsearch, - emotion: str, - season: Optional[int] = None, - episode: Optional[int] = None, - character: Optional[str] = None, - limit: int = 20, - ) -> Dict[str, Any]: - nested_must = [{"term": {"character_appearances.emotion.label": emotion}}] - if character: - nested_must.append({"term": {"character_appearances.name": character}}) - - must_clauses = [{ - "nested": { - "path": "character_appearances", - "query": {"bool": {"must": nested_must}}, - }, - }] - - must_clauses.extend(ElasticsearchQueries._build_episode_filters(season, episode)) - - nested_filter: Dict[str, Any] = {"term": {"character_appearances.emotion.label": emotion}} - if character: - nested_filter = { - "bool": { - "must": [ - {"term": {"character_appearances.emotion.label": emotion}}, - {"term": {"character_appearances.name": character}}, - ], - }, - } - - return await es_client.search( - index="ranczo_video_frames", - query={"bool": {"must": must_clauses}}, - sort=[ - { - "character_appearances.emotion.confidence": { - "order": "desc", - "nested": { - "path": "character_appearances", - "filter": nested_filter, - }, - }, - }, - ], - track_scores=True, - size=limit, - _source=[ - "episode_id", "frame_number", "timestamp", "video_path", - "episode_metadata", "character_appearances", "scene_info", - ], - ) - - @staticmethod - async def search_by_object( - es_client: AsyncElasticsearch, - object_query: str, - season: Optional[int] = None, - episode: Optional[int] = None, - limit: int = 20, - ) -> Dict[str, Any]: - filter_clauses = ElasticsearchQueries._build_episode_filters(season, episode) - - must_clauses: List[Dict[str, Any]] = [] - - if ":" in object_query: - object_class, count_filter = object_query.split(":", 1) - object_class = object_class.strip() - - if count_filter.endswith("+"): - min_count = int(count_filter[:-1]) - must_clauses.append({ - "nested": { - "path": "detected_objects", - "query": { - "bool": { - "must": [ - {"term": {"detected_objects.class": object_class}}, - {"range": {"detected_objects.count": {"gte": min_count}}}, - ], - }, - }, - }, - }) - elif "-" in count_filter: - min_c, max_c = count_filter.split("-") - must_clauses.append({ - "nested": { - "path": "detected_objects", - "query": { - "bool": { - "must": [ - {"term": {"detected_objects.class": object_class}}, - {"range": {"detected_objects.count": {"gte": int(min_c), "lte": int(max_c)}}}, - ], - }, - }, - }, - }) - else: - exact_count = int(count_filter) - must_clauses.append({ - "nested": { - "path": "detected_objects", - "query": { - "bool": { - "must": [ - {"term": {"detected_objects.class": object_class}}, - {"term": {"detected_objects.count": exact_count}}, - ], - }, - }, - }, - }) - else: - must_clauses.append({ - "nested": { - "path": "detected_objects", - "query": { - "term": {"detected_objects.class": object_query.strip()}, - }, - }, - }) - - query_body = { - "bool": { - "must": must_clauses, - "filter": filter_clauses, - }, - } - - object_class = object_query.split(":")[0].strip() if ":" in object_query else object_query.strip() - - return await es_client.search( - index="ranczo_video_frames", - query=query_body, - sort=[ - { - "detected_objects.count": { - "order": "desc", - "nested": { - "path": "detected_objects", - "filter": {"term": {"detected_objects.class": object_class}}, - }, - }, - }, - ], - track_scores=True, - size=limit, - _source=[ - "episode_id", "frame_number", "timestamp", "detected_objects", - "character_appearances", "video_path", "episode_metadata", "scene_info", - ], - ) - - @staticmethod - async def search_perceptual_hash( - es_client: AsyncElasticsearch, - phash: str, - limit: int = 10, - ) -> Dict[str, Any]: - return await es_client.search( - index="ranczo_video_frames", - query={"term": {"perceptual_hash": phash}}, - size=limit, - _source=[ - "episode_id", "frame_number", "timestamp", "video_path", - "episode_metadata", "perceptual_hash", "scene_info", - ], - ) - - @staticmethod - async def list_characters(es_client: AsyncElasticsearch) -> List[Tuple[str, int]]: - result = await es_client.search( - index="ranczo_video_frames", - size=0, - aggs={ - "characters_nested": { - "nested": {"path": "character_appearances"}, - "aggs": { - "character_names": { - "terms": {"field": "character_appearances.name", "size": 1000}, - }, - }, - }, - }, - ) - buckets = result["aggregations"]["characters_nested"]["character_names"]["buckets"] - return [(b["key"], b["doc_count"]) for b in buckets] - - @staticmethod - async def list_objects(es_client: AsyncElasticsearch) -> List[Tuple[str, int]]: - result = await es_client.search( - index="ranczo_video_frames", - size=0, - aggs={ - "objects_nested": { - "nested": {"path": "detected_objects"}, - "aggs": { - "object_classes": { - "terms": {"field": "detected_objects.class", "size": 1000}, - }, - }, - }, - }, - ) - buckets = result["aggregations"]["objects_nested"]["object_classes"]["buckets"] - return [(b["key"], b["doc_count"]) for b in buckets] - - @staticmethod - async def search_episode_name( - es_client: AsyncElasticsearch, - query: str, - season: Optional[int] = None, - limit: int = 20, - ) -> Dict[str, Any]: - must_clauses = [{ - "multi_match": { - "query": query, - "fields": ["title^2", "episode_metadata.title"], - "fuzziness": "AUTO", - }, - }] - - if season is not None: - must_clauses.append({"term": {"episode_metadata.season": season}}) - - query_body = {"bool": {"must": must_clauses}} - - return await es_client.search( - index="ranczo_episode_names", - query=query_body, - size=limit, - _source=["episode_id", "title", "video_path", "episode_metadata"], - ) - - async def search_episode_name_semantic( - self, - es_client: AsyncElasticsearch, - text: str, - season: Optional[int] = None, - limit: int = 10, - ) -> Dict[str, Any]: - embedding = self._embedding_service.get_text_embedding(text) - - filter_clauses = [] - if season is not None: - filter_clauses.append({"term": {"episode_metadata.season": season}}) - - knn_query: Dict[str, Any] = { - "field": "title_embedding", - "query_vector": embedding, - "k": limit, - "num_candidates": limit * 10, - } - if filter_clauses: - knn_query["filter"] = filter_clauses - - return await es_client.search( - index="ranczo_episode_names", - knn=knn_query, - size=limit, - _source=["episode_id", "title", "video_path", "episode_metadata"], - ) - - @staticmethod - async def get_stats(es_client: AsyncElasticsearch) -> Dict[str, int]: - return { - "segments": (await es_client.count(index="ranczo_segments"))["count"], - "text_embeddings": (await es_client.count(index="ranczo_text_embeddings"))["count"], - "video_embeddings": (await es_client.count(index="ranczo_video_frames"))["count"], - "episode_names": (await es_client.count(index="ranczo_episode_names"))["count"], - } diff --git a/preprocessor/search/result_formatters.py b/preprocessor/search/result_formatters.py deleted file mode 100644 index 2ba2d7780..000000000 --- a/preprocessor/search/result_formatters.py +++ /dev/null @@ -1,85 +0,0 @@ -from typing import ( - Any, - Dict, - Optional, -) - -import click - -from preprocessor.utils.constants import ( - ElasticsearchAggregationKeys, - ElasticsearchKeys, - EpisodeMetadataKeys, -) - - -class ResultFormatter: - @staticmethod - def format_timestamp(seconds: float) -> str: - minutes = int(seconds // 60) - secs = seconds % 60 - return f"{minutes}m {secs:.1f}s" - - @staticmethod - def _format_scene_context(scene_info: Optional[Dict[str, Any]]) -> str: - if not scene_info: - return "" - start = ResultFormatter.format_timestamp(scene_info.get('scene_start_time', 0)) - end = ResultFormatter.format_timestamp(scene_info.get('scene_end_time', 0)) - return f" [Scene {scene_info.get('scene_number', '?')}: {start} - {end}]" - - @staticmethod - def print_results(result: Dict[str, Any], result_type: str = "text") -> None: # pylint: disable=too-many-locals - total = result[ElasticsearchKeys.HITS][ElasticsearchKeys.TOTAL][ElasticsearchAggregationKeys.VALUE] - hits = result[ElasticsearchKeys.HITS][ElasticsearchKeys.HITS] - - click.echo(f"\nZnaleziono: {total} wynikow") - click.echo("=" * 80) - - for i, hit in enumerate(hits, 1): - source = hit[ElasticsearchKeys.SOURCE] - score = hit[ElasticsearchKeys.SCORE] - meta = source[EpisodeMetadataKeys.EPISODE_METADATA] - scene_ctx = ResultFormatter._format_scene_context(source.get("scene_info")) - - click.echo(f"\n[{i}] Score: {score:.2f}") - season_code = "S00" if meta['season'] == 0 else f"S{meta['season']:02d}" - click.echo(f"Episode: {season_code}E{meta['episode_number']:02d} - {meta.get('title', 'N/A')}") - - if result_type == "text": - click.echo(f"Segment ID: {source.get('segment_id', 'N/A')}") - start_time = ResultFormatter.format_timestamp(source['start_time']) - end_time = ResultFormatter.format_timestamp(source['end_time']) - click.echo(f"Time: {start_time} - {end_time}{scene_ctx}") - click.echo(f"Speaker: {source.get('speaker', 'N/A')}") - click.echo(f"Text: {source['text']}") - elif result_type == "text_semantic": - click.echo(f"Segments: {source['segment_range'][0]}-{source['segment_range'][1]}{scene_ctx}") - click.echo(f"Embedding ID: {source.get('embedding_id', 'N/A')}") - click.echo(f"Text: {source['text']}") - elif result_type == "episode_name": - click.echo(f"Episode Title: {source.get('title', 'N/A')}") - else: - timestamp = ResultFormatter.format_timestamp(source['timestamp']) - click.echo(f"Frame: {source['frame_number']} @ {timestamp}{scene_ctx}") - if "frame_type" in source: - click.echo(f"Type: {source['frame_type']}") - if "scene_number" in source: - click.echo(f"Scene number: {source['scene_number']}") - if "perceptual_hash" in source: - click.echo(f"Hash: {source['perceptual_hash']}") - if source.get("character_appearances"): - chars_strs = [] - for char in source['character_appearances']: - char_str = char.get('name', 'Unknown') - if char.get('emotion'): - emotion_label = char['emotion'].get('label', '?') - emotion_conf = char['emotion'].get('confidence', 0) - char_str += f" ({emotion_label} {emotion_conf:.2f})" - chars_strs.append(char_str) - click.echo(f"Characters: {', '.join(chars_strs)}") - if source.get("detected_objects"): - objects_str = ", ".join([f"{obj['class']}:{obj['count']}" for obj in source['detected_objects']]) - click.echo(f"Objects: {objects_str}") - - click.echo(f"Path: {source['video_path']}") diff --git a/preprocessor/text_analysis/__init__.py b/preprocessor/text_analysis/__init__.py deleted file mode 100644 index e8a533c57..000000000 --- a/preprocessor/text_analysis/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -from preprocessor.text_analysis.text_statistics import TextStatistics - -__all__ = ["TextStatistics"] diff --git a/preprocessor/text_analysis/text_statistics.py b/preprocessor/text_analysis/text_statistics.py deleted file mode 100644 index 8bf692bd0..000000000 --- a/preprocessor/text_analysis/text_statistics.py +++ /dev/null @@ -1,207 +0,0 @@ -from collections import Counter -from dataclasses import ( - dataclass, - field, -) -from pathlib import Path -import re -from typing import ( - Any, - Dict, - List, - Set, -) - - -@dataclass -class LanguageConfig: - vowels: Set[str] - consonants: Set[str] - punctuation: Set[str] - special_chars: Set[str] - - -POLISH_VOWELS = set("aąeęioóuyAĄEĘIOÓUY") -POLISH_CONSONANTS = set("bcćdfghjklłmnńprsśtwzźżBCĆDFGHJKLŁMNŃPRSŚTWZŹŻ") -ENGLISH_VOWELS = set("aeiouAEIOU") -ENGLISH_CONSONANTS = set("bcdfghjklmnpqrstvwxyzBCDFGHJKLMNPQRSTVWXYZ") - -PUNCTUATION = set(".,;:!?…-—–()[]{}\"'«»„""''") # noqa: RUF001, pylint: disable=implicit-str-concat -SPECIAL_CHARS = set("@#$%^&*+=<>|\\/_~`") - - -POLISH_CONFIG = LanguageConfig( - vowels=POLISH_VOWELS | ENGLISH_VOWELS, - consonants=POLISH_CONSONANTS | ENGLISH_CONSONANTS, - punctuation=PUNCTUATION, - special_chars=SPECIAL_CHARS, -) - -ENGLISH_CONFIG = LanguageConfig( - vowels=ENGLISH_VOWELS, - consonants=ENGLISH_CONSONANTS, - punctuation=PUNCTUATION, - special_chars=SPECIAL_CHARS, -) - - -@dataclass -class TextStatistics: # pylint: disable=too-many-instance-attributes - text: str - language: str = "pl" - - sentences: int = 0 - lines: int = 0 - paragraphs: int = 0 - empty_lines: int = 0 - words: int = 0 - letters: int = 0 - digits: int = 0 - symbols: int = 0 - punctuation_marks: int = 0 - special_characters: int = 0 - chars_without_spaces: int = 0 - spaces: int = 0 - total_chars: int = 0 - vowels: int = 0 - consonants: int = 0 - - unique_words: int = 0 - avg_word_length: float = 0.0 - avg_sentence_length: float = 0.0 - type_token_ratio: float = 0.0 - - letter_frequency: Dict[str, int] = field(default_factory=dict) - word_frequency: List[Dict[str, Any]] = field(default_factory=list) - bigrams: List[Dict[str, Any]] = field(default_factory=list) - trigrams: List[Dict[str, Any]] = field(default_factory=list) - - @classmethod - def from_file(cls, file_path: Path, language: str = "pl") -> "TextStatistics": - with open(file_path, "r", encoding="utf-8") as f: - text = f.read() - - stats = cls(text=text, language=language) - stats.calculate() - return stats - - @classmethod - def from_text(cls, text: str, language: str = "pl") -> "TextStatistics": - stats = cls(text=text, language=language) - stats.calculate() - return stats - - def calculate(self): - self.__calculate_basic_stats() - self.__calculate_character_stats() - self.__calculate_word_stats() - self.__calculate_advanced_stats() - - def __get_config(self) -> LanguageConfig: - return POLISH_CONFIG if self.language == "pl" else ENGLISH_CONFIG - - def __calculate_basic_stats(self): - lines = self.text.split("\n") - self.lines = len(lines) - self.empty_lines = sum(1 for line in lines if not line.strip()) - - paragraphs = self.text.split("\n\n") - self.paragraphs = len([p for p in paragraphs if p.strip()]) - - sentence_pattern = r'[.!?…]+(?:\s|$)' - self.sentences = len(re.findall(sentence_pattern, self.text)) - - self.total_chars = len(self.text) - self.spaces = self.text.count(" ") + self.text.count("\t") + self.text.count("\n") - self.chars_without_spaces = self.total_chars - self.spaces - - def __calculate_character_stats(self): - config = self.__get_config() - letter_counter = Counter() - - for char in self.text: - if char.isalpha(): - self.letters += 1 - letter_counter[char.lower()] += 1 - - if char in config.vowels: - self.vowels += 1 - elif char in config.consonants: - self.consonants += 1 - elif char.isdigit(): - self.digits += 1 - elif char in config.punctuation: - self.punctuation_marks += 1 - elif char in config.special_chars: - self.special_characters += 1 - elif not char.isspace(): - self.symbols += 1 - - self.letter_frequency = dict(sorted(letter_counter.items(), key=lambda x: x[1], reverse=True)) - - def __calculate_word_stats(self): - words = re.findall(r'\b\w+\b', self.text.lower()) - self.words = len(words) - - if self.words > 0: - word_counter = Counter(words) - self.unique_words = len(word_counter) - self.type_token_ratio = round(self.unique_words / self.words, 4) if self.words > 0 else 0.0 - - word_lengths = [len(w) for w in words] - self.avg_word_length = round(sum(word_lengths) / len(word_lengths), 2) if word_lengths else 0.0 - - self.word_frequency = [ - {"word": word, "count": count} - for word, count in word_counter.most_common(50) - ] - - def __calculate_advanced_stats(self): - if self.sentences > 0: - self.avg_sentence_length = round(self.words / self.sentences, 2) - - words = re.findall(r'\b\w+\b', self.text.lower()) - if len(words) >= 2: - bigram_counter = Counter(zip(words[:-1], words[1:])) - self.bigrams = [ - {"bigram": f"{w1} {w2}", "count": count} - for (w1, w2), count in bigram_counter.most_common(25) - ] - - if len(words) >= 3: - trigram_counter = Counter(zip(words[:-2], words[1:-1], words[2:])) - self.trigrams = [ - {"trigram": f"{w1} {w2} {w3}", "count": count} - for (w1, w2, w3), count in trigram_counter.most_common(25) - ] - - def to_dict(self) -> Dict[str, Any]: - return { - "basic_statistics": { - "sentences": self.sentences, - "lines": self.lines, - "paragraphs": self.paragraphs, - "empty_lines": self.empty_lines, - "words": self.words, - "letters": self.letters, - "digits": self.digits, - "symbols": self.symbols, - "punctuation_marks": self.punctuation_marks, - "special_characters": self.special_characters, - "chars_without_spaces": self.chars_without_spaces, - "spaces": self.spaces, - "total_chars": self.total_chars, - "vowels": self.vowels, - "consonants": self.consonants, - }, - "advanced_statistics": { - "unique_words": self.unique_words, - "avg_word_length": self.avg_word_length, - "avg_sentence_length": self.avg_sentence_length, - "type_token_ratio": self.type_token_ratio, - }, - "letter_frequency": self.letter_frequency, - "word_frequency": self.word_frequency, - "bigrams": self.bigrams, - "trigrams": self.trigrams, - } diff --git a/preprocessor/transcription/__init__.py b/preprocessor/transcription/__init__.py deleted file mode 100644 index 456e60c25..000000000 --- a/preprocessor/transcription/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -from preprocessor.transcription.generators.json_generator import JsonGenerator -from preprocessor.transcription.processors.audio_normalizer import AudioNormalizer -from preprocessor.transcription.processors.episode_info_processor import EpisodeInfoProcessor -from preprocessor.transcription.processors.normalized_audio_processor import NormalizedAudioProcessor diff --git a/preprocessor/transcription/engines/whisper_engine.py b/preprocessor/transcription/engines/whisper_engine.py deleted file mode 100644 index 92586a4f8..000000000 --- a/preprocessor/transcription/engines/whisper_engine.py +++ /dev/null @@ -1,73 +0,0 @@ -import gc -import logging -from pathlib import Path -from typing import ( - Any, - Dict, -) - -from faster_whisper import WhisperModel -import torch - -from preprocessor.transcription.engines.base_engine import TranscriptionEngine -from preprocessor.transcription.whisper_utils import ( - build_transcription_result, - get_language_code, -) -from preprocessor.utils.console import console - - -class WhisperEngine(TranscriptionEngine): - def __init__( - self, - model: str = "large-v3-turbo", - language: str = "Polish", - device: str = "cuda", - ): - self.model_name = model - self.language = language - self.device = device - - self.logger = logging.getLogger(self.__class__.__name__) - - if device != "cuda": - raise ValueError(f"Only GPU (cuda) is supported, got device={device}") - - compute_type = "float16" - console.print(f"[cyan]Loading Whisper model: {model} on {device} with compute_type={compute_type}[/cyan]") - self.model = WhisperModel(model, device=device, compute_type=compute_type) - console.print("[green]✓ Whisper model loaded[/green]") - - def transcribe(self, audio_path: Path) -> Dict[str, Any]: - console.print(f"[cyan]Transcribing with Whisper: {audio_path.name}[/cyan]") - - if not audio_path.exists(): - raise FileNotFoundError(f"Audio file not found: {audio_path}") - - language_code = get_language_code(self.language) - - segments, info = self.model.transcribe( - str(audio_path), - language=language_code, - beam_size=10, - word_timestamps=True, - condition_on_previous_text=False, - ) - - result = build_transcription_result(segments, language=info.language) - - console.print(f"[green]✓ Transcription completed: {audio_path.name}[/green]") - - return result - - def get_name(self) -> str: - return f"Whisper-{self.model_name}" - - def cleanup(self) -> None: - console.print("[cyan]Unloading Whisper model and clearing GPU memory...[/cyan]") - if hasattr(self, 'model'): - del self.model - gc.collect() - if torch.cuda.is_available(): - torch.cuda.empty_cache() - console.print("[green]✓ Whisper model unloaded, GPU memory cleared[/green]") diff --git a/preprocessor/transcription/generators/json_generator.py b/preprocessor/transcription/generators/json_generator.py deleted file mode 100644 index c785f5237..000000000 --- a/preprocessor/transcription/generators/json_generator.py +++ /dev/null @@ -1,94 +0,0 @@ -from pathlib import Path -from typing import ( - Any, - Dict, - Literal, -) - -from preprocessor.core.constants import ( - FILE_EXTENSIONS, - FILE_SUFFIXES, -) -from preprocessor.transcription.generators.base_generator import BaseTranscriptionGenerator -from preprocessor.utils.transcription_utils import convert_words_list - - -class JsonGenerator(BaseTranscriptionGenerator): - def __init__(self, format_type: Literal["full", "simple", "segmented"], *args, **kwargs): - super().__init__(*args, **kwargs) - self.format_type = format_type - - def _process_file(self, json_file: Path, data: Dict[str, Any]) -> None: - pass - - def _get_output_filename(self, json_file: Path) -> str: - if self.format_type == "full": - return json_file.name - suffix = FILE_SUFFIXES[self.format_type] - return json_file.name.replace(FILE_EXTENSIONS["json"], f"{suffix}{FILE_EXTENSIONS['json']}") - - def convert(self, data: Dict[str, Any]) -> Dict[str, Any]: - if self.format_type == "full": - return self.convert_to_full_format(data) - if self.format_type == "simple": - return self.convert_to_simple_format(data) - if self.format_type == "segmented": - return self.convert_to_segmented_format(data) - raise ValueError(f"Unknown format type: {self.format_type}") - - @staticmethod - def convert_to_full_format(data: Dict[str, Any]) -> Dict[str, Any]: - segments = data.get("segments", []) - full_text = " ".join(seg.get("text", "").strip() for seg in segments) - - language_code = data.get("language", "pol") - if language_code in {"Polish", "polish"}: - language_code = "pol" - - words = [] - for seg in segments: - seg_words = seg.get("words", []) - words.extend(convert_words_list(seg_words)) - - return { - "language_code": language_code, - "language_probability": 1.0, - "text": full_text, - "words": words, - } - - @staticmethod - def convert_to_simple_format(data: Dict[str, Any]) -> Dict[str, Any]: - segments = data.get("segments", []) - result_segments = [] - - for seg in segments: - text = seg.get("text", "").strip() - seg_words = seg.get("words", []) - - speaker = "speaker_unknown" - if seg_words: - speaker = seg_words[0].get("speaker_id", "speaker_unknown") - - result_segments.append({ - "speaker": speaker, - "text": text, - }) - - return {"segments": result_segments} - - @staticmethod - def convert_to_segmented_format(data: Dict[str, Any]) -> Dict[str, Any]: - segments = data.get("segments", []) - result_segments = [] - - for seg in segments: - text = seg.get("text", "").strip() - seg_words = seg.get("words", []) - - result_segments.append({ - "text": text, - "words": convert_words_list(seg_words), - }) - - return {"segments": result_segments} diff --git a/preprocessor/transcription/generators/multi_format_generator.py b/preprocessor/transcription/generators/multi_format_generator.py deleted file mode 100644 index 6e2283079..000000000 --- a/preprocessor/transcription/generators/multi_format_generator.py +++ /dev/null @@ -1,170 +0,0 @@ -import json -from pathlib import Path -from typing import ( - Any, - Dict, -) - -from preprocessor.config.config import ( - get_base_output_dir, - settings, -) -from preprocessor.episodes import EpisodeManager -from preprocessor.transcription.generators.json_generator import JsonGenerator -from preprocessor.transcription.generators.srt_generator import SrtGenerator -from preprocessor.transcription.generators.txt_generator import TxtGenerator -from preprocessor.utils.error_handling_logger import ErrorHandlingLogger - - -class MultiFormatGenerator: - def __init__( - self, - jsons_dir: Path, - episodes_info_json: Path, - output_base_path: Path, - logger: ErrorHandlingLogger, - series_name: str = "", - ): - self.jsons_dir = jsons_dir - self.output_base_path = output_base_path - self.logger = logger - self.series_name = series_name.lower() if series_name else "unknown" - - self.episode_manager = EpisodeManager(episodes_info_json, self.series_name) - - def __call__(self) -> None: - self.generate() - - def generate(self) -> None: - for transcription_file in self.jsons_dir.rglob("*.json"): - self.__process_file(transcription_file) - - def __process_file(self, transcription_file: Path) -> None: - try: # pylint: disable=too-many-try-statements - with open(transcription_file, "r", encoding="utf-8") as f: - transcription = json.load(f) - - episode_info = self.episode_manager.parse_filename(transcription_file) - if not episode_info: - self.logger.error(f"Cannot extract episode info from {transcription_file.name}") - return - - filename = self.episode_manager.path_manager.build_filename(episode_info, extension="json") - season_code = episode_info.season_code() - episode_code = episode_info.episode_num() - main_output_file = get_base_output_dir(self.series_name) / settings.output_subdirs.transcriptions / season_code / episode_code / "raw" / filename - - if main_output_file.exists(): - self.logger.info(f"Skipping (already exists): {episode_info.episode_code()}") - return - - episode_metadata = EpisodeManager.get_metadata(episode_info) - transcription_with_info = { - "episode_info": episode_metadata, - **transcription, - } - - self.__generate_full_json(transcription_with_info, episode_info) - self.__generate_segmented_json(transcription, episode_info) - self.__generate_simple_json(transcription, episode_info) - self.__generate_srt(transcription, episode_info) - self.__generate_txt(transcription, episode_info) - - except Exception as e: - self.logger.error(f"Error processing file {transcription_file}: {e}") - - def __generate_full_json(self, data: Dict[str, Any], episode_info) -> None: - filename = self.episode_manager.path_manager.build_filename(episode_info, extension="json") - season_code = episode_info.season_code() - episode_code = episode_info.episode_num() - output_file = get_base_output_dir(self.series_name) / settings.output_subdirs.transcriptions / season_code / episode_code / "raw" / filename - output_file.parent.mkdir(parents=True, exist_ok=True) - - generator = JsonGenerator("full", Path("."), output_file.parent, self.logger) - full_json = generator.convert_to_full_format(data) - full_json["episode_info"] = data.get("episode_info", {}) - - with open(output_file, "w", encoding="utf-8") as f: - json.dump(full_json, f, indent=2, ensure_ascii=False) - - self.logger.info(f"Generated full JSON: {output_file}") - - def __generate_segmented_json(self, data: Dict[str, Any], episode_info) -> None: - filename = self.episode_manager.path_manager.build_filename( - episode_info, - extension="json", - suffix="segmented", - ) - season_code = episode_info.season_code() - episode_code = episode_info.episode_num() - output_file = get_base_output_dir(self.series_name) / settings.output_subdirs.transcriptions / season_code / episode_code / "raw" / filename - output_file.parent.mkdir(parents=True, exist_ok=True) - - generator = JsonGenerator("segmented", Path("."), output_file.parent, self.logger) - segmented_json = generator.convert_to_segmented_format(data) - - segmented_json["episode_info"] = { - "season": episode_info.season, - "episode_number": episode_info.relative_episode, - } - - with open(output_file, "w", encoding="utf-8") as f: - json.dump(segmented_json, f, indent=2, ensure_ascii=False) - - self.logger.info(f"Generated segmented JSON: {output_file}") - - def __generate_simple_json(self, data: Dict[str, Any], episode_info) -> None: - filename = self.episode_manager.path_manager.build_filename( - episode_info, - extension="json", - suffix="simple", - ) - season_code = episode_info.season_code() - episode_code = episode_info.episode_num() - output_file = get_base_output_dir(self.series_name) / settings.output_subdirs.transcriptions / season_code / episode_code / "raw" / filename - output_file.parent.mkdir(parents=True, exist_ok=True) - - generator = JsonGenerator("simple", Path("."), output_file.parent, self.logger) - simple_json = generator.convert_to_simple_format(data) - - simple_json["episode_info"] = { - "season": episode_info.season, - "episode_number": episode_info.relative_episode, - } - - with open(output_file, "w", encoding="utf-8") as f: - json.dump(simple_json, f, indent=2, ensure_ascii=False) - - self.logger.info(f"Generated simple JSON: {output_file}") - - def __generate_srt(self, data: Dict[str, Any], episode_info) -> None: - filename = self.episode_manager.path_manager.build_filename(episode_info, extension="srt") - season_code = episode_info.season_code() - episode_code = episode_info.episode_num() - output_file = get_base_output_dir(self.series_name) / settings.output_subdirs.transcriptions / season_code / episode_code / "raw" / filename - output_file.parent.mkdir(parents=True, exist_ok=True) - output_file.parent.mkdir(parents=True, exist_ok=True) - - generator = SrtGenerator(Path("."), output_file.parent, self.logger) - srt_content = generator.convert_to_srt_format(data) - - with open(output_file, "w", encoding="utf-8") as f: - f.write(srt_content) - - self.logger.info(f"Generated SRT: {output_file}") - - def __generate_txt(self, data: Dict[str, Any], episode_info) -> None: - filename = self.episode_manager.path_manager.build_filename(episode_info, extension="txt") - season_code = episode_info.season_code() - episode_code = episode_info.episode_num() - output_file = get_base_output_dir(self.series_name) / settings.output_subdirs.transcriptions / season_code / episode_code / "raw" / filename - output_file.parent.mkdir(parents=True, exist_ok=True) - output_file.parent.mkdir(parents=True, exist_ok=True) - - generator = TxtGenerator(Path("."), output_file.parent, self.logger) - txt_content = generator.convert_to_txt_format(data) - - with open(output_file, "w", encoding="utf-8") as f: - f.write(txt_content) - - self.logger.info(f"Generated TXT: {output_file}") diff --git a/preprocessor/transcription/generators/srt_generator.py b/preprocessor/transcription/generators/srt_generator.py deleted file mode 100644 index 2c5661020..000000000 --- a/preprocessor/transcription/generators/srt_generator.py +++ /dev/null @@ -1,50 +0,0 @@ -from pathlib import Path -from typing import ( - Any, - Dict, -) - -from preprocessor.core.constants import FILE_EXTENSIONS -from preprocessor.transcription.generators.base_generator import BaseTranscriptionGenerator - - -class SrtGenerator(BaseTranscriptionGenerator): - def _process_file(self, json_file: Path, data: Dict[str, Any]) -> None: - pass - - def _get_output_filename(self, json_file: Path) -> str: - return json_file.name.replace(FILE_EXTENSIONS["json"], FILE_EXTENSIONS["srt"]) - - def convert_to_srt_format(self, data: Dict[str, Any]) -> str: - segments = data.get("segments", []) - srt_lines = [] - index = 1 - - for seg in segments: - start = seg.get("start", 0.0) - end = seg.get("end", 0.0) - text = seg.get("text", "").strip() - - if not text: - continue - - start_time = self.__format_timestamp(start) - end_time = self.__format_timestamp(end) - - srt_lines.append(f"{index}") - srt_lines.append(f"{start_time} --> {end_time}") - srt_lines.append(text) - srt_lines.append("") - - index += 1 - - return "\n".join(srt_lines) - - @staticmethod - def __format_timestamp(seconds: float) -> str: - hours = int(seconds // 3600) - minutes = int((seconds % 3600) // 60) - secs = int(seconds % 60) - millis = int((seconds % 1) * 1000) - - return f"{hours:02d}:{minutes:02d}:{secs:02d},{millis:03d}" diff --git a/preprocessor/transcription/processors/__init__.py b/preprocessor/transcription/processors/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/preprocessor/transcription/processors/sound_separator.py b/preprocessor/transcription/processors/sound_separator.py deleted file mode 100644 index e958b5dca..000000000 --- a/preprocessor/transcription/processors/sound_separator.py +++ /dev/null @@ -1,391 +0,0 @@ -import json -from pathlib import Path -import re -from typing import ( - Any, - Dict, - List, - Tuple, -) - -from preprocessor.config.config import settings -from preprocessor.core.base_processor import ( - BaseProcessor, - OutputSpec, - ProcessingItem, -) -from preprocessor.core.constants import ( - FILE_EXTENSIONS, - FILE_SUFFIXES, -) -from preprocessor.episodes import EpisodeManager -from preprocessor.utils.constants import ( - WordKeys, - WordTypeValues, -) - - -class SoundEventSeparator(BaseProcessor): - - def __init__(self, args: Dict[str, Any]) -> None: - super().__init__( - args=args, - class_name=self.__class__.__name__, - error_exit_code=2, - loglevel=args.get("loglevel", 20), - ) - - self.transcription_dir = Path( - self._args.get("transcription_dir", settings.transcription.get_output_dir(self.series_name)), - ) - episodes_info_json = self._args.get("episodes_info_json") - self.episode_manager = EpisodeManager(episodes_info_json, self.series_name) - - def _validate_args(self, args: Dict[str, Any]) -> None: - pass - - def get_output_subdir(self) -> str: - return settings.output_subdirs.transcriptions - - def _get_processing_items(self) -> List[ProcessingItem]: - segmented_files = list(self.transcription_dir.rglob("**/raw/*_segmented.json")) - - items = [] - for trans_file in segmented_files: - episode_info = self.episode_manager.parse_filename(trans_file) - if not episode_info: - self.logger.warning(f"Cannot parse episode info from {trans_file.name}") - continue - - episode_id = EpisodeManager.get_episode_id_for_state(episode_info) - - items.append( - ProcessingItem( - episode_id=episode_id, - input_path=trans_file, - metadata={"episode_info": episode_info}, - ), - ) - - return items - - def _get_expected_outputs(self, item: ProcessingItem) -> List[OutputSpec]: - base_name = item.input_path.stem.replace(FILE_SUFFIXES["segmented"], "") - episode_dir = item.input_path.parent.parent - clean_dir = episode_dir / settings.output_subdirs.transcription_subdirs.clean - sound_dir = episode_dir / settings.output_subdirs.transcription_subdirs.sound_events - - clean_json = clean_dir / f"{base_name}{FILE_SUFFIXES['clean']}{FILE_EXTENSIONS['json']}" - sound_json = sound_dir / f"{base_name}{FILE_SUFFIXES['sound_events']}{FILE_EXTENSIONS['json']}" - clean_segmented_json = clean_dir / f"{base_name}{FILE_SUFFIXES['segmented']}_clean{FILE_EXTENSIONS['json']}" - sound_segmented_json = sound_dir / f"{base_name}{FILE_SUFFIXES['segmented']}_sound_events{FILE_EXTENSIONS['json']}" - clean_txt = clean_dir / f"{base_name}{FILE_SUFFIXES['clean']}{FILE_EXTENSIONS['txt']}" - sound_txt = sound_dir / f"{base_name}{FILE_SUFFIXES['sound_events']}{FILE_EXTENSIONS['txt']}" - clean_srt = clean_dir / f"{base_name}{FILE_SUFFIXES['clean']}{FILE_EXTENSIONS['srt']}" - sound_srt = sound_dir / f"{base_name}{FILE_SUFFIXES['sound_events']}{FILE_EXTENSIONS['srt']}" - - return [ - OutputSpec(path=clean_json, required=True), - OutputSpec(path=sound_json, required=True), - OutputSpec(path=clean_segmented_json, required=True), - OutputSpec(path=sound_segmented_json, required=True), - OutputSpec(path=clean_txt, required=True), - OutputSpec(path=sound_txt, required=True), - OutputSpec(path=clean_srt, required=True), - OutputSpec(path=sound_srt, required=True), - ] - - def _process_item(self, item: ProcessingItem, missing_outputs: List[OutputSpec]) -> None: # pylint: disable=too-many-locals - with open(item.input_path, "r", encoding="utf-8") as f: - data = json.load(f) - - episode_info = data.get("episode_info", {}) - segments = data.get("segments", []) - - dialogue_segments = [] - sound_event_segments = [] - - for segment in segments: - classification = self.__classify_segment(segment) - - if classification == "dialogue": - dialogue_segments.append(self.__clean_segment_text(segment)) - elif classification == "sound_event": - sound_event_segments.append(self.__enrich_sound_event(self.__clean_segment_text(segment))) - elif classification == "mixed": - dialogue_parts, sound_parts = self.__split_mixed_segment(segment) - dialogue_segments.extend(dialogue_parts) - sound_event_segments.extend([self.__enrich_sound_event(s) for s in sound_parts]) - - dialogue_segments = self.__renumber_segments(dialogue_segments) - sound_event_segments = self.__renumber_segments(sound_event_segments) - - base_name = item.input_path.stem.replace(FILE_SUFFIXES["segmented"], "") - episode_dir = item.input_path.parent.parent - clean_dir = episode_dir / settings.output_subdirs.transcription_subdirs.clean - sound_dir = episode_dir / settings.output_subdirs.transcription_subdirs.sound_events - - clean_dir.mkdir(parents=True, exist_ok=True) - sound_dir.mkdir(parents=True, exist_ok=True) - - clean_json = clean_dir / f"{base_name}_clean_transcription.json" - sound_json = sound_dir / f"{base_name}_sound_events.json" - clean_segmented_json = clean_dir / f"{base_name}_segmented_clean.json" - sound_segmented_json = sound_dir / f"{base_name}_segmented_sound_events.json" - clean_txt = clean_dir / f"{base_name}_clean_transcription.txt" - sound_txt = sound_dir / f"{base_name}_sound_events.txt" - clean_srt = clean_dir / f"{base_name}_clean_transcription.srt" - sound_srt = sound_dir / f"{base_name}_sound_events.srt" - - raw_txt = episode_dir / settings.output_subdirs.transcription_subdirs.raw / f"{base_name}.txt" - - dialogue_segments_simple = self.__convert_to_simple_format(dialogue_segments) - sound_event_segments_simple = self.__convert_to_simple_format(sound_event_segments) - - with open(clean_json, "w", encoding="utf-8") as f: - json.dump( - {"episode_info": episode_info, "segments": dialogue_segments_simple}, - f, - ensure_ascii=False, - indent=4, - ) - - with open(sound_json, "w", encoding="utf-8") as f: - json.dump( - {"episode_info": episode_info, "segments": sound_event_segments_simple}, - f, - ensure_ascii=False, - indent=4, - ) - - with open(clean_segmented_json, "w", encoding="utf-8") as f: - json.dump( - {"episode_info": episode_info, "segments": dialogue_segments}, - f, - ensure_ascii=False, - indent=4, - ) - - with open(sound_segmented_json, "w", encoding="utf-8") as f: - json.dump( - {"episode_info": episode_info, "segments": sound_event_segments}, - f, - ensure_ascii=False, - indent=4, - ) - - self.__generate_txt_files(raw_txt, clean_txt, sound_txt) - self.__generate_srt_files(dialogue_segments, sound_event_segments, clean_srt, sound_srt) - - self.logger.info( - f"Separated {item.episode_id}: " - f"{len(dialogue_segments)} dialogue, {len(sound_event_segments)} sound events", - ) - - def __classify_segment(self, segment: Dict[str, Any]) -> str: - words = segment.get("words", []) - if not words: - return "dialogue" - - has_sound = False - has_dialogue = False - - for word in words: - if self.__is_sound_event(word): - has_sound = True - elif word.get(WordKeys.TYPE) not in [WordTypeValues.SPACING, ""]: - has_dialogue = True - - if has_sound and has_dialogue: - return "mixed" - if has_sound: - return "sound_event" - return "dialogue" - - @staticmethod - def __is_sound_event(word: Dict[str, Any]) -> bool: - if word.get(WordKeys.TYPE) == WordTypeValues.AUDIO_EVENT: - return True - - text = word.get("text", "").strip() - if re.match(r'^\(.*\)$', text): - return True - - return False - - def __split_mixed_segment(self, segment: Dict[str, Any]) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]: - words = segment.get("words", []) - dialogue_sequences = [] - sound_sequences = [] - - current_type = None - current_words = [] - - for word in words: - if word.get(WordKeys.TYPE) == WordTypeValues.SPACING: - if current_words: - current_words.append(word) - continue - - is_sound = self.__is_sound_event(word) - word_type = "sound" if is_sound else "dialogue" - - if word_type != current_type: - if current_words: - self.__finalize_sequence( - current_type, current_words, dialogue_sequences, sound_sequences, segment, - ) - current_type = word_type - current_words = [word] - else: - current_words.append(word) - - if current_words: - self.__finalize_sequence( - current_type, current_words, dialogue_sequences, sound_sequences, segment, - ) - - return dialogue_sequences, sound_sequences - - @staticmethod - def __finalize_sequence( - seq_type: str, - words: List[Dict], - dialogue_sequences: List[Dict], - sound_sequences: List[Dict], - original_segment: Dict[str, Any], - ) -> None: - if not words: - return - - non_spacing_words = [w for w in words if w.get(WordKeys.TYPE) != WordTypeValues.SPACING] - if not non_spacing_words: - return - - text = "".join([w.get("text", "") for w in words]) - text = re.sub(r'\s+', ' ', text).strip() - start_time = min((w.get("start") or 0) for w in words) - end_time = max((w.get("end") or 0) for w in words) - - new_segment = { - "text": text, - "start": start_time, - "end": end_time, - "words": words, - } - - for key in original_segment: - if key not in ["text", "start", "end", "words"]: - new_segment[key] = original_segment[key] - - if seq_type == "dialogue": - dialogue_sequences.append(new_segment) - else: - sound_sequences.append(new_segment) - - @staticmethod - def __clean_segment_text(segment: Dict[str, Any]) -> Dict[str, Any]: - cleaned = segment.copy() - if "text" in cleaned: - text = cleaned["text"] - text = re.sub(r'\s+', ' ', text).strip() - cleaned["text"] = text - - if cleaned.get("start") is None or cleaned.get("end") is None: - words = cleaned.get("words", []) - if words: - starts = [(w.get("start") or 0) for w in words if w.get("start") is not None] - ends = [(w.get("end") or 0) for w in words if w.get("end") is not None] - if starts: - cleaned["start"] = min(starts) - if ends: - cleaned["end"] = max(ends) - - return cleaned - - @staticmethod - def __enrich_sound_event(segment: Dict[str, Any]) -> Dict[str, Any]: - enriched = segment.copy() - enriched["sound_type"] = "sound" - return enriched - - @staticmethod - def __renumber_segments(segments: List[Dict]) -> List[Dict]: - for i, segment in enumerate(segments): - segment["id"] = i - return segments - - @staticmethod - def __convert_to_simple_format(segments: List[Dict]) -> List[Dict]: - simple_segments = [] - for seg in segments: - simple_seg = { - "id": seg.get("id"), - "text": seg.get("text", ""), - "start": seg.get("start") or 0.0, - "end": seg.get("end") or 0.0, - } - if "sound_type" in seg: - simple_seg["sound_type"] = seg["sound_type"] - simple_segments.append(simple_seg) - return simple_segments - - def __generate_txt_files(self, original_txt: Path, clean_txt: Path, sound_txt: Path) -> None: - if not original_txt.exists(): - self.logger.warning(f"Original TXT file not found: {original_txt}") - return - - with open(original_txt, "r", encoding="utf-8") as f: - original_content = f.read() - - clean_content = re.sub(r'\([^)]*\)', '', original_content) - clean_content = re.sub(r'\s+', ' ', clean_content).strip() - - sound_matches = re.findall(r'\([^)]*\)', original_content) - sound_content = ' '.join(sound_matches) - - with open(clean_txt, "w", encoding="utf-8") as f: - f.write(clean_content) - - with open(sound_txt, "w", encoding="utf-8") as f: - f.write(sound_content) - - @staticmethod - def __generate_srt_files( - dialogue_segments: List[Dict], - sound_segments: List[Dict], - clean_srt: Path, - sound_srt: Path, - ) -> None: - def format_timestamp(seconds: float) -> str: - hours = int(seconds // 3600) - minutes = int((seconds % 3600) // 60) - secs = int(seconds % 60) - millis = int((seconds % 1) * 1000) - return f"{hours:02d}:{minutes:02d}:{secs:02d},{millis:03d}" - - def _write_srt(segments: List[Dict], output_path: Path) -> None: - with open(output_path, "w", encoding="utf-8") as f: - for idx, seg in enumerate(segments, start=1): - words = seg.get("words", []) - text = seg.get("text", "").strip() - - if not text or not words: - continue - - non_spacing_words = [w for w in words if w.get(WordKeys.TYPE) != WordTypeValues.SPACING] - if not non_spacing_words: - continue - - start_time = min((w.get("start") or 0.0) for w in non_spacing_words) - end_time = max((w.get("end") or 0.0) for w in non_spacing_words) - - f.write(f"{idx}\n") - f.write(f"{format_timestamp(start_time)} --> {format_timestamp(end_time)}\n") - f.write(f"{text}\n\n") - - _write_srt(dialogue_segments, clean_srt) - _write_srt(sound_segments, sound_srt) - - def _get_progress_description(self) -> str: - return "Separating sound events from dialogues" diff --git a/preprocessor/transcription/whisper_utils.py b/preprocessor/transcription/whisper_utils.py deleted file mode 100644 index 8015068b3..000000000 --- a/preprocessor/transcription/whisper_utils.py +++ /dev/null @@ -1,58 +0,0 @@ -from typing import ( - Any, - Dict, -) - -LANGUAGE_MAP = { - "polish": "pl", - "english": "en", - "german": "de", - "french": "fr", - "spanish": "es", -} - - -def get_language_code(language: str) -> str: - return LANGUAGE_MAP.get(language.lower(), language.lower()) - - -def _process_whisper_segment(segment) -> Dict[str, Any]: - words = [] - if hasattr(segment, 'words') and segment.words: - for word in segment.words: - words.append({ - "word": word.word, - "start": word.start, - "end": word.end, - "probability": word.probability, - }) - - return { - "id": segment.id, - "seek": 0, - "start": segment.start, - "end": segment.end, - "text": segment.text, - "tokens": [], - "avg_logprob": segment.avg_logprob, - "compression_ratio": segment.compression_ratio, - "no_speech_prob": segment.no_speech_prob, - "words": words, - } - - -def build_transcription_result(segments, language: str = None) -> Dict[str, Any]: - result = { - "text": "", - "segments": [], - } - - if language: - result["language"] = language - - for segment in segments: - segment_dict = _process_whisper_segment(segment) - result["segments"].append(segment_dict) - result["text"] += segment.text - - return result diff --git a/preprocessor/types/__init__.py b/preprocessor/types/__init__.py deleted file mode 100644 index b8eb211e8..000000000 --- a/preprocessor/types/__init__.py +++ /dev/null @@ -1,69 +0,0 @@ -from .clip import ClipSegment -from .detection import ( - CharacterDetectionInFrame, - Detection, - ObjectDetectionInFrame, -) -from .episode import ( - EpisodeInfo, - EpisodeMetadata, - SeasonInfo, - SeasonInfoDict, -) -from .frame import FrameRequest -from .scene import ( - SceneDict, - SceneTimestamp, - SceneTimestampPoint, - SceneTimestampsData, -) -from .search import ( - ElasticsearchAggregations, - ElasticsearchHit, - ElasticsearchHits, - ElasticsearchResponse, - EpisodeBucket, - SearchSegment, - SeasonBucket, -) -from .transcription import ( - BaseSegment, - ElasticsearchSegment, - SegmentWithScore, - SegmentWithTimes, - TranscriptionContext, -) -from .video import ( - HashResult, - VideoMetadata, -) - -__all__ = [ - "EpisodeInfo", - "EpisodeMetadata", - "SeasonInfo", - "SeasonInfoDict", - "FrameRequest", - "SceneDict", - "SceneTimestamp", - "SceneTimestampPoint", - "SceneTimestampsData", - "ClipSegment", - "CharacterDetectionInFrame", - "Detection", - "ObjectDetectionInFrame", - "HashResult", - "VideoMetadata", - "BaseSegment", - "ElasticsearchSegment", - "SegmentWithScore", - "SegmentWithTimes", - "TranscriptionContext", - "ElasticsearchAggregations", - "ElasticsearchHit", - "ElasticsearchHits", - "ElasticsearchResponse", - "EpisodeBucket", - "SearchSegment", - "SeasonBucket", -] diff --git a/preprocessor/utils/__init__.py b/preprocessor/utils/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/preprocessor/utils/batch_processing_utils.py b/preprocessor/utils/batch_processing_utils.py deleted file mode 100644 index fbc753796..000000000 --- a/preprocessor/utils/batch_processing_utils.py +++ /dev/null @@ -1,221 +0,0 @@ -from concurrent.futures import ThreadPoolExecutor -import json -from pathlib import Path -import time -from typing import ( - Any, - Dict, - Iterator, - List, - Optional, - Tuple, -) - -from PIL import Image - -from preprocessor.embeddings.gpu_batch_processor import GPUBatchProcessor -from preprocessor.utils.batch_processor import BatchProcessor -from preprocessor.utils.console import console -from preprocessor.utils.image_hasher import PerceptualHasher -from preprocessor.utils.time_utils import format_time_hms -from preprocessor.video.frame_utils import load_frames_from_requests - - -def _prefetch_batches( - frames_dir: Path, - frame_requests: List[Dict[str, Any]], - batch_size: int, - convert_rgb: bool = False, - prefetch_count: int = 2, -) -> Iterator[Tuple[int, List[Dict[str, Any]], List[Image.Image]]]: - total_chunks = (len(frame_requests) + batch_size - 1) // batch_size - - with ThreadPoolExecutor(max_workers=prefetch_count) as executor: - futures = {} - - for chunk_idx in range(total_chunks): - chunk_start = chunk_idx * batch_size - chunk_end = min(chunk_start + batch_size, len(frame_requests)) - chunk_requests = frame_requests[chunk_start:chunk_end] - - future = executor.submit(load_frames_from_requests, frames_dir, chunk_requests, convert_rgb) - futures[chunk_idx] = (chunk_requests, future) - - if len(futures) >= prefetch_count or chunk_idx == total_chunks - 1: - next_idx = chunk_idx - len(futures) + 1 - chunk_reqs, future = futures.pop(next_idx) - pil_images = future.result() - yield next_idx, chunk_reqs, pil_images - - -def compute_hashes_in_batches( - frames_dir: Path, - frame_requests: List[Dict[str, Any]], - hasher: PerceptualHasher, - batch_size: int, -) -> List[Dict[str, Any]]: - total_chunks = (len(frame_requests) + batch_size - 1) // batch_size - console.print(f"[cyan]Computing hashes for {len(frame_requests)} frames in {total_chunks} batches[/cyan]") - - start_time = time.time() - batch_processor = BatchProcessor(batch_size) - processed_batches = 0 - - def _process_hash_batch(batch_requests: List[Dict[str, Any]]) -> List[Dict[str, Any]]: - nonlocal processed_batches - pil_images = load_frames_from_requests(frames_dir, batch_requests) - phashes = hasher.compute_phash_batch(pil_images) - - batch_results = [] - for request, phash in zip(batch_requests, phashes): - result = request.copy() - result["perceptual_hash"] = phash - batch_results.append(result) - - del pil_images - processed_batches += 1 - _report_batch_progress( - processed_batches, - total_chunks, - processed_batches, - total_chunks, - start_time, - ) - return batch_results - - results = batch_processor.process(frame_requests, _process_hash_batch) - console.print(f"[green]✓ Computed {len(results)} hashes[/green]") - return results - - -def compute_embeddings_in_batches( # pylint: disable=too-many-locals - frames_dir: Path, - frame_requests: List[Dict[str, Any]], - gpu_processor: GPUBatchProcessor, - batch_size: int, - image_hashes: Dict[int, str], - checkpoint_file: Optional[Path] = None, - checkpoint_interval: int = 20, - prefetch_count: int = 2, -) -> List[Dict[str, Any]]: - total_chunks = (len(frame_requests) + batch_size - 1) // batch_size - embeddings = [] - start_chunk_idx = 0 - - if checkpoint_file and checkpoint_file.exists(): - console.print("[yellow]Found checkpoint file, resuming from last saved batch[/yellow]") - try: - with open(checkpoint_file, "r", encoding="utf-8") as f: - checkpoint_data = json.load(f) - embeddings = checkpoint_data.get("embeddings", []) - start_chunk_idx = checkpoint_data.get("last_batch_idx", 0) + 1 - console.print(f"[cyan]Resuming from batch {start_chunk_idx}/{total_chunks}[/cyan]") - except (json.JSONDecodeError, KeyError) as e: - console.print(f"[yellow]Failed to load checkpoint: {e}. Starting from beginning.[/yellow]") - start_chunk_idx = 0 - embeddings = [] - - console.print(f"[cyan]Computing embeddings for {len(frame_requests)} frames in {total_chunks} batches (with prefetch={prefetch_count})[/cyan]") - - actual_checkpoint_interval = min(checkpoint_interval, max(1, total_chunks // 2)) - if actual_checkpoint_interval != checkpoint_interval: - console.print(f"[dim cyan]Adjusted checkpoint interval: {actual_checkpoint_interval} (every ~50% of batches)[/dim cyan]") - - start_time = time.time() - processed_batches = 0 - batches_to_process = total_chunks - start_chunk_idx - - for chunk_idx, chunk_requests, pil_images in _prefetch_batches( - frames_dir, frame_requests, batch_size, convert_rgb=True, prefetch_count=prefetch_count, - ): - if chunk_idx < start_chunk_idx: - continue - - chunk_embeddings = gpu_processor.process_images_batch(pil_images, chunk_idx) - - for request, embedding in zip(chunk_requests, chunk_embeddings): - result = { - **request, - "embedding": embedding, - } - - frame_num = request.get("frame_number") - if frame_num is not None and frame_num in image_hashes: - result["perceptual_hash"] = image_hashes[frame_num] - - embeddings.append(result) - - del pil_images - del chunk_embeddings - - processed_batches += 1 - _report_batch_progress( - processed_batches, - batches_to_process, - chunk_idx + 1, - total_chunks, - start_time, - ) - - if checkpoint_file and (chunk_idx + 1) % actual_checkpoint_interval == 0: - _save_checkpoint(checkpoint_file, chunk_idx, embeddings) - - if checkpoint_file and checkpoint_file.exists(): - checkpoint_file.unlink() - console.print("[cyan]Checkpoint file removed[/cyan]") - - vram_stats = gpu_processor.get_vram_stats() - if vram_stats: - console.print( - f"[cyan]VRAM usage: max={vram_stats['max_vram_gb']}GB, " - f"avg={vram_stats['avg_vram_gb']}GB[/cyan]", - ) - suggested_batch = gpu_processor.suggest_optimal_batch_size(target_vram_gb=21.0) - if suggested_batch != batch_size: - console.print( - f"[yellow]Suggested batch_size for 21GB VRAM target: {suggested_batch} " - f"(current: {batch_size})[/yellow]", - ) - - console.print(f"[green]✓ Computed {len(embeddings)} embeddings[/green]") - return embeddings - - -def _report_batch_progress( - processed: int, - total_to_process: int, - current_batch: int, - total_batches: int, - start_time: float, -) -> None: - elapsed = time.time() - start_time - percent = (processed / total_to_process * 100) if total_to_process > 0 else 0 - - if 0 < processed < total_to_process: - rate = processed / elapsed if elapsed > 0 else 0 - remaining = total_to_process - processed - eta_seconds = remaining / rate if rate > 0 else 0 - eta = format_time_hms(eta_seconds) if eta_seconds > 0 else "0:00:00" - rate_str = f"{rate:.2f} batch/s" - elif processed >= total_to_process: - eta = "0:00:00" - rate_str = f"{processed / elapsed:.2f} batch/s" if elapsed > 0 else "N/A" - else: - eta = "-:--:--" - rate_str = "N/A" - - console.print( - f" [dim cyan]Batch {current_batch}/{total_batches} " - f"({percent:.1f}%) | {rate_str} | ETA: {eta}[/dim cyan]", - ) - - -def _save_checkpoint(checkpoint_file: Path, last_batch_idx: int, embeddings: List[Dict[str, Any]]) -> None: - checkpoint_file.parent.mkdir(parents=True, exist_ok=True) - checkpoint_data = { - "last_batch_idx": last_batch_idx, - "embeddings": embeddings, - } - with open(checkpoint_file, "w", encoding="utf-8") as f: - json.dump(checkpoint_data, f) - console.print(f"[dim cyan]Checkpoint saved at batch {last_batch_idx + 1}[/dim cyan]") diff --git a/preprocessor/utils/batch_processor.py b/preprocessor/utils/batch_processor.py deleted file mode 100644 index 260c29acc..000000000 --- a/preprocessor/utils/batch_processor.py +++ /dev/null @@ -1,24 +0,0 @@ -from typing import ( - Callable, - Generic, - List, - TypeVar, -) - -T = TypeVar('T') -R = TypeVar('R') - -class BatchProcessor(Generic[T, R]): - def __init__(self, batch_size: int): - self.batch_size = batch_size - - def process( - self, - items: List[T], - process_fn: Callable[[List[T]], List[R]], - ) -> List[R]: - results = [] - for i in range(0, len(items), self.batch_size): - batch = items[i:i+self.batch_size] - results.extend(process_fn(batch)) - return results diff --git a/preprocessor/utils/constants.py b/preprocessor/utils/constants.py deleted file mode 100644 index e00986f2a..000000000 --- a/preprocessor/utils/constants.py +++ /dev/null @@ -1,200 +0,0 @@ -# pylint: disable=duplicate-code - -class SegmentKeys: - START_TIME = "start_time" - END_TIME = "end_time" - TEXT = "text" - VIDEO_PATH = "video_path" - SEGMENT_ID = "segment_id" - ID = "id" - START = "start" - END = "end" - - -class EpisodeMetadataKeys: - EPISODE_METADATA = "episode_metadata" - EPISODE_INFO = "episode_info" - SEASON = "season" - EPISODE_NUMBER = "episode_number" - SERIES_NAME = "series_name" - TITLE = "title" - PREMIERE_DATE = "premiere_date" - VIEWERSHIP = "viewership" - - -class ElasticsearchKeys: - SOURCE = "_source" - SCORE = "_score" - HITS = "hits" - TOTAL = "total" - AGGREGATIONS = "aggregations" - BUCKETS = "buckets" - KEY = "key" - - -class ElasticsearchAggregationKeys: - UNIQUE_EPISODES = "unique_episodes" - SEASONS = "seasons" - VALUE = "value" - - -class TranscriptionContextKeys: - TARGET = "target" - CONTEXT = "context" - OVERALL_START_TIME = "overall_start_time" - OVERALL_END_TIME = "overall_end_time" - - -class ElasticsearchQueryKeys: - QUERY = "query" - TERM = "term" - MATCH = "match" - BOOL = "bool" - MUST = "must" - FILTER = "filter" - RANGE = "range" - SIZE = "size" - SORT = "sort" - ORDER = "order" - ASC = "asc" - DESC = "desc" - FUZZINESS = "fuzziness" - AUTO = "AUTO" - TERMS = "terms" - FIELD = "field" - AGGS = "aggs" - CARDINALITY = "cardinality" - TOP_HITS = "top_hits" - INCLUDES = "includes" - LT = "lt" - GT = "gt" - SOURCE = "_source" - KEY = "_key" - - -class EpisodesDataKeys: - SEASONS = "seasons" - SEASON_NUMBER = "season_number" - EPISODES = "episodes" - - -class FfprobeKeys: - STREAMS = "streams" - FORMAT = "format" - - -class FfprobeStreamKeys: - R_FRAME_RATE = "r_frame_rate" - BIT_RATE = "bit_rate" - CODEC_NAME = "codec_name" - WIDTH = "width" - HEIGHT = "height" - DURATION = "duration" - - -class FfprobeFormatKeys: - DURATION = "duration" - SIZE = "size" - - -class DetectionKeys: - DETECTIONS = "detections" - CHARACTERS = "characters" - FRAME_NUMBER = "frame_number" - FRAME = "frame" - FRAME_NAME = "frame_name" - FRAME_FILE = "frame_file" - - -class CharacterDetectionKeys: - NAME = "name" - CONFIDENCE = "confidence" - EMOTION = "emotion" - BBOX = "bbox" - - -class EmotionKeys: - LABEL = "label" - CONFIDENCE = "confidence" - - -class ObjectDetectionKeys: - CLASS_NAME = "class_name" - CLASS_ID = "class_id" - CONFIDENCE = "confidence" - BBOX = "bbox" - - -class SceneKeys: - SCENES = "scenes" - START = "start" - END = "end" - SCENE_NUMBER = "scene_number" - SCENE_START_FRAME = "scene_start_frame" - SCENE_END_FRAME = "scene_end_frame" - SCENE_START_TIME = "scene_start_time" - SCENE_END_TIME = "scene_end_time" - - -class SceneTimeKeys: - SECONDS = "seconds" - FRAME = "frame" - - -class ElasticDocKeys: - SCENE_INFO = "scene_info" - CHARACTER_APPEARANCES = "character_appearances" - DETECTED_OBJECTS = "detected_objects" - PERCEPTUAL_HASH = "perceptual_hash" - PERCEPTUAL_HASH_INT = "perceptual_hash_int" - - -class EmbeddingKeys: - EPISODE_ID = "episode_id" - TITLE = "title" - TITLE_EMBEDDING = "title_embedding" - EPISODE_METADATA = "episode_metadata" - FRAME_NUMBER = "frame_number" - PERCEPTUAL_HASH = "perceptual_hash" - FRAME_PATH = "frame_path" - TIMESTAMP = "timestamp" - EMBEDDING = "embedding" - SCENE_NUMBER = "scene_number" - - -class ValidationMetadataKeys: - WIDTH = "width" - HEIGHT = "height" - FORMAT = "format" - SIZE_MB = "size_mb" - SIZE_BYTES = "size_bytes" - LINE_COUNT = "line_count" - CODEC = "codec" - DURATION = "duration" - - -class WordKeys: - TYPE = "type" - START = "start" - END = "end" - WORD = "word" - - -class WordTypeValues: - SPACING = "spacing" - AUDIO_EVENT = "audio_event" - - -class GoogleSearchKeys: - ENGINE = "engine" - Q = "q" - HL = "hl" - GL = "gl" - API_KEY = "api_key" - IMAGES_RESULTS = "images_results" - - -class ImageResultKeys: - ORIGINAL = "original" - THUMBNAIL = "thumbnail" - IMAGE = "image" diff --git a/preprocessor/utils/detection_io.py b/preprocessor/utils/detection_io.py deleted file mode 100644 index 2ce695797..000000000 --- a/preprocessor/utils/detection_io.py +++ /dev/null @@ -1,92 +0,0 @@ -from pathlib import Path -import re -from typing import ( - Any, - Dict, - List, - Optional, -) - -from preprocessor.characters.face_detection import detect_characters_in_frame -from preprocessor.config.config import settings -from preprocessor.core.path_manager import PathManager -from preprocessor.utils.console import console -from preprocessor.utils.file_utils import atomic_write_json -from preprocessor.utils.metadata_utils import create_minimal_episode_info - - -def _parse_frame_number(frame_filename: str) -> Optional[int]: - match = re.search(r'frame_(\d+)', frame_filename) - if match: - return int(match.group(1)) - return None - - -def save_character_detections( - episode_info, - results: List[Dict[str, Any]], - path_manager: Optional[PathManager] = None, - fps: float = 25.0, -) -> Path: - detections_data = { - "episode_info": create_minimal_episode_info(episode_info), - "video_metadata": { - "fps": fps, - }, - "detections": results, - } - - series_name = episode_info.series_name or "unknown" - path_manager = PathManager(series_name) - - detections_filename = path_manager.build_filename( - episode_info, - extension="json", - suffix="character_detections", - ) - - if path_manager is None: - path_manager = PathManager(series_name) - - detections_output = path_manager.build_path( - episode_info, - settings.output_subdirs.character_detections, - detections_filename, - ) - atomic_write_json(detections_output, detections_data, indent=2, ensure_ascii=False) - - return detections_output - - -def process_frames_for_detection( - frame_files: List[Path], - face_app, - character_vectors: Dict[str, Any], - threshold: float, - fps: float = 25.0, -) -> List[Dict[str, Any]]: - results = [] - for idx, frame_path in enumerate(frame_files): - detected_chars = detect_characters_in_frame( - frame_path, - face_app, - character_vectors, - threshold, - ) - - frame_number = _parse_frame_number(frame_path.name) - timestamp = frame_number / fps if frame_number is not None else None - - frame_result = { - "frame_number": frame_number, - "timestamp": timestamp, - "frame_file": frame_path.name, - "characters": detected_chars, - } - - results.append(frame_result) - - if (idx + 1) % 100 == 0: - console.print(f" Processed {idx + 1}/{len(frame_files)} frames") - - return results diff --git a/preprocessor/utils/file_utils.py b/preprocessor/utils/file_utils.py deleted file mode 100644 index b2747a91e..000000000 --- a/preprocessor/utils/file_utils.py +++ /dev/null @@ -1,18 +0,0 @@ -import json -from pathlib import Path -from typing import Any - - -def atomic_write_json(output_path: Path, data: Any, **kwargs) -> None: - kwargs.setdefault('ensure_ascii', False) - temp_path = output_path.with_suffix(output_path.suffix + '.tmp') - with open(temp_path, 'w', encoding='utf-8') as f: - json.dump(data, f, **kwargs) - temp_path.replace(output_path) - - -def atomic_write_text(output_path: Path, content: str) -> None: - temp_path = output_path.with_suffix(output_path.suffix + '.tmp') - with open(temp_path, 'w', encoding='utf-8') as f: - f.write(content) - temp_path.replace(output_path) diff --git a/preprocessor/utils/hash_save_utils.py b/preprocessor/utils/hash_save_utils.py deleted file mode 100644 index 78d757084..000000000 --- a/preprocessor/utils/hash_save_utils.py +++ /dev/null @@ -1,50 +0,0 @@ -from pathlib import Path -from typing import ( - Any, - Dict, - List, -) - -from preprocessor.config.config import settings -from preprocessor.core.path_manager import PathManager -from preprocessor.episodes import EpisodeInfo -from preprocessor.utils.file_utils import atomic_write_json -from preprocessor.utils.metadata_utils import create_processing_metadata - - -def save_image_hashes_to_json( - episode_info: EpisodeInfo, - hash_results: List[Dict[str, Any]], - series_name: str, - device: str, - batch_size: int, -) -> Path: - path_manager = PathManager(series_name) - episode_dir = path_manager.get_episode_dir(episode_info, settings.output_subdirs.image_hashes) - episode_dir.mkdir(parents=True, exist_ok=True) - - hash_data = create_processing_metadata( - episode_info=episode_info, - processing_params={ - "device": device, - "batch_size": batch_size, - "hash_size": 8, - }, - statistics={ - "total_hashes": len(hash_results), - "unique_hashes": len(set(h.get("perceptual_hash") for h in hash_results if "perceptual_hash" in h)), - }, - results_key="image_hashes", - results_data=hash_results, - ) - - hash_filename = path_manager.build_filename( - episode_info, - extension="json", - suffix="image_hashes", - ) - - output_path = episode_dir / hash_filename - atomic_write_json(output_path, hash_data) - - return output_path diff --git a/preprocessor/utils/image_hash_utils.py b/preprocessor/utils/image_hash_utils.py deleted file mode 100644 index 493e3bc8c..000000000 --- a/preprocessor/utils/image_hash_utils.py +++ /dev/null @@ -1,55 +0,0 @@ -import json -from typing import Dict - -from preprocessor.config.config import settings -from preprocessor.core.path_manager import PathManager -from preprocessor.episodes import EpisodeInfo - - -def load_image_hashes_for_episode( - episode_info_dict: Dict[str, int], - series_name: str, - logger=None, -) -> Dict[int, str]: - season = episode_info_dict.get("season") - episode = episode_info_dict.get("episode_number") - if season is None or episode is None: - return {} - - path_manager = PathManager(series_name) - episode_info = EpisodeInfo.create_minimal(season, episode, series_name) - - hashes_episode_dir = path_manager.get_episode_dir( - episode_info, - settings.output_subdirs.image_hashes, - ) - - hash_files = list(hashes_episode_dir.glob("*_image_hashes.json")) - if not hash_files: - if logger: - logger.debug(f"Image hashes not found in: {hashes_episode_dir}") - return {} - - hashes_file = hash_files[0] - - if not hashes_file.exists(): - if logger: - logger.debug(f"Image hashes not found: {hashes_file}") - return {} - - try: - with open(hashes_file, "r", encoding="utf-8") as f: - data = json.load(f) - - hash_map = {} - for item in data.get("image_hashes", []): - frame_num = item.get("frame_number") - phash = item.get("perceptual_hash") - if frame_num is not None and phash: - hash_map[frame_num] = phash - - return hash_map - except Exception as e: - if logger: - logger.error(f"Failed to load image hashes: {e}") - return {} diff --git a/preprocessor/utils/image_hasher.py b/preprocessor/utils/image_hasher.py deleted file mode 100644 index 3981c4703..000000000 --- a/preprocessor/utils/image_hasher.py +++ /dev/null @@ -1,76 +0,0 @@ -import logging -from typing import List - -from PIL import Image -import numpy as np -import torch -import torch.nn.functional as F - - -class PerceptualHasher: - def __init__(self, device: str = "cuda", hash_size: int = 8): - self.device = device - self.hash_size = hash_size - self.resize_size = hash_size * 4 - self.logger = logging.getLogger(__name__) - - def compute_phash_batch(self, pil_images: List[Image.Image]) -> List[str]: - if not pil_images: - return [] - - try: - images_tensor = self.__pil_to_tensor_batch(pil_images) - hashes = self.__compute_phash_tensor(images_tensor) - return hashes - except Exception as e: - self.logger.error(f"Failed to compute pHash: {e}") - return ["0" * 16] * len(pil_images) - - def __pil_to_tensor_batch(self, pil_images: List[Image.Image]) -> torch.Tensor: - tensors = [] - for img in pil_images: - if img.mode != 'L': - img = img.convert('L') - img_resized = img.resize((self.resize_size, self.resize_size), Image.Resampling.LANCZOS) - img_array = np.array(img_resized, dtype=np.float32) - tensor = torch.from_numpy(img_array) - tensors.append(tensor) - - batch_tensor = torch.stack(tensors).unsqueeze(1).to(self.device) - return batch_tensor - - def __compute_phash_tensor(self, images: torch.Tensor) -> List[str]: - dct_coeffs = self.__batch_dct2d(images) - - top_left = dct_coeffs[:, :, :self.hash_size, :self.hash_size] - - top_left_flat = top_left.reshape(top_left.size(0), -1) - - median_vals = torch.median(top_left_flat, dim=1, keepdim=True)[0] - - hash_bits = (top_left_flat > median_vals).long() - - hashes = [] - for bits in hash_bits: - hash_int = 0 - for i, bit in enumerate(bits): - if bit: - hash_int |= (1 << i) - hash_hex = f"{hash_int:016x}" - hashes.append(hash_hex) - - return hashes - - # noinspection PyPep8Naming - def __batch_dct2d(self, images: torch.Tensor) -> torch.Tensor: - N, C, H, W = images.shape # pylint: disable=unused-variable - - if H != W or H != self.resize_size: - images = F.interpolate(images, size=(self.resize_size, self.resize_size), mode='bilinear', align_corners=False) - - freq_h = torch.fft.fft(images, dim=2) - freq_hw = torch.fft.fft(freq_h, dim=3) - - dct_coeffs = freq_hw.real - - return dct_coeffs diff --git a/preprocessor/utils/metadata_utils.py b/preprocessor/utils/metadata_utils.py deleted file mode 100644 index b1082a99a..000000000 --- a/preprocessor/utils/metadata_utils.py +++ /dev/null @@ -1,29 +0,0 @@ -from datetime import datetime -from typing import ( - Any, - Dict, - List, -) - - -def create_minimal_episode_info(episode_info) -> Dict[str, Any]: - return { - "season": episode_info.season, - "episode_number": episode_info.relative_episode, - } - - -def create_processing_metadata( - episode_info, - processing_params: Dict[str, Any], - statistics: Dict[str, Any], - results_key: str, - results_data: List[Any], -) -> Dict[str, Any]: - return { - "generated_at": datetime.now().isoformat(), - "episode_info": create_minimal_episode_info(episode_info), - "processing_parameters": processing_params, - "statistics": statistics, - results_key: results_data, - } diff --git a/preprocessor/utils/resource_scope.py b/preprocessor/utils/resource_scope.py deleted file mode 100644 index 59b04b8c6..000000000 --- a/preprocessor/utils/resource_scope.py +++ /dev/null @@ -1,19 +0,0 @@ -import gc -import sys - - -class ResourceScope: - def __enter__(self): - return self - - def __exit__(self, exc_type, exc_val, exc_tb): - gc.collect() - if "torch" in sys.modules: - import torch # pylint: disable=import-outside-toplevel - - if torch.cuda.is_available() and torch.cuda.is_initialized(): - try: - torch.cuda.synchronize() - torch.cuda.empty_cache() - except Exception: - pass diff --git a/preprocessor/utils/time_utils.py b/preprocessor/utils/time_utils.py deleted file mode 100644 index a9ebf84e2..000000000 --- a/preprocessor/utils/time_utils.py +++ /dev/null @@ -1,17 +0,0 @@ -def format_time_hms(seconds: float) -> str: - hours = int(seconds // 3600) - minutes = int((seconds % 3600) // 60) - secs = int(seconds % 60) - return f"{hours}:{minutes:02d}:{secs:02d}" - - -def format_time_human(seconds: float) -> str: - if seconds < 60: - return f"{seconds:.1f}s" - minutes = int(seconds // 60) - secs = int(seconds % 60) - if minutes < 60: - return f"{minutes}m {secs}s" - hours = minutes // 60 - minutes = minutes % 60 - return f"{hours}h {minutes}m {secs}s" diff --git a/preprocessor/utils/transcription_utils.py b/preprocessor/utils/transcription_utils.py deleted file mode 100644 index a8c9c4b13..000000000 --- a/preprocessor/utils/transcription_utils.py +++ /dev/null @@ -1,57 +0,0 @@ -import codecs -import json -from pathlib import Path -import re -from typing import ( - Any, - Dict, - List, -) - - -def _convert_word_to_standard_format(word: Dict[str, Any]) -> Dict[str, Any]: - return { - "text": word.get("word", word.get("text", "")).strip(), - "start": word.get("start", 0.0), - "end": word.get("end", 0.0), - "type": "word", - "speaker_id": word.get("speaker_id", "speaker_unknown"), - "logprob": word.get("probability", 0.0), - } - - -def convert_words_list(seg_words: List[Dict[str, Any]]) -> List[Dict[str, Any]]: - return [_convert_word_to_standard_format(word) for word in seg_words] - - -def _fix_unicode_escapes(text: str) -> str: - def replace_unicode(match): - unicode_str = match.group(0) - try: - return codecs.decode(unicode_str, 'unicode_escape') - except Exception: - return unicode_str - - pattern = r'\\u[0-9a-fA-F]{4}' - return re.sub(pattern, replace_unicode, text) - - -def fix_transcription_file_unicode(file_path: Path) -> bool: - try: - with open(file_path, "r", encoding="utf-8") as f: - content = f.read() - - if '\\u' not in content: - return False - - fixed_content = _fix_unicode_escapes(content) - - if fixed_content != content: - data = json.loads(fixed_content) - with open(file_path, "w", encoding="utf-8") as f: - json.dump(data, f, indent=2, ensure_ascii=False) - return True - - return False - except Exception: - return False diff --git a/preprocessor/utils/video_utils.py b/preprocessor/utils/video_utils.py deleted file mode 100644 index 721fc8ceb..000000000 --- a/preprocessor/utils/video_utils.py +++ /dev/null @@ -1,29 +0,0 @@ -from typing import ( - Generator, - Tuple, -) - -import decord -import numpy as np - - -def iterate_frames_with_histogram( - video_path: str, - sample_interval: int = 5, -) -> Generator[Tuple[int, np.ndarray, np.ndarray], None, None]: - vr = decord.VideoReader(video_path, ctx=decord.cpu(0)) - total_frames = len(vr) - - for frame_num in range(0, total_frames, sample_interval): - try: - frame_tensor = vr[frame_num] - frame_np = frame_tensor.numpy() - - gray = np.dot(frame_np[..., :3], [0.299, 0.587, 0.114]).astype(np.uint8) - hist, _ = np.histogram(gray, bins=256, range=(0, 256)) - hist = hist / (hist.sum() + 1e-7) - - yield frame_num, frame_np, hist - - except (RuntimeError, ValueError, OSError): - break diff --git a/preprocessor/validation/__init__.py b/preprocessor/validation/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/preprocessor/validation/episode_stats.py b/preprocessor/validation/episode_stats.py deleted file mode 100644 index a683c4b48..000000000 --- a/preprocessor/validation/episode_stats.py +++ /dev/null @@ -1,511 +0,0 @@ -from dataclasses import ( - dataclass, - field, -) -import json -from typing import ( - Any, - Dict, - List, - Optional, - Tuple, -) - -from preprocessor.config.config import ( - get_base_output_dir, - settings, -) -from preprocessor.core.constants import ( - DEFAULT_VIDEO_EXTENSION, - OUTPUT_FILE_NAMES, - OUTPUT_FILE_PATTERNS, -) -from preprocessor.core.path_manager import PathManager -from preprocessor.episodes import EpisodeInfo -from preprocessor.validation.base_result import ValidationStatusMixin -from preprocessor.validation.file_validators import ( - validate_image_file, - validate_json_file, - validate_jsonl_file, - validate_video_file, -) - -ELASTIC_SUBDIRS = settings.output_subdirs.elastic_document_subdirs - - -@dataclass -class EpisodeStats(ValidationStatusMixin): # pylint: disable=too-many-instance-attributes - episode_info: EpisodeInfo - series_name: str - errors: List[str] = field(default_factory=list) - warnings: List[str] = field(default_factory=list) - - transcription_chars: Optional[int] = None - transcription_duration: Optional[float] = None - transcription_words: Optional[int] = None - - exported_frames_count: Optional[int] = None - exported_frames_total_size_mb: Optional[float] = None - exported_frames_avg_resolution: Optional[Tuple[int, int]] = None - - video_size_mb: Optional[float] = None - video_duration: Optional[float] = None - video_codec: Optional[str] = None - video_resolution: Optional[Tuple[int, int]] = None - - scenes_count: Optional[int] = None - scenes_avg_duration: Optional[float] = None - - image_hashes_count: Optional[int] = None - object_detections_count: Optional[int] = None - object_visualizations_count: Optional[int] = None - character_visualizations_count: Optional[int] = None - face_clusters_count: Optional[int] = None - face_clusters_total_faces: Optional[int] = None - - def collect_stats(self): - self.__validate_transcription() - self.__validate_exported_frames() - self.__validate_video() - self.__validate_scenes() - self.__validate_image_hashes() - self.__validate_character_visualizations() - self.__validate_face_clusters() - self.__validate_object_detections() - self.__validate_object_visualizations() - self.__validate_other_files() - - def __validate_transcription(self): - transcriptions_dir = PathManager(self.series_name).get_episode_dir(self.episode_info,settings.output_subdirs.transcriptions) - base_name = f"{self.series_name}_{self.episode_info.episode_code()}" - - raw_dir = transcriptions_dir / settings.output_subdirs.transcription_subdirs.raw - clean_dir = transcriptions_dir / settings.output_subdirs.transcription_subdirs.clean - sound_events_dir = transcriptions_dir / settings.output_subdirs.transcription_subdirs.sound_events - - transcription_files = { - "main": raw_dir / f"{base_name}.json", - "segmented": raw_dir / f"{base_name}_segmented.json", - "simple": raw_dir / f"{base_name}_simple.json", - "clean": clean_dir / f"{base_name}_clean_transcription.json", - "clean_txt": clean_dir / f"{base_name}_clean_transcription.txt", - "sound_events": sound_events_dir / f"{base_name}_sound_events.json", - } - - if not any(f.exists() for f in transcription_files.values()): - self.errors.append("No transcription files found in any format") - return - - self.__validate_raw_transcription(transcription_files) - self.__validate_clean_transcription(transcription_files["clean"]) - self.__validate_clean_txt(transcription_files["clean_txt"]) - self.__validate_sound_events(transcription_files["sound_events"]) - - def __validate_raw_transcription(self, transcription_files: Dict[str, Any]): - raw_transcription = None - for key in ("main", "segmented", "simple"): - if transcription_files[key].exists(): - raw_transcription = transcription_files[key] - break - - if not raw_transcription: - self.warnings.append("Missing raw transcription file (checked: .json, _segmented.json, _simple.json)") - return - - result = validate_json_file(raw_transcription) - if not result.is_valid: - self.errors.append(f"Invalid transcription JSON: {result.error_message}") - return - - self.__extract_transcription_stats(raw_transcription) - - def __extract_transcription_stats(self, raw_transcription): - try: - with open(raw_transcription, "r", encoding="utf-8") as f: - data = json.load(f) - - text = data.get("text", "") - if not text: - segments = data.get("segments", []) - if segments: - text = " ".join(seg.get("text", "") for seg in segments) - - self.transcription_chars = len(text) - self.transcription_words = len(text.split()) - - words = data.get("words", []) - if words: - self.transcription_duration = words[-1].get("end", 0.0) - else: - segments = data.get("segments", []) - if segments and segments[-1].get("end"): - self.transcription_duration = segments[-1].get("end", 0.0) - except Exception as e: - self.errors.append(f"Error reading transcription: {e}") - - def __validate_clean_transcription(self, clean_transcription_file): - if not clean_transcription_file.exists(): - self.warnings.append(f"Missing clean transcription file: {clean_transcription_file.name}") - return - - result = validate_json_file(clean_transcription_file) - if not result.is_valid: - self.warnings.append(f"Invalid clean transcription JSON: {result.error_message}") - - def __validate_clean_txt(self, clean_txt_file): - if not clean_txt_file.exists(): - self.warnings.append(f"Missing clean transcription txt: {clean_txt_file.name}") - - def __validate_sound_events(self, sound_events_file): - if not sound_events_file.exists(): - self.warnings.append(f"Missing sound events file: {sound_events_file.name}") - return - - result = validate_json_file(sound_events_file) - if not result.is_valid: - self.warnings.append(f"Invalid sound events JSON: {result.error_message}") - - def __validate_exported_frames(self): - frames_dir = PathManager(self.series_name).get_episode_dir(self.episode_info,settings.output_subdirs.frames) - if not frames_dir.exists(): - self.warnings.append(f"Missing {settings.output_subdirs.frames} directory: {frames_dir}") - return - - frame_files = sorted(frames_dir.glob(OUTPUT_FILE_PATTERNS["frame"])) - if not frame_files: - self.warnings.append(f"No frames found in {settings.output_subdirs.frames}/") - return - - self.exported_frames_count = len(frame_files) - - total_size = 0 - resolutions = [] - invalid_count = 0 - - for frame_file in frame_files: - result = validate_image_file(frame_file) - if result.is_valid: - total_size += result.metadata["size_mb"] - resolutions.append((result.metadata["width"], result.metadata["height"])) - else: - invalid_count += 1 - self.errors.append(f"Invalid frame {frame_file.name}: {result.error_message}") - - if invalid_count > 0: - self.warnings.append(f"{invalid_count} invalid frames found") - - self.exported_frames_total_size_mb = round(total_size, 2) - - if resolutions: - most_common_res = max(set(resolutions), key=resolutions.count) - self.exported_frames_avg_resolution = most_common_res - - def __validate_video(self): - filename = f"{self.series_name.lower()}_{self.episode_info.episode_code()}{DEFAULT_VIDEO_EXTENSION}" - season_dir = get_base_output_dir(self.series_name) / settings.output_subdirs.video / self.episode_info.season_code() - video_file = season_dir / filename - - if not video_file.exists(): - self.warnings.append(f"Missing video file: {video_file}") - return - - result = validate_video_file(video_file) - if not result.is_valid: - self.errors.append(f"Invalid video: {result.error_message}") - return - - self.video_size_mb = result.metadata["size_mb"] - self.video_duration = result.metadata["duration"] - self.video_codec = result.metadata["codec"] - self.video_resolution = (result.metadata["width"], result.metadata["height"]) - - def __validate_scenes(self): - scenes_dir = PathManager(self.series_name).get_episode_dir(self.episode_info,settings.output_subdirs.scenes) - scenes_file = scenes_dir / f"{self.series_name}_{self.episode_info.episode_code()}{OUTPUT_FILE_PATTERNS['scenes_suffix']}" - if not scenes_file.exists(): - self.errors.append(f"Missing scenes file: {scenes_file}") - return - - result = validate_json_file(scenes_file) - if not result.is_valid: - self.errors.append(f"Invalid scenes JSON: {result.error_message}") - return - - try: - with open(scenes_file, "r", encoding="utf-8") as f: - data = json.load(f) - - self.scenes_count = data.get("total_scenes", 0) - scenes = data.get("scenes", []) - if scenes: - durations = [scene.get("duration", 0) for scene in scenes] - self.scenes_avg_duration = round(sum(durations) / len(durations), 2) - except Exception as e: - self.errors.append(f"Error reading scenes: {e}") - - def __validate_image_hashes(self): - hashes_dir = PathManager(self.series_name).get_episode_dir(self.episode_info,settings.output_subdirs.image_hashes) - if not hashes_dir.exists(): - self.warnings.append(f"Missing {settings.output_subdirs.image_hashes} directory") - return - - json_files = list(hashes_dir.glob("*.json")) - if not json_files: - self.warnings.append(f"No JSON files in {settings.output_subdirs.image_hashes}/") - return - - self.image_hashes_count = len(json_files) - sizes = [] - - for json_file in json_files: - result = validate_json_file(json_file) - if not result.is_valid: - self.errors.append(f"Invalid image hash JSON {json_file.name}: {result.error_message}") - else: - sizes.append(json_file.stat().st_size) - - self.__check_size_anomalies(sizes, "image_hashes") - - def __validate_character_visualizations(self): - viz_dir = PathManager(self.series_name).get_episode_dir(self.episode_info,settings.output_subdirs.character_visualizations) - if not viz_dir.exists(): - return - - image_files = list(viz_dir.glob("*.jpg")) + list(viz_dir.glob("*.png")) - if not image_files: - self.warnings.append(f"No visualization images in {settings.output_subdirs.character_visualizations}/") - return - - self.character_visualizations_count = len(image_files) - invalid_count = 0 - - for img_file in image_files: - result = validate_image_file(img_file) - if not result.is_valid: - invalid_count += 1 - self.errors.append(f"Invalid character visualization {img_file.name}: {result.error_message}") - - if invalid_count > 0: - self.warnings.append(f"{invalid_count} invalid character visualization images found") - - def __validate_face_clusters(self): - clusters_dir = PathManager(self.series_name).get_episode_dir(self.episode_info,settings.output_subdirs.face_clusters) - if not clusters_dir.exists(): - return - - metadata_files = list(clusters_dir.glob("*_face_clusters.json")) - metadata_file = metadata_files[0] if metadata_files else None - - if not metadata_file or not metadata_file.exists(): - self.warnings.append("Missing face clustering metadata file") - return - - result = validate_json_file(metadata_file) - if not result.is_valid: - self.errors.append(f"Invalid face clustering metadata: {result.error_message}") - return - - try: - with open(metadata_file, "r", encoding="utf-8") as f: - data = json.load(f) - - clusters = data.get("clusters", {}) - - if isinstance(clusters, dict): - self.face_clusters_count = len(clusters) - total_faces = 0 - for _, cluster_info in clusters.items(): - total_faces += cluster_info.get("face_count", 0) - elif isinstance(clusters, list): - self.face_clusters_count = len(clusters) - total_faces = 0 - for cluster_info in clusters: - total_faces += cluster_info.get("face_count", 0) - else: - self.warnings.append("Unexpected clusters format in face clustering metadata") - return - - noise_info = data.get("noise", {}) - if noise_info: - total_faces += noise_info.get("face_count", 0) - - self.face_clusters_total_faces = total_faces - - except Exception as e: - self.errors.append(f"Error reading face clustering metadata: {e}") - - def __validate_object_detections(self): - detections_dir = PathManager(self.series_name).get_episode_dir(self.episode_info,settings.output_subdirs.object_detections) - if not detections_dir.exists(): - self.warnings.append(f"Missing {settings.output_subdirs.object_detections} directory") - return - - json_files = [f for f in detections_dir.glob("*.json") if "visualizations" not in str(f)] - if not json_files: - self.warnings.append(f"No JSON files in {settings.output_subdirs.object_detections}/") - return - - self.object_detections_count = len(json_files) - sizes = [] - - for json_file in json_files: - result = validate_json_file(json_file) - if not result.is_valid: - self.errors.append(f"Invalid object detection JSON {json_file.name}: {result.error_message}") - else: - sizes.append(json_file.stat().st_size) - - self.__check_size_anomalies(sizes, "object_detections") - - def __validate_object_visualizations(self): - viz_dir = PathManager(self.series_name).get_episode_dir(self.episode_info,settings.output_subdirs.object_visualizations) - if not viz_dir.exists(): - return - - image_files = list(viz_dir.glob("*.jpg")) + list(viz_dir.glob("*.png")) - if not image_files: - self.warnings.append(f"No visualization images in {settings.output_subdirs.object_visualizations}/") - return - - self.object_visualizations_count = len(image_files) - invalid_count = 0 - - for img_file in image_files: - result = validate_image_file(img_file) - if not result.is_valid: - invalid_count += 1 - self.errors.append(f"Invalid visualization {img_file.name}: {result.error_message}") - - if invalid_count > 0: - self.warnings.append(f"{invalid_count} invalid visualization images found") - - def __validate_embedding_dimensions(self, jsonl_file, subdir: str): - embedding_fields = { - ELASTIC_SUBDIRS.text_embeddings: "text_embedding", - ELASTIC_SUBDIRS.video_frames: "video_embedding", - ELASTIC_SUBDIRS.episode_names: "title_embedding", - ELASTIC_SUBDIRS.full_episode_embeddings: "full_episode_embedding", - ELASTIC_SUBDIRS.sound_event_embeddings: "sound_event_embedding", - } - - if subdir not in embedding_fields: - return - - embedding_field = embedding_fields[subdir] - expected_dim = settings.embedding_model.embedding_dim - - try: - with open(jsonl_file, "r", encoding="utf-8") as f: - for line_num, line in enumerate(f, 1): - if not line.strip(): - continue - doc = json.loads(line) - if embedding_field in doc: - embedding = doc[embedding_field] - if isinstance(embedding, list): - actual_dim = len(embedding) - if actual_dim != expected_dim: - self.errors.append( - f"{jsonl_file.name} line {line_num}: " - f"{embedding_field} has {actual_dim} dimensions, expected {expected_dim}", - ) - return - except Exception as e: - self.errors.append(f"Error validating embeddings in {jsonl_file.name}: {e}") - - def __check_size_anomalies(self, sizes: List[int], folder_name: str, threshold: float = 0.2): - if len(sizes) < 2: - return - - avg_size = sum(sizes) / len(sizes) - if avg_size == 0: - return - - for i, size in enumerate(sizes): - deviation = abs(size - avg_size) / avg_size - if deviation > threshold: - self.warnings.append( - f"{folder_name} file #{i+1} size deviation: {deviation*100:.1f}% from average", - ) - - def __validate_other_files(self): - char_detections_dir = PathManager(self.series_name).get_episode_dir(self.episode_info,settings.output_subdirs.character_detections) - detections_file = char_detections_dir / OUTPUT_FILE_NAMES["detections"] - if detections_file.exists(): - result = validate_json_file(detections_file) - if not result.is_valid: - self.errors.append(f"Invalid {OUTPUT_FILE_NAMES['detections']}: {result.error_message}") - - embeddings_dir = PathManager(self.series_name).get_episode_dir(self.episode_info,settings.output_subdirs.embeddings) - if embeddings_dir.exists(): - embeddings_file = embeddings_dir / OUTPUT_FILE_NAMES["embeddings_text"] - if embeddings_file.exists(): - result = validate_json_file(embeddings_file) - if not result.is_valid: - self.errors.append(f"Invalid {OUTPUT_FILE_NAMES['embeddings_text']}: {result.error_message}") - - elastic_subdirs = [ - ELASTIC_SUBDIRS.text_segments, - ELASTIC_SUBDIRS.text_embeddings, - ELASTIC_SUBDIRS.video_frames, - ELASTIC_SUBDIRS.episode_names, - ELASTIC_SUBDIRS.text_statistics, - ELASTIC_SUBDIRS.full_episode_embeddings, - ELASTIC_SUBDIRS.sound_events, - ELASTIC_SUBDIRS.sound_event_embeddings, - ] - found_elastic_docs = False - for subdir in elastic_subdirs: - elastic_docs_dir = PathManager(self.series_name).get_episode_dir( - self.episode_info, - f"{settings.output_subdirs.elastic_documents}/{subdir}", - ) - if elastic_docs_dir.exists(): - found_elastic_docs = True - for jsonl_file in elastic_docs_dir.glob("*.jsonl"): - result = validate_jsonl_file(jsonl_file) - if not result.is_valid: - self.errors.append(f"Invalid JSONL {jsonl_file.name}: {result.error_message}") - else: - self.__validate_embedding_dimensions(jsonl_file, subdir) - - if not found_elastic_docs: - self.warnings.append(f"Missing {settings.output_subdirs.elastic_documents} directory") - - transcriptions_dir = PathManager(self.series_name).get_episode_dir(self.episode_info,settings.output_subdirs.transcriptions) - if transcriptions_dir.exists(): - clean_dir = transcriptions_dir / settings.output_subdirs.transcription_subdirs.clean - text_stats_file = clean_dir / f"{self.series_name}_{self.episode_info.episode_code()}_text_stats.json" - if text_stats_file.exists(): - result = validate_json_file(text_stats_file) - if not result.is_valid: - self.errors.append(f"Invalid text_stats JSON: {result.error_message}") - else: - self.warnings.append(f"Missing text statistics file: {text_stats_file.name}") - - def to_dict(self) -> Dict[str, Any]: - return { - "status": self.status, - "errors": self.errors, - "warnings": self.warnings, - "stats": { - "transcription_chars": self.transcription_chars, - "transcription_duration": self.transcription_duration, - "transcription_words": self.transcription_words, - "exported_frames_count": self.exported_frames_count, - "exported_frames_total_size_mb": self.exported_frames_total_size_mb, - "exported_frames_avg_resolution": self.exported_frames_avg_resolution, - "video_size_mb": self.video_size_mb, - "video_duration": self.video_duration, - "video_codec": self.video_codec, - "video_resolution": self.video_resolution, - "scenes_count": self.scenes_count, - "scenes_avg_duration": self.scenes_avg_duration, - "image_hashes_count": self.image_hashes_count, - "character_visualizations_count": self.character_visualizations_count, - "face_clusters_count": self.face_clusters_count, - "face_clusters_total_faces": self.face_clusters_total_faces, - "object_detections_count": self.object_detections_count, - "object_visualizations_count": self.object_visualizations_count, - }, - } diff --git a/preprocessor/validation/file_validators.py b/preprocessor/validation/file_validators.py deleted file mode 100644 index bb3c7bff4..000000000 --- a/preprocessor/validation/file_validators.py +++ /dev/null @@ -1,178 +0,0 @@ -from dataclasses import dataclass -import json -from pathlib import Path -import subprocess -from typing import ( - Any, - Dict, - Optional, -) -import zipfile - -from PIL import Image - -from preprocessor.utils.constants import ( - FfprobeFormatKeys, - FfprobeKeys, - FfprobeStreamKeys, - ValidationMetadataKeys, -) - - -@dataclass -class ValidationResult: - is_valid: bool - error_message: Optional[str] = None - metadata: Optional[Dict[str, Any]] = None - - -def validate_json_file(path: Path) -> ValidationResult: - if not path.exists(): - return ValidationResult(is_valid=False, error_message=f"File does not exist: {path}") - - try: - with open(path, "r", encoding="utf-8") as f: - json.load(f) - return ValidationResult(is_valid=True, metadata={ValidationMetadataKeys.SIZE_BYTES: path.stat().st_size}) - except json.JSONDecodeError as e: - return ValidationResult(is_valid=False, error_message=f"Invalid JSON: {e}") - except Exception as e: - return ValidationResult(is_valid=False, error_message=f"Error reading file: {e}") - - -def validate_jsonl_file(path: Path) -> ValidationResult: - if not path.exists(): - return ValidationResult(is_valid=False, error_message=f"File does not exist: {path}") - - try: - line_count = 0 - with open(path, "r", encoding="utf-8") as f: - for line_num, line in enumerate(f, 1): - line = line.strip() - if not line: - continue - try: - json.loads(line) - line_count += 1 - except json.JSONDecodeError as e: - return ValidationResult( - is_valid=False, - error_message=f"Invalid JSON at line {line_num}: {e}", - ) - return ValidationResult( - is_valid=True, - metadata={ValidationMetadataKeys.SIZE_BYTES: path.stat().st_size, ValidationMetadataKeys.LINE_COUNT: line_count}, - ) - except Exception as e: - return ValidationResult(is_valid=False, error_message=f"Error reading file: {e}") - - -def validate_image_file(path: Path) -> ValidationResult: - if not path.exists(): - return ValidationResult(is_valid=False, error_message=f"File does not exist: {path}") - - try: - with Image.open(path) as img: - img.verify() - with Image.open(path) as img: - width, height = img.size - format_type = img.format - size_mb = path.stat().st_size / (1024 * 1024) - - return ValidationResult( - is_valid=True, - metadata={ - ValidationMetadataKeys.WIDTH: width, - ValidationMetadataKeys.HEIGHT: height, - ValidationMetadataKeys.FORMAT: format_type, - ValidationMetadataKeys.SIZE_MB: round(size_mb, 2), - }, - ) - except Exception as e: - return ValidationResult(is_valid=False, error_message=f"Invalid image: {e}") - - -def validate_video_file(path: Path) -> ValidationResult: - if not path.exists(): - return ValidationResult(is_valid=False, error_message=f"File does not exist: {path}") - - try: - result = subprocess.run( - [ - "ffprobe", - "-v", - "error", - "-select_streams", - "v:0", - "-show_entries", - "stream=codec_name,width,height,duration", - "-show_entries", - "format=duration,size", - "-of", - "json", - str(path), - ], - capture_output=True, - text=True, - check=True, - ) - - probe_data = json.loads(result.stdout) - stream = probe_data.get(FfprobeKeys.STREAMS, [{}])[0] - format_info = probe_data.get(FfprobeKeys.FORMAT, {}) - - duration = float(stream.get(FfprobeStreamKeys.DURATION, format_info.get(FfprobeFormatKeys.DURATION, 0))) - size_bytes = int(format_info.get(FfprobeFormatKeys.SIZE, 0)) - size_mb = size_bytes / (1024 * 1024) - - return ValidationResult( - is_valid=True, - metadata={ - ValidationMetadataKeys.CODEC: stream.get(FfprobeStreamKeys.CODEC_NAME), - ValidationMetadataKeys.WIDTH: stream.get(FfprobeStreamKeys.WIDTH), - ValidationMetadataKeys.HEIGHT: stream.get(FfprobeStreamKeys.HEIGHT), - ValidationMetadataKeys.DURATION: round(duration, 2), - ValidationMetadataKeys.SIZE_MB: round(size_mb, 2), - }, - ) - except subprocess.CalledProcessError as e: - return ValidationResult(is_valid=False, error_message=f"ffprobe error: {e.stderr}") - except Exception as e: - return ValidationResult(is_valid=False, error_message=f"Error validating video: {e}") - - -def validate_archive_file(path: Path) -> ValidationResult: - if not path.exists(): - return ValidationResult(is_valid=False, error_message=f"File does not exist: {path}") - - try: - with zipfile.ZipFile(path, "r") as zip_ref: - bad_file = zip_ref.testzip() - if bad_file: - return ValidationResult( - is_valid=False, - error_message=f"Corrupt file in archive: {bad_file}", - ) - - file_count = len(zip_ref.namelist()) - compressed_size = sum(info.compress_size for info in zip_ref.infolist()) - uncompressed_size = sum(info.file_size for info in zip_ref.infolist()) - - compression_ratio = 0 - if uncompressed_size > 0: - compression_ratio = (1 - compressed_size / uncompressed_size) * 100 - - return ValidationResult( - is_valid=True, - metadata={ - ValidationMetadataKeys.SIZE_MB: round(path.stat().st_size / (1024 * 1024), 2), - "file_count": file_count, - "compressed_size_mb": round(compressed_size / (1024 * 1024), 2), - "uncompressed_size_mb": round(uncompressed_size / (1024 * 1024), 2), - "compression_ratio": round(compression_ratio, 2), - }, - ) - except zipfile.BadZipFile as e: - return ValidationResult(is_valid=False, error_message=f"Invalid ZIP file: {e}") - except Exception as e: - return ValidationResult(is_valid=False, error_message=f"Error validating archive: {e}") diff --git a/preprocessor/validation/global_validator.py b/preprocessor/validation/global_validator.py deleted file mode 100644 index e9ed67bb8..000000000 --- a/preprocessor/validation/global_validator.py +++ /dev/null @@ -1,117 +0,0 @@ -from pathlib import Path -from typing import List - -from preprocessor.validation.base_result import BaseValidationResult -from preprocessor.validation.file_validators import ( - validate_image_file, - validate_json_file, -) - - -class GlobalValidationResult(BaseValidationResult): - pass - - -class GlobalValidator: - def __init__( - self, - series_name: str, - base_output_dir: Path, - ): - self.series_name = series_name - self.base_output_dir = base_output_dir - self.result = GlobalValidationResult() - - def validate(self) -> GlobalValidationResult: - self.__validate_main_json_files() - self.__validate_characters_folder() - self.__validate_processing_metadata() - return self.result - - def __validate_main_json_files(self): - episodes_file = self.base_output_dir / f"{self.series_name}_episodes.json" - if episodes_file.exists(): - result = validate_json_file(episodes_file) - if not result.is_valid: - self.result.errors.append(f"Invalid {episodes_file.name}: {result.error_message}") - else: - self.result.stats["episodes_json_valid"] = True - else: - self.result.warnings.append(f"Missing {episodes_file.name}") - - characters_file = self.base_output_dir / f"{self.series_name}_characters.json" - if characters_file.exists(): - result = validate_json_file(characters_file) - if not result.is_valid: - self.result.errors.append(f"Invalid {characters_file.name}: {result.error_message}") - else: - self.result.stats["characters_json_valid"] = True - else: - self.result.warnings.append(f"Missing {characters_file.name}") - - def __validate_characters_folder(self): - characters_dir = self.base_output_dir / "characters" - if not characters_dir.exists(): - self.result.warnings.append("Missing characters/ directory") - return - - character_folders = [d for d in characters_dir.iterdir() if d.is_dir()] - if not character_folders: - self.result.warnings.append("No character folders in characters/") - return - - self.result.stats["character_folders_count"] = len(character_folders) - - total_images = 0 - invalid_images = 0 - characters_without_images: List[str] = [] - - for char_folder in character_folders: - image_files = ( - list(char_folder.glob("*.jpg")) + - list(char_folder.glob("*.jpeg")) + - list(char_folder.glob("*.png")) + - list(char_folder.glob("*.webp")) - ) - - if not image_files: - characters_without_images.append(char_folder.name) - continue - - total_images += len(image_files) - - for img_file in image_files: - result = validate_image_file(img_file) - if not result.is_valid: - invalid_images += 1 - self.result.errors.append( - f"Invalid character image {char_folder.name}/{img_file.name}: {result.error_message}", - ) - - self.result.stats["character_images_count"] = total_images - self.result.stats["invalid_character_images"] = invalid_images - - if characters_without_images: - self.result.warnings.append( - f"{len(characters_without_images)} characters without reference images", - ) - - def __validate_processing_metadata(self): - metadata_dir = self.base_output_dir / "processing_metadata" - if not metadata_dir.exists(): - self.result.warnings.append("Missing processing_metadata/ directory") - return - - json_files = list(metadata_dir.glob("*.json")) - if not json_files: - self.result.warnings.append("No JSON files in processing_metadata/") - return - - self.result.stats["processing_metadata_files"] = len(json_files) - - for json_file in json_files: - result = validate_json_file(json_file) - if not result.is_valid: - self.result.errors.append( - f"Invalid processing metadata {json_file.name}: {result.error_message}", - ) diff --git a/preprocessor/validation/validator.py b/preprocessor/validation/validator.py deleted file mode 100644 index 6f5c04cef..000000000 --- a/preprocessor/validation/validator.py +++ /dev/null @@ -1,152 +0,0 @@ -from datetime import datetime -from pathlib import Path -from typing import ( - Dict, - Optional, -) - -from rich.console import Console -from rich.progress import track - -from preprocessor.config.config import settings -from preprocessor.core.path_manager import PathManager -from preprocessor.episodes import EpisodeManager -from preprocessor.utils.file_utils import atomic_write_json -from preprocessor.validation.episode_stats import EpisodeStats -from preprocessor.validation.report_generator import ReportGenerator -from preprocessor.validation.season_comparator import SeasonComparison - -console = Console() - - -class Validator: - def __init__( - self, - season: str, - series_name: str = "ranczo", - anomaly_threshold: float = 20.0, - base_output_dir: Path = None, - episodes_info_json: Optional[Path] = None, - ): - self.season = season - self.series_name = series_name - self.anomaly_threshold = anomaly_threshold - self.base_output_dir = base_output_dir - self.episode_manager = EpisodeManager(episodes_info_json, series_name) - self.validation_reports_dir = base_output_dir / settings.output_subdirs.validation_reports - - def validate(self) -> int: - transcriptions_season_path = self.base_output_dir / "transcriptions" / self.season - if not transcriptions_season_path.exists(): - console.print(f"[red]Season directory not found: {transcriptions_season_path}[/red]") - return 1 - - console.print(f"[bold cyan]Validating season {self.season}...[/bold cyan]") - - episodes_stats = self.__collect_episodes_stats(transcriptions_season_path) - - if not episodes_stats: - console.print(f"[red]No episodes found in {transcriptions_season_path}[/red]") - return 1 - - self.validation_reports_dir.mkdir(parents=True, exist_ok=True) - - self.__generate_episode_reports(episodes_stats) - - season_comparison = SeasonComparison( - season=self.season, - anomaly_threshold=self.anomaly_threshold, - ) - season_comparison.compare_episodes(episodes_stats) - - report_generator = ReportGenerator( - season=self.season, - anomaly_threshold=self.anomaly_threshold, - ) - season_report_path = self.validation_reports_dir / f"{self.series_name}_{self.season}_season.json" - report_generator.generate_report(episodes_stats, season_comparison, season_report_path) - - self.__print_summary(episodes_stats, season_comparison) - - console.print(f"\n[green]Validation reports saved to: {self.validation_reports_dir}[/green]") - - return 0 - - def __collect_episodes_stats(self, transcriptions_season_path: Path) -> Dict[str, EpisodeStats]: - episode_dirs = sorted([d for d in transcriptions_season_path.iterdir() if d.is_dir() and d.name.startswith("E")]) - - episodes_stats = {} - for episode_dir in track(episode_dirs, description="Collecting episode stats"): - episode_num = int(episode_dir.name[1:]) - season_num = int(self.season[1:]) - - episode_info = self.episode_manager.get_episode_by_season_and_relative(season_num, episode_num) - if not episode_info: - console.print(f"[yellow]Skipping {episode_dir.name}: could not parse episode info[/yellow]") - continue - - episode_id = episode_info.episode_code() - stats = EpisodeStats( - episode_info=episode_info, - series_name=self.series_name, - ) - stats.collect_stats() - episodes_stats[episode_id] = stats - - return episodes_stats - - def __generate_episode_reports(self, episodes_stats: Dict[str, EpisodeStats]): - for stats in episodes_stats.values(): - episode_report = { - "validation_timestamp": datetime.now().isoformat(), - "episode_id": stats.episode_info.episode_code(), - "episode_title": stats.episode_info.title, - "status": stats.status, - "errors": stats.errors, - "warnings": stats.warnings, - "stats": stats.to_dict()["stats"], - } - - path_manager = PathManager(self.series_name) - report_filename = path_manager.build_filename(stats.episode_info, extension="json") - report_path = self.validation_reports_dir / report_filename - atomic_write_json(report_path, episode_report) - - def __print_summary(self, episodes_stats: Dict[str, EpisodeStats], season_comparison: SeasonComparison): - console.print(f"\n[bold]Validation Summary for {self.season}[/bold]") - console.print(f"Total episodes: {len(episodes_stats)}") - - pass_count = sum(1 for stats in episodes_stats.values() if stats.status == "PASS") - warning_count = sum(1 for stats in episodes_stats.values() if stats.status == "WARNING") - fail_count = sum(1 for stats in episodes_stats.values() if stats.status == "FAIL") - - console.print(f" [green]PASS:[/green] {pass_count}") - console.print(f" [yellow]WARNING:[/yellow] {warning_count}") - console.print(f" [red]FAIL:[/red] {fail_count}") - - if season_comparison.anomalies: - console.print(f"\n[bold yellow]Anomalies detected: {len(season_comparison.anomalies)}[/bold yellow]") - for anomaly in season_comparison.anomalies[:5]: - color = "red" if anomaly.severity == "ERROR" else "yellow" - console.print( - f" [{color}]{anomaly.episode}[/{color}]: " - f"{anomaly.metric} = {anomaly.value} " - f"(avg: {anomaly.avg}, deviation: {anomaly.deviation_percent:.1f}%)", - ) - if len(season_comparison.anomalies) > 5: - console.print(f" ... and {len(season_comparison.anomalies) - 5} more") - - for episode_id, stats in episodes_stats.items(): - if stats.errors: - console.print(f"\n[red]Errors in {episode_id}:[/red]") - for error in stats.errors[:3]: - console.print(f" - {error}") - if len(stats.errors) > 3: - console.print(f" ... and {len(stats.errors) - 3} more") - - if stats.warnings: - console.print(f"\n[yellow]Warnings in {episode_id}:[/yellow]") - for warning in stats.warnings[:3]: - console.print(f" - {warning}") - if len(stats.warnings) > 3: - console.print(f" ... and {len(stats.warnings) - 3} more") diff --git a/preprocessor/video/__init__.py b/preprocessor/video/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/preprocessor/video/emotion_utils.py b/preprocessor/video/emotion_utils.py deleted file mode 100644 index 3e09bf26d..000000000 --- a/preprocessor/video/emotion_utils.py +++ /dev/null @@ -1,127 +0,0 @@ -from typing import ( - Dict, - List, - Optional, - Tuple, -) - -from hsemotion_onnx.facial_emotions import HSEmotionRecognizer -import numpy as np - -from preprocessor.config.config import settings -from preprocessor.utils.console import console - -EMOTION_LABELS = [ - 'anger', - 'contempt', - 'disgust', - 'fear', - 'happiness', - 'neutral', - 'sadness', - 'surprise', -] - - -def init_emotion_model() -> HSEmotionRecognizer: - model_name = settings.emotion_detection.model_name - - console.print(f"[cyan]Loading HSEmotion model: {model_name}...[/cyan]") - - try: - fer = HSEmotionRecognizer(model_name=model_name) - console.print(f"[green]✓ HSEmotion model loaded: {model_name}[/green]") - return fer - except Exception as e: - raise RuntimeError(f"Failed to load HSEmotion model {model_name}: {e}") from e - - -def detect_emotion( - face_image: np.ndarray, - model: HSEmotionRecognizer, -) -> Tuple[str, float, Dict[str, float]]: - try: - emotion, scores = model.predict_emotions(face_image, logits=False) - - emotion_scores = { - EMOTION_LABELS[i]: float(scores[i]) - for i in range(len(EMOTION_LABELS)) - } - - confidence = float(max(scores)) - dominant_emotion = emotion.lower() - - return dominant_emotion, confidence, emotion_scores - - except Exception as e: - raise RuntimeError(f"Emotion detection failed: {e}") from e - - -def crop_face_from_frame(frame: np.ndarray, bbox: Dict[str, int]) -> Optional[np.ndarray]: - try: - x1, y1 = bbox['x1'], bbox['y1'] - x2, y2 = bbox['x2'], bbox['y2'] - - if x1 < 0 or y1 < 0 or x2 > frame.shape[1] or y2 > frame.shape[0]: - x1 = max(0, x1) - y1 = max(0, y1) - x2 = min(frame.shape[1], x2) - y2 = min(frame.shape[0], y2) - - if x2 <= x1 or y2 <= y1: - return None - - face_crop = frame[y1:y2, x1:x2] - - if face_crop.size == 0: - return None - - return face_crop - - except Exception: - return None - - -def detect_emotions_batch( - face_images: List[np.ndarray], - model: HSEmotionRecognizer, - batch_size: int = 32, -) -> List[Tuple[str, float, Dict[str, float]]]: - results = [] - total = len(face_images) - - for batch_start in range(0, total, batch_size): - batch_end = min(batch_start + batch_size, total) - batch = face_images[batch_start:batch_end] - - progress_pct = int((batch_end / total) * 100) - console.print(f"[cyan] Processing batch {batch_start}-{batch_end}/{total} ({progress_pct}%)[/cyan]") - - try: - batch_results = model.predict_multi_emotions(batch, logits=False) - - for emotion, scores in batch_results: - emotion_scores = { - EMOTION_LABELS[i]: float(scores[i]) - for i in range(len(EMOTION_LABELS)) - } - confidence = float(max(scores)) - dominant_emotion = emotion.lower() - - results.append((dominant_emotion, confidence, emotion_scores)) - - except Exception: - for face_img in batch: - try: - emotion, scores = model.predict_emotions(face_img, logits=False) - emotion_scores = { - EMOTION_LABELS[i]: float(scores[i]) - for i in range(len(EMOTION_LABELS)) - } - confidence = float(max(scores)) - dominant_emotion = emotion.lower() - results.append((dominant_emotion, confidence, emotion_scores)) - except Exception: - results.append(None) - - return results diff --git a/preprocessor/video/frame_processor.py b/preprocessor/video/frame_processor.py deleted file mode 100644 index 24f6b561d..000000000 --- a/preprocessor/video/frame_processor.py +++ /dev/null @@ -1,216 +0,0 @@ -import json -import logging -from pathlib import Path -import shutil -from typing import ( - Any, - Dict, - List, - Optional, - Tuple, -) - -import cv2 - -from preprocessor.config.config import settings -from preprocessor.core.base_processor import ( - BaseProcessor, - OutputSpec, - ProcessingItem, -) -from preprocessor.episodes import EpisodeManager -from preprocessor.utils.console import console - -# pylint: disable=duplicate-code - - -class FrameProcessor(BaseProcessor): - def __init__(self, args: Dict[str, Any]): - super().__init__( - args=args, - class_name=self.__class__.__name__, - error_exit_code=15, - loglevel=logging.DEBUG, - ) - - self.frames_dir: Path = Path( - self._args.get("frames_dir", settings.frame_export.get_output_dir(self.series_name)), - ) - self.ramdisk_path: Path = Path(self._args.get("ramdisk_path", "/dev/shm")) - - episodes_info_json = self._args.get("episodes_info_json") - self.episode_manager = EpisodeManager(episodes_info_json, self.series_name) - - self.sub_processors: List['FrameSubProcessor'] = [] - - def _validate_args(self, args: Dict[str, Any]) -> None: - pass - - def get_output_subdir(self) -> str: - return settings.output_subdirs.frames - - def add_sub_processor(self, processor: 'FrameSubProcessor') -> None: - self.sub_processors.append(processor) - - def _get_processing_items(self) -> List[ProcessingItem]: - return self._get_episode_processing_items_from_metadata( - "**/*_frame_metadata.json", - self.frames_dir, - self.episode_manager, - ) - - def _get_expected_outputs(self, item: ProcessingItem) -> List[OutputSpec]: - outputs = [] - for sub_processor in self.sub_processors: - outputs.extend(sub_processor.get_expected_outputs(item)) - return outputs - - def cleanup(self) -> None: - for sub_processor in self.sub_processors: - sub_processor.finalize() - console.print("[green]✓ All sub-processors finalized[/green]") - - def _process_item(self, item: ProcessingItem, missing_outputs: List[OutputSpec]) -> None: - metadata_file = item.input_path - episode_info = item.metadata["episode_info"] - - frames_episode_dir = metadata_file.parent - season = episode_info.season - episode = episode_info.relative_episode - - any_sub_processor_will_run = any( - sub_processor.should_run(item, missing_outputs) - for sub_processor in self.sub_processors - ) - - if not any_sub_processor_will_run: - for sub_processor in self.sub_processors: - console.print(f"[yellow]Skipping: {sub_processor.name} (output exists)[/yellow]") - return - - any_sub_processor_needs_ramdisk = any( - sub_processor.should_run(item, missing_outputs) and sub_processor.needs_ramdisk() - for sub_processor in self.sub_processors - ) - - if any_sub_processor_needs_ramdisk: - ramdisk_episode_dir = self.ramdisk_path / "frames" / f"S{season:02d}" / f"E{episode:02d}" - try: - self.__copy_frames_to_ramdisk(frames_episode_dir, ramdisk_episode_dir) - - for sub_processor in self.sub_processors: - if sub_processor.should_run(item, missing_outputs): - console.print(f"[cyan]Running: {sub_processor.name}[/cyan]") - sub_processor.process(item, ramdisk_episode_dir) - else: - console.print(f"[yellow]Skipping: {sub_processor.name} (output exists)[/yellow]") - - finally: - self.__cleanup_ramdisk(ramdisk_episode_dir) - else: - for sub_processor in self.sub_processors: - if sub_processor.should_run(item, missing_outputs): - console.print(f"[cyan]Running: {sub_processor.name}[/cyan]") - sub_processor.process(item, frames_episode_dir) - else: - console.print(f"[yellow]Skipping: {sub_processor.name} (output exists)[/yellow]") - - @staticmethod - def __copy_frames_to_ramdisk(source_dir: Path, dest_dir: Path) -> None: - dest_dir.mkdir(parents=True, exist_ok=True) - - frame_files = list(source_dir.glob("*frame_*.jpg")) - console.print(f"[cyan]Copying {len(frame_files)} frames to RAMdisk: {dest_dir}[/cyan]") - - for frame_file in frame_files: - shutil.copy2(frame_file, dest_dir / frame_file.name) - - console.print("[green]✓ Frames copied to RAMdisk[/green]") - - @staticmethod - def __cleanup_ramdisk(ramdisk_dir: Path) -> None: - if ramdisk_dir.exists(): - shutil.rmtree(ramdisk_dir) - console.print(f"[green]✓ RAMdisk cleaned: {ramdisk_dir}[/green]") - - -class FrameSubProcessor: - def __init__(self, name: str): - self.name = name - - def get_expected_outputs(self, item: ProcessingItem) -> List[OutputSpec]: - raise NotImplementedError - - def should_run(self, item: ProcessingItem, missing_outputs: List[OutputSpec]) -> bool: - expected = self.get_expected_outputs(item) - return any(str(exp.path) in str(miss.path) for exp in expected for miss in missing_outputs) - - def needs_ramdisk(self) -> bool: - return True - - def process(self, item: ProcessingItem, ramdisk_frames_dir: Path) -> None: - raise NotImplementedError - - @staticmethod - def _load_frame_files_from_ramdisk(ramdisk_frames_dir: Path) -> List[Path]: - return sorted([ - f for f in ramdisk_frames_dir.glob("*.jpg") - if f.is_file() and "frame_" in f.name - ]) - - def _load_frames_with_warning(self, ramdisk_frames_dir: Path) -> Optional[List[Path]]: - frame_files = self._load_frame_files_from_ramdisk(ramdisk_frames_dir) - if not frame_files: - console.print(f"[yellow]No frames found in {ramdisk_frames_dir}[/yellow]") - return None - return frame_files - - @staticmethod - def _load_detection_file( - detection_dir: Path, - ramdisk_frames_dir: Path, - glob_pattern: str, - ) -> Optional[Dict[str, Any]]: - detection_files = list(detection_dir.glob(glob_pattern)) - detection_file = detection_files[0] if detection_files else None - - if not detection_file or not detection_file.exists(): - console.print(f"[yellow]No detections JSON found in {detection_dir}[/yellow]") - return None - - if not ramdisk_frames_dir.exists(): - console.print(f"[yellow]No frames directory found: {ramdisk_frames_dir}[/yellow]") - return None - - with open(detection_file, 'r', encoding='utf-8') as f: - return json.load(f) - - @staticmethod - def _load_frame_requests_from_metadata(metadata_file: Path) -> Optional[List[Dict[str, Any]]]: - with open(metadata_file, "r", encoding="utf-8") as f: - metadata = json.load(f) - - frame_requests = metadata.get("frames", []) - if not frame_requests: - console.print(f"[yellow]No frames in metadata for {metadata_file}[/yellow]") - return None - - return frame_requests - - @staticmethod - def _draw_label_on_bbox( - img, - label: str, - x1: int, - y1: int, - color: Tuple[int, int, int], - ) -> None: - label_size, _ = cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX, 0.5, 1) - label_y1 = max(y1 - 10, label_size[1]) - - cv2.rectangle(img, (x1, label_y1 - label_size[1] - 5), (x1 + label_size[0], label_y1), color, -1) - cv2.putText(img, label, (x1, label_y1 - 2), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255), 1) - - def finalize(self) -> None: - if hasattr(self, 'logger'): - self.logger.finalize() diff --git a/preprocessor/video/frame_utils.py b/preprocessor/video/frame_utils.py deleted file mode 100644 index d1f2fe32f..000000000 --- a/preprocessor/video/frame_utils.py +++ /dev/null @@ -1,40 +0,0 @@ -from concurrent.futures import ThreadPoolExecutor -from pathlib import Path -from typing import ( - Any, - Dict, - List, -) - -from PIL import Image - - -def _load_single_frame(frames_dir: Path, request: Dict[str, Any], convert_rgb: bool) -> Image.Image: - if "frame_path" in request: - frame_path = frames_dir / request["frame_path"] - else: - frame_num = request["frame_number"] - frame_path = frames_dir / f"frame_{frame_num:06d}.jpg" - - if frame_path.exists(): - img = Image.open(frame_path) - if convert_rgb and img.mode != 'RGB': - img = img.convert('RGB') - return img - return Image.new('RGB', (1, 1)) - - -def load_frames_from_requests( - frames_dir: Path, - frame_requests: List[Dict[str, Any]], - convert_rgb: bool = False, - num_workers: int = 4, -) -> List[Image.Image]: - with ThreadPoolExecutor(max_workers=num_workers) as executor: - images = list( - executor.map( - lambda req: _load_single_frame(frames_dir, req, convert_rgb), - frame_requests, - ), - ) - return images diff --git a/preprocessor/video/subprocessors/__init__.py b/preprocessor/video/subprocessors/__init__.py deleted file mode 100644 index baa834a1c..000000000 --- a/preprocessor/video/subprocessors/__init__.py +++ /dev/null @@ -1,19 +0,0 @@ -from .character_detection_subprocessor import CharacterDetectionSubProcessor -from .character_detection_visualization_subprocessor import CharacterDetectionVisualizationSubProcessor -from .emotion_detection_subprocessor import EmotionDetectionSubProcessor -from .face_clustering_subprocessor import FaceClusteringSubProcessor -from .image_hash_subprocessor import ImageHashSubProcessor -from .object_detection_subprocessor import ObjectDetectionSubProcessor -from .object_detection_visualization_subprocessor import ObjectDetectionVisualizationSubProcessor -from .video_embedding_subprocessor import VideoEmbeddingSubProcessor - -__all__ = [ - "ImageHashSubProcessor", - "VideoEmbeddingSubProcessor", - "CharacterDetectionSubProcessor", - "ObjectDetectionSubProcessor", - "ObjectDetectionVisualizationSubProcessor", - "CharacterDetectionVisualizationSubProcessor", - "EmotionDetectionSubProcessor", - "FaceClusteringSubProcessor", -] diff --git a/preprocessor/video/subprocessors/character_detection_subprocessor.py b/preprocessor/video/subprocessors/character_detection_subprocessor.py deleted file mode 100644 index fdd508f47..000000000 --- a/preprocessor/video/subprocessors/character_detection_subprocessor.py +++ /dev/null @@ -1,99 +0,0 @@ -import logging -from pathlib import Path -from typing import ( - Dict, - List, - Optional, -) - -from insightface.app import FaceAnalysis -import numpy as np - -from preprocessor.characters.face_detection import ( - init_face_detection, - load_character_references, -) -from preprocessor.config.config import settings -from preprocessor.core.base_processor import ( - OutputSpec, - ProcessingItem, -) -from preprocessor.core.path_manager import PathManager -from preprocessor.utils.console import console -from preprocessor.utils.detection_io import ( - process_frames_for_detection, - save_character_detections, -) -from preprocessor.utils.error_handling_logger import ErrorHandlingLogger -from preprocessor.video.frame_processor import FrameSubProcessor - - -class CharacterDetectionSubProcessor(FrameSubProcessor): - def __init__(self, characters_dir: Path, use_gpu: bool, threshold: float): - super().__init__("Character Detection") - self.characters_dir = characters_dir - self.use_gpu = use_gpu - self.threshold = threshold - self.face_app: Optional[FaceAnalysis] = None - self.character_vectors: Dict[str, np.ndarray] = {} - self.logger = ErrorHandlingLogger("CharacterDetectionSubProcessor", logging.DEBUG, 15) - - def initialize(self) -> None: - if self.face_app is None: - console.print("[cyan]Initializing face detection...[/cyan]") - self.face_app = init_face_detection() - self.character_vectors = load_character_references(self.characters_dir, self.face_app) - console.print("[green]✓ Face detection initialized[/green]") - - def cleanup(self) -> None: - self.face_app = None - self.character_vectors = {} - - def finalize(self) -> None: - if hasattr(self, 'logger'): - self.logger.finalize() - - def get_expected_outputs(self, item: ProcessingItem) -> List[OutputSpec]: - episode_info = item.metadata["episode_info"] - episode_dir = PathManager(episode_info.series_name or "unknown").get_episode_dir(episode_info,settings.output_subdirs.character_detections) - series_name = item.metadata["series_name"] - path_manager = PathManager(series_name) - detections_filename = path_manager.build_filename( - episode_info, - extension="json", - suffix="character_detections", - ) - detections_output = episode_dir / detections_filename - return [OutputSpec(path=detections_output, required=True)] - - def should_run(self, item: ProcessingItem, missing_outputs: List[OutputSpec]) -> bool: - if not self.characters_dir.exists(): - console.print(f"[yellow]Characters directory not found: {self.characters_dir}, skipping[/yellow]") - return False - - expected = self.get_expected_outputs(item) - return any(str(exp.path) in str(miss.path) for exp in expected for miss in missing_outputs) - - def process(self, item: ProcessingItem, ramdisk_frames_dir: Path) -> None: - self.initialize() - - if not self.character_vectors: - console.print("[yellow]No character references loaded, skipping detection[/yellow]") - return - - episode_info = item.metadata["episode_info"] - - frame_files = sorted([ - f for f in ramdisk_frames_dir.glob("*.jpg") - if f.is_file() and "frame_" in f.name - ]) - - console.print(f"[cyan]Detecting characters in {len(frame_files)} frames[/cyan]") - - results = process_frames_for_detection( - frame_files, - self.face_app, - self.character_vectors, - self.threshold, - ) - save_character_detections(episode_info, results) diff --git a/preprocessor/video/subprocessors/character_detection_visualization_subprocessor.py b/preprocessor/video/subprocessors/character_detection_visualization_subprocessor.py deleted file mode 100644 index 3f0d0046a..000000000 --- a/preprocessor/video/subprocessors/character_detection_visualization_subprocessor.py +++ /dev/null @@ -1,138 +0,0 @@ -import logging -from pathlib import Path -from typing import ( - Any, - Dict, - List, - Set, - Tuple, -) - -import cv2 -import numpy as np - -from preprocessor.config.config import settings -from preprocessor.core.base_processor import ( - OutputSpec, - ProcessingItem, -) -from preprocessor.core.path_manager import PathManager -from preprocessor.utils.console import console -from preprocessor.utils.error_handling_logger import ErrorHandlingLogger -from preprocessor.video.frame_processor import FrameSubProcessor - - -class CharacterDetectionVisualizationSubProcessor(FrameSubProcessor): - def __init__(self): - super().__init__("Character Detection Visualization") - self.logger = ErrorHandlingLogger("CharacterDetectionVisualizationSubProcessor", logging.DEBUG, 15) - - def initialize(self) -> None: - pass - - def cleanup(self) -> None: - pass - - def needs_ramdisk(self) -> bool: - return False - - def get_expected_outputs(self, item: ProcessingItem) -> List[OutputSpec]: - episode_info = item.metadata["episode_info"] - episode_dir = PathManager(episode_info.series_name or "unknown").get_episode_dir(episode_info,settings.output_subdirs.character_visualizations) - marker_file = episode_dir / ".visualization_complete" - return [OutputSpec(path=marker_file, required=True)] - - def should_run(self, item: ProcessingItem, missing_outputs: List[OutputSpec]) -> bool: - episode_info = item.metadata["episode_info"] - detection_dir = PathManager(episode_info.series_name or "unknown").get_episode_dir(episode_info,settings.output_subdirs.character_detections) - detection_files = list(detection_dir.glob("*_character_detections.json")) - detection_file = detection_files[0] if detection_files else None - - if not detection_file or not detection_file.exists(): - console.print(f"[yellow]No character detections found for {episode_info.episode_code()}, skipping visualization[/yellow]") - return False - - expected = self.get_expected_outputs(item) - return any(str(exp.path) in str(miss.path) for exp in expected for miss in missing_outputs) - - def process(self, item: ProcessingItem, ramdisk_frames_dir: Path) -> None: - episode_info = item.metadata["episode_info"] - detection_dir = PathManager(episode_info.series_name or "unknown").get_episode_dir(episode_info,settings.output_subdirs.character_detections) - - detection_data = self._load_detection_file( - detection_dir, - ramdisk_frames_dir, - "*_character_detections.json", - ) - if detection_data is None: - return - - frames_with_detections = [f for f in detection_data.get("detections", []) if f.get('characters')] - if not frames_with_detections: - console.print(f"[yellow]No frames with character detections for {episode_info.episode_code()}[/yellow]") - return - - output_dir = PathManager(episode_info.series_name or "unknown").get_episode_dir(episode_info,settings.output_subdirs.character_visualizations) - output_dir.mkdir(parents=True, exist_ok=True) - - all_character_names = set() - for frame_data in frames_with_detections: - for char in frame_data.get('characters', []): - all_character_names.add(char['name']) - colors = self.__generate_character_colors(all_character_names) - - console.print(f"[cyan]Visualizing {len(frames_with_detections)} frames with characters for {episode_info.episode_code()}[/cyan]") - - for frame_data in frames_with_detections: - frame_name = frame_data.get('frame_file') or frame_data.get('frame') - if not frame_name: - continue - - output_path = output_dir / frame_name - if output_path.exists(): - continue - - frame_path = ramdisk_frames_dir / frame_name - if not frame_path.exists(): - continue - - img = cv2.imread(str(frame_path)) - if img is None: - continue - - self.__draw_characters_on_frame(img, frame_data['characters'], colors) - cv2.imwrite(str(output_path), img) - - marker_file = output_dir / ".visualization_complete" - marker_file.write_text(f"completed: {len(frames_with_detections)} frames") - console.print(f"[green]✓ Visualized {len(frames_with_detections)} frames saved to: {output_dir}[/green]") - - @staticmethod - def __draw_characters_on_frame(img, characters: List[Dict[str, Any]], colors: Dict[str, Tuple[int, int, int]]) -> None: - for character in characters: - name = character['name'] - confidence = character['confidence'] - bbox = character['bbox'] - - x1, y1 = bbox['x1'], bbox['y1'] - x2, y2 = bbox['x2'], bbox['y2'] - color = colors.get(name, (0, 255, 0)) - - cv2.rectangle(img, (x1, y1), (x2, y2), color, 2) - - label = f"{name} {confidence:.2f}" - if "emotion" in character: - emotion_label = character["emotion"]["label"] - emotion_conf = character["emotion"]["confidence"] - label += f" | {emotion_label} {emotion_conf:.2f}" - - FrameSubProcessor._draw_label_on_bbox(img, label, x1, y1, color) - - @staticmethod - def __generate_character_colors(character_names: Set[str]) -> Dict[str, Tuple[int, int, int]]: - np.random.seed(42) - colors = {} - sorted_names = sorted(character_names) - for _, name in enumerate(sorted_names): - colors[name] = tuple(int(x) for x in np.random.randint(50, 255, 3)) - return colors diff --git a/preprocessor/video/subprocessors/emotion_detection_subprocessor.py b/preprocessor/video/subprocessors/emotion_detection_subprocessor.py deleted file mode 100644 index 3076a7042..000000000 --- a/preprocessor/video/subprocessors/emotion_detection_subprocessor.py +++ /dev/null @@ -1,167 +0,0 @@ -import json -import logging -from pathlib import Path -from typing import ( - List, - Optional, -) - -import cv2 -from hsemotion_onnx.facial_emotions import HSEmotionRecognizer - -from preprocessor.config.config import settings -from preprocessor.core.base_processor import ( - OutputSpec, - ProcessingItem, -) -from preprocessor.core.path_manager import PathManager -from preprocessor.utils.console import console -from preprocessor.utils.error_handling_logger import ErrorHandlingLogger -from preprocessor.utils.file_utils import atomic_write_json -from preprocessor.video.emotion_utils import ( - crop_face_from_frame, - detect_emotions_batch, - init_emotion_model, -) -from preprocessor.video.frame_processor import FrameSubProcessor - - -class EmotionDetectionSubProcessor(FrameSubProcessor): - def __init__(self): - super().__init__("Emotion Detection") - self.model: Optional[HSEmotionRecognizer] = None - self.logger = ErrorHandlingLogger("EmotionDetectionSubProcessor", logging.DEBUG, 15) - - def initialize(self) -> None: - if self.model is None: - self.model = init_emotion_model() - - def cleanup(self) -> None: - self.model = None - - def finalize(self) -> None: - if hasattr(self, 'logger'): - self.logger.finalize() - - def get_expected_outputs(self, item: ProcessingItem) -> List[OutputSpec]: - episode_info = item.metadata["episode_info"] - episode_dir = PathManager(episode_info.series_name or "unknown").get_episode_dir(episode_info,settings.output_subdirs.character_detections) - marker_file = episode_dir / ".emotion_complete" - return [OutputSpec(path=marker_file, required=True)] - - def should_run(self, item: ProcessingItem, missing_outputs: List[OutputSpec]) -> bool: - episode_info = item.metadata["episode_info"] - series_name = item.metadata["series_name"] - episode_dir = PathManager(episode_info.series_name or "unknown").get_episode_dir(episode_info,settings.output_subdirs.character_detections) - - path_manager = PathManager(series_name) - detections_filename = path_manager.build_filename( - episode_info, - extension="json", - suffix="character_detections", - ) - detections_file = episode_dir / detections_filename - - if not detections_file.exists(): - console.print( - f"[yellow]No character detections found for emotion analysis: {detections_file}[/yellow]", - ) - return False - - marker_file = episode_dir / ".emotion_complete" - return any(output.path == marker_file for output in missing_outputs) - - def process(self, item: ProcessingItem, ramdisk_frames_dir: Path) -> None: # pylint: disable=too-many-locals,too-many-statements - self.initialize() - - episode_info = item.metadata["episode_info"] - series_name = item.metadata["series_name"] - episode_dir = PathManager(episode_info.series_name or "unknown").get_episode_dir(episode_info,settings.output_subdirs.character_detections) - - path_manager = PathManager(series_name) - detections_filename = path_manager.build_filename( - episode_info, - extension="json", - suffix="character_detections", - ) - detections_file = episode_dir / detections_filename - - if not detections_file.exists(): - console.print(f"[yellow]No detections file: {detections_file}[/yellow]") - return - - with open(detections_file, "r", encoding="utf-8") as f: - detections_data = json.load(f) - - detections = detections_data.get("detections", []) - - total_characters = sum(len(d.get("characters", [])) for d in detections) - console.print(f"[cyan]Collecting {total_characters} faces for batch emotion analysis[/cyan]") - - face_crops = [] - face_metadata = [] - - for detection_idx, detection in enumerate(detections): - frame_file = detection.get("frame_file") - if not frame_file: - continue - - frame_path = ramdisk_frames_dir / frame_file - - if not frame_path.exists(): - continue - - frame = cv2.imread(str(frame_path)) - if frame is None: - continue - - characters = detection.get("characters", []) - - for char_idx, char in enumerate(characters): - bbox = char.get("bbox") - if not bbox: - continue - - face_crop = crop_face_from_frame(frame, bbox) - if face_crop is None: - continue - - face_crops.append(face_crop) - face_metadata.append({ - "detection_idx": detection_idx, - "char_idx": char_idx, - }) - - if not face_crops: - console.print("[yellow]No valid face crops found[/yellow]") - return - - console.print(f"[cyan]Processing {len(face_crops)} faces with HSEmotion model[/cyan]") - - emotion_results = detect_emotions_batch(face_crops, self.model) - - processed = 0 - for result, metadata in zip(emotion_results, face_metadata): - if result is None: - continue - - dominant_emotion, confidence, emotion_scores = result - detection_idx = metadata["detection_idx"] - char_idx = metadata["char_idx"] - - char = detections[detection_idx]["characters"][char_idx] - char["emotion"] = { - "label": dominant_emotion, - "confidence": confidence, - "scores": emotion_scores, - } - processed += 1 - - atomic_write_json(detections_file, detections_data, indent=2, ensure_ascii=False) - - marker_file = detections_file.parent / ".emotion_complete" - marker_file.write_text("completed", encoding="utf-8") - - console.print( - f"[green]✓ Emotion analysis complete: {processed}/{total_characters} characters processed[/green]", - ) diff --git a/preprocessor/video/subprocessors/face_clustering_subprocessor.py b/preprocessor/video/subprocessors/face_clustering_subprocessor.py deleted file mode 100644 index 8b3b61299..000000000 --- a/preprocessor/video/subprocessors/face_clustering_subprocessor.py +++ /dev/null @@ -1,277 +0,0 @@ -from collections import defaultdict -import gc -import logging -from pathlib import Path -from typing import ( - Any, - Dict, - List, - Optional, -) - -from cuml.cluster import HDBSCAN as cuHDBSCAN -import cupy as cp -import cv2 -from insightface.app import FaceAnalysis -import numpy as np -import torch - -from preprocessor.characters.face_detection import init_face_detection -from preprocessor.config.config import settings -from preprocessor.core.base_processor import ( - OutputSpec, - ProcessingItem, -) -from preprocessor.core.path_manager import PathManager -from preprocessor.utils.console import console -from preprocessor.utils.error_handling_logger import ErrorHandlingLogger -from preprocessor.utils.file_utils import atomic_write_json -from preprocessor.utils.metadata_utils import create_processing_metadata -from preprocessor.video.frame_processor import FrameSubProcessor - - -class FaceClusteringSubProcessor(FrameSubProcessor): - def __init__( - self, - min_cluster_size: int, - min_samples: int, - save_noise: bool, - save_full_frames: bool, - ): - super().__init__("Face Clustering") - self.min_cluster_size = min_cluster_size - self.min_samples = min_samples - self.save_noise = save_noise - self.save_full_frames = save_full_frames - self.face_app: Optional[FaceAnalysis] = None - self.logger = ErrorHandlingLogger("FaceClusteringSubProcessor", logging.DEBUG, 15) - - def initialize(self) -> None: - if self.face_app is None: - console.print("[cyan]Initializing face detection for clustering...[/cyan]") - self.face_app = init_face_detection() - console.print("[green]✓ Face detection initialized[/green]") - - def cleanup(self) -> None: - self.face_app = None - self.__cleanup_memory() - - def finalize(self) -> None: - if hasattr(self, 'logger'): - self.logger.finalize() - - def needs_ramdisk(self) -> bool: - return False - - def get_expected_outputs(self, item: ProcessingItem) -> List[OutputSpec]: - episode_info = item.metadata["episode_info"] - episode_dir = PathManager(episode_info.series_name or "unknown").get_episode_dir(episode_info,settings.output_subdirs.face_clusters) - series_name = item.metadata["series_name"] - path_manager = PathManager(series_name) - metadata_filename = path_manager.build_filename( - episode_info, - extension="json", - suffix="_face_clusters", - ) - metadata_output = episode_dir / metadata_filename - return [OutputSpec(path=metadata_output, required=True)] - - def process(self, item: ProcessingItem, ramdisk_frames_dir: Path) -> None: - self.initialize() - - episode_info = item.metadata["episode_info"] - - frame_files = self._load_frames_with_warning(ramdisk_frames_dir) - if frame_files is None: - return - - console.print(f"[cyan]Extracting faces and vectors from {len(frame_files)} frames[/cyan]") - - face_data = self.__extract_faces_with_vectors(frame_files) - - if len(face_data) == 0: - console.print("[yellow]No faces detected, skipping clustering[/yellow]") - return - - console.print(f"[cyan]Clustering {len(face_data)} faces[/cyan]") - labels = self.__cluster_faces(face_data) - - console.print("[cyan]Saving clusters[/cyan]") - series_name = item.metadata["series_name"] - self.__save_clusters(episode_info, face_data, labels, frame_files, series_name) - - def __extract_faces_with_vectors(self, frame_files: List[Path]) -> List[Dict[str, Any]]: - face_data = [] - - for idx, frame_path in enumerate(frame_files): - if idx % 50 == 0: - console.print(f"[cyan]Processing frame {idx}/{len(frame_files)}[/cyan]") - - img = cv2.imread(str(frame_path)) - if img is None: - continue - - faces = self.face_app.get(img) - - for face_idx, face in enumerate(faces): - bbox = face.bbox.astype(int) - x1, y1, x2, y2 = bbox - - x1 = max(0, x1) - y1 = max(0, y1) - x2 = min(img.shape[1], x2) - y2 = min(img.shape[0], y2) - - face_img = img[y1:y2, x1:x2] - - if face_img.size == 0: - continue - - face_data.append({ - 'vector': face.normed_embedding, - 'frame_path': frame_path, - 'bbox': bbox, - 'face_img': face_img, - 'face_idx': face_idx, - }) - - console.print(f"[green]✓ Found {len(face_data)} faces in {len(frame_files)} frames[/green]") - return face_data - - def __cluster_faces(self, face_data: List[Dict[str, Any]]) -> np.ndarray: - vectors = np.array([fd['vector'] for fd in face_data]) - - console.print(f"[cyan]Clustering with GPU HDBSCAN (min_cluster_size={self.min_cluster_size}, min_samples={self.min_samples})[/cyan]") - vectors_gpu = cp.asarray(vectors) - - clusterer = cuHDBSCAN( - min_cluster_size=self.min_cluster_size, - min_samples=self.min_samples, - metric='euclidean', - cluster_selection_method='eom', - ) - labels = clusterer.fit_predict(vectors_gpu) - labels = cp.asnumpy(labels) - - n_clusters = len(set(labels)) - (1 if -1 in labels else 0) - n_noise = list(labels).count(-1) - - console.print(f"[green]✓ Found {n_clusters} clusters[/green]") - console.print(f"[green]✓ {n_noise} faces marked as noise[/green]") - - return labels - - def __save_clusters( # pylint: disable=too-many-locals - self, - episode_info, - face_data: List[Dict[str, Any]], - labels: np.ndarray, - all_frame_files: List[Path], - series_name: str, - ) -> None: - episode_dir = PathManager(episode_info.series_name or "unknown").get_episode_dir(episode_info,settings.output_subdirs.face_clusters) - episode_dir.mkdir(parents=True, exist_ok=True) - - clusters = defaultdict(list) - for face_info, label in zip(face_data, labels): - clusters[label].append(face_info) - - cluster_stats = [] - - for cluster_id, faces in sorted(clusters.items()): - if cluster_id == -1: - if not self.save_noise: - continue - cluster_dir = episode_dir / "noise" - else: - cluster_dir = episode_dir / f"cluster_{cluster_id}" - - faces_dir = cluster_dir / "faces" - faces_dir.mkdir(parents=True, exist_ok=True) - - if self.save_full_frames: - frames_dir = cluster_dir / "frames" - frames_dir.mkdir(parents=True, exist_ok=True) - - saved_frames = set() - cluster_frames = [] - - for face_info in faces: - frame_name = face_info['frame_path'].stem - face_idx = face_info['face_idx'] - face_output_path = faces_dir / f"{frame_name}_face{face_idx}.jpg" - - if face_info['face_img'].size > 0: - cv2.imwrite(str(face_output_path), face_info['face_img']) - - if self.save_full_frames and frame_name not in saved_frames: - frame_output_path = frames_dir / f"{frame_name}.jpg" - img = cv2.imread(str(face_info['frame_path'])) - if img is not None: - cv2.imwrite(str(frame_output_path), img) - saved_frames.add(frame_name) - cluster_frames.append(f"{frame_name}.jpg") - - cluster_label = "noise" if cluster_id == -1 else f"cluster_{cluster_id}" - console.print(f"[green]✓ Saved {len(faces)} faces to {cluster_label}[/green]") - - cluster_stats.append({ - "cluster_id": cluster_label, - "face_count": len(faces), - "frame_count": len(saved_frames), - "frames": sorted(cluster_frames), - "character_name": None, - }) - - self.__save_metadata(episode_info, face_data, labels, cluster_stats, all_frame_files, series_name) - - def __save_metadata( - self, - episode_info, - face_data: List[Dict[str, Any]], - labels: np.ndarray, - cluster_stats: List[Dict[str, Any]], - all_frame_files: List[Path], - series_name: str, - ) -> None: - episode_dir = PathManager(episode_info.series_name or "unknown").get_episode_dir(episode_info,settings.output_subdirs.face_clusters) - - n_clusters = len(set(labels)) - (1 if -1 in labels else 0) - n_noise = list(labels).count(-1) - frames_with_faces = len(set(fd['frame_path'] for fd in face_data)) - - metadata = create_processing_metadata( - episode_info=episode_info, - processing_params={ - "min_cluster_size": self.min_cluster_size, - "min_samples": self.min_samples, - "metric": "euclidean", - "algorithm": "hdbscan", - "model": settings.face_recognition.model_name, - }, - statistics={ - "total_faces_detected": len(face_data), - "total_clusters": n_clusters, - "noise_faces": n_noise, - "frames_processed": len(all_frame_files), - "frames_with_faces": frames_with_faces, - }, - results_key="clusters", - results_data=cluster_stats, - ) - path_manager = PathManager(series_name) - metadata_filename = path_manager.build_filename( - episode_info, - extension="json", - suffix="_face_clusters", - ) - metadata_output = episode_dir / metadata_filename - atomic_write_json(metadata_output, metadata, indent=2, ensure_ascii=False) - - console.print(f"[green]✓ Saved cluster metadata to: {metadata_output}[/green]") - - @staticmethod - def __cleanup_memory() -> None: - gc.collect() - if torch.cuda.is_available(): - torch.cuda.empty_cache() diff --git a/preprocessor/video/subprocessors/image_hash_subprocessor.py b/preprocessor/video/subprocessors/image_hash_subprocessor.py deleted file mode 100644 index a0f4ae577..000000000 --- a/preprocessor/video/subprocessors/image_hash_subprocessor.py +++ /dev/null @@ -1,82 +0,0 @@ -import gc -import logging -from pathlib import Path -from typing import ( - List, - Optional, -) - -import torch - -from preprocessor.config.config import settings -from preprocessor.core.base_processor import ( - OutputSpec, - ProcessingItem, -) -from preprocessor.core.path_manager import PathManager -from preprocessor.utils.batch_processing_utils import compute_hashes_in_batches -from preprocessor.utils.console import console -from preprocessor.utils.error_handling_logger import ErrorHandlingLogger -from preprocessor.utils.hash_save_utils import save_image_hashes_to_json -from preprocessor.utils.image_hasher import PerceptualHasher -from preprocessor.video.frame_processor import FrameSubProcessor - -# pylint: disable=duplicate-code - - -class ImageHashSubProcessor(FrameSubProcessor): - def __init__(self, device: str, batch_size: int): - super().__init__("Image Hashing") - self.device = device - self.batch_size = batch_size - self.hasher: Optional[PerceptualHasher] = None - self.logger = ErrorHandlingLogger("ImageHashSubProcessor", logging.DEBUG, 15) - - def initialize(self) -> None: - if self.hasher is None: - self.hasher = PerceptualHasher(device=self.device, hash_size=8) - - def cleanup(self) -> None: - self.hasher = None - self.__cleanup_memory() - - def get_expected_outputs(self, item: ProcessingItem) -> List[OutputSpec]: - episode_info = item.metadata["episode_info"] - episode_dir = PathManager(episode_info.series_name or "unknown").get_episode_dir(episode_info,settings.output_subdirs.image_hashes) - series_name = item.metadata["series_name"] - path_manager = PathManager(series_name) - hash_filename = path_manager.build_filename( - episode_info, - extension="json", - suffix="image_hashes", - ) - hash_output = episode_dir / hash_filename - return [OutputSpec(path=hash_output, required=True)] - - def process(self, item: ProcessingItem, ramdisk_frames_dir: Path) -> None: - self.initialize() - - metadata_file = item.input_path - episode_info = item.metadata["episode_info"] - - frame_requests = self._load_frame_requests_from_metadata(metadata_file) - if frame_requests is None: - return - - hash_results = compute_hashes_in_batches(ramdisk_frames_dir, frame_requests, self.hasher, self.batch_size) - series_name = item.metadata["series_name"] - - output_path = save_image_hashes_to_json( - episode_info=episode_info, - hash_results=hash_results, - series_name=series_name, - device=self.device, - batch_size=self.batch_size, - ) - console.print(f"[green]✓ Saved hashes to: {output_path}[/green]") - - @staticmethod - def __cleanup_memory() -> None: - gc.collect() - if torch.cuda.is_available(): - torch.cuda.empty_cache() diff --git a/preprocessor/video/subprocessors/object_detection_subprocessor.py b/preprocessor/video/subprocessors/object_detection_subprocessor.py deleted file mode 100644 index d31a87793..000000000 --- a/preprocessor/video/subprocessors/object_detection_subprocessor.py +++ /dev/null @@ -1,196 +0,0 @@ -import gc -import logging -from pathlib import Path -from typing import ( - Any, - Dict, - List, - Optional, -) - -from PIL import Image -import torch - -from preprocessor.config.config import settings -from preprocessor.core.base_processor import ( - OutputSpec, - ProcessingItem, -) -from preprocessor.core.path_manager import PathManager -from preprocessor.utils.batch_processor import BatchProcessor -from preprocessor.utils.console import console -from preprocessor.utils.error_handling_logger import ErrorHandlingLogger -from preprocessor.utils.file_utils import atomic_write_json -from preprocessor.utils.metadata_utils import create_processing_metadata -from preprocessor.video.frame_processor import FrameSubProcessor - - -class ObjectDetectionSubProcessor(FrameSubProcessor): - def __init__(self, model_name: str = "ustc-community/dfine-xlarge-obj2coco", conf_threshold: float = 0.25): - super().__init__("Object Detection") - self.model_name = model_name - self.conf_threshold = conf_threshold - self.model: Optional[Any] = None - self.image_processor: Optional[Any] = None - self.logger = ErrorHandlingLogger("ObjectDetectionSubProcessor", logging.DEBUG, 15) - self.batch_processor = BatchProcessor(8) - - def initialize(self) -> None: - if self.model is None: - if not torch.cuda.is_available(): - raise RuntimeError("CUDA is not available. Object detection requires GPU.") - - from transformers import ( # pylint: disable=import-outside-toplevel - AutoImageProcessor, - DFineForObjectDetection, - ) - - console.print(f"[cyan]Loading D-FINE model: {self.model_name}[/cyan]") - self.image_processor = AutoImageProcessor.from_pretrained(self.model_name) - self.model = DFineForObjectDetection.from_pretrained(self.model_name) - self.model.to("cuda") - console.print("[green]✓ D-FINE model loaded on GPU[/green]") - - def cleanup(self) -> None: - self.model = None - self.image_processor = None - self.__cleanup_memory() - - def finalize(self) -> None: - if hasattr(self, 'logger'): - self.logger.finalize() - - def get_expected_outputs(self, item: ProcessingItem) -> List[OutputSpec]: - episode_info = item.metadata["episode_info"] - episode_dir = PathManager(episode_info.series_name or "unknown").get_episode_dir(episode_info,settings.output_subdirs.object_detections) - series_name = item.metadata["series_name"] - path_manager = PathManager(series_name) - detections_filename = path_manager.build_filename( - episode_info, - extension="json", - suffix="_object_detections", - ) - detections_output = episode_dir / detections_filename - return [OutputSpec(path=detections_output, required=True)] - - def process(self, item: ProcessingItem, ramdisk_frames_dir: Path) -> None: - self.initialize() - - episode_info = item.metadata["episode_info"] - - frame_files = self._load_frames_with_warning(ramdisk_frames_dir) - if frame_files is None: - return - - console.print(f"[cyan]Detecting objects in {len(frame_files)} frames[/cyan]") - - def _process_batch(batch_paths: List[Path]) -> List[Dict[str, Any]]: - batch_images = [Image.open(fp) for fp in batch_paths] - target_sizes = [(img.height, img.width) for img in batch_images] - - inputs = self.image_processor(images=batch_images, return_tensors="pt") - inputs = {k: v.to("cuda") for k, v in inputs.items()} - - with torch.no_grad(): - outputs = self.model(**inputs) - - results = self.image_processor.post_process_object_detection( - outputs, - target_sizes=target_sizes, - threshold=self.conf_threshold, - ) - - batch_results = [] - for frame_path, result in zip(batch_paths, results): - frame_result = { - "frame_name": frame_path.name, - "detections": [], - } - - for score, label_id, box in zip(result["scores"], result["labels"], result["boxes"]): - score_value = score.item() - label = label_id.item() - box_coords = [float(i) for i in box.tolist()] - - detection = { - "class_id": label, - "class_name": self.model.config.id2label[label], - "confidence": score_value, - "bbox": { - "x1": box_coords[0], - "y1": box_coords[1], - "x2": box_coords[2], - "y2": box_coords[3], - }, - } - frame_result["detections"].append(detection) - - frame_result["detection_count"] = len(frame_result["detections"]) - batch_results.append(frame_result) - - for img in batch_images: - img.close() - return batch_results - - all_results = self.batch_processor.process(frame_files, _process_batch) - - detections_data = { - "episode_code": episode_info.episode_code(), - "model": self.model_name, - "confidence_threshold": self.conf_threshold, - "frames": all_results, - } - - total_detections = sum(f['detection_count'] for f in detections_data['frames']) - frames_with_detections = len([f for f in detections_data['frames'] if f['detection_count'] > 0]) - - console.print(f"[green]✓ Total detections: {total_detections}[/green]") - console.print(f"[green]✓ Frames with detections: {frames_with_detections}/{len(frame_files)}[/green]") - - class_counts = {} - for frame in detections_data["frames"]: - for det in frame["detections"]: - class_name = det["class_name"] - class_counts[class_name] = class_counts.get(class_name, 0) + 1 - - if class_counts: - top_classes = sorted(class_counts.items(), key=lambda x: x[1], reverse=True)[:5] - console.print(f"[cyan]Top 5 classes: {', '.join(f'{cls}:{cnt}' for cls, cnt in top_classes)}[/cyan]") - - series_name = item.metadata["series_name"] - self.__save_detections(episode_info, detections_data, series_name) - - def __save_detections(self, episode_info, detections_data: Dict[str, Any], series_name: str) -> None: - episode_dir = PathManager(episode_info.series_name or "unknown").get_episode_dir(episode_info,settings.output_subdirs.object_detections) - episode_dir.mkdir(parents=True, exist_ok=True) - - output_data = create_processing_metadata( - episode_info=episode_info, - processing_params={ - "model": self.model_name, - "confidence_threshold": self.conf_threshold, - }, - statistics={ - "total_frames": len(detections_data["frames"]), - "total_detections": sum(f['detection_count'] for f in detections_data['frames']), - "frames_with_detections": len([f for f in detections_data['frames'] if f['detection_count'] > 0]), - }, - results_key="detections", - results_data=detections_data["frames"], - ) - path_manager = PathManager(series_name) - detections_filename = path_manager.build_filename( - episode_info, - extension="json", - suffix="_object_detections", - ) - detections_output = episode_dir / detections_filename - atomic_write_json(detections_output, output_data, indent=2, ensure_ascii=False) - - console.print(f"[green]✓ Saved object detections to: {detections_output}[/green]") - - @staticmethod - def __cleanup_memory() -> None: - gc.collect() - if torch.cuda.is_available(): - torch.cuda.empty_cache() diff --git a/preprocessor/video/subprocessors/object_detection_visualization_subprocessor.py b/preprocessor/video/subprocessors/object_detection_visualization_subprocessor.py deleted file mode 100644 index 60c1b0ebd..000000000 --- a/preprocessor/video/subprocessors/object_detection_visualization_subprocessor.py +++ /dev/null @@ -1,124 +0,0 @@ -import logging -from pathlib import Path -from typing import ( - Any, - Dict, - List, - Tuple, -) - -import cv2 -import numpy as np - -from preprocessor.config.config import settings -from preprocessor.core.base_processor import ( - OutputSpec, - ProcessingItem, -) -from preprocessor.core.path_manager import PathManager -from preprocessor.utils.console import console -from preprocessor.utils.error_handling_logger import ErrorHandlingLogger -from preprocessor.video.frame_processor import FrameSubProcessor - - -class ObjectDetectionVisualizationSubProcessor(FrameSubProcessor): - def __init__(self): - super().__init__("Object Detection Visualization") - self.logger = ErrorHandlingLogger("ObjectDetectionVisualizationSubProcessor", logging.DEBUG, 15) - - def initialize(self) -> None: - pass - - def cleanup(self) -> None: - pass - - def needs_ramdisk(self) -> bool: - return False - - def get_expected_outputs(self, item: ProcessingItem) -> List[OutputSpec]: - episode_info = item.metadata["episode_info"] - episode_dir = PathManager(episode_info.series_name or "unknown").get_episode_dir(episode_info,settings.output_subdirs.object_visualizations) - marker_file = episode_dir / ".visualization_complete" - return [OutputSpec(path=marker_file, required=True)] - - def should_run(self, item: ProcessingItem, missing_outputs: List[OutputSpec]) -> bool: - episode_info = item.metadata["episode_info"] - detection_dir = PathManager(episode_info.series_name or "unknown").get_episode_dir(episode_info,settings.output_subdirs.object_detections) - detection_files = list(detection_dir.glob("*_object_detections.json")) - detection_file = detection_files[0] if detection_files else None - - if not detection_file or not detection_file.exists(): - console.print(f"[yellow]No object detections found for {episode_info.episode_code()}, skipping visualization[/yellow]") - return False - - expected = self.get_expected_outputs(item) - return any(str(exp.path) in str(miss.path) for exp in expected for miss in missing_outputs) - - def process(self, item: ProcessingItem, ramdisk_frames_dir: Path) -> None: - episode_info = item.metadata["episode_info"] - detection_dir = PathManager(episode_info.series_name or "unknown").get_episode_dir(episode_info,settings.output_subdirs.object_detections) - - detection_data = self._load_detection_file( - detection_dir, - ramdisk_frames_dir, - "*_object_detections.json", - ) - if detection_data is None: - return - - frames_with_detections = [f for f in detection_data.get("detections", []) if f['detection_count'] > 0] - if not frames_with_detections: - console.print(f"[yellow]No frames with detections for {episode_info.episode_code()}[/yellow]") - return - - output_dir = PathManager(episode_info.series_name or "unknown").get_episode_dir(episode_info,settings.output_subdirs.object_visualizations) - output_dir.mkdir(parents=True, exist_ok=True) - colors = self.__generate_colors() - conf_threshold = detection_data.get("processing_params", {}).get("confidence_threshold", 0.25) - - console.print(f"[cyan]Visualizing {len(frames_with_detections)} frames for {episode_info.episode_code()}[/cyan]") - - for frame_data in frames_with_detections: - output_path = output_dir / frame_data['frame_name'] - if output_path.exists(): - continue - - frame_path = ramdisk_frames_dir / frame_data['frame_name'] - if not frame_path.exists(): - continue - - img = cv2.imread(str(frame_path)) - if img is None: - continue - - self.__draw_detections_on_frame(img, frame_data['detections'], colors, conf_threshold) - cv2.imwrite(str(output_path), img) - - marker_file = output_dir / ".visualization_complete" - marker_file.write_text(f"completed: {len(frames_with_detections)} frames") - console.print(f"[green]✓ Visualized {len(frames_with_detections)} frames saved to: {output_dir}[/green]") - - @staticmethod - def __draw_detections_on_frame(img, detections: List[Dict[str, Any]], colors: Dict[int, Tuple[int, int, int]], conf_threshold: float) -> None: - for detection in detections: - if detection['confidence'] < conf_threshold: - continue - - class_id = detection['class_id'] - bbox = detection['bbox'] - x1, y1 = int(bbox['x1']), int(bbox['y1']) - x2, y2 = int(bbox['x2']), int(bbox['y2']) - color = colors.get(class_id, (0, 255, 0)) - - cv2.rectangle(img, (x1, y1), (x2, y2), color, 2) - - label = f"{detection['class_name']} {detection['confidence']:.2f}" - FrameSubProcessor._draw_label_on_bbox(img, label, x1, y1, color) - - @staticmethod - def __generate_colors(num_colors: int = 80) -> Dict[int, Tuple[int, int, int]]: - np.random.seed(42) - colors = {} - for i in range(num_colors): - colors[i] = tuple(int(x) for x in np.random.randint(50, 255, 3)) - return colors diff --git a/preprocessor/video/subprocessors/video_embedding_subprocessor.py b/preprocessor/video/subprocessors/video_embedding_subprocessor.py deleted file mode 100644 index b6463a732..000000000 --- a/preprocessor/video/subprocessors/video_embedding_subprocessor.py +++ /dev/null @@ -1,144 +0,0 @@ -import gc -import logging -from pathlib import Path -from typing import ( - Any, - Dict, - List, - Optional, -) - -import torch - -from preprocessor.config.config import settings -from preprocessor.core.base_processor import ( - OutputSpec, - ProcessingItem, -) -from preprocessor.core.path_manager import PathManager -from preprocessor.embeddings.gpu_batch_processor import GPUBatchProcessor -from preprocessor.utils.batch_processing_utils import compute_embeddings_in_batches -from preprocessor.utils.console import console -from preprocessor.utils.error_handling_logger import ErrorHandlingLogger -from preprocessor.utils.file_utils import atomic_write_json -from preprocessor.utils.image_hash_utils import load_image_hashes_for_episode -from preprocessor.utils.metadata_utils import create_processing_metadata -from preprocessor.video.frame_processor import FrameSubProcessor - -# pylint: disable=duplicate-code - - -class VideoEmbeddingSubProcessor(FrameSubProcessor): - def __init__(self, device: str, batch_size: int, model_name: str, model_revision: str): - super().__init__("Video Embeddings") - self.device = device - self.batch_size = batch_size - self.model_name = model_name - self.model_revision = model_revision - self.model = None - self.gpu_processor: Optional[GPUBatchProcessor] = None - self.logger = ErrorHandlingLogger("VideoEmbeddingSubProcessor", logging.DEBUG, 15) - - def initialize(self) -> None: - if self.model is None: - from preprocessor.embeddings.qwen3_vl_embedding import Qwen3VLEmbedder # pylint: disable=import-outside-toplevel - console.print(f"[cyan]Loading embedding model: {self.model_name}[/cyan]") - self.model = Qwen3VLEmbedder( - model_name_or_path=self.model_name, - torch_dtype=torch.bfloat16, - ) - self.gpu_processor = GPUBatchProcessor( - self.model, - self.batch_size, - self.logger, - self.device, - progress_sub_batch_size=settings.embedding.progress_sub_batch_size, - ) - console.print("[green]✓ Qwen3-VL-Embedding model loaded[/green]") - - def cleanup(self) -> None: - self.model = None - self.gpu_processor = None - self.__cleanup_memory() - - def get_expected_outputs(self, item: ProcessingItem) -> List[OutputSpec]: - episode_info = item.metadata["episode_info"] - episode_dir = PathManager(episode_info.series_name or "unknown").get_episode_dir(episode_info,settings.output_subdirs.embeddings) - series_name = item.metadata["series_name"] - path_manager = PathManager(series_name) - video_filename = path_manager.build_filename( - episode_info, - extension="json", - suffix="embeddings_video", - ) - video_output = episode_dir / video_filename - return [OutputSpec(path=video_output, required=True)] - - def process(self, item: ProcessingItem, ramdisk_frames_dir: Path) -> None: - self.initialize() - - metadata_file = item.input_path - episode_info = item.metadata["episode_info"] - - frame_requests = self._load_frame_requests_from_metadata(metadata_file) - if frame_requests is None: - return - - episode_dir = PathManager(episode_info.series_name or "unknown").get_episode_dir(episode_info,settings.output_subdirs.embeddings) - checkpoint_file = episode_dir / "embeddings_video_checkpoint.json" - - series_name = item.metadata.get("series_name", "unknown") - image_hashes = load_image_hashes_for_episode( - {"season": episode_info.season, "episode_number": episode_info.relative_episode}, - series_name, - self.logger, - ) - video_embeddings = compute_embeddings_in_batches( - ramdisk_frames_dir, - frame_requests, - self.gpu_processor, - self.batch_size, - image_hashes, - checkpoint_file=checkpoint_file, - checkpoint_interval=20, - prefetch_count=settings.embedding.prefetch_chunks, - ) - series_name = item.metadata["series_name"] - self.__save_embeddings(episode_info, video_embeddings, series_name) - - def __save_embeddings(self, episode_info, video_embeddings: List[Dict[str, Any]], series_name: str) -> None: - episode_dir = PathManager(episode_info.series_name or "unknown").get_episode_dir(episode_info,settings.output_subdirs.embeddings) - episode_dir.mkdir(parents=True, exist_ok=True) - - video_data = create_processing_metadata( - episode_info=episode_info, - processing_params={ - "model_name": self.model_name, - "model_revision": self.model_revision, - "batch_size": self.batch_size, - "device": self.device, - }, - statistics={ - "total_embeddings": len(video_embeddings), - "embedding_dimension": len(video_embeddings[0]["embedding"]) if video_embeddings else 0, - "frames_with_hash": sum(1 for e in video_embeddings if "perceptual_hash" in e), - }, - results_key="video_embeddings", - results_data=video_embeddings, - ) - path_manager = PathManager(series_name) - video_filename = path_manager.build_filename( - episode_info, - extension="json", - suffix="embeddings_video", - ) - video_output = episode_dir / video_filename - atomic_write_json(video_output, video_data, indent=2, ensure_ascii=False) - - console.print(f"[green]✓ Saved embeddings to: {video_output}[/green]") - - @staticmethod - def __cleanup_memory() -> None: - gc.collect() - if torch.cuda.is_available(): - torch.cuda.empty_cache() From 3afc6c0b9696ad396b68f01cef9678f816aa1e62 Mon Sep 17 00:00:00 2001 From: dam2452 <81230036+dam2452@users.noreply.github.com> Date: Tue, 10 Feb 2026 21:51:31 +0100 Subject: [PATCH 07/89] Lower scene min length and raise beam size Reduce default scene detection minimum length from 15s to 10s and increase Whisper transcription beam_size from 5 to 10. Updates applied to config defaults, step configs, pipeline factory, and the runtime wrappers (TransNetWrapper and Whisper) so defaults and implementations stay consistent; this allows detection of shorter scenes and uses a larger beam for potentially improved transcription quality at the cost of extra compute. --- preprocessor/app/config_defaults.py | 2 +- preprocessor/app/pipeline_factory.py | 4 ++-- preprocessor/config/step_configs.py | 4 ++-- preprocessor/lib/media/scene_detection.py | 2 +- preprocessor/lib/transcription/whisper.py | 2 +- 5 files changed, 7 insertions(+), 7 deletions(-) diff --git a/preprocessor/app/config_defaults.py b/preprocessor/app/config_defaults.py index 288e97028..a8bb016ce 100644 --- a/preprocessor/app/config_defaults.py +++ b/preprocessor/app/config_defaults.py @@ -38,7 +38,7 @@ def get_default_step_configs(series_name: str) -> Dict[str, object]: ), 'separate_sounds': SoundSeparationConfig(), 'analyze_text': TextAnalysisConfig(language='pl'), - 'detect_scenes': SceneDetectionConfig(threshold=0.5, min_scene_len=15), + 'detect_scenes': SceneDetectionConfig(threshold=0.5, min_scene_len=10), 'export_frames': FrameExportConfig(frames_per_scene=3), 'text_embeddings': TextEmbeddingConfig( model_name='Qwen/Qwen2-VL-8B-Instruct', diff --git a/preprocessor/app/pipeline_factory.py b/preprocessor/app/pipeline_factory.py index 87352468c..77343f6a4 100644 --- a/preprocessor/app/pipeline_factory.py +++ b/preprocessor/app/pipeline_factory.py @@ -102,7 +102,7 @@ description="Wykrywa zmiany scen używając TransNetV2", produces=["scene_detections/{season}/{episode}.json"], needs=[transcoded_videos], - config=SceneDetectionConfig(threshold=0.5, min_scene_len=15), + config=SceneDetectionConfig(threshold=0.5, min_scene_len=10), ) exported_frames = StepBuilder( @@ -126,7 +126,7 @@ model="large-v3-turbo", language="pl", device="cuda", - beam_size=5, + beam_size=10, temperature=0.0, ), ) diff --git a/preprocessor/config/step_configs.py b/preprocessor/config/step_configs.py index 3a496dd08..7be90dd68 100644 --- a/preprocessor/config/step_configs.py +++ b/preprocessor/config/step_configs.py @@ -33,7 +33,7 @@ def maxrate_must_be_greater_than_bitrate(self) -> Self: class SceneDetectionConfig(BaseModel): threshold: float = Field(default=0.5, ge=0, le=1) - min_scene_len: int = Field(default=15, ge=1) + min_scene_len: int = Field(default=10, ge=1) class FrameExportConfig(BaseModel): resolution: Resolution = Field(default=Resolution.R720P) @@ -52,7 +52,7 @@ class WhisperTranscriptionConfig(BaseModel): model: str = 'large-v3-turbo' language: str = 'pl' device: str = 'cuda' - beam_size: int = Field(default=5, ge=1) + beam_size: int = Field(default=10, ge=1) temperature: float = Field(default=0.0, ge=0.0, le=1.0) class TextAnalysisConfig(BaseModel): diff --git a/preprocessor/lib/media/scene_detection.py b/preprocessor/lib/media/scene_detection.py index 8c4be9df7..2fd5777de 100644 --- a/preprocessor/lib/media/scene_detection.py +++ b/preprocessor/lib/media/scene_detection.py @@ -27,7 +27,7 @@ def detect_scenes( self, video_path: Path, threshold: float=0.5, - min_scene_len: int=15, + min_scene_len: int=10, ) -> List[Dict[str, Any]]: if self.model is None: raise RuntimeError('Model not loaded. Call load_model() first.') diff --git a/preprocessor/lib/transcription/whisper.py b/preprocessor/lib/transcription/whisper.py index 81ebd6e14..e0889d7f5 100644 --- a/preprocessor/lib/transcription/whisper.py +++ b/preprocessor/lib/transcription/whisper.py @@ -14,7 +14,7 @@ class Whisper: - def __init__(self, model: str='large-v3-turbo', language: str='pl', device: str='cuda', beam_size: int=5, temperature: float=0.0) -> None: + def __init__(self, model: str='large-v3-turbo', language: str='pl', device: str='cuda', beam_size: int=10, temperature: float=0.0) -> None: self.model_name: str = model self.language: str = language self.device: str = device From 0250bd2b9465182246f21198e16c6e853a699746 Mon Sep 17 00:00:00 2001 From: dam2452 <81230036+dam2452@users.noreply.github.com> Date: Wed, 11 Feb 2026 12:21:59 +0100 Subject: [PATCH 08/89] Support per-series configs and selective pipelines Introduce per-series configuration and make the pipeline build dynamic from those configs. Added SeriesConfig loader and default/template/kiepscy/ranczo JSON configs; pipeline_factory now constructs StepBuilders from series_config and exposes build/visualize/get_step_configs with a series parameter. CLI and helpers updated to accept a --series arg, handle Docker vs local input/output paths, and apply selective skip rules from series config. Pipeline execution now checks state_manager to skip completed steps and marks steps as started/completed; StateManager uses a per-series state file. Misc: pass series_name into scrapers, add create_progress factory, simplify entrypoint, and adjust base scraper behavior. --- .pre-commit-config.yaml | 1 + preprocessor/app/pipeline_builder.py | 34 ++ preprocessor/app/pipeline_factory.py | 503 +++++++++--------- preprocessor/cli/cli_main.py | 117 ++-- preprocessor/cli/helpers.py | 16 +- preprocessor/config/series_config.py | 200 +++++++ preprocessor/core/state_manager.py | 5 +- preprocessor/entrypoint.sh | 5 - preprocessor/lib/ui/console.py | 4 + preprocessor/modules/scraping/base_scraper.py | 4 + .../scraping/character_scraper_step.py | 3 +- .../modules/scraping/episode_scraper_step.py | 3 +- preprocessor/modules/text/import_step.py | 7 +- preprocessor/modules/video/transcoding.py | 25 +- preprocessor/series_configs/defaults.json | 49 ++ preprocessor/series_configs/kiepscy.json | 27 + preprocessor/series_configs/ranczo.json | 40 ++ preprocessor/series_configs/template.json | 44 ++ 18 files changed, 771 insertions(+), 316 deletions(-) create mode 100644 preprocessor/config/series_config.py create mode 100644 preprocessor/series_configs/defaults.json create mode 100644 preprocessor/series_configs/kiepscy.json create mode 100644 preprocessor/series_configs/ranczo.json create mode 100644 preprocessor/series_configs/template.json diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index c4ca8a0f5..f5b66a90e 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -37,6 +37,7 @@ repos: - id: chmod args: ["755"] files: (.*scripts\/.*.py$|\.sh$) + exclude: ^preprocessor/entrypoint\.sh$ - id: remove-tabs args: [--whitespaces-count, '4'] - repo: https://github.com/PyCQA/isort diff --git a/preprocessor/app/pipeline_builder.py b/preprocessor/app/pipeline_builder.py index 237ccd391..a5953ac3a 100644 --- a/preprocessor/app/pipeline_builder.py +++ b/preprocessor/app/pipeline_builder.py @@ -48,17 +48,51 @@ def run_for_episodes( next_artifacts = [] for artifact in current_artifacts: + episode_id = artifact.episode_id + + if self.__should_skip_step(step.name, episode_id): + self.context.logger.info( + f"⏭️ Skipping {step.name} for {episode_id} (already completed)", + ) + next_artifacts.append(artifact) + continue + try: + self.__mark_step_in_progress(step.name, episode_id) result = step.execute(artifact, self.context) + self.__mark_step_completed(step.name, episode_id) + if result: next_artifacts.append(result) + else: + next_artifacts.append(artifact) except Exception as e: self.context.logger.error( f"Step {step.name} failed for {artifact.episode_id}: {e}", ) + raise current_artifacts = next_artifacts + def __should_skip_step(self, step_name: str, episode_id: str) -> bool: + if self.context.force_rerun: + return False + + if self.context.state_manager is None: + return False + + return self.context.state_manager.is_step_completed(step_name, episode_id) + + def __mark_step_in_progress(self, step_name: str, episode_id: str) -> None: + if self.context.state_manager is None: + return + self.context.state_manager.mark_step_started(step_name, episode_id) + + def __mark_step_completed(self, step_name: str, episode_id: str) -> None: + if self.context.state_manager is None: + return + self.context.state_manager.mark_step_completed(step_name, episode_id) + @staticmethod def __discover_videos(source_path: Path) -> List[Path]: extensions = ["*.mp4", "*.mkv", "*.avi"] diff --git a/preprocessor/app/pipeline_factory.py b/preprocessor/app/pipeline_factory.py index 77343f6a4..75f9896f7 100644 --- a/preprocessor/app/pipeline_factory.py +++ b/preprocessor/app/pipeline_factory.py @@ -5,6 +5,8 @@ Phase, StepBuilder, ) +from preprocessor.config.config import get_base_output_dir +from preprocessor.config.series_config import SeriesConfig from preprocessor.config.step_configs import ( ArchiveConfig, CharacterDetectionConfig, @@ -31,251 +33,256 @@ PROCESSING = Phase("PROCESSING", color="green") INDEXING = Phase("INDEXING", color="yellow") -episodes_metadata = StepBuilder( - id="scrape_episodes", - phase=SCRAPING, - module="preprocessor.modules.scraping.episode_scraper_step:EpisodeScraperStep", - description="Scrapes episode metadata from wiki", - produces=["episodes.json"], - needs=[], - config=EpisodeScraperConfig( - urls=["https://ranczo.fandom.com/pl/wiki/Lista_odcinków"], - output_file="preprocessor/scraped_data/episodes.json", - headless=True, - merge_sources=True, - scraper_method="crawl4ai", - parser_mode="normal", - ), -) - -characters_metadata = StepBuilder( - id="scrape_characters", - phase=SCRAPING, - module="preprocessor.modules.scraping.character_scraper_step:CharacterScraperStep", - description="Scrapes character data from wiki", - produces=["characters.json"], - needs=[], - config=CharacterScraperConfig( - urls=["https://ranczo.fandom.com/pl/wiki/Postacie"], - output_file="preprocessor/scraped_data/characters.json", - headless=True, - scraper_method="crawl4ai", - parser_mode="normal", - ), -) - -character_references = StepBuilder( - id="process_references", - phase=SCRAPING, - module="preprocessor.modules.scraping.reference_processor_step:CharacterReferenceStep", - description="Downloads and processes character reference images", - produces=["character_faces/{character}/*.jpg"], - needs=[characters_metadata], - config=CharacterReferenceConfig( - characters_file="preprocessor/scraped_data/characters.json", - output_dir="preprocessor/character_faces", - search_engine="duckduckgo", - images_per_character=5, - ), -) - -transcoded_videos = StepBuilder( - id="transcode", - phase=PROCESSING, - module="preprocessor.modules.video.transcoding:VideoTranscoderStep", - description="Konwersja do h264_nvenc 720p 30fps z adaptacyjnym bitrate", - produces=["transcoded_videos/{season}/{episode}.mp4"], - needs=[], - config=TranscodeConfig( - video_bitrate_mbps=2.5, - minrate_mbps=1.5, - maxrate_mbps=3.5, - bufsize_mbps=5.0, - gop_size=2.0, - ), -) - -scene_data = StepBuilder( - id="detect_scenes", - phase=PROCESSING, - module="preprocessor.modules.video.scene_detection:SceneDetectorStep", - description="Wykrywa zmiany scen używając TransNetV2", - produces=["scene_detections/{season}/{episode}.json"], - needs=[transcoded_videos], - config=SceneDetectionConfig(threshold=0.5, min_scene_len=10), -) - -exported_frames = StepBuilder( - id="export_frames", - phase=PROCESSING, - module="preprocessor.modules.video.frame_export:FrameExporterStep", - description="Eksportuje klatki (PNG) na granicach scen", - produces=["frames/{season}/{episode}/*.png"], - needs=[scene_data], - config=FrameExportConfig(frames_per_scene=3), -) - -transcription_data = StepBuilder( - id="transcribe", - phase=PROCESSING, - module="preprocessor.modules.text.transcription:TranscriptionStep", - description="Transkrypcja audio używając Whisper large-v3-turbo", - produces=["transcriptions/{season}/{episode}.json"], - needs=[transcoded_videos], - config=WhisperTranscriptionConfig( - model="large-v3-turbo", - language="pl", - device="cuda", - beam_size=10, - temperature=0.0, - ), -) - -separated_audio = StepBuilder( - id="separate_sounds", - phase=PROCESSING, - module="preprocessor.modules.audio.separation:AudioSeparationStep", - description="Rozdziela dialogi od efektów dźwiękowych", - produces=["separated_audio/{season}/{episode}/"], - needs=[transcription_data], - config=SoundSeparationConfig(), -) - -text_stats = StepBuilder( - id="analyze_text", - phase=PROCESSING, - module="preprocessor.modules.text.analysis:TextAnalysisStep", - description="Analiza statystyk tekstu (częstotliwość słów, sentiment)", - produces=["text_analysis/{season}/{episode}.json"], - needs=[transcription_data], - config=TextAnalysisConfig(language="pl"), -) - -text_embeddings = StepBuilder( - id="text_embeddings", - phase=PROCESSING, - module="preprocessor.modules.text.embeddings:TextEmbeddingStep", - description="Generuje embeddingi tekstowe używając Qwen2-VL", - produces=["embeddings/text/{season}/{episode}.npy"], - needs=[text_stats], - config=TextEmbeddingConfig( - model_name="Qwen/Qwen2-VL-8B-Instruct", - batch_size=8, - device="cuda", - text_sentences_per_chunk=5, - text_chunk_overlap=1, - ), -) - -image_hashes = StepBuilder( - id="image_hashing", - phase=PROCESSING, - module="preprocessor.modules.vision.image_hashing:ImageHashStep", - description="Perceptual hashing klatek (phash, dhash, wavelet)", - produces=["hashes/{season}/{episode}.json"], - needs=[exported_frames], - config=ImageHashConfig(batch_size=32), -) - -video_embeddings = StepBuilder( - id="video_embeddings", - phase=PROCESSING, - module="preprocessor.modules.vision.embeddings:VideoEmbeddingStep", - description="Embeddingi wizualne używając Qwen2-VL", - produces=["embeddings/vision/{season}/{episode}.npy"], - needs=[exported_frames, image_hashes], - config=VideoEmbeddingConfig( - model_name="Qwen/Qwen2-VL-8B-Instruct", - batch_size=8, - device="cuda", - ), -) - -character_detections = StepBuilder( - id="detect_characters", - phase=PROCESSING, - module="preprocessor.modules.vision.character_detection:CharacterDetectorStep", - description="Rozpoznaje postacie na klatkach używając InsightFace", - produces=["detections/characters/{season}/{episode}.json"], - needs=[exported_frames], - config=CharacterDetectionConfig(threshold=0.7), -) - -emotion_data = StepBuilder( - id="detect_emotions", - phase=PROCESSING, - module="preprocessor.modules.vision.emotion_detection:EmotionDetectionStep", - description="Detekcja emocji na twarzach używając EmoNet", - produces=["detections/emotions/{season}/{episode}.json"], - needs=[exported_frames], - config=EmotionDetectionConfig(), -) - -face_clusters = StepBuilder( - id="cluster_faces", - phase=PROCESSING, - module="preprocessor.modules.vision.face_clustering:FaceClusteringStep", - description="Klasteryzacja twarzy używając HDBSCAN", - produces=["clusters/faces/{season}/{episode}.json"], - needs=[exported_frames], - config=FaceClusteringConfig(), -) - -object_detections = StepBuilder( - id="detect_objects", - phase=PROCESSING, - module="preprocessor.modules.vision.object_detection:ObjectDetectionStep", - description="Detekcja obiektów ogólnych używając D-FINE", - produces=["detections/objects/{season}/{episode}.json"], - needs=[exported_frames], - config=ObjectDetectionConfig(), -) - -elastic_documents = StepBuilder( - id="generate_elastic_docs", - phase=INDEXING, - module="preprocessor.modules.search.document_generation:DocumentGeneratorStep", - description="Łączy wszystkie dane w dokumenty Elasticsearch", - produces=["elastic_documents/{season}/{episode}.ndjson"], - needs=[ - text_embeddings, - video_embeddings, - character_detections, - emotion_data, - face_clusters, - object_detections, - ], - config=DocumentGenerationConfig(generate_segments=True), -) - -episode_archives = StepBuilder( - id="generate_archives", - phase=INDEXING, - module="preprocessor.modules.packaging.archives:ArchiveGenerationStep", - description="Tworzy archiwa ZIP per odcinek (wszystkie artefakty)", - produces=["archives/{season}/{episode}.zip"], - needs=[elastic_documents], - config=ArchiveConfig(), -) - -indexed_data = StepBuilder( - id="index_to_elasticsearch", - phase=INDEXING, - module="preprocessor.modules.search.indexing:ElasticsearchIndexerStep", - description="Wrzuca dokumenty do Elasticsearch", - produces=[""], - needs=[elastic_documents], - config=ElasticsearchConfig( - index_name="ranczo_clips", - host="localhost:9200", - dry_run=False, - append=False, - ), -) - -def build_pipeline() -> Pipeline: - pipeline = Pipeline(name="ranczo_processing") +def build_pipeline(series_name: str) -> Pipeline: # pylint: disable=too-many-locals + series_config: SeriesConfig = SeriesConfig.load(series_name) + + episodes_metadata = StepBuilder( + id="scrape_episodes", + phase=SCRAPING, + module="preprocessor.modules.scraping.episode_scraper_step:EpisodeScraperStep", + description="Scrapes episode metadata from wiki", + produces=["episodes.json"], + needs=[], + config=EpisodeScraperConfig( + urls=series_config.scraping.episodes.urls, + output_file=str(get_base_output_dir(series_name) / f"{series_name}_episodes.json"), + headless=True, + merge_sources=True, + scraper_method="crawl4ai", + parser_mode=series_config.scraping.episodes.parser_mode, + ), + ) + + characters_metadata = StepBuilder( + id="scrape_characters", + phase=SCRAPING, + module="preprocessor.modules.scraping.character_scraper_step:CharacterScraperStep", + description="Scrapes character data from wiki", + produces=["characters.json"], + needs=[], + config=CharacterScraperConfig( + urls=series_config.scraping.characters.urls, + output_file=str(get_base_output_dir(series_name) / f"{series_name}_characters.json"), + headless=True, + scraper_method="crawl4ai", + parser_mode=series_config.scraping.characters.parser_mode, + ), + ) + + character_references = StepBuilder( + id="process_references", + phase=SCRAPING, + module="preprocessor.modules.scraping.reference_processor_step:CharacterReferenceStep", + description="Downloads and processes character reference images", + produces=["character_faces/{character}/*.jpg"], + needs=[characters_metadata], + config=CharacterReferenceConfig( + characters_file=str(get_base_output_dir(series_name) / f"{series_name}_characters.json"), + output_dir=str(get_base_output_dir(series_name) / "character_faces"), + search_engine=series_config.scraping.character_references.search_engine, + images_per_character=series_config.scraping.character_references.images_per_character, + ), + ) + + transcoded_videos = StepBuilder( + id="transcode", + phase=PROCESSING, + module="preprocessor.modules.video.transcoding:VideoTranscoderStep", + description=f"Konwersja do {series_config.processing.transcode.codec} {series_config.processing.transcode.resolution} z adaptacyjnym bitrate", + produces=["transcoded_videos/{season}/{episode}.mp4"], + needs=[], + config=TranscodeConfig( + video_bitrate_mbps=series_config.processing.transcode.video_bitrate_mbps, + minrate_mbps=series_config.processing.transcode.minrate_mbps, + maxrate_mbps=series_config.processing.transcode.maxrate_mbps, + bufsize_mbps=series_config.processing.transcode.bufsize_mbps, + gop_size=series_config.processing.transcode.gop_size, + ), + ) + + scene_data = StepBuilder( + id="detect_scenes", + phase=PROCESSING, + module="preprocessor.modules.video.scene_detection:SceneDetectorStep", + description="Wykrywa zmiany scen używając TransNetV2", + produces=["scene_detections/{season}/{episode}.json"], + needs=[transcoded_videos], + config=SceneDetectionConfig( + threshold=series_config.processing.scene_detection.threshold, + min_scene_len=series_config.processing.scene_detection.min_scene_len, + ), + ) + + exported_frames = StepBuilder( + id="export_frames", + phase=PROCESSING, + module="preprocessor.modules.video.frame_export:FrameExporterStep", + description="Eksportuje klatki (PNG) na granicach scen", + produces=["frames/{season}/{episode}/*.png"], + needs=[scene_data], + config=FrameExportConfig(frames_per_scene=series_config.processing.frame_export.frames_per_scene), + ) + + transcription_data = StepBuilder( + id="transcribe", + phase=PROCESSING, + module="preprocessor.modules.text.transcription:TranscriptionStep", + description=f"Transkrypcja audio używając {series_config.processing.transcription.mode}", + produces=["transcriptions/{season}/{episode}.json"], + needs=[transcoded_videos], + config=WhisperTranscriptionConfig( + model=series_config.processing.transcription.model, + language=series_config.processing.transcription.language, + device=series_config.processing.transcription.device, + beam_size=10, + temperature=0.0, + ), + ) + + separated_audio = StepBuilder( + id="separate_sounds", + phase=PROCESSING, + module="preprocessor.modules.audio.separation:SoundSeparationStep", + description="Rozdziela dialogi od efektów dźwiękowych", + produces=["separated_audio/{season}/{episode}/"], + needs=[transcription_data], + config=SoundSeparationConfig(), + ) + + text_stats = StepBuilder( + id="analyze_text", + phase=PROCESSING, + module="preprocessor.modules.text.analysis:TextAnalysisStep", + description="Analiza statystyk tekstu (częstotliwość słów, sentiment)", + produces=["text_analysis/{season}/{episode}.json"], + needs=[transcription_data], + config=TextAnalysisConfig(language=series_config.processing.transcription.language), + ) + + text_embeddings = StepBuilder( + id="text_embeddings", + phase=PROCESSING, + module="preprocessor.modules.text.embeddings:TextEmbeddingStep", + description="Generuje embeddingi tekstowe używając Qwen2-VL", + produces=["embeddings/text/{season}/{episode}.npy"], + needs=[text_stats], + config=TextEmbeddingConfig( + model_name="Qwen/Qwen2-VL-8B-Instruct", + batch_size=8, + device="cuda", + text_sentences_per_chunk=5, + text_chunk_overlap=1, + ), + ) + + image_hashes = StepBuilder( + id="image_hashing", + phase=PROCESSING, + module="preprocessor.modules.vision.image_hashing:ImageHashStep", + description="Perceptual hashing klatek (phash, dhash, wavelet)", + produces=["hashes/{season}/{episode}.json"], + needs=[exported_frames], + config=ImageHashConfig(batch_size=32), + ) + + video_embeddings = StepBuilder( + id="video_embeddings", + phase=PROCESSING, + module="preprocessor.modules.vision.embeddings:VideoEmbeddingStep", + description="Embeddingi wizualne używając Qwen2-VL", + produces=["embeddings/vision/{season}/{episode}.npy"], + needs=[exported_frames, image_hashes], + config=VideoEmbeddingConfig( + model_name="Qwen/Qwen2-VL-8B-Instruct", + batch_size=8, + device="cuda", + ), + ) + + character_detections = StepBuilder( + id="detect_characters", + phase=PROCESSING, + module="preprocessor.modules.vision.character_detection:CharacterDetectorStep", + description="Rozpoznaje postacie na klatkach używając InsightFace", + produces=["detections/characters/{season}/{episode}.json"], + needs=[exported_frames], + config=CharacterDetectionConfig(threshold=0.7), + ) + + emotion_data = StepBuilder( + id="detect_emotions", + phase=PROCESSING, + module="preprocessor.modules.vision.emotion_detection:EmotionDetectionStep", + description="Detekcja emocji na twarzach używając EmoNet", + produces=["detections/emotions/{season}/{episode}.json"], + needs=[exported_frames], + config=EmotionDetectionConfig(), + ) + + face_clusters = StepBuilder( + id="cluster_faces", + phase=PROCESSING, + module="preprocessor.modules.vision.face_clustering:FaceClusteringStep", + description="Klasteryzacja twarzy używając HDBSCAN", + produces=["clusters/faces/{season}/{episode}.json"], + needs=[exported_frames], + config=FaceClusteringConfig(), + ) + + object_detections = StepBuilder( + id="detect_objects", + phase=PROCESSING, + module="preprocessor.modules.vision.object_detection:ObjectDetectionStep", + description="Detekcja obiektów ogólnych używając D-FINE", + produces=["detections/objects/{season}/{episode}.json"], + needs=[exported_frames], + config=ObjectDetectionConfig(), + ) + + elastic_documents = StepBuilder( + id="generate_elastic_docs", + phase=INDEXING, + module="preprocessor.modules.search.document_generation:DocumentGeneratorStep", + description="Łączy wszystkie dane w dokumenty Elasticsearch", + produces=["elastic_documents/{season}/{episode}.ndjson"], + needs=[ + text_embeddings, + video_embeddings, + character_detections, + emotion_data, + face_clusters, + object_detections, + ], + config=DocumentGenerationConfig(generate_segments=True), + ) + + episode_archives = StepBuilder( + id="generate_archives", + phase=INDEXING, + module="preprocessor.modules.packaging.archives:ArchiveGenerationStep", + description="Tworzy archiwa ZIP per odcinek (wszystkie artefakty)", + produces=["archives/{season}/{episode}.zip"], + needs=[elastic_documents], + config=ArchiveConfig(), + ) + + indexed_data = StepBuilder( + id="index_to_elasticsearch", + phase=INDEXING, + module="preprocessor.modules.search.indexing:ElasticsearchIndexerStep", + description="Wrzuca dokumenty do Elasticsearch", + produces=[""], + needs=[elastic_documents], + config=ElasticsearchConfig( + index_name=series_config.indexing.elasticsearch.index_name, + host=series_config.indexing.elasticsearch.host, + dry_run=series_config.indexing.elasticsearch.dry_run, + append=series_config.indexing.elasticsearch.append, + ), + ) + + pipeline = Pipeline(name=f"{series_name}_processing") pipeline.register(episodes_metadata) pipeline.register(characters_metadata) @@ -307,11 +314,11 @@ def build_pipeline() -> Pipeline: return pipeline -def visualize() -> None: - pipeline = build_pipeline() +def visualize(series_name: str = "ranczo") -> None: + pipeline = build_pipeline(series_name) print(pipeline.to_ascii_art()) -def get_step_configs() -> Dict[str, object]: - pipeline = build_pipeline() +def get_step_configs(series_name: str) -> Dict[str, object]: + pipeline = build_pipeline(series_name) return {step_id: step.config for step_id, step in pipeline._steps.items()} diff --git a/preprocessor/cli/cli_main.py b/preprocessor/cli/cli_main.py index 431498812..9e467b502 100644 --- a/preprocessor/cli/cli_main.py +++ b/preprocessor/cli/cli_main.py @@ -1,3 +1,4 @@ +import os from pathlib import Path from typing import Callable @@ -9,6 +10,17 @@ visualize, ) from preprocessor.cli.helpers import setup_pipeline_context +from preprocessor.config.series_config import SeriesConfig + + +def _get_input_base_path() -> Path: + is_docker: bool = os.getenv('DOCKER_CONTAINER', 'false').lower() == 'true' + return Path('/input_data') if is_docker else Path('preprocessor/input_data') + + +def _get_output_base_path() -> Path: + is_docker: bool = os.getenv('DOCKER_CONTAINER', 'false').lower() == 'true' + return Path('/app/output_data') if is_docker else Path('preprocessor/output_data') @click.group() @@ -18,8 +30,9 @@ def cli() -> None: @cli.command(name="visualize") -def visualize_command() -> None: - visualize() +@click.option("--series", default="ranczo", help="Series name (e.g., ranczo)") +def visualize_command(series: str) -> None: + visualize(series) @cli.command(name="run-all") @@ -31,33 +44,48 @@ def visualize_command() -> None: help="Step IDs to skip (e.g., --skip transcode --skip detect_scenes)", ) def run_all(series: str, force_rerun: bool, skip: tuple) -> None: - pipeline = build_pipeline() + series_config = SeriesConfig.load(series) + pipeline = build_pipeline(series) setup = setup_pipeline_context(series, "run_all", force_rerun, with_episode_manager=True) - plan = pipeline.get_execution_order(skip=list(skip)) + try: # pylint: disable=too-many-try-statements + skip_list = list(skip) + if series_config.pipeline_mode == "selective": + skip_list.extend(series_config.skip_steps) + skip_list = list(set(skip_list)) + if series_config.skip_steps: + setup.logger.info(f"🔧 Selective mode: auto-skipping {', '.join(series_config.skip_steps)}") + + plan = pipeline.get_execution_order(skip=skip_list) - setup.logger.info(f"📋 Execution plan: {' → '.join(plan)}") - setup.logger.info(f"📂 Source: preprocessor/input_data/{series}") + input_base = _get_input_base_path() + source_path = input_base / series - source_path = Path("preprocessor/input_data") / series + setup.logger.info(f"📋 Execution plan: {' → '.join(plan)}") + setup.logger.info(f"📂 Source: {source_path}") - for step_id in plan: - step = pipeline.get_step(step_id) - setup.logger.info(f"{'=' * 80}") - setup.logger.info(f"🔧 Step: {step_id}") - setup.logger.info(f"📝 {step.description}") + for step_id in plan: + step = pipeline.get_step(step_id) + setup.logger.info(f"{'=' * 80}") + setup.logger.info(f"🔧 Step: {step_id}") + setup.logger.info(f"📝 {step.description}") - StepClass = step.load_class() - instance = StepClass(step.config) + StepClass = step.load_class() + instance = StepClass(step.config) - runner = PipelineRunner(setup.context) - runner.add_step(instance) - runner.run_for_episodes(source_path, setup.episode_manager) + runner = PipelineRunner(setup.context) + runner.add_step(instance) + runner.run_for_episodes(source_path, setup.episode_manager) - setup.logger.info(f"✅ Step '{step_id}' completed") + setup.logger.info(f"✅ Step '{step_id}' completed") - setup.logger.info("=" * 80) - setup.logger.info("🎉 Pipeline completed successfully!") + setup.logger.info("=" * 80) + setup.logger.info("🎉 Pipeline completed successfully!") + except KeyboardInterrupt: + setup.logger.info("\n🛑 Interrupted by user") + raise + finally: + setup.logger.finalize() def _create_step_command(step_id: str, step_description: str) -> Callable: @@ -65,39 +93,46 @@ def _create_step_command(step_id: str, step_description: str) -> Callable: @click.option("--series", required=True, help="Series name (e.g., ranczo)") @click.option("--force-rerun", is_flag=True, help="Force rerun even if cached") def step_command(series: str, force_rerun: bool, _step_id: str = step_id) -> None: - pipeline = build_pipeline() + pipeline = build_pipeline(series) setup = setup_pipeline_context(series, _step_id, force_rerun, with_episode_manager=True) - step = pipeline.get_step(_step_id) + try: # pylint: disable=too-many-try-statements + step = pipeline.get_step(_step_id) - deps = step.dependency_ids - if deps: - setup.logger.info(f"📦 Dependencies: {', '.join(deps)}") - for dep_id in deps: - if not setup.context.state_manager.is_step_completed(dep_id, "*"): - setup.logger.warning( - f"⚠️ Dependency '{dep_id}' may not be completed. " - f"Run it first or use --force-rerun.", - ) + deps = step.dependency_ids + if deps: + setup.logger.info(f"📦 Dependencies: {', '.join(deps)}") + for dep_id in deps: + if not setup.context.state_manager.is_step_completed(dep_id, "*"): + setup.logger.warning( + f"⚠️ Dependency '{dep_id}' may not be completed. " + f"Run it first or use --force-rerun.", + ) - setup.logger.info(f"🔧 Running: {_step_id}") - setup.logger.info(f"📝 {step.description}") + setup.logger.info(f"🔧 Running: {_step_id}") + setup.logger.info(f"📝 {step.description}") - StepClass = step.load_class() - instance = StepClass(step.config) + StepClass = step.load_class() + instance = StepClass(step.config) - source_path = Path("preprocessor/input_data") / series + input_base = _get_input_base_path() + source_path = input_base / series - runner = PipelineRunner(setup.context) - runner.add_step(instance) - runner.run_for_episodes(source_path, setup.episode_manager) + runner = PipelineRunner(setup.context) + runner.add_step(instance) + runner.run_for_episodes(source_path, setup.episode_manager) - setup.logger.info(f"✅ Step '{_step_id}' completed successfully") + setup.logger.info(f"✅ Step '{_step_id}' completed successfully") + except KeyboardInterrupt: + setup.logger.info("\n🛑 Interrupted by user") + raise + finally: + setup.logger.finalize() return step_command -_cli_pipeline = build_pipeline() +_cli_pipeline = build_pipeline("ranczo") for _step_id, _step in _cli_pipeline._steps.items(): command_func = _create_step_command(_step_id, _step.description) diff --git a/preprocessor/cli/helpers.py b/preprocessor/cli/helpers.py index fd3348264..09aeb5ff0 100644 --- a/preprocessor/cli/helpers.py +++ b/preprocessor/cli/helpers.py @@ -1,5 +1,6 @@ from dataclasses import dataclass import logging +import os from pathlib import Path from typing import Optional @@ -53,18 +54,27 @@ def setup_pipeline_context( with_episode_manager: bool = True, ) -> PipelineSetup: logger: ErrorHandlingLogger = create_cli_logger(logger_name) - state_manager: StateManager = StateManager(series) + + is_docker: bool = os.getenv('DOCKER_CONTAINER', 'false').lower() == 'true' + base_dir: Path = Path('/app/output_data') if is_docker else Path('preprocessor/output_data') + + series_output_dir: Path = base_dir / series + series_output_dir.mkdir(parents=True, exist_ok=True) + + state_manager: StateManager = StateManager(series, working_dir=series_output_dir) state_manager.load_or_create_state() + context: ExecutionContext = ExecutionContext( series_name=series, - base_output_dir=Path('preprocessor/output_data'), + base_output_dir=base_dir, logger=logger, state_manager=state_manager, force_rerun=force_rerun, ) episode_manager: Optional[EpisodeManager] = None if with_episode_manager: - episodes_json: Optional[Path] = Path(f'preprocessor/input_data/{series}/episodes.json') + input_base: Path = Path('/input_data') if is_docker else Path('preprocessor/input_data') + episodes_json: Optional[Path] = input_base / series / 'episodes.json' if not episodes_json.exists(): episodes_json = None episode_manager = EpisodeManager(episodes_json, series, logger) diff --git a/preprocessor/config/series_config.py b/preprocessor/config/series_config.py new file mode 100644 index 000000000..cffd694af --- /dev/null +++ b/preprocessor/config/series_config.py @@ -0,0 +1,200 @@ +from dataclasses import dataclass +import json +from pathlib import Path +from typing import ( + Any, + Dict, + List, +) + + +def _deep_merge(base: Dict[str, Any], override: Dict[str, Any]) -> Dict[str, Any]: + result: Dict[str, Any] = base.copy() + for key, value in override.items(): + if key.startswith('_'): + continue + if key in result and isinstance(result[key], dict) and isinstance(value, dict): + result[key] = _deep_merge(result[key], value) + else: + result[key] = value + return result + + +@dataclass +class EpisodeScrapingConfig: + urls: List[str] + parser_mode: str + + +@dataclass +class CharacterScrapingConfig: + urls: List[str] + parser_mode: str + + +@dataclass +class CharacterReferencesConfig: + search_engine: str + images_per_character: int + + +@dataclass +class ScrapingConfig: + episodes: EpisodeScrapingConfig + characters: CharacterScrapingConfig + character_references: CharacterReferencesConfig + + +@dataclass +class TranscriptionProcessingConfig: + mode: str + model: str + language: str + device: str + + +@dataclass +class TranscodeProcessingConfig: + codec: str + resolution: str + video_bitrate_mbps: float + minrate_mbps: float + maxrate_mbps: float + bufsize_mbps: float + gop_size: float + + +@dataclass +class SceneDetectionProcessingConfig: + threshold: float + min_scene_len: int + + +@dataclass +class FrameExportProcessingConfig: + frames_per_scene: int + + +@dataclass +class ProcessingConfig: + transcription: TranscriptionProcessingConfig + transcode: TranscodeProcessingConfig + scene_detection: SceneDetectionProcessingConfig + frame_export: FrameExportProcessingConfig + + +@dataclass +class ElasticsearchIndexingConfig: + index_name: str + host: str + dry_run: bool + append: bool + + +@dataclass +class IndexingConfig: + elasticsearch: ElasticsearchIndexingConfig + + +@dataclass +class SeriesConfig: + series_name: str + display_name: str + pipeline_mode: str + skip_steps: List[str] + scraping: ScrapingConfig + processing: ProcessingConfig + indexing: IndexingConfig + + @staticmethod + def __load_from_dict(data: Dict[str, Any]) -> 'SeriesConfig': + return SeriesConfig( + series_name=data['series_name'], + display_name=data['display_name'], + pipeline_mode=data.get('pipeline_mode', 'full'), + skip_steps=data.get('skip_steps', []), + scraping=ScrapingConfig( + episodes=EpisodeScrapingConfig( + urls=data['scraping']['episodes']['urls'], + parser_mode=data['scraping']['episodes']['parser_mode'], + ), + characters=CharacterScrapingConfig( + urls=data['scraping']['characters']['urls'], + parser_mode=data['scraping']['characters']['parser_mode'], + ), + character_references=CharacterReferencesConfig( + search_engine=data['scraping']['character_references']['search_engine'], + images_per_character=data['scraping']['character_references']['images_per_character'], + ), + ), + processing=ProcessingConfig( + transcription=TranscriptionProcessingConfig( + mode=data['processing']['transcription']['mode'], + model=data['processing']['transcription']['model'], + language=data['processing']['transcription']['language'], + device=data['processing']['transcription']['device'], + ), + transcode=TranscodeProcessingConfig( + codec=data['processing']['transcode']['codec'], + resolution=data['processing']['transcode']['resolution'], + video_bitrate_mbps=data['processing']['transcode']['video_bitrate_mbps'], + minrate_mbps=data['processing']['transcode']['minrate_mbps'], + maxrate_mbps=data['processing']['transcode']['maxrate_mbps'], + bufsize_mbps=data['processing']['transcode']['bufsize_mbps'], + gop_size=data['processing']['transcode']['gop_size'], + ), + scene_detection=SceneDetectionProcessingConfig( + threshold=data['processing']['scene_detection']['threshold'], + min_scene_len=data['processing']['scene_detection']['min_scene_len'], + ), + frame_export=FrameExportProcessingConfig( + frames_per_scene=data['processing']['frame_export']['frames_per_scene'], + ), + ), + indexing=IndexingConfig( + elasticsearch=ElasticsearchIndexingConfig( + index_name=data['indexing']['elasticsearch']['index_name'], + host=data['indexing']['elasticsearch']['host'], + dry_run=data['indexing']['elasticsearch']['dry_run'], + append=data['indexing']['elasticsearch']['append'], + ), + ), + ) + + @staticmethod + def __load_defaults() -> Dict[str, Any]: + defaults_path: Path = Path('preprocessor/series_configs/defaults.json') + if not defaults_path.exists(): + return {} + with open(defaults_path, 'r', encoding='utf-8') as f: + data: Dict[str, Any] = json.load(f) + return {k: v for k, v in data.items() if not k.startswith('_')} + + @staticmethod + def _load_from_file(config_path: Path) -> 'SeriesConfig': + if not config_path.exists(): + raise FileNotFoundError( + f"Series config not found: {config_path}\n" + f"Create it using template: preprocessor/series_configs/template.json", + ) + + defaults: Dict[str, Any] = SeriesConfig.__load_defaults() + + with open(config_path, 'r', encoding='utf-8') as f: + series_overrides: Dict[str, Any] = json.load(f) + + series_filtered: Dict[str, Any] = { + k: v for k, v in series_overrides.items() + if not k.startswith('_') + } + + merged_config: Dict[str, Any] = _deep_merge(defaults, series_filtered) + + return SeriesConfig.__load_from_dict(merged_config) + + @staticmethod + def load(series_name: str) -> 'SeriesConfig': + config_dir: Path = Path('preprocessor/series_configs') + config_path: Path = config_dir / f'{series_name}.json' + + return SeriesConfig._load_from_file(config_path) diff --git a/preprocessor/core/state_manager.py b/preprocessor/core/state_manager.py index 756673928..81f1d37e0 100644 --- a/preprocessor/core/state_manager.py +++ b/preprocessor/core/state_manager.py @@ -66,11 +66,12 @@ def _from_dict(cls, data: Dict[str, Any]) -> 'ProcessingState': ) class StateManager: - STATE_FILE: str = '.preprocessing_state.json' + STATE_FILE_TEMPLATE: str = '.preprocessing_state_{series}.json' def __init__(self, series_name: str, working_dir: Path = Path('.')) -> None: self.__series_name: str = series_name - self.__state_file: Path = working_dir / self.STATE_FILE + state_filename: str = self.STATE_FILE_TEMPLATE.format(series=series_name) + self.__state_file: Path = working_dir / state_filename self.__state: Optional[ProcessingState] = None self.__cleanup_registered: bool = False self.__interrupted: bool = False diff --git a/preprocessor/entrypoint.sh b/preprocessor/entrypoint.sh index 15033bb39..71388463b 100755 --- a/preprocessor/entrypoint.sh +++ b/preprocessor/entrypoint.sh @@ -1,10 +1,5 @@ #!/bin/bash set -e -echo "Ensuring global output directories exist..." -mkdir -p /app/output_data/characters -mkdir -p /app/output_data/scraped_pages -mkdir -p /app/output_data/processing_metadata - echo "Starting application..." exec python -m preprocessor.cli "$@" diff --git a/preprocessor/lib/ui/console.py b/preprocessor/lib/ui/console.py index b3fb82251..4f7d631e0 100644 --- a/preprocessor/lib/ui/console.py +++ b/preprocessor/lib/ui/console.py @@ -87,4 +87,8 @@ def __enter__(self): def __exit__(self, exc_type, exc_val, exc_tb): pass + +def create_progress() -> SimpleProgress: + return SimpleProgress() + console = _get_console() diff --git a/preprocessor/modules/scraping/base_scraper.py b/preprocessor/modules/scraping/base_scraper.py index c734dbad9..ba5384b24 100644 --- a/preprocessor/modules/scraping/base_scraper.py +++ b/preprocessor/modules/scraping/base_scraper.py @@ -86,6 +86,10 @@ def _save_result(self, result: Dict[str, Any]) -> None: with open(self.output_file, 'w', encoding='utf-8') as f: json.dump(result, f, indent=2, ensure_ascii=False) + def get_output_subdir(self) -> str: + """Scrapery używają bezpośrednio output_file zamiast subdirektoriów per-episode.""" + return "" + @abstractmethod def _process_scraped_pages(self, scraped_pages: List[Dict[str, Any]]) -> None: pass diff --git a/preprocessor/modules/scraping/character_scraper_step.py b/preprocessor/modules/scraping/character_scraper_step.py index e1734a387..6fc263bf2 100644 --- a/preprocessor/modules/scraping/character_scraper_step.py +++ b/preprocessor/modules/scraping/character_scraper_step.py @@ -34,13 +34,14 @@ def execute( context.logger.info(f"Scraping characters from {len(self.config.urls)} URLs") - scraper = CharacterScraper( # pylint: disable=abstract-class-instantiated + scraper = CharacterScraper( { "urls": self.config.urls, "output_file": output_path, "headless": self.config.headless, "scraper_method": self.config.scraper_method, "parser_mode": self.config.parser_mode, + "series_name": context.series_name, }, ) diff --git a/preprocessor/modules/scraping/episode_scraper_step.py b/preprocessor/modules/scraping/episode_scraper_step.py index d9d234f08..068f63e29 100644 --- a/preprocessor/modules/scraping/episode_scraper_step.py +++ b/preprocessor/modules/scraping/episode_scraper_step.py @@ -34,7 +34,7 @@ def execute( context.logger.info(f"Scraping episodes from {len(self.config.urls)} URLs") - scraper = EpisodeScraper( # pylint: disable=abstract-class-instantiated + scraper = EpisodeScraper( { "urls": self.config.urls, "output_file": output_path, @@ -42,6 +42,7 @@ def execute( "merge_sources": self.config.merge_sources, "scraper_method": self.config.scraper_method, "parser_mode": self.config.parser_mode, + "series_name": context.series_name, }, ) diff --git a/preprocessor/modules/text/import_step.py b/preprocessor/modules/text/import_step.py index 3f2c13d0c..8b1ea8ba0 100644 --- a/preprocessor/modules/text/import_step.py +++ b/preprocessor/modules/text/import_step.py @@ -66,9 +66,10 @@ def _import_single_file(self, json_file: Path, context: ExecutionContext) -> Opt output_filename: str = self._episode_manager.path_manager.build_filename(episode_info, extension='json') output_path: Path = context.get_output_path(episode_info, 'transcriptions', output_filename) if output_path.exists() and (not context.force_rerun): - if context.is_step_completed(self.name, episode_id): - context.logger.info(f'Skipping {episode_id} (cached)') - return TranscriptionData(episode_id=episode_id, episode_info=episode_info, path=output_path, language='pl', model='11labs', format='json') + context.logger.info(f'Skipping {episode_id} (output exists)') + if not context.is_step_completed(self.name, episode_id): + context.mark_step_completed(self.name, episode_id) + return TranscriptionData(episode_id=episode_id, episode_info=episode_info, path=output_path, language='pl', model='11labs', format='json') context.logger.info(f'Importing {episode_id} from {json_file.name}') context.mark_step_started(self.name, episode_id) with open(json_file, 'r', encoding='utf-8') as f: diff --git a/preprocessor/modules/video/transcoding.py b/preprocessor/modules/video/transcoding.py index 4634e0e64..aea0a256e 100644 --- a/preprocessor/modules/video/transcoding.py +++ b/preprocessor/modules/video/transcoding.py @@ -20,18 +20,19 @@ def execute( # pylint: disable=too-many-locals output_filename = f'{context.series_name}_{input_data.episode_info.episode_code()}.mp4' output_path = context.get_season_output_path(input_data.episode_info, 'transcoded_videos', output_filename) if output_path.exists() and (not context.force_rerun): - if context.is_step_completed(self.name, input_data.episode_id): - context.logger.info(f'Skipping {input_data.episode_id} (cached)') - resolution_str = ( - f'{self.config.resolution.width}x{self.config.resolution.height}' - ) - return TranscodedVideo( - path=output_path, - episode_id=input_data.episode_id, - episode_info=input_data.episode_info, - resolution=resolution_str, - codec=self.config.codec, - ) + context.logger.info(f'Skipping {input_data.episode_id} (output exists)') + if not context.is_step_completed(self.name, input_data.episode_id): + context.mark_step_completed(self.name, input_data.episode_id) + resolution_str = ( + f'{self.config.resolution.width}x{self.config.resolution.height}' + ) + return TranscodedVideo( + path=output_path, + episode_id=input_data.episode_id, + episode_info=input_data.episode_info, + resolution=resolution_str, + codec=self.config.codec, + ) probe_data = FFmpegWrapper.probe_video(input_data.path) input_fps = FFmpegWrapper.get_framerate(probe_data) input_video_bitrate = FFmpegWrapper.get_video_bitrate(probe_data) diff --git a/preprocessor/series_configs/defaults.json b/preprocessor/series_configs/defaults.json new file mode 100644 index 000000000..04601e0f9 --- /dev/null +++ b/preprocessor/series_configs/defaults.json @@ -0,0 +1,49 @@ +{ + "_comment": "Domy\u015blna konfiguracja - wszystkie serie dziedzicz\u0105 te ustawienia", + "_note": "Seria mo\u017ce nadpisa\u0107 dowolne pole w swoim pliku JSON", + "indexing": { + "elasticsearch": { + "append": false, + "dry_run": false, + "host": "localhost:9200" + } + }, + "pipeline_mode": "full", + "processing": { + "frame_export": { + "frames_per_scene": 3 + }, + "scene_detection": { + "min_scene_len": 10, + "threshold": 0.5 + }, + "transcode": { + "bufsize_mbps": 5.0, + "codec": "h264_nvenc", + "gop_size": 2.0, + "maxrate_mbps": 3.5, + "minrate_mbps": 1.5, + "resolution": "720p", + "video_bitrate_mbps": 2.5 + }, + "transcription": { + "device": "cuda", + "language": "pl", + "mode": "whisper", + "model": "large-v3-turbo" + } + }, + "scraping": { + "character_references": { + "images_per_character": 5, + "search_engine": "duckduckgo" + }, + "characters": { + "parser_mode": "normal" + }, + "episodes": { + "parser_mode": "normal" + } + }, + "skip_steps": [] +} diff --git a/preprocessor/series_configs/kiepscy.json b/preprocessor/series_configs/kiepscy.json new file mode 100644 index 000000000..ad8ae8601 --- /dev/null +++ b/preprocessor/series_configs/kiepscy.json @@ -0,0 +1,27 @@ +{ + "_comment": "Konfiguracja dla Kiepscy - tylko zmiany wzgl\u0119dem defaults.json", + "display_name": "\u015awiat wed\u0142ug Kiepskich", + "indexing": { + "elasticsearch": { + "index_name": "kiepscy_clips" + } + }, + "scraping": { + "character_references": { + "search_engine": "google" + }, + "characters": { + "parser_mode": "premium", + "urls": [ + "https://pl.wikipedia.org/wiki/Lista_postaci_serialu_%C5%9Awiat_wed%C5%82ug_Kiepskich" + ] + }, + "episodes": { + "parser_mode": "premium", + "urls": [ + "https://pl.wikipedia.org/wiki/Lista_odcink%C3%B3w_serialu_%C5%9Awiat_wed%C5%82ug_Kiepskich" + ] + } + }, + "series_name": "kiepscy" +} diff --git a/preprocessor/series_configs/ranczo.json b/preprocessor/series_configs/ranczo.json new file mode 100644 index 000000000..698718f53 --- /dev/null +++ b/preprocessor/series_configs/ranczo.json @@ -0,0 +1,40 @@ +{ + "_comment": "Konfiguracja dla Ranczo - tylko zmiany wzgl\u0119dem defaults.json", + "_note": "Metadane, bohaterowie i transkrypcje przygotowane r\u0119cznie/z 11labs", + "display_name": "Ranczo", + "indexing": { + "elasticsearch": { + "index_name": "ranczo_clips" + } + }, + "pipeline_mode": "selective", + "processing": { + "transcription": { + "mode": "elevenlabs" + } + }, + "scraping": { + "character_references": { + "search_engine": "google" + }, + "characters": { + "parser_mode": "premium", + "urls": [ + "https://ranczo.fandom.com/pl/wiki/Postacie" + ] + }, + "episodes": { + "parser_mode": "premium", + "urls": [ + "https://ranczo.fandom.com/pl/wiki/Lista_odcink\u00f3w" + ] + } + }, + "series_name": "ranczo", + "skip_steps": [ + "scrape_episodes", + "scrape_characters", + "process_references", + "transcribe" + ] +} diff --git a/preprocessor/series_configs/template.json b/preprocessor/series_configs/template.json new file mode 100644 index 000000000..69e9deadc --- /dev/null +++ b/preprocessor/series_configs/template.json @@ -0,0 +1,44 @@ +{ + "_comment": "Template - skopiuj i edytuj dla swojej serii", + "_instruction": "1. Skopiuj jako {nazwa_serii}.json, 2. Wype\u0142nij wymagane pola, 3. Dodaj tylko te opcje kt\u00f3re si\u0119 r\u00f3\u017cni\u0105 od defaults", + "_note": "Podaj tylko ZMIANY wzgl\u0119dem defaults.json (nie przepisuj wszystkiego!)", + "_optional_overrides": { + "_comment": "Poni\u017cej opcjonalne nadpisania defaults.json - odkomentuj i dostosuj wed\u0142ug potrzeb", + "pipeline_mode_example": "selective (je\u015bli masz gotowe dane)", + "processing.transcode.codec_example": "h265_nvenc (je\u015bli chcesz HEVC)", + "processing.transcode.resolution_example": "1080p (je\u015bli chcesz wy\u017csz\u0105 jako\u015b\u0107)", + "processing.transcription.mode_example": "elevenlabs (je\u015bli masz 11labs API)", + "scraping.character_references.search_engine_example": "google (je\u015bli masz SerpAPI)", + "scraping.episodes.parser_mode_example": "premium (je\u015bli masz Gemini API)", + "skip_steps_example": [ + "scrape_episodes", + "transcribe" + ] + }, + "_required_fields": { + "display_name": "WYMAGANE - nazwa wy\u015bwietlana", + "indexing.elasticsearch.index_name": "WYMAGANE - nazwa indeksu ES", + "scraping.characters.urls": "WYMAGANE - lista URLi do stron z postaciami", + "scraping.episodes.urls": "WYMAGANE - lista URLi do stron z odcinkami", + "series_name": "WYMAGANE - nazwa serii (musi zgadza\u0107 si\u0119 z nazw\u0105 pliku i katalogu input_data/)" + }, + "display_name": "Nazwa Wy\u015bwietlana", + "indexing": { + "elasticsearch": { + "index_name": "nazwa_serii_clips" + } + }, + "scraping": { + "characters": { + "urls": [ + "https://example.com/wiki/Characters" + ] + }, + "episodes": { + "urls": [ + "https://example.com/wiki/Episodes" + ] + } + }, + "series_name": "nazwa_serii" +} From fa146b2e1272ce5b30cc24164e1599dd40d70e5e Mon Sep 17 00:00:00 2001 From: dam2452 <81230036+dam2452@users.noreply.github.com> Date: Wed, 11 Feb 2026 12:30:48 +0100 Subject: [PATCH 09/89] Make FFmpegWrapper helpers private; fix typings Rename FFmpegWrapper internal constants and helper methods to use double-underscore (name-mangled) identifiers and update all call sites accordingly in preprocessor/lib/media/ffmpeg.py. Reorder/relocate helper implementations while preserving behavior. Also switch list[str] annotations to typing.List for compatibility and add the List import in both ffmpeg.py and preprocessor/modules/audio/extraction.py; update the command variable typing in audio extraction to List[str]. No functional changes to encoding logic. --- preprocessor/lib/media/ffmpeg.py | 163 ++++++++++++----------- preprocessor/modules/audio/extraction.py | 3 +- 2 files changed, 84 insertions(+), 82 deletions(-) diff --git a/preprocessor/lib/media/ffmpeg.py b/preprocessor/lib/media/ffmpeg.py index c9f8e6164..5031d7bea 100644 --- a/preprocessor/lib/media/ffmpeg.py +++ b/preprocessor/lib/media/ffmpeg.py @@ -4,80 +4,21 @@ from typing import ( Any, Dict, + List, Optional, ) class FFmpegWrapper: - _PROFILE = 'main' - _LEVEL = '4.1' - _PIX_FMT = 'yuv420p' - _BF = '2' - _B_ADAPT = '1' - _TWO_PASS = '1' - _RC_LOOKAHEAD = '32' - _AQ_STRENGTH = '15' - _AUDIO_CHANNELS = '2' - - @staticmethod - def _build_video_filter(width: int, height: int) -> str: - return ( - f"scale='iw*sar:ih',scale={width}:{height}:" - f"force_original_aspect_ratio=decrease,pad={width}:{height}:" - f"(ow-iw)/2:(oh-ih)/2:black,setsar=1" - ) - - @staticmethod - def _build_base_command( - input_path: Path, codec: str, preset: str, target_fps: Optional[float], - ) -> list[str]: - command = [ - 'ffmpeg', '-v', 'error', '-stats', '-hide_banner', '-y', - '-i', str(input_path), - '-c:v', codec, - '-preset', preset, - '-profile:v', FFmpegWrapper._PROFILE, - '-level', FFmpegWrapper._LEVEL, - '-pix_fmt', FFmpegWrapper._PIX_FMT, - ] - if target_fps: - command.extend(['-r', str(target_fps)]) - return command - - @staticmethod - def _build_encoding_params( - video_bitrate: str, minrate: str, maxrate: str, bufsize: str, gop_size: int, - ) -> list[str]: - return [ - '-rc', 'vbr_hq', - '-b:v', video_bitrate, - '-minrate', minrate, - '-maxrate', maxrate, - '-bufsize', bufsize, - '-bf', FFmpegWrapper._BF, - '-b_adapt', FFmpegWrapper._B_ADAPT, - '-2pass', FFmpegWrapper._TWO_PASS, - '-rc-lookahead', FFmpegWrapper._RC_LOOKAHEAD, - '-aq-strength', FFmpegWrapper._AQ_STRENGTH, - '-g', str(gop_size), - '-spatial-aq', '1', - '-temporal-aq', '1', - '-multipass', 'fullres', - ] - - @staticmethod - def _build_audio_and_output_params( - audio_bitrate: str, vf_filter: str, output_path: Path, - ) -> list[str]: - return [ - '-c:a', 'aac', - '-b:a', audio_bitrate, - '-ac', FFmpegWrapper._AUDIO_CHANNELS, - '-vf', vf_filter, - '-movflags', '+faststart', - '-f', 'mp4', - str(output_path), - ] + __PROFILE = 'main' + __LEVEL = '4.1' + __PIX_FMT = 'yuv420p' + __BF = '2' + __B_ADAPT = '1' + __TWO_PASS = '1' + __RC_LOOKAHEAD = '32' + __AQ_STRENGTH = '15' + __AUDIO_CHANNELS = '2' @staticmethod def transcode( # pylint: disable=too-many-arguments @@ -95,15 +36,15 @@ def transcode( # pylint: disable=too-many-arguments target_fps: Optional[float] = None, ) -> None: width, height = [int(x) for x in resolution.split(':')] - vf_filter = FFmpegWrapper._build_video_filter(width, height) - command = FFmpegWrapper._build_base_command(input_path, codec, preset, target_fps) + vf_filter = FFmpegWrapper.__build_video_filter(width, height) + command = FFmpegWrapper.__build_base_command(input_path, codec, preset, target_fps) command.extend( - FFmpegWrapper._build_encoding_params( + FFmpegWrapper.__build_encoding_params( video_bitrate, minrate, maxrate, bufsize, gop_size, ), ) command.extend( - FFmpegWrapper._build_audio_and_output_params( + FFmpegWrapper.__build_audio_and_output_params( audio_bitrate, vf_filter, output_path, ), ) @@ -115,14 +56,9 @@ def probe_video(video_path: Path) -> Dict[str, Any]: result = subprocess.run(cmd, capture_output=True, text=True, check=True) return json.loads(result.stdout) - @staticmethod - def _get_stream_by_type(probe_data: Dict[str, Any], codec_type: str) -> Optional[Dict[str, Any]]: - streams = [s for s in probe_data.get('streams', []) if s.get('codec_type') == codec_type] - return streams[0] if streams else None - @staticmethod def get_framerate(probe_data: Dict[str, Any]) -> float: - stream = FFmpegWrapper._get_stream_by_type(probe_data, 'video') + stream = FFmpegWrapper.__get_stream_by_type(probe_data, 'video') if not stream: raise ValueError('No video streams found') r_frame_rate = stream.get('r_frame_rate') @@ -133,7 +69,7 @@ def get_framerate(probe_data: Dict[str, Any]) -> float: @staticmethod def get_video_bitrate(probe_data: Dict[str, Any]) -> Optional[float]: - stream = FFmpegWrapper._get_stream_by_type(probe_data, 'video') + stream = FFmpegWrapper.__get_stream_by_type(probe_data, 'video') if not stream: return None bit_rate = stream.get('bit_rate') @@ -143,10 +79,75 @@ def get_video_bitrate(probe_data: Dict[str, Any]) -> Optional[float]: @staticmethod def get_audio_bitrate(probe_data: Dict[str, Any]) -> Optional[int]: - stream = FFmpegWrapper._get_stream_by_type(probe_data, 'audio') + stream = FFmpegWrapper.__get_stream_by_type(probe_data, 'audio') if not stream: return None bit_rate = stream.get('bit_rate') if not bit_rate: return None return int(int(bit_rate) / 1000) + + @staticmethod + def __build_video_filter(width: int, height: int) -> str: + return ( + f"scale='iw*sar:ih',scale={width}:{height}:" + f"force_original_aspect_ratio=decrease,pad={width}:{height}:" + f"(ow-iw)/2:(oh-ih)/2:black,setsar=1" + ) + + @staticmethod + def __build_base_command( + input_path: Path, codec: str, preset: str, target_fps: Optional[float], + ) -> List[str]: + command = [ + 'ffmpeg', '-v', 'error', '-stats', '-hide_banner', '-y', + '-i', str(input_path), + '-c:v', codec, + '-preset', preset, + '-profile:v', FFmpegWrapper.__PROFILE, + '-level', FFmpegWrapper.__LEVEL, + '-pix_fmt', FFmpegWrapper.__PIX_FMT, + ] + if target_fps: + command.extend(['-r', str(target_fps)]) + return command + + @staticmethod + def __build_encoding_params( + video_bitrate: str, minrate: str, maxrate: str, bufsize: str, gop_size: int, + ) -> List[str]: + return [ + '-rc', 'vbr_hq', + '-b:v', video_bitrate, + '-minrate', minrate, + '-maxrate', maxrate, + '-bufsize', bufsize, + '-bf', FFmpegWrapper.__BF, + '-b_adapt', FFmpegWrapper.__B_ADAPT, + '-2pass', FFmpegWrapper.__TWO_PASS, + '-rc-lookahead', FFmpegWrapper.__RC_LOOKAHEAD, + '-aq-strength', FFmpegWrapper.__AQ_STRENGTH, + '-g', str(gop_size), + '-spatial-aq', '1', + '-temporal-aq', '1', + '-multipass', 'fullres', + ] + + @staticmethod + def __build_audio_and_output_params( + audio_bitrate: str, vf_filter: str, output_path: Path, + ) -> List[str]: + return [ + '-c:a', 'aac', + '-b:a', audio_bitrate, + '-ac', FFmpegWrapper.__AUDIO_CHANNELS, + '-vf', vf_filter, + '-movflags', '+faststart', + '-f', 'mp4', + str(output_path), + ] + + @staticmethod + def __get_stream_by_type(probe_data: Dict[str, Any], codec_type: str) -> Optional[Dict[str, Any]]: + streams = [s for s in probe_data.get('streams', []) if s.get('codec_type') == codec_type] + return streams[0] if streams else None diff --git a/preprocessor/modules/audio/extraction.py b/preprocessor/modules/audio/extraction.py index 23f99864a..fe0d831b5 100644 --- a/preprocessor/modules/audio/extraction.py +++ b/preprocessor/modules/audio/extraction.py @@ -1,5 +1,6 @@ from pathlib import Path import subprocess +from typing import List from preprocessor.config.step_configs import AudioExtractionConfig from preprocessor.core.artifacts import ( @@ -37,7 +38,7 @@ def execute(self, input_data: SourceVideo, context: ExecutionContext) -> AudioAr ) context.logger.info(f'Extracting audio for {input_data.episode_id}') context.mark_step_started(self.name, input_data.episode_id) - command: list[str] = [ + command: List[str] = [ 'ffmpeg', '-y', '-v', 'error', '-i', str(input_data.path), '-vn', From 023720bd4aeaa82f4cb2bf7025b9cf41f72ac687 Mon Sep 17 00:00:00 2001 From: dam2452 <81230036+dam2452@users.noreply.github.com> Date: Wed, 11 Feb 2026 12:43:08 +0100 Subject: [PATCH 10/89] Add interlacing detection and refactor scene code Detect interlaced video and apply deinterlace filter during transcoding; refactor scene detection internals. - FFmpegWrapper: added ffmpeg idet-based detect_interlacing() and __parse_idet_output(); made __build_video_filter accept deinterlace flag and compose filters; imported re and Tuple typing. - VideoTranscoderStep: run interlace detection, log results, pass deinterlace flag into FFmpegWrapper, and added pylint disable for long method. - TransNetWrapper: renamed several helpers to private (__get_video_info, __build_scenes_from_predictions, __create_scene_dict, __frame_to_timecode), adjusted detect_scenes to use the new names, moved cleanup earlier, and ensured GPU memory is cleared. - Scene detection module: updated call site to use the renamed private video info method. These changes enable automatic detection of interlaced content and conditional application of a bwdif deinterlacing filter, while tightening internal method visibility and cleaning up resources after model use. --- preprocessor/lib/ai/__init__.py | 4 +- preprocessor/lib/ai/clients.py | 100 +++++++++++ preprocessor/lib/ai/models.py | 65 +++++++ .../lib/ai/{llm_provider.py => provider.py} | 165 ++++-------------- preprocessor/lib/media/ffmpeg.py | 75 +++++++- preprocessor/lib/media/scene_detection.py | 61 ++++--- preprocessor/modules/scraping/base_scraper.py | 2 +- preprocessor/modules/video/scene_detection.py | 2 +- preprocessor/modules/video/transcoding.py | 22 ++- 9 files changed, 327 insertions(+), 169 deletions(-) create mode 100644 preprocessor/lib/ai/clients.py create mode 100644 preprocessor/lib/ai/models.py rename preprocessor/lib/ai/{llm_provider.py => provider.py} (53%) diff --git a/preprocessor/lib/ai/__init__.py b/preprocessor/lib/ai/__init__.py index c79473d8f..a07ded715 100644 --- a/preprocessor/lib/ai/__init__.py +++ b/preprocessor/lib/ai/__init__.py @@ -1,8 +1,8 @@ -from preprocessor.lib.ai.llm_provider import ( +from preprocessor.lib.ai.models import ( CharacterInfo, EpisodeInfo, - LLMProvider, SeasonMetadata, ) +from preprocessor.lib.ai.provider import LLMProvider __all__ = ['LLMProvider', 'EpisodeInfo', 'SeasonMetadata', 'CharacterInfo'] diff --git a/preprocessor/lib/ai/clients.py b/preprocessor/lib/ai/clients.py new file mode 100644 index 000000000..24c255bef --- /dev/null +++ b/preprocessor/lib/ai/clients.py @@ -0,0 +1,100 @@ +from abc import ( + ABC, + abstractmethod, +) +from typing import ( + Dict, + List, + Optional, +) + +from openai import OpenAI +from vllm import ( + LLM, + SamplingParams, +) + +from preprocessor.config.config import settings +from preprocessor.lib.ui.console import console + + +class BaseLLMClient(ABC): + @abstractmethod + def generate(self, messages: List[Dict[str, str]], max_tokens: int = 32768) -> str: + pass + + +class VLLMClient(BaseLLMClient): + __DEFAULT_MODEL_NAME = 'Qwen/Qwen2.5-Coder-7B-Instruct' + + def __init__(self, model_name: Optional[str] = None) -> None: + self._model_name = model_name or self.__DEFAULT_MODEL_NAME + self._model: Optional[LLM] = None + self.__load_model() + + def generate(self, messages: List[Dict[str, str]], max_tokens: int = 32768) -> str: + if self._model is None: + raise RuntimeError('Model not initialized') + + sampling_params = SamplingParams( + temperature=0.7, + top_p=0.8, + top_k=20, + max_tokens=max_tokens, + repetition_penalty=1.05, + ) + outputs = self._model.chat(messages=[messages], sampling_params=sampling_params) + return outputs[0].outputs[0].text.strip() + + def __load_model(self) -> None: + console.print(f'[cyan]Loading LLM: {self._model_name} (vLLM, 128K context)[/cyan]') + try: + self._model = LLM( + model=self._model_name, + trust_remote_code=True, + max_model_len=131072, + gpu_memory_utilization=0.95, + tensor_parallel_size=1, + dtype='bfloat16', + enable_chunked_prefill=True, + max_num_batched_tokens=16384, + enforce_eager=True, + disable_log_stats=True, + ) + console.print('[green]✓ LLM loaded successfully (vLLM)[/green]') + except Exception as e: + console.print(f'[red]Failed to load model: {e}[/red]') + raise + + +class GeminiClient(BaseLLMClient): + __GEMINI_MODEL_NAME = 'gemini-2.5-flash' + + def __init__(self) -> None: + self._client: Optional[OpenAI] = None + self.__init_client() + + def generate(self, messages: List[Dict[str, str]], max_tokens: int = 32768) -> str: + if self._client is None: + raise RuntimeError('Gemini client not initialized') + + response = self._client.chat.completions.create( + model=self.__GEMINI_MODEL_NAME, + messages=messages, + ) + return response.choices[0].message.content.strip() + + def __init_client(self) -> None: + console.print('[cyan]Initializing Gemini 2.5 Flash via OpenAI SDK...[/cyan]') + try: + api_key = settings.gemini.api_key + if not api_key: + raise ValueError('GEMINI_API_KEY not set in environment') + self._client = OpenAI( + base_url='https://generativelanguage.googleapis.com/v1beta/openai/', + api_key=api_key, + ) + console.print('[green]✓ Gemini 2.5 Flash initialized[/green]') + except Exception as e: + console.print(f'[red]Failed to initialize Gemini client: {e}[/red]') + raise diff --git a/preprocessor/lib/ai/models.py b/preprocessor/lib/ai/models.py new file mode 100644 index 000000000..c0a8f5ec8 --- /dev/null +++ b/preprocessor/lib/ai/models.py @@ -0,0 +1,65 @@ +from typing import ( + List, + Optional, +) + +from pydantic import ( + BaseModel, + field_validator, + model_validator, +) + + +class EpisodeInfo(BaseModel): + episode_in_season: int + overall_episode_number: int + title: str + premiere_date: Optional[str] = None + viewership: Optional[str] = None + + @field_validator('viewership', mode='before') + @classmethod + @staticmethod + def _convert_viewership_to_str(cls, v: Optional[int]) -> Optional[str]: + if v is None: + return None + if isinstance(v, int): + return str(v) + return v + + +class SeasonMetadata(BaseModel): + season_number: int + episodes: List[EpisodeInfo] + + @model_validator(mode='before') + @classmethod + @staticmethod + def _convert_old_format(cls, data: dict) -> dict: + if isinstance(data, dict) and 'episodes' in data: + for idx, episode in enumerate(data['episodes'], start=1): + if isinstance(episode, dict) and 'episode_number' in episode and ('episode_in_season' not in episode): + episode['episode_in_season'] = idx + episode['overall_episode_number'] = episode['episode_number'] + del episode['episode_number'] + return data + + +class AllSeasonsMetadata(BaseModel): + seasons: List[SeasonMetadata] + + +class EpisodeMetadata(BaseModel): + title: str + description: str + summary: str + season: Optional[int] = None + episode_number: Optional[int] = None + + +class CharacterInfo(BaseModel): + name: str + + +class CharactersList(BaseModel): + characters: List[CharacterInfo] diff --git a/preprocessor/lib/ai/llm_provider.py b/preprocessor/lib/ai/provider.py similarity index 53% rename from preprocessor/lib/ai/llm_provider.py rename to preprocessor/lib/ai/provider.py index d9eb9ffdf..945a9d5c8 100644 --- a/preprocessor/lib/ai/llm_provider.py +++ b/preprocessor/lib/ai/provider.py @@ -7,18 +7,8 @@ Type, ) -from openai import OpenAI -from pydantic import ( - BaseModel, - field_validator, - model_validator, -) -from vllm import ( - LLM, - SamplingParams, -) +from pydantic import BaseModel -from preprocessor.config.config import settings from preprocessor.config.enums import ParserMode from preprocessor.config.prompts import ( extract_all_seasons_system, @@ -32,78 +22,38 @@ merge_episode_data_system, merge_episode_data_user, ) +from preprocessor.lib.ai.clients import ( + BaseLLMClient, + GeminiClient, + VLLMClient, +) +from preprocessor.lib.ai.models import ( + AllSeasonsMetadata, + CharacterInfo, + CharactersList, + EpisodeMetadata, + SeasonMetadata, +) from preprocessor.lib.ui.console import console -class EpisodeInfo(BaseModel): - episode_in_season: int - overall_episode_number: int - title: str - premiere_date: Optional[str] = None - viewership: Optional[str] = None - - @field_validator('viewership', mode='before') - @classmethod - @staticmethod - def convert_viewership_to_str(cls, v): - if v is None: - return None - if isinstance(v, int): - return str(v) - return v - -class SeasonMetadata(BaseModel): - season_number: int - episodes: List[EpisodeInfo] - - @model_validator(mode='before') - @classmethod - @staticmethod - def convert_old_format(cls, data): - if isinstance(data, dict) and 'episodes' in data: - for idx, episode in enumerate(data['episodes'], start=1): - if isinstance(episode, dict) and 'episode_number' in episode and ('episode_in_season' not in episode): - episode['episode_in_season'] = idx - episode['overall_episode_number'] = episode['episode_number'] - del episode['episode_number'] - return data - -class AllSeasonsMetadata(BaseModel): - seasons: List[SeasonMetadata] - -class EpisodeMetadata(BaseModel): - title: str - description: str - summary: str - season: Optional[int] = None - episode_number: Optional[int] = None - -class CharacterInfo(BaseModel): - name: str - -class CharactersList(BaseModel): - characters: List[CharacterInfo] - class LLMProvider: - __DEFAULT_MODEL_NAME = 'Qwen/Qwen2.5-Coder-7B-Instruct' - __GEMINI_MODEL_NAME = 'gemini-2.5-flash' - __instance = None - __model = None - __openai_client = None + __instance: Optional['LLMProvider'] = None + __client: Optional[BaseLLMClient] = None - def __new__(cls, model_name: Optional[str]=None, parser_mode: Optional[ParserMode]=None): + def __new__(cls, model_name: Optional[str] = None, parser_mode: Optional[ParserMode] = None) -> 'LLMProvider': if cls.__instance is None: cls.__instance = super().__new__(cls) return cls.__instance - def __init__(self, model_name: Optional[str]=None, parser_mode: Optional[ParserMode]=None): - self.parser_mode = parser_mode or ParserMode.NORMAL - if self.parser_mode == ParserMode.PREMIUM: - if self.__openai_client is None: - self.__init_gemini_client() - elif self.__model is None: - self.model_name = model_name or self.__DEFAULT_MODEL_NAME - self.__load_model() + def __init__(self, model_name: Optional[str] = None, parser_mode: Optional[ParserMode] = None) -> None: + self._parser_mode = parser_mode or ParserMode.NORMAL + + if self.__client is None: + if self._parser_mode == ParserMode.PREMIUM: + self.__client = GeminiClient() + else: + self.__client = VLLMClient(model_name=model_name) def extract_season_episodes(self, page_text: str, url: str) -> Optional[SeasonMetadata]: return self.__process_llm_request( @@ -126,6 +76,7 @@ def merge_episode_data(self, metadata_list: List[EpisodeMetadata]) -> EpisodeMet raise ValueError('No metadata to merge') if len(metadata_list) == 1: return metadata_list[0] + combined_text = '\n\n---\n\n'.join([ f'Source {i + 1}:\n' f'Title: {m.title}\n' @@ -135,6 +86,7 @@ def merge_episode_data(self, metadata_list: List[EpisodeMetadata]) -> EpisodeMet f'Episode: {m.episode_number}' for i, m in enumerate(metadata_list) ]) + result = self.__process_llm_request( system_prompt=merge_episode_data_system.get(), user_prompt=merge_episode_data_user.get().format( @@ -152,6 +104,7 @@ def extract_all_seasons(self, scraped_pages: List[Dict[str, Any]]) -> Optional[L url = page['url'] markdown = page['markdown'] combined_content += f'\n\n=== SOURCE {i}: {url} ===\n\n{markdown}\n' + result = self.__process_llm_request( system_prompt=extract_all_seasons_system.get(), user_prompt=extract_all_seasons_user.get().format( @@ -173,6 +126,7 @@ def extract_characters( url = page['url'] markdown = page['markdown'] combined_content += f'\n\n=== SOURCE {i}: {url} ===\n\n{markdown}\n' + result = self.__process_llm_request( system_prompt=extract_characters_system.get(), user_prompt=extract_characters_user.get().format( @@ -192,68 +146,21 @@ def __process_llm_request( response_model: Type[BaseModel], error_context: str, ) -> Optional[BaseModel]: + if self.__client is None: + raise RuntimeError('LLM client not initialized') + try: - messages = [{'role': 'system', 'content': system_prompt}, {'role': 'user', 'content': user_prompt}] - if self.parser_mode == ParserMode.PREMIUM: - content = self.__generate_with_gemini(messages) - else: - content = self.__generate(messages) + messages = [ + {'role': 'system', 'content': system_prompt}, + {'role': 'user', 'content': user_prompt}, + ] + content = self.__client.generate(messages) data = self.__extract_json(content) return response_model(**data) except Exception as e: console.print(f'[red]LLM {error_context}: {e}[/red]') return None - def __init_gemini_client(self) -> None: - console.print('[cyan]Initializing Gemini 2.5 Flash via OpenAI SDK...[/cyan]') - try: - api_key = settings.gemini.api_key - if not api_key: - raise ValueError('GEMINI_API_KEY not set in environment') - self.__openai_client = OpenAI( - base_url='https://generativelanguage.googleapis.com/v1beta/openai/', - api_key=api_key, - ) - console.print('[green]✓ Gemini 2.5 Flash initialized[/green]') - except Exception as e: - console.print(f'[red]Failed to initialize Gemini client: {e}[/red]') - raise e - - def __load_model(self) -> None: - console.print(f'[cyan]Loading LLM: {self.model_name} (vLLM, 128K context)[/cyan]') - try: - self.__model = LLM( - model=self.model_name, - trust_remote_code=True, - max_model_len=131072, - gpu_memory_utilization=0.95, - tensor_parallel_size=1, - dtype='bfloat16', - enable_chunked_prefill=True, - max_num_batched_tokens=16384, - enforce_eager=True, - disable_log_stats=True, - ) - console.print('[green]✓ LLM loaded successfully (vLLM)[/green]') - except Exception as e: - console.print(f'[red]Failed to load model: {e}[/red]') - raise e - - def __generate(self, messages: List[Dict], max_tokens: int=32768) -> str: - sampling_params = SamplingParams( - temperature=0.7, - top_p=0.8, - top_k=20, - max_tokens=max_tokens, - repetition_penalty=1.05, - ) - outputs = self.__model.chat(messages=[messages], sampling_params=sampling_params) - return outputs[0].outputs[0].text.strip() - - def __generate_with_gemini(self, messages: List[Dict]) -> str: - response = self.__openai_client.chat.completions.create(model=self.__GEMINI_MODEL_NAME, messages=messages) - return response.choices[0].message.content.strip() - @staticmethod def __extract_json(content: str) -> Dict[str, Any]: try: diff --git a/preprocessor/lib/media/ffmpeg.py b/preprocessor/lib/media/ffmpeg.py index 5031d7bea..79bc23f3d 100644 --- a/preprocessor/lib/media/ffmpeg.py +++ b/preprocessor/lib/media/ffmpeg.py @@ -1,11 +1,13 @@ import json from pathlib import Path +import re import subprocess from typing import ( Any, Dict, List, Optional, + Tuple, ) @@ -34,9 +36,10 @@ def transcode( # pylint: disable=too-many-arguments audio_bitrate: str, gop_size: int, target_fps: Optional[float] = None, + deinterlace: bool = False, ) -> None: width, height = [int(x) for x in resolution.split(':')] - vf_filter = FFmpegWrapper.__build_video_filter(width, height) + vf_filter = FFmpegWrapper.__build_video_filter(width, height, deinterlace) command = FFmpegWrapper.__build_base_command(input_path, codec, preset, target_fps) command.extend( FFmpegWrapper.__build_encoding_params( @@ -88,13 +91,20 @@ def get_audio_bitrate(probe_data: Dict[str, Any]) -> Optional[int]: return int(int(bit_rate) / 1000) @staticmethod - def __build_video_filter(width: int, height: int) -> str: - return ( + def __build_video_filter(width: int, height: int, deinterlace: bool = False) -> str: + filters = [] + + if deinterlace: + filters.append('bwdif=mode=0') + + filters.append( f"scale='iw*sar:ih',scale={width}:{height}:" f"force_original_aspect_ratio=decrease,pad={width}:{height}:" - f"(ow-iw)/2:(oh-ih)/2:black,setsar=1" + f"(ow-iw)/2:(oh-ih)/2:black,setsar=1", ) + return ','.join(filters) + @staticmethod def __build_base_command( input_path: Path, codec: str, preset: str, target_fps: Optional[float], @@ -151,3 +161,60 @@ def __build_audio_and_output_params( def __get_stream_by_type(probe_data: Dict[str, Any], codec_type: str) -> Optional[Dict[str, Any]]: streams = [s for s in probe_data.get('streams', []) if s.get('codec_type') == codec_type] return streams[0] if streams else None + + @staticmethod + def detect_interlacing( + video_path: Path, + analysis_time: int = 30, + threshold: float = 0.15, + ) -> Tuple[bool, Optional[Dict[str, Any]]]: + cmd = [ + 'ffmpeg', + '-hide_banner', + '-nostats', + '-i', str(video_path), + '-t', str(analysis_time), + '-vf', 'idet', + '-f', 'null', + '-', + ] + + result = subprocess.run( + cmd, + stdout=subprocess.DEVNULL, + stderr=subprocess.PIPE, + text=True, + encoding='utf-8', + errors='ignore', + check=False, + ) + + stats = FFmpegWrapper.__parse_idet_output(result.stderr) + if stats is None: + return (False, None) + + total_interlaced = stats['tff'] + stats['bff'] + total_frames = total_interlaced + stats['progressive'] + + if total_frames == 0: + return (False, None) + + ratio = total_interlaced / total_frames + stats['ratio'] = ratio + + return (ratio > threshold, stats) + + @staticmethod + def __parse_idet_output(stderr: str) -> Optional[Dict[str, int]]: + tff_match = re.search(r'TFF:\s*(\d+)', stderr) + bff_match = re.search(r'BFF:\s*(\d+)', stderr) + prog_match = re.search(r'Progressive:\s*(\d+)', stderr) + + if not (tff_match and bff_match and prog_match): + return None + + return { + 'tff': int(tff_match.group(1)), + 'bff': int(bff_match.group(1)), + 'progressive': int(prog_match.group(1)), + } diff --git a/preprocessor/lib/media/scene_detection.py b/preprocessor/lib/media/scene_detection.py index 2fd5777de..f39a48ce7 100644 --- a/preprocessor/lib/media/scene_detection.py +++ b/preprocessor/lib/media/scene_detection.py @@ -26,18 +26,18 @@ def load_model(self) -> None: def detect_scenes( self, video_path: Path, - threshold: float=0.5, - min_scene_len: int=10, + threshold: float = 0.5, + min_scene_len: int = 10, ) -> List[Dict[str, Any]]: if self.model is None: raise RuntimeError('Model not loaded. Call load_model() first.') - video_info = self.get_video_info(video_path) + video_info = self.__get_video_info(video_path) if not video_info: raise RuntimeError(f'Failed to get video info for {video_path}') try: _, single_frame_predictions, _ = self.model.predict_video(str(video_path)) scene_changes = np.where(single_frame_predictions > threshold)[0] - return self._build_scenes_from_predictions( + return self.__build_scenes_from_predictions( scene_changes, video_info, min_scene_len, @@ -45,48 +45,36 @@ def detect_scenes( except (RuntimeError, ValueError, OSError) as e: raise RuntimeError(f'TransNetV2 detection failed: {e}') from e - def _build_scenes_from_predictions( + def cleanup(self) -> None: + if self.model is not None: + del self.model + self.model = None + gc.collect() + if torch.cuda.is_available(): + torch.cuda.empty_cache() + + def __build_scenes_from_predictions( self, scene_changes: np.ndarray, video_info: Dict[str, Any], min_scene_len: int, ) -> List[Dict[str, Any]]: - """Build scene list from frame predictions.""" scenes = [] fps = video_info['fps'] prev_frame = 0 for frame_num in scene_changes: if frame_num - prev_frame < min_scene_len: continue - scene = self._create_scene_dict(len(scenes) + 1, prev_frame, frame_num, fps) + scene = self.__create_scene_dict(len(scenes) + 1, prev_frame, frame_num, fps) scenes.append(scene) prev_frame = frame_num total_frames = video_info['total_frames'] if total_frames - prev_frame > min_scene_len: - scene = self._create_scene_dict(len(scenes) + 1, prev_frame, total_frames, fps) + scene = self.__create_scene_dict(len(scenes) + 1, prev_frame, total_frames, fps) scenes.append(scene) return scenes - @staticmethod - def get_video_info(video_file: Path) -> Optional[Dict[str, Any]]: - try: - vr = decord.VideoReader(str(video_file), ctx=decord.cpu(0)) - fps = vr.get_avg_fps() - total_frames = len(vr) - duration = total_frames / fps if fps > 0 else 0 - return {'fps': fps, 'duration': duration, 'total_frames': total_frames} - except (RuntimeError, ValueError, OSError): - return None - - def cleanup(self) -> None: - if self.model is not None: - del self.model - self.model = None - gc.collect() - if torch.cuda.is_available(): - torch.cuda.empty_cache() - - def _create_scene_dict( + def __create_scene_dict( self, scene_number: int, start_frame: int, @@ -98,19 +86,30 @@ def _create_scene_dict( 'start': { 'frame': int(start_frame), 'seconds': float(start_frame / fps), - 'timecode': self._frame_to_timecode(start_frame, fps), + 'timecode': self.__frame_to_timecode(start_frame, fps), }, 'end': { 'frame': int(end_frame), 'seconds': float(end_frame / fps), - 'timecode': self._frame_to_timecode(end_frame, fps), + 'timecode': self.__frame_to_timecode(end_frame, fps), }, 'duration': float((end_frame - start_frame) / fps), 'frame_count': int(end_frame - start_frame), } @staticmethod - def _frame_to_timecode(frame: int, fps: float) -> str: + def __get_video_info(video_file: Path) -> Optional[Dict[str, Any]]: + try: + vr = decord.VideoReader(str(video_file), ctx=decord.cpu(0)) + fps = vr.get_avg_fps() + total_frames = len(vr) + duration = total_frames / fps if fps > 0 else 0 + return {'fps': fps, 'duration': duration, 'total_frames': total_frames} + except (RuntimeError, ValueError, OSError): + return None + + @staticmethod + def __frame_to_timecode(frame: int, fps: float) -> str: seconds = frame / fps hours = int(seconds // 3600) minutes = int(seconds % 3600 // 60) diff --git a/preprocessor/modules/scraping/base_scraper.py b/preprocessor/modules/scraping/base_scraper.py index ba5384b24..b0cb106a6 100644 --- a/preprocessor/modules/scraping/base_scraper.py +++ b/preprocessor/modules/scraping/base_scraper.py @@ -15,7 +15,7 @@ ScraperMethod, ) from preprocessor.core.base_processor import BaseProcessor -from preprocessor.lib.ai.llm_provider import LLMProvider +from preprocessor.lib.ai import LLMProvider from preprocessor.lib.scraping.clipboard import ScraperClipboard from preprocessor.lib.scraping.crawl4ai import ScraperCrawl4AI from preprocessor.lib.ui.console import console diff --git a/preprocessor/modules/video/scene_detection.py b/preprocessor/modules/video/scene_detection.py index 22b691021..82368211b 100644 --- a/preprocessor/modules/video/scene_detection.py +++ b/preprocessor/modules/video/scene_detection.py @@ -50,7 +50,7 @@ def execute(self, input_data: TranscodedVideo, context: ExecutionContext) -> Sce threshold=self.config.threshold, min_scene_len=self.config.min_scene_len, ) - video_info = self.transnet.get_video_info(input_data.path) + video_info = self.transnet.__get_video_info(input_data.path) output_data = { 'total_scenes': len(scenes), 'video_info': video_info, diff --git a/preprocessor/modules/video/transcoding.py b/preprocessor/modules/video/transcoding.py index aea0a256e..87106bf54 100644 --- a/preprocessor/modules/video/transcoding.py +++ b/preprocessor/modules/video/transcoding.py @@ -14,7 +14,7 @@ class VideoTranscoderStep(PipelineStep[SourceVideo, TranscodedVideo, TranscodeCo def name(self) -> str: return 'video_transcode' - def execute( # pylint: disable=too-many-locals + def execute( # pylint: disable=too-many-locals,too-many-statements self, input_data: SourceVideo, context: ExecutionContext, ) -> TranscodedVideo: output_filename = f'{context.series_name}_{input_data.episode_info.episode_code()}.mp4' @@ -71,6 +71,25 @@ def execute( # pylint: disable=too-many-locals f'Adjusted to {audio_bitrate} kbps to avoid quality loss.' ) context.logger.info(msg) + has_interlacing, idet_stats = FFmpegWrapper.detect_interlacing(input_data.path) + if has_interlacing and idet_stats: + context.logger.info( + f"Interlacing detected for {input_data.episode_id} " + f"({idet_stats['ratio']*100:.1f}% interlaced frames: " + f"TFF={idet_stats['tff']}, BFF={idet_stats['bff']}) - " + f"applying bwdif deinterlacing filter", + ) + elif idet_stats: + context.logger.info( + f"Progressive content detected for {input_data.episode_id} " + f"({idet_stats['progressive']}/{idet_stats['progressive'] + idet_stats['tff'] + idet_stats['bff']} frames) - " + f"no deinterlacing needed", + ) + else: + context.logger.warning( + f"Could not detect interlacing for {input_data.episode_id} - " + f"proceeding without deinterlacing", + ) context.logger.info(f'Transcoding {input_data.episode_id}') temp_path = output_path.with_suffix('.mp4.tmp') context.mark_step_started(self.name, input_data.episode_id, [str(temp_path)]) @@ -88,6 +107,7 @@ def execute( # pylint: disable=too-many-locals audio_bitrate=f'{audio_bitrate}k', gop_size=int(target_fps * self.config.gop_size), target_fps=target_fps if target_fps < input_fps else None, + deinterlace=has_interlacing, ) temp_path.replace(output_path) except Exception: From 5ea6a7c10b86c87c3472b69f3279710cb994f15e Mon Sep 17 00:00:00 2001 From: dam2452 <81230036+dam2452@users.noreply.github.com> Date: Wed, 11 Feb 2026 13:32:05 +0100 Subject: [PATCH 11/89] Add force_deinterlace and improve detection Introduce a force_deinterlace option and make interlace detection more robust. Wire force_deinterlace through config (series_config, step_configs), pipeline_factory, and defaults/kiepscy series configs (defaults=false, kiepscy=true). FFmpegWrapper.detect_interlacing: make analysis_time optional, conditionally include -t, add -an, handle non-zero ffmpeg exit, and parse idet output via a multiline regex to extract TFF/BFF/Progressive. VideoTranscoderStep: when force_deinterlace is enabled skip detection and force bwdif; otherwise run detection, log failures explicitly and proceed without deinterlacing if idet fails; use a deinterlace variable and ensure temp file cleanup by broadening the except to catch BaseException. Also translate several pipeline error messages from Polish to English for clarity. --- preprocessor/app/pipeline.py | 32 +++++++-------- preprocessor/app/pipeline_factory.py | 1 + preprocessor/config/series_config.py | 2 + preprocessor/config/step_configs.py | 1 + preprocessor/lib/media/ffmpeg.py | 34 ++++++++++------ preprocessor/modules/video/transcoding.py | 47 ++++++++++++++--------- preprocessor/series_configs/defaults.json | 1 + preprocessor/series_configs/kiepscy.json | 5 +++ 8 files changed, 77 insertions(+), 46 deletions(-) diff --git a/preprocessor/app/pipeline.py b/preprocessor/app/pipeline.py index 88735ad24..cd3ac4aa3 100644 --- a/preprocessor/app/pipeline.py +++ b/preprocessor/app/pipeline.py @@ -19,9 +19,9 @@ def __init__(self, name: str) -> None: def register(self, step: StepBuilder) -> None: if step.id in self._steps: raise ValueError( - f"❌ DUPLIKAT KROKU:\n" - f" Krok '{step.id}' jest już zarejestrowany w pipeline!\n" - f" Sprawdź build_pipeline() w pipeline_factory.py", + f"❌ DUPLICATE STEP:\n" + f" Step '{step.id}' is already registered in the pipeline!\n" + f" Check build_pipeline() in pipeline_factory.py", ) self._steps[step.id] = step @@ -52,15 +52,15 @@ def _raise_missing_dependency_error( ) -> None: raise ValueError( f"\n{'=' * 80}\n" - f"❌ BŁĄD ZALEŻNOŚCI W PIPELINE\n" + f"❌ PIPELINE DEPENDENCY ERROR\n" f"{'=' * 80}\n\n" - f"Krok: '{step_id}'\n" - f"Potrzebuje: '{missing_dep_id}'\n" - f"Problem: Krok '{missing_dep_id}' nie jest zarejestrowany!\n\n" - f"Rozwiązanie:\n" - f" 1. Sprawdź build_pipeline() w preprocessor/app/pipeline_factory.py\n" - f" 2. Upewnij się że '{missing_dep_id}' jest dodany przez pipeline.register()\n" - f" 3. Lub usuń '{missing_dep_id}' z needs=[...] w definicji '{step_id}'\n" + f"Step: '{step_id}'\n" + f"Needs: '{missing_dep_id}'\n" + f"Issue: Step '{missing_dep_id}' is not registered!\n\n" + f"Solution:\n" + f" 1. Check build_pipeline() in preprocessor/app/pipeline_factory.py\n" + f" 2. Ensure '{missing_dep_id}' is added via pipeline.register()\n" + f" 3. Or remove '{missing_dep_id}' from needs=[...] in definition of '{step_id}'\n" f"\n{'=' * 80}\n", ) @@ -70,13 +70,13 @@ def _raise_cycle_error(self) -> None: raise ValueError( f"\n{'=' * 80}\n" - f"❌ CYKL W ZALEŻNOŚCIACH PIPELINE\n" + f"❌ PIPELINE DEPENDENCY CYCLE DETECTED\n" f"{'=' * 80}\n\n" - f"Wykryto cykliczną zależność:\n" + f"Cyclic dependency detected:\n" f" {cycle_path}\n\n" - f"Kroki w cyklu: {', '.join(cycles[0])}\n\n" - f"Pipeline musi być DAG (Directed Acyclic Graph).\n" - f"Usuń jedną z zależności aby przerwać cykl.\n" + f"Steps in cycle: {', '.join(cycles[0])}\n\n" + f"Pipeline must be a DAG (Directed Acyclic Graph).\n" + f"Remove one of the dependencies to break the cycle.\n" f"\n{'=' * 80}\n", ) diff --git a/preprocessor/app/pipeline_factory.py b/preprocessor/app/pipeline_factory.py index 75f9896f7..dde2474d1 100644 --- a/preprocessor/app/pipeline_factory.py +++ b/preprocessor/app/pipeline_factory.py @@ -98,6 +98,7 @@ def build_pipeline(series_name: str) -> Pipeline: # pylint: disable=too-many-lo maxrate_mbps=series_config.processing.transcode.maxrate_mbps, bufsize_mbps=series_config.processing.transcode.bufsize_mbps, gop_size=series_config.processing.transcode.gop_size, + force_deinterlace=series_config.processing.transcode.force_deinterlace, ), ) diff --git a/preprocessor/config/series_config.py b/preprocessor/config/series_config.py index cffd694af..65acd13d5 100644 --- a/preprocessor/config/series_config.py +++ b/preprocessor/config/series_config.py @@ -62,6 +62,7 @@ class TranscodeProcessingConfig: maxrate_mbps: float bufsize_mbps: float gop_size: float + force_deinterlace: bool @dataclass @@ -142,6 +143,7 @@ def __load_from_dict(data: Dict[str, Any]) -> 'SeriesConfig': maxrate_mbps=data['processing']['transcode']['maxrate_mbps'], bufsize_mbps=data['processing']['transcode']['bufsize_mbps'], gop_size=data['processing']['transcode']['gop_size'], + force_deinterlace=data['processing']['transcode']['force_deinterlace'], ), scene_detection=SceneDetectionProcessingConfig( threshold=data['processing']['scene_detection']['threshold'], diff --git a/preprocessor/config/step_configs.py b/preprocessor/config/step_configs.py index 7be90dd68..68341cf7b 100644 --- a/preprocessor/config/step_configs.py +++ b/preprocessor/config/step_configs.py @@ -21,6 +21,7 @@ class TranscodeConfig(BaseModel): bufsize_mbps: float = Field(gt=0) audio_bitrate_kbps: int = 128 gop_size: float = Field(gt=0) + force_deinterlace: bool = False class Config: arbitrary_types_allowed = True diff --git a/preprocessor/lib/media/ffmpeg.py b/preprocessor/lib/media/ffmpeg.py index 79bc23f3d..48008e170 100644 --- a/preprocessor/lib/media/ffmpeg.py +++ b/preprocessor/lib/media/ffmpeg.py @@ -165,19 +165,23 @@ def __get_stream_by_type(probe_data: Dict[str, Any], codec_type: str) -> Optiona @staticmethod def detect_interlacing( video_path: Path, - analysis_time: int = 30, + analysis_time: Optional[int] = None, threshold: float = 0.15, ) -> Tuple[bool, Optional[Dict[str, Any]]]: cmd = [ 'ffmpeg', - '-hide_banner', - '-nostats', '-i', str(video_path), - '-t', str(analysis_time), + ] + + if analysis_time: + cmd.extend(['-t', str(analysis_time)]) + + cmd.extend([ '-vf', 'idet', + '-an', '-f', 'null', '-', - ] + ]) result = subprocess.run( cmd, @@ -189,6 +193,9 @@ def detect_interlacing( check=False, ) + if result.returncode != 0: + return (False, None) + stats = FFmpegWrapper.__parse_idet_output(result.stderr) if stats is None: return (False, None) @@ -206,15 +213,18 @@ def detect_interlacing( @staticmethod def __parse_idet_output(stderr: str) -> Optional[Dict[str, int]]: - tff_match = re.search(r'TFF:\s*(\d+)', stderr) - bff_match = re.search(r'BFF:\s*(\d+)', stderr) - prog_match = re.search(r'Progressive:\s*(\d+)', stderr) + matches = re.findall( + r'Multi frame detection:\s+TFF:\s*(\d+)\s+BFF:\s*(\d+)\s+Progressive:\s*(\d+)', + stderr, + ) - if not (tff_match and bff_match and prog_match): + if not matches: return None + tff, bff, progressive = matches[-1] + return { - 'tff': int(tff_match.group(1)), - 'bff': int(bff_match.group(1)), - 'progressive': int(prog_match.group(1)), + 'tff': int(tff), + 'bff': int(bff), + 'progressive': int(progressive), } diff --git a/preprocessor/modules/video/transcoding.py b/preprocessor/modules/video/transcoding.py index 87106bf54..f456f53e3 100644 --- a/preprocessor/modules/video/transcoding.py +++ b/preprocessor/modules/video/transcoding.py @@ -71,25 +71,36 @@ def execute( # pylint: disable=too-many-locals,too-many-statements f'Adjusted to {audio_bitrate} kbps to avoid quality loss.' ) context.logger.info(msg) - has_interlacing, idet_stats = FFmpegWrapper.detect_interlacing(input_data.path) - if has_interlacing and idet_stats: + if self.config.force_deinterlace: context.logger.info( - f"Interlacing detected for {input_data.episode_id} " - f"({idet_stats['ratio']*100:.1f}% interlaced frames: " - f"TFF={idet_stats['tff']}, BFF={idet_stats['bff']}) - " - f"applying bwdif deinterlacing filter", - ) - elif idet_stats: - context.logger.info( - f"Progressive content detected for {input_data.episode_id} " - f"({idet_stats['progressive']}/{idet_stats['progressive'] + idet_stats['tff'] + idet_stats['bff']} frames) - " - f"no deinterlacing needed", + f"Force deinterlacing enabled for {input_data.episode_id} - " + f"skipping interlace detection and applying bwdif filter unconditionally", ) + deinterlace = True else: - context.logger.warning( - f"Could not detect interlacing for {input_data.episode_id} - " - f"proceeding without deinterlacing", - ) + context.logger.info(f"Detecting interlacing for {input_data.episode_id}...") + has_interlacing, idet_stats = FFmpegWrapper.detect_interlacing(input_data.path) + if has_interlacing and idet_stats: + context.logger.info( + f"Interlacing detected for {input_data.episode_id} " + f"({idet_stats['ratio']*100:.1f}% interlaced frames: " + f"TFF={idet_stats['tff']}, BFF={idet_stats['bff']}) - " + f"applying bwdif deinterlacing filter", + ) + elif idet_stats: + context.logger.info( + f"Progressive content detected for {input_data.episode_id} " + f"({idet_stats['progressive']}/{idet_stats['progressive'] + idet_stats['tff'] + idet_stats['bff']} frames) - " + f"no deinterlacing needed", + ) + else: + context.logger.error( + f"Failed to detect interlacing for {input_data.episode_id} - " + f"idet filter did not return valid statistics. " + f"This may indicate an ffmpeg error or incompatible video format. " + f"Proceeding without deinterlacing.", + ) + deinterlace = has_interlacing context.logger.info(f'Transcoding {input_data.episode_id}') temp_path = output_path.with_suffix('.mp4.tmp') context.mark_step_started(self.name, input_data.episode_id, [str(temp_path)]) @@ -107,10 +118,10 @@ def execute( # pylint: disable=too-many-locals,too-many-statements audio_bitrate=f'{audio_bitrate}k', gop_size=int(target_fps * self.config.gop_size), target_fps=target_fps if target_fps < input_fps else None, - deinterlace=has_interlacing, + deinterlace=deinterlace, ) temp_path.replace(output_path) - except Exception: + except BaseException: if temp_path.exists(): temp_path.unlink() raise diff --git a/preprocessor/series_configs/defaults.json b/preprocessor/series_configs/defaults.json index 04601e0f9..fb21e6f8e 100644 --- a/preprocessor/series_configs/defaults.json +++ b/preprocessor/series_configs/defaults.json @@ -20,6 +20,7 @@ "transcode": { "bufsize_mbps": 5.0, "codec": "h264_nvenc", + "force_deinterlace": false, "gop_size": 2.0, "maxrate_mbps": 3.5, "minrate_mbps": 1.5, diff --git a/preprocessor/series_configs/kiepscy.json b/preprocessor/series_configs/kiepscy.json index ad8ae8601..1d7d5ee4f 100644 --- a/preprocessor/series_configs/kiepscy.json +++ b/preprocessor/series_configs/kiepscy.json @@ -6,6 +6,11 @@ "index_name": "kiepscy_clips" } }, + "processing": { + "transcode": { + "force_deinterlace": true + } + }, "scraping": { "character_references": { "search_engine": "google" From 6dba0a5a40229fc2c959362f1ebea0fc4177224e Mon Sep 17 00:00:00 2001 From: dam2452 <81230036+dam2452@users.noreply.github.com> Date: Wed, 11 Feb 2026 14:04:00 +0100 Subject: [PATCH 12/89] Refactor pipeline and add search CLI Refactor core pipeline API and CLI, add video discovery and search features. - Rename Pipeline -> PipelineDefinition and Pipeline runner -> PipelineExecutor; update pipeline API (validate(logger: Optional), get_all_steps, execute_step(s)). - Extract video discovery logic to preprocessor/app/video_discovery.py and use it in the runner. - Update pipeline_factory to construct PipelineDefinition and use get_all_steps. - Replace inline video/path discovery with PathResolver usage in CLI; add new core path_resolver/path_service modules (added files). - Add a comprehensive async Elasticsearch-based `search` CLI command with multiple search modes (text, semantic, image, hash, character, emotion, object, episode name, stats) and associated clients; compute perceptual hashes when needed. - Introduce SkipListBuilder for constructing skip lists and integrate into run-all flow. - Clean up helpers to use PathResolver and simplify context setup. - Large documentation updates: expand README and SEARCH_GUIDE with new config format, pipeline steps, commands, examples, state management and API key instructions. These changes reorganize execution flow, improve modularity, and add a full-featured search CLI and better configuration/state handling. --- preprocessor/README.md | 545 ++++++++++++------ preprocessor/SEARCH_GUIDE.md | 62 +- preprocessor/app/pipeline.py | 22 +- preprocessor/app/pipeline_builder.py | 50 +- preprocessor/app/pipeline_factory.py | 8 +- preprocessor/app/video_discovery.py | 19 + preprocessor/cli/cli_main.py | 282 +++++++-- preprocessor/cli/helpers.py | 129 +++-- preprocessor/cli/skip_list_builder.py | 21 + preprocessor/core/path_manager.py | 17 +- preprocessor/core/path_resolver.py | 17 + preprocessor/core/path_service.py | 36 ++ .../search/clients/elasticsearch_queries.py | 76 ++- 13 files changed, 919 insertions(+), 365 deletions(-) create mode 100644 preprocessor/app/video_discovery.py create mode 100644 preprocessor/cli/skip_list_builder.py create mode 100644 preprocessor/core/path_resolver.py create mode 100644 preprocessor/core/path_service.py diff --git a/preprocessor/README.md b/preprocessor/README.md index 82d2ab8c3..ba2944865 100644 --- a/preprocessor/README.md +++ b/preprocessor/README.md @@ -10,134 +10,190 @@ Docker pipeline do przetwarzania wideo z GPU: transkodowanie, transkrypcja, dete ```bash cd preprocessor -mkdir -p input_data/videos output_data -cp /twoje/wideo/*.mp4 input_data/videos/ +mkdir -p input_data output_data docker compose build -# Pełny pipeline z scrapingiem -./run-preprocessor.sh run-all /input_data/ranczo \ - --scrape-urls https://example.com/wiki/Seria \ - --character-urls https://example.com/wiki/Postacie \ - --series-name ranczo - -# Z gotowymi metadanymi -./run-preprocessor.sh run-all /input_data/kiepscy \ - --episodes-info-json /input_data/kiepscy_episodes.json \ - --series-name kiepscy - -# Pomiń transkodowanie i transkrypcję (użyj istniejących) -./run-preprocessor.sh run-all /input_data/videos \ - --episodes-info-json /input_data/episodes.json \ - --series-name nazwa_serii \ - --skip-transcode \ - --skip-transcribe - -# Tryb premium (Gemini + ElevenLabs + Google Images) -./run-preprocessor.sh run-all /input_data/videos \ - --series-name nazwa_serii \ - --parser-mode premium \ - --transcription-mode premium \ - --search-mode premium +# Podstawowe użycie - pełny pipeline +./run-preprocessor.sh run-all --series ranczo + +# Z pomijaniem konkretnych kroków +./run-preprocessor.sh run-all --series kiepscy --skip transcode --skip transcribe + +# Wymuszenie ponownego przetworzenia (ignoruje cache) +./run-preprocessor.sh run-all --series ranczo --force-rerun + +# Pojedynczy krok +./run-preprocessor.sh transcode --series ranczo +./run-preprocessor.sh detect-scenes --series ranczo + +# Search +./run-preprocessor.sh search --series ranczo --text "Lucy Wilska" +./run-preprocessor.sh search --series kiepscy --stats ``` +**Konfiguracja:** Wszystkie parametry (URLs do scrapingu, tryby transkrypcji, bitrate, etc.) są w plikach `series_configs/*.json` + --- -## Pipeline (13 kroków) +## Konfiguracja per-seria +Pipeline używa plików JSON w `series_configs/` do konfiguracji każdego serialu: + +**Struktura:** ``` -SCRAPING PROCESSING INDEXING -─────────────────────────────────────────────────────────────────────── -[0a] episodes ─┬→ [1] transcode → [2] transcribe → [3] separate sounds -[0b] characters │ [4] analyze text -[0c] download │ [5] detect scenes → [6] export frames -[0d] process ─┘ [7] text embeddings - [8] frame processing (8a-8f) - [9] elastic docs → [10] archives → [11] index → [12] validate +series_configs/ +├── defaults.json # Domyślne ustawienia dla wszystkich seriali +├── ranczo.json # Nadpisuje defaults tylko dla Ranczo +└── kiepscy.json # Nadpisuje defaults tylko dla Kiepskich ``` ---- - -## Flagi Skip +**Przykład `kiepscy.json`:** +```json +{ + "display_name": "Świat według Kiepskich", + "series_name": "kiepscy", + "pipeline_mode": "full", + "indexing": { + "elasticsearch": { + "index_name": "kiepscy_clips" + } + }, + "processing": { + "transcode": { + "force_deinterlace": true, + "video_bitrate_mbps": 2.5 + }, + "transcription": { + "mode": "whisper", + "model": "large-v3-turbo" + } + }, + "scraping": { + "episodes": { + "parser_mode": "premium", + "urls": ["https://pl.wikipedia.org/wiki/Lista_odcinków..."] + }, + "characters": { + "parser_mode": "premium", + "urls": ["https://pl.wikipedia.org/wiki/Lista_postaci..."] + }, + "character_references": { + "search_engine": "google" + } + }, + "skip_steps": [] +} +``` -| Flaga | Krok | -|-------|------| -| `--skip-transcode` | 1: Transkodowanie | -| `--skip-transcribe` | 2-3: Transkrypcja + separacja | -| `--skip-text-analysis` | 4: Analiza tekstu | -| `--skip-scenes` | 5: Detekcja scen | -| `--skip-frame-export` | 6: Eksport klatek | -| `--skip-embeddings` | 7: Text embeddings | -| `--skip-character-reference-processing` | 0d: Przetwarzanie referencji postaci | -| `--skip-elastic-documents` | 9: Dokumenty ES | -| `--skip-archives` | 10: Archiwizacja ZIP | -| `--skip-index` | 11: Indeksowanie | -| `--skip-validation` | 12: Walidacja | +**Tryby pipeline:** +- `"pipeline_mode": "full"` - uruchamia wszystkie kroki +- `"pipeline_mode": "selective"` - pomija kroki z `skip_steps` automatycznie -
-Flagi frame processing (8a-8f) +**Dostępne parametry:** Zobacz `defaults.json` dla pełnej listy opcji konfiguracyjnych. -| Flaga | Krok | -|-------|------| -| `--skip-image-hashing` | 8a: Image hashing | -| `--skip-video-embeddings` | 8b: Video embeddings | -| `--skip-character-detection` | 8c: Character detection | -| `--skip-emotion-detection` | 8d: Emotion detection | -| `--skip-face-clustering` | 8e: Face clustering | -| `--skip-object-detection` | 8f: Object detection | +--- -**Uwaga:** Wizualizacje są domyślnie wyłączone. Użyj `--debug-visualizations` aby je włączyć. +## Pipeline (19 kroków) -
+``` +SCRAPING PROCESSING INDEXING +───────────────────────────────────────────────────────────────────────────── +[1] scrape_episodes ──┬─→ [4] transcode ─→ [5] transcribe ─→ [6] separate_sounds +[2] scrape_characters │ [7] analyze_text +[3] process_references─┘ [8] detect_scenes ─→ [9] export_frames + [10] text_embeddings + [11] video_embeddings + [12] image_hashing + [13] detect_characters + [14] detect_emotions + [15] cluster_faces + [16] detect_objects + [17] generate_elastic_docs ─→ [18] generate_archives ─→ [19] index_to_elasticsearch +``` -**Premium modes:** `--parser-mode premium` (Gemini 2.5 Flash) • `--transcription-mode premium` (ElevenLabs) • `--search-mode premium` (Google Images) +**Kroki są automatycznie wykonywane w poprawnej kolejności** - pipeline rozwiązuje zależności i tworzy plan wykonania. --- -## Główne komendy +## Dostępne komendy ```bash -# Pełny pipeline -./run-preprocessor.sh run-all /input_data/videos --series-name nazwa_serii [OPTIONS] - -# Pojedyncze kroki -./run-preprocessor.sh scrape-episodes --urls URL --output-file /input_data/episodes.json -./run-preprocessor.sh transcode /input_data/videos [--episodes-info-json FILE] [--resolution 720p] -./run-preprocessor.sh transcribe /input_data/videos --name series --episodes-info-json FILE -./run-preprocessor.sh transcribe-elevenlabs /input_data/videos --name series --episodes-info-json FILE -./run-preprocessor.sh separate-sounds --transcription-jsons /app/output_data/transcriptions -./run-preprocessor.sh analyze-text --season S10 --language pl -./run-preprocessor.sh detect-scenes /input_data/videos [--threshold 0.5] -./run-preprocessor.sh export-frames /input_data/videos -./run-preprocessor.sh process-character-references --name series -./run-preprocessor.sh image-hashing --frames-dir /app/output_data/exported_frames -./run-preprocessor.sh generate-embeddings --transcription-jsons /app/output_data/transcriptions -./run-preprocessor.sh generate-elastic-documents --transcription-jsons /app/output_data/transcriptions -./run-preprocessor.sh generate-archives --series-name nazwa_serii -./run-preprocessor.sh index --name nazwa_serii -./run-preprocessor.sh validate --season S01 --series-name nazwa_serii - -# Narzędzia -./run-preprocessor.sh search --text "query" -./run-preprocessor.sh search --text-semantic "query" -./run-preprocessor.sh search --image /path/to/image.jpg -./run-preprocessor.sh search --character "Nazwa" -./run-preprocessor.sh search --emotion "happiness" -./run-preprocessor.sh search --stats -./run-preprocessor.sh fix-unicode --transcription-jsons DIR --episodes-info-json FILE --name series -./run-preprocessor.sh import-transcriptions --input-dir DIR --episodes-info-json FILE --name series +# Pipeline +./run-preprocessor.sh run-all --series NAZWA [--skip STEP_ID ...] [--force-rerun] + +# Scraping +./run-preprocessor.sh scrape-episodes --series NAZWA +./run-preprocessor.sh scrape-characters --series NAZWA +./run-preprocessor.sh process-references --series NAZWA + +# Video processing +./run-preprocessor.sh transcode --series NAZWA +./run-preprocessor.sh detect-scenes --series NAZWA +./run-preprocessor.sh export-frames --series NAZWA + +# Audio/Text processing +./run-preprocessor.sh transcribe --series NAZWA +./run-preprocessor.sh separate-sounds --series NAZWA +./run-preprocessor.sh analyze-text --series NAZWA + +# Embeddings +./run-preprocessor.sh text-embeddings --series NAZWA +./run-preprocessor.sh video-embeddings --series NAZWA + +# Visual analysis +./run-preprocessor.sh image-hashing --series NAZWA +./run-preprocessor.sh detect-characters --series NAZWA +./run-preprocessor.sh detect-emotions --series NAZWA +./run-preprocessor.sh cluster-faces --series NAZWA +./run-preprocessor.sh detect-objects --series NAZWA + +# Indexing +./run-preprocessor.sh generate-elastic-docs --series NAZWA +./run-preprocessor.sh generate-archives --series NAZWA +./run-preprocessor.sh index-to-elasticsearch --series NAZWA + +# Search (wymaga uruchomionego Elasticsearch) +./run-preprocessor.sh search --series NAZWA --text "query" +./run-preprocessor.sh search --series NAZWA --text-semantic "query" +./run-preprocessor.sh search --series NAZWA --image /input_data/screenshot.jpg +./run-preprocessor.sh search --series NAZWA --character "Postać" +./run-preprocessor.sh search --series NAZWA --emotion "happiness" +./run-preprocessor.sh search --series NAZWA --object "person:5+" +./run-preprocessor.sh search --series NAZWA --stats +./run-preprocessor.sh search --series NAZWA --list-characters + +# Utilities +./run-preprocessor.sh visualize --series NAZWA # Wizualizacja grafu zależności +./run-preprocessor.sh bash # Shell w kontenerze +``` + +**Parametry:** +- `--series NAZWA` - **WYMAGANY** dla wszystkich komend (np. `ranczo`, `kiepscy`) +- `--force-rerun` - Ignoruje cache i przetwarza ponownie +- `--skip STEP_ID` - Pomija konkretny krok (można użyć wielokrotnie) + +**Step IDs do --skip:** +``` +scrape_episodes, scrape_characters, process_references, +transcode, transcribe, separate_sounds, analyze_text, +detect_scenes, export_frames, text_embeddings, video_embeddings, +image_hashing, detect_characters, detect_emotions, cluster_faces, detect_objects, +generate_elastic_docs, generate_archives, index_to_elasticsearch ``` --- ## Multi-Series Support -Pipeline wspiera przetwarzanie wielu seriali jednocześnie. Każdy serial ma dedykowany folder: +Pipeline wspiera przetwarzanie wielu seriali jednocześnie. Każdy serial ma dedykowany folder i konfigurację. **Input struktura:** ``` input_data/ ├── ranczo/ │ ├── S01/ +│ │ ├── S01E01.mp4 +│ │ └── S01E02.mp4 │ ├── S02/ │ └── S03/ └── kiepscy/ @@ -151,19 +207,30 @@ output_data/ ├── ranczo/ │ ├── transcoded_videos/ │ ├── transcriptions/ -│ ├── ranczo_episodes.json -│ ├── ranczo_characters.json +│ ├── scene_timestamps/ +│ ├── exported_frames/ +│ ├── embeddings/ +│ ├── elastic_documents/ +│ ├── .preprocessing_state_ranczo.json │ └── ... └── kiepscy/ ├── transcoded_videos/ - ├── kiepscy_episodes.json + ├── .preprocessing_state_kiepscy.json └── ... ``` +**Config struktura:** +``` +series_configs/ +├── defaults.json # Domyślne dla wszystkich +├── ranczo.json # Overrides dla Ranczo +└── kiepscy.json # Overrides dla Kiepskich +``` + **Migracja ze starej struktury:** ```bash -mkdir -p input_data/{series_name} -mv input_data/S* input_data/{series_name}/ +mkdir -p input_data/nazwa_serii +mv input_data/S* input_data/nazwa_serii/ ``` --- @@ -172,11 +239,11 @@ mv input_data/S* input_data/{series_name}/ ``` output_data/{series_name}/ -├── transcoded_videos/ # MP4 h264_nvenc (720p) +├── transcoded_videos/ # MP4 h264_nvenc (720p domyślnie) ├── transcriptions/ # raw/ • clean/ • sound_events/ ├── scene_timestamps/ # JSON z timestampami scen -├── exported_frames/ # JPG 1080p (domyślnie) -├── embeddings/ # text • video • sound_events • full_episode +├── exported_frames/ # PNG (1080p domyślnie) +├── embeddings/ # text/ • video/ • sound_events/ • full_episode/ ├── image_hashes/ # perceptual hashes klatek ├── character_detections/ # detections.json + visualizations/ (opcjonalne) ├── character_references_processed/ # face vectors postaci @@ -184,10 +251,17 @@ output_data/{series_name}/ ├── face_clusters/ # HDBSCAN clusters ├── object_detections/ # D-FINE detections + visualizations/ (opcjonalne) ├── elastic_documents/ # JSONL per typ dokumentu +│ ├── text_segments/ +│ ├── text_embeddings/ +│ ├── video_frames/ +│ └── episode_names/ ├── archives/ # ZIP per odcinek ├── validation_reports/ # JSON raporty walidacji ├── processing_metadata/ # metadata kroków pipeline -└── scraped_pages/ # zapisane strony wiki +├── scraped_pages/ # zapisane strony wiki +├── {series}_episodes.json # metadane odcinków +├── {series}_characters.json # lista postaci +└── .preprocessing_state_{series}.json # stan pipeline (cache) ``` --- @@ -197,6 +271,7 @@ output_data/{series_name}/ | Komponent | Stack | |-----------|-------| | Transkodowanie | FFmpeg + h264_nvenc (GPU) | +| Deinterlacing | bwdif (opcjonalnie, auto-detect lub force) | | Transkrypcja | Whisper large-v3-turbo / ElevenLabs Scribe v1 | | Sceny | TransNetV2 | | Embeddingi | Qwen/Qwen3-VL-Embedding-8B (4096-dim) | @@ -210,13 +285,92 @@ output_data/{series_name}/ --- +## Parametry konfiguracyjne + +**Wszystkie parametry są w `series_configs/*.json`**. Poniżej wartości domyślne z `defaults.json`: + +**Transkodowanie (`processing.transcode`):** +```json +{ + "codec": "h264_nvenc", + "resolution": "720p", + "video_bitrate_mbps": 2.5, + "minrate_mbps": 1.5, + "maxrate_mbps": 3.5, + "bufsize_mbps": 5.0, + "audio_bitrate_kbps": 128, + "gop_size": 2.0, + "force_deinterlace": false +} +``` + +**Detekcja scen (`processing.scene_detection`):** +```json +{ + "threshold": 0.5, + "min_scene_len": 10 +} +``` + +**Eksport klatek (`processing.frame_export`):** +```json +{ + "frames_per_scene": 3 +} +``` + +**Transkrypcja (`processing.transcription`):** +```json +{ + "mode": "whisper", + "model": "large-v3-turbo", + "language": "pl", + "device": "cuda" +} +``` + +**Scraping (`scraping`):** +```json +{ + "episodes": { + "parser_mode": "normal", + "urls": ["https://..."] + }, + "characters": { + "parser_mode": "normal", + "urls": ["https://..."] + }, + "character_references": { + "search_engine": "duckduckgo", + "images_per_character": 5 + } +} +``` + +**Elasticsearch (`indexing.elasticsearch`):** +```json +{ + "index_name": "nazwa_clips", + "host": "localhost:9200", + "dry_run": false, + "append": false +} +``` + +**Tryby:** +- `parser_mode`: `"normal"` (Qwen2.5-Coder) | `"premium"` (Gemini 2.5 Flash) +- `transcription.mode`: `"whisper"` | `"elevenlabs"` +- `search_engine`: `"duckduckgo"` | `"google"` (wymaga SERPAPI_API_KEY) + +--- + ## Użycie VRAM **Target:** ~21GB VRAM (85% z 24GB dla modelu embeddingowego) -**Batch sizes:** -- Video embeddings: 32 (domyślnie), progress sub-batch: 100 -- Text embeddings: 64 (domyślnie) +**Batch sizes (domyślne):** +- Video embeddings: 32, progress sub-batch: 100 +- Text embeddings: 64 - Object detection: 8 - Emotion detection: 32 @@ -252,70 +406,27 @@ Faktyczne użycie VRAM zależy od: ## Formaty plików -**Input:** `.mp4` `.avi` `.mkv` `.mov` `.flv` `.wmv` `.webm` +**Input wideo:** `.mp4` `.avi` `.mkv` `.mov` `.flv` `.wmv` `.webm` **Output wideo:** `.mp4` (h264_nvenc, 720p domyślnie) -**Output klatki:** `.jpg` (1080p domyślnie) +**Output klatki:** `.png` (1080p domyślnie) **Nazewnictwo odcinków:** `S01E01`, `s01e12`, `S10E05` (case-insensitive) **Nazewnictwo folderów:** `S01`, `Sezon 1`, `Season 10` → autonormalizacja do `SXX` -**Metadane:** JSON (episodes.json, characters.json) -**Elastic docs:** JSONL per typ (text_segments, video_frames, etc.) - ---- - -## Parametry konfiguracyjne - -**Transkodowanie:** -- Target file size: 50MB per 100s -- Audio bitrate: 128 kbps -- GOP size: 0.5s - -**Scene detection:** -- Threshold: 0.5 -- Min scene length: 10 frames - -**Text chunking:** -- Segments per embedding: 5 -- Sentences per chunk: 8 -- Chunk overlap: 3 - -**Character detection:** -- Reference images per character: 3 -- Normalized face size: 112x112 -- Face detection threshold: 0.2 -- Reference matching threshold: 0.50 -- Frame detection threshold: 0.55 - -**Object detection:** -- Confidence threshold: 0.30 - -**Embeddings:** -- Dimension: 4096 -- Max model length: 8192 tokens -- Chunked prefill: enabled +**Metadane:** JSON (`{series}_episodes.json`, `{series}_characters.json`) +**Elastic docs:** JSONL per typ (`text_segments`, `video_frames`, `text_embeddings`, `episode_names`) --- -## Dodatkowe opcje - -**State management:** -- `--no-state` - wyłącz zapisywanie stanu (brak wznowienia po przerwaniu) -- Domyślnie pipeline zapisuje stan i można wznowić po Ctrl+C - -**Ramdisk:** -- `--ramdisk-path /mnt/ramdisk` - użyj RAMdisk dla tymczasowych plików (szybsze przetwarzanie) -- Domyślnie: `/dev/shm` (shared memory, 4GB z docker-compose) -- RAMdisk używany do: kopiowania klatek podczas frame processing, tymczasowych plików transkrypcji - -**Interaktywny tryb:** -- `--interactive-character-processing` - manualna selekcja twarzy przy przetwarzaniu referencji postaci +## State Management -**Debug:** -- `--debug-visualizations` - włącz wizualizacje dla detekcji postaci i obiektów (wyłączone domyślnie) -- `--dry-run` - test indeksowania bez wysyłania do Elasticsearch +Pipeline automatycznie zapisuje stan przetwarzania w `.preprocessing_state_{series}.json`: +- Śledzi które kroki zostały ukończone dla każdego odcinka +- Pozwala na wznowienie po przerwaniu (Ctrl+C) +- Pomija już przetworzone odcinki (chyba że `--force-rerun`) -**Embeddingi:** -- `--skip-full-episode` - pomiń generowanie embeddingów całych odcinków (tylko text, video, sound events) -- `--batch-size N` - rozmiar batcha dla embeddingów (domyślnie 32 dla video, 64 dla text) +**Resetowanie stanu:** +```bash +rm output_data/ranczo/.preprocessing_state_ranczo.json +``` --- @@ -340,26 +451,124 @@ docker run --rm --gpus all nvidia/cuda:12.1.0-base-ubuntu22.04 nvidia-smi ```bash # Logi -docker logs ranchbot-preprocessing-app -f +docker logs -f preprocessor-preprocessor-run-XXX # GPU check nvidia-smi docker run --rm --gpus all nvidia/cuda:12.1.0-base-ubuntu22.04 nvidia-smi -# OOM na GPU → zmniejsz batch size -./run-preprocessor.sh generate-embeddings --batch-size 16 # domyślnie 32 +# OOM na GPU +# Zmniejsz batch_size w series_configs/{series}.json # Brak miejsca na dysku docker system prune -a docker volume prune -du -sh output_data/* # sprawdź co zajmuje miejsce +du -sh output_data/* # Wznów pipeline po przerwaniu -./run-preprocessor.sh run-all /input_data/videos --series-name nazwa_serii --name nazwa_serii +./run-preprocessor.sh run-all --series nazwa_serii +# Stan jest automatycznie przywracany z .preprocessing_state_{series}.json + +# Reset stanu dla konkretnego serialu +rm output_data/nazwa_serii/.preprocessing_state_nazwa_serii.json # Reset całego named volume z modelami docker volume rm ranchbot-ai-models # Shell w kontenerze ./run-preprocessor.sh bash + +# Debug - wizualizacja grafu pipeline +./run-preprocessor.sh visualize --series nazwa_serii +``` + +--- + +## Search Guide + +Szczegółowy opis funkcjonalności search znajduje się w `SEARCH_GUIDE.md`. + +**Quick examples:** +```bash +# Statystyki +./run-preprocessor.sh search --series ranczo --stats + +# Full-text search +./run-preprocessor.sh search --series ranczo --text "Lucy Wilska" --season 10 + +# Semantic search +./run-preprocessor.sh search --series ranczo --text-semantic "wesele" + +# Visual search +./run-preprocessor.sh search --series ranczo --image /input_data/screenshot.jpg + +# Search by character/emotion/object +./run-preprocessor.sh search --series ranczo --character "Lucy Wilska" --emotion "happiness" +./run-preprocessor.sh search --series ranczo --object "person:5+" + +# Lista postaci +./run-preprocessor.sh search --series ranczo --list-characters +``` + +--- + +## Tworzenie nowego serialu + +1. **Przygotuj dane:** + ```bash + mkdir -p input_data/nowy_serial/S01 + cp /path/to/videos/*.mp4 input_data/nowy_serial/S01/ + ``` + +2. **Stwórz config:** + ```bash + cp series_configs/defaults.json series_configs/nowy_serial.json + ``` + +3. **Edytuj config:** + ```json + { + "series_name": "nowy_serial", + "display_name": "Nowy Serial", + "indexing": { + "elasticsearch": { + "index_name": "nowy_serial_clips" + } + }, + "scraping": { + "episodes": { + "urls": ["https://..."] + }, + "characters": { + "urls": ["https://..."] + } + } + } + ``` + +4. **Uruchom pipeline:** + ```bash + ./run-preprocessor.sh run-all --series nowy_serial + ``` + +--- + +## API Keys (opcjonalne) + +Ustaw w `.env` lub docker-compose environment: + +```bash +# ElevenLabs (dla premium transcription) +ELEVEN_API_KEY=your_key + +# Google Images (dla premium character references) +SERPAPI_API_KEY=your_key + +# Gemini (dla premium scraping) +GEMINI_API_KEY=your_key + +# Elasticsearch (jeśli wymaga auth) +ES_HOST=localhost:9200 +ES_USER=elastic +ES_PASS=password ``` diff --git a/preprocessor/SEARCH_GUIDE.md b/preprocessor/SEARCH_GUIDE.md index 06caa66b7..9b2aeaa67 100644 --- a/preprocessor/SEARCH_GUIDE.md +++ b/preprocessor/SEARCH_GUIDE.md @@ -1,13 +1,17 @@ -# Ranczo Search +# Search CLI do przeszukiwania Elasticsearch. Wymaga ES na `localhost:9200` (lub inny `--host`) z zaindeksowanymi danymi. -**Indeksy:** `ranczo_segments` • `ranczo_text_embeddings` • `ranczo_video_frames` • `ranczo_episode_names` +**Multi-series:** Każdy serial ma własne indeksy (np. `ranczo_clips_*`, `kiepscy_clips_*`). Użyj `--series nazwa_serii` aby wybrać który serial przeszukać. + +**Indeksy (przykład dla ranczo):** `ranczo_clips_text_segments` • `ranczo_clips_text_embeddings` • `ranczo_clips_video_frames` • `ranczo_clips_episode_names` --- ## Tryby wyszukiwania +**WAŻNE:** Wszystkie komendy wymagają parametru `--series nazwa_serii` (np. `--series ranczo`, `--series kiepscy`) + | Flaga | Opis | |-------|------| | `--text` | Full-text BM25, dokładne słowa | @@ -29,34 +33,39 @@ CLI do przeszukiwania Elasticsearch. Wymaga ES na `localhost:9200` (lub inny `-- ```bash # Meta -./run-preprocessor.sh search --stats -./run-preprocessor.sh search --list-characters +./run-preprocessor.sh search --series ranczo --stats +./run-preprocessor.sh search --series ranczo --list-characters +./run-preprocessor.sh search --series kiepscy --stats # dla innego serialu # Text -./run-preprocessor.sh search --text "Kto tu rządzi" --limit 5 -./run-preprocessor.sh search --text-semantic "wesele" --season 10 +./run-preprocessor.sh search --series ranczo --text "Kto tu rządzi" --limit 5 +./run-preprocessor.sh search --series ranczo --text-semantic "wesele" --season 10 # Visual -./run-preprocessor.sh search --text-to-video "pocałunek" -./run-preprocessor.sh search --image /input_data/screenshot.jpg -./run-preprocessor.sh search --hash /input_data/frame.jpg # znajdź duplikaty -./run-preprocessor.sh search --hash "a1b2c3d4e5f6" # lub podaj hash bezpośrednio +./run-preprocessor.sh search --series ranczo --text-to-video "pocałunek" +./run-preprocessor.sh search --series ranczo --image /input_data/screenshot.jpg +./run-preprocessor.sh search --series ranczo --hash /input_data/frame.jpg # znajdź duplikaty +./run-preprocessor.sh search --series ranczo --hash "a1b2c3d4e5f6" # lub podaj hash bezpośrednio # Filtry i kombinacje -./run-preprocessor.sh search --character "Lucy Wilska" --season 10 -./run-preprocessor.sh search --emotion "happiness" --character "Lucy Wilska" -./run-preprocessor.sh search --emotion "sadness" --season 1 --episode 5 -./run-preprocessor.sh search --object "person:5+" # 5+ osób -./run-preprocessor.sh search --object "dog" --season 10 -./run-preprocessor.sh search --text-to-video "pocałunek" --character "Lucy Wilska" -./run-preprocessor.sh search --image /input_data/frame.jpg --season 10 --episode 1 +./run-preprocessor.sh search --series ranczo --character "Lucy Wilska" --season 10 +./run-preprocessor.sh search --series ranczo --emotion "happiness" --character "Lucy Wilska" +./run-preprocessor.sh search --series ranczo --emotion "sadness" --season 1 --episode 5 +./run-preprocessor.sh search --series ranczo --object "person:5+" # 5+ osób +./run-preprocessor.sh search --series ranczo --object "dog" --season 10 +./run-preprocessor.sh search --series ranczo --text-to-video "pocałunek" --character "Lucy Wilska" +./run-preprocessor.sh search --series ranczo --image /input_data/frame.jpg --season 10 --episode 1 # Episode -./run-preprocessor.sh search --episode-name "Spadek" -./run-preprocessor.sh search --episode-name-semantic "wesele" +./run-preprocessor.sh search --series ranczo --episode-name "Spadek" +./run-preprocessor.sh search --series ranczo --episode-name-semantic "wesele" # Output -./run-preprocessor.sh search --text "Lucy" --json-output | jq '.hits[]' +./run-preprocessor.sh search --series ranczo --text "Lucy" --json-output | jq '.hits[]' + +# Inne seriale +./run-preprocessor.sh search --series kiepscy --text "Ferdek" +./run-preprocessor.sh search --series kiepscy --character "Halina Kiepska" --emotion "anger" ``` --- @@ -65,6 +74,7 @@ CLI do przeszukiwania Elasticsearch. Wymaga ES na `localhost:9200` (lub inny `-- | Filtr | Użycie | |-------|--------| +| `--series NAME` | **WYMAGANY:** Nazwa serialu (np. ranczo, kiepscy) | | `--season N` | Sezon | | `--episode N` | Odcinek | | `--character NAME` | Postać (case-sensitive) | @@ -164,11 +174,11 @@ curl http://localhost:9200 # Oczekiwany output: {"name": "...", "cluster_name": "...", ...} # Test indeksów -./run-preprocessor.sh search --stats +./run-preprocessor.sh search --series ranczo --stats # Powinno pokazać liczby dokumentów w każdym indeksie # Brak wyników dla postaci (case-sensitive!) -./run-preprocessor.sh search --list-characters | grep -i "lucy" +./run-preprocessor.sh search --series ranczo --list-characters | grep -i "lucy" # Użyj dokładnej nazwy: "Lucy Wilska" nie "lucy wilska" # Błąd "Cannot connect to Elasticsearch" @@ -181,8 +191,12 @@ nvidia-smi # sprawdź dostępność GPU # Plik obrazka nie znaleziony # Ścieżki w kontenerze: /input_data/ nie ./input_data/ -./run-preprocessor.sh search --image /input_data/screenshot.jpg # ✓ -./run-preprocessor.sh search --image ./input_data/screenshot.jpg # ✗ +./run-preprocessor.sh search --series ranczo --image /input_data/screenshot.jpg # ✓ +./run-preprocessor.sh search --series ranczo --image ./input_data/screenshot.jpg # ✗ + +# Brak parametru --series +./run-preprocessor.sh search --text "Lucy" # ✗ Błąd: --series jest wymagany +./run-preprocessor.sh search --series ranczo --text "Lucy" # ✓ ``` **Wymagania:** diff --git a/preprocessor/app/pipeline.py b/preprocessor/app/pipeline.py index cd3ac4aa3..b1ece5706 100644 --- a/preprocessor/app/pipeline.py +++ b/preprocessor/app/pipeline.py @@ -1,4 +1,5 @@ from typing import ( + TYPE_CHECKING, Dict, List, Optional, @@ -9,8 +10,11 @@ from preprocessor.app.step_builder import StepBuilder +if TYPE_CHECKING: + from preprocessor.lib.core.logging import ErrorHandlingLogger -class Pipeline: + +class PipelineDefinition: def __init__(self, name: str) -> None: self.name: str = name self._steps: Dict[str, StepBuilder] = {} @@ -25,7 +29,7 @@ def register(self, step: StepBuilder) -> None: ) self._steps[step.id] = step - def validate(self) -> None: + def validate(self, logger: Optional["ErrorHandlingLogger"] = None) -> None: self._graph = nx.DiGraph() for step_id, step in self._steps.items(): @@ -40,13 +44,18 @@ def validate(self) -> None: if not nx.is_directed_acyclic_graph(self._graph): self._raise_cycle_error() - print( + message = ( f"✅ Pipeline '{self.name}' validated successfully:\n" f" - {len(self._steps)} steps registered\n" f" - DAG structure confirmed\n" - f" - No cyclic dependencies", + f" - No cyclic dependencies" ) + if logger: + logger.info(message) + else: + print(message) + def _raise_missing_dependency_error( self, step_id: str, missing_dep_id: str, ) -> None: @@ -149,5 +158,8 @@ def to_ascii_art(self) -> str: lines.append("=" * 80) return "\n".join(lines) + def get_all_steps(self) -> Dict[str, StepBuilder]: + return dict(self._steps) + def __repr__(self) -> str: - return f"Pipeline(name='{self.name}', steps={len(self._steps)})" + return f"PipelineDefinition(name='{self.name}', steps={len(self._steps)})" diff --git a/preprocessor/app/pipeline_builder.py b/preprocessor/app/pipeline_builder.py index a5953ac3a..0f3974d35 100644 --- a/preprocessor/app/pipeline_builder.py +++ b/preprocessor/app/pipeline_builder.py @@ -1,28 +1,33 @@ from pathlib import Path from typing import ( + TYPE_CHECKING, Any, List, ) +from preprocessor.app.video_discovery import VideoDiscovery from preprocessor.core.artifacts import SourceVideo from preprocessor.core.base_step import PipelineStep from preprocessor.core.context import ExecutionContext from preprocessor.lib.episodes.episode_manager import EpisodeManager +if TYPE_CHECKING: + from preprocessor.app.pipeline import PipelineDefinition -class Pipeline: + +class PipelineExecutor: def __init__(self, context: ExecutionContext): self.context = context self.steps: List[PipelineStep] = [] - def add_step(self, step: PipelineStep) -> "Pipeline": + def add_step(self, step: PipelineStep) -> "PipelineExecutor": self.steps.append(step) return self def run_for_episodes( self, source_path: Path, episode_manager: EpisodeManager, ) -> None: - video_files = self.__discover_videos(source_path) + video_files = VideoDiscovery.discover(source_path) self.context.logger.info( f"Discovered {len(video_files)} video files in {source_path}", ) @@ -93,14 +98,6 @@ def __mark_step_completed(self, step_name: str, episode_id: str) -> None: return self.context.state_manager.mark_step_completed(step_name, episode_id) - @staticmethod - def __discover_videos(source_path: Path) -> List[Path]: - extensions = ["*.mp4", "*.mkv", "*.avi"] - videos = [] - for ext in extensions: - videos.extend(source_path.glob(f"**/{ext}")) - return sorted(videos) - def cleanup(self) -> None: for step in self.steps: if hasattr(step, "cleanup"): @@ -108,3 +105,34 @@ def cleanup(self) -> None: step.cleanup() except Exception as e: self.context.logger.error(f"Cleanup failed for step {step.name}: {e}") + + def execute_step( + self, + pipeline: "PipelineDefinition", + step_id: str, + source_path: Path, + episode_manager: EpisodeManager, + ) -> None: + step = pipeline.get_step(step_id) + self.context.logger.info(f"🔧 Step: {step_id}") + self.context.logger.info(f"📝 {step.description}") + + StepClass = step.load_class() + instance = StepClass(step.config) + + runner = PipelineExecutor(self.context) + runner.add_step(instance) + runner.run_for_episodes(source_path, episode_manager) + + self.context.logger.info(f"✅ Step '{step_id}' completed") + + def execute_steps( + self, + pipeline: "PipelineDefinition", + step_ids: List[str], + source_path: Path, + episode_manager: EpisodeManager, + ) -> None: + for step_id in step_ids: + self.context.logger.info(f"{'=' * 80}") + self.execute_step(pipeline, step_id, source_path, episode_manager) diff --git a/preprocessor/app/pipeline_factory.py b/preprocessor/app/pipeline_factory.py index dde2474d1..b27f28035 100644 --- a/preprocessor/app/pipeline_factory.py +++ b/preprocessor/app/pipeline_factory.py @@ -1,6 +1,6 @@ from typing import Dict -from preprocessor.app.pipeline import Pipeline +from preprocessor.app.pipeline import PipelineDefinition from preprocessor.app.step_builder import ( Phase, StepBuilder, @@ -34,7 +34,7 @@ INDEXING = Phase("INDEXING", color="yellow") -def build_pipeline(series_name: str) -> Pipeline: # pylint: disable=too-many-locals +def build_pipeline(series_name: str) -> PipelineDefinition: # pylint: disable=too-many-locals series_config: SeriesConfig = SeriesConfig.load(series_name) episodes_metadata = StepBuilder( @@ -283,7 +283,7 @@ def build_pipeline(series_name: str) -> Pipeline: # pylint: disable=too-many-lo ), ) - pipeline = Pipeline(name=f"{series_name}_processing") + pipeline = PipelineDefinition(name=f"{series_name}_processing") pipeline.register(episodes_metadata) pipeline.register(characters_metadata) @@ -322,4 +322,4 @@ def visualize(series_name: str = "ranczo") -> None: def get_step_configs(series_name: str) -> Dict[str, object]: pipeline = build_pipeline(series_name) - return {step_id: step.config for step_id, step in pipeline._steps.items()} + return {step_id: step.config for step_id, step in pipeline.get_all_steps().items()} diff --git a/preprocessor/app/video_discovery.py b/preprocessor/app/video_discovery.py new file mode 100644 index 000000000..026b2d329 --- /dev/null +++ b/preprocessor/app/video_discovery.py @@ -0,0 +1,19 @@ +from pathlib import Path +from typing import List + + +class VideoDiscovery: + DEFAULT_EXTENSIONS: List[str] = ["*.mp4", "*.mkv", "*.avi"] + + @staticmethod + def discover( + source_path: Path, + extensions: List[str] = None, + ) -> List[Path]: + if extensions is None: + extensions = VideoDiscovery.DEFAULT_EXTENSIONS + + videos = [] + for ext in extensions: + videos.extend(source_path.glob(f"**/{ext}")) + return sorted(videos) diff --git a/preprocessor/cli/cli_main.py b/preprocessor/cli/cli_main.py index 9e467b502..dc0c5a117 100644 --- a/preprocessor/cli/cli_main.py +++ b/preprocessor/cli/cli_main.py @@ -1,26 +1,20 @@ -import os from pathlib import Path -from typing import Callable +from typing import ( + Callable, + Tuple, +) import click -from preprocessor.app.pipeline_builder import Pipeline as PipelineRunner +from preprocessor.app.pipeline_builder import PipelineExecutor from preprocessor.app.pipeline_factory import ( build_pipeline, visualize, ) from preprocessor.cli.helpers import setup_pipeline_context +from preprocessor.cli.skip_list_builder import SkipListBuilder from preprocessor.config.series_config import SeriesConfig - - -def _get_input_base_path() -> Path: - is_docker: bool = os.getenv('DOCKER_CONTAINER', 'false').lower() == 'true' - return Path('/input_data') if is_docker else Path('preprocessor/input_data') - - -def _get_output_base_path() -> Path: - is_docker: bool = os.getenv('DOCKER_CONTAINER', 'false').lower() == 'true' - return Path('/app/output_data') if is_docker else Path('preprocessor/output_data') +from preprocessor.core.path_resolver import PathResolver @click.group() @@ -43,41 +37,27 @@ def visualize_command(series: str) -> None: multiple=True, help="Step IDs to skip (e.g., --skip transcode --skip detect_scenes)", ) -def run_all(series: str, force_rerun: bool, skip: tuple) -> None: +def run_all(series: str, force_rerun: bool, skip: Tuple[str, ...]) -> None: series_config = SeriesConfig.load(series) pipeline = build_pipeline(series) setup = setup_pipeline_context(series, "run_all", force_rerun, with_episode_manager=True) - try: # pylint: disable=too-many-try-statements - skip_list = list(skip) - if series_config.pipeline_mode == "selective": - skip_list.extend(series_config.skip_steps) - skip_list = list(set(skip_list)) - if series_config.skip_steps: - setup.logger.info(f"🔧 Selective mode: auto-skipping {', '.join(series_config.skip_steps)}") - + try: + skip_list = SkipListBuilder.build(skip, series_config, setup.logger) plan = pipeline.get_execution_order(skip=skip_list) - input_base = _get_input_base_path() - source_path = input_base / series + source_path = PathResolver.get_input_base() / series setup.logger.info(f"📋 Execution plan: {' → '.join(plan)}") setup.logger.info(f"📂 Source: {source_path}") - for step_id in plan: - step = pipeline.get_step(step_id) - setup.logger.info(f"{'=' * 80}") - setup.logger.info(f"🔧 Step: {step_id}") - setup.logger.info(f"📝 {step.description}") - - StepClass = step.load_class() - instance = StepClass(step.config) - - runner = PipelineRunner(setup.context) - runner.add_step(instance) - runner.run_for_episodes(source_path, setup.episode_manager) - - setup.logger.info(f"✅ Step '{step_id}' completed") + executor = PipelineExecutor(setup.context) + executor.execute_steps( + pipeline=pipeline, + step_ids=plan, + source_path=source_path, + episode_manager=setup.episode_manager, + ) setup.logger.info("=" * 80) setup.logger.info("🎉 Pipeline completed successfully!") @@ -96,7 +76,7 @@ def step_command(series: str, force_rerun: bool, _step_id: str = step_id) -> Non pipeline = build_pipeline(series) setup = setup_pipeline_context(series, _step_id, force_rerun, with_episode_manager=True) - try: # pylint: disable=too-many-try-statements + try: step = pipeline.get_step(_step_id) deps = step.dependency_ids @@ -109,18 +89,15 @@ def step_command(series: str, force_rerun: bool, _step_id: str = step_id) -> Non f"Run it first or use --force-rerun.", ) - setup.logger.info(f"🔧 Running: {_step_id}") - setup.logger.info(f"📝 {step.description}") + source_path = PathResolver.get_input_base() / series - StepClass = step.load_class() - instance = StepClass(step.config) - - input_base = _get_input_base_path() - source_path = input_base / series - - runner = PipelineRunner(setup.context) - runner.add_step(instance) - runner.run_for_episodes(source_path, setup.episode_manager) + executor = PipelineExecutor(setup.context) + executor.execute_step( + pipeline=pipeline, + step_id=_step_id, + source_path=source_path, + episode_manager=setup.episode_manager, + ) setup.logger.info(f"✅ Step '{_step_id}' completed successfully") except KeyboardInterrupt: @@ -132,9 +109,210 @@ def step_command(series: str, force_rerun: bool, _step_id: str = step_id) -> Non return step_command -_cli_pipeline = build_pipeline("ranczo") +@cli.command(name="search") +@click.option("--series", required=True, help="Series name (e.g., ranczo, kiepscy)") +@click.option("--text", type=str, help="Full-text search po transkrypcjach") +@click.option("--text-semantic", type=str, help="Semantic search po text embeddings") +@click.option("--text-to-video", type=str, help="Cross-modal search: text query w video embeddings") +@click.option("--image", type=click.Path(exists=True, path_type=Path), help="Semantic search po video embeddings") +@click.option("--hash", "phash", type=str, help="Szukaj po perceptual hash (podaj hash string lub sciezke do obrazka)") +@click.option("--character", type=str, help="Szukaj po postaci") +@click.option("--emotion", type=str, help="Szukaj po emocji (neutral, happiness, surprise, sadness, anger, disgust, fear, contempt)") +@click.option("--object", "object_query", type=str, help="Szukaj po wykrytych obiektach (np. 'dog', 'person:5+', 'chair:2-4')") +@click.option("--episode-name", type=str, help="Fuzzy search po nazwach odcinkow") +@click.option("--episode-name-semantic", type=str, help="Semantic search po nazwach odcinkow") +@click.option("--list-characters", "list_chars_flag", is_flag=True, help="Lista wszystkich postaci") +@click.option("--list-objects", "list_objects_flag", is_flag=True, help="Lista wszystkich klas obiektow") +@click.option("--season", type=int, help="Filtruj po sezonie") +@click.option("--episode", type=int, help="Filtruj po odcinku") +@click.option("--limit", type=int, default=20, help="Limit wynikow") +@click.option("--stats", is_flag=True, help="Pokaz statystyki indeksow") +@click.option("--json-output", is_flag=True, help="Output w formacie JSON") +@click.option("--host", type=str, default="http://localhost:9200", help="Elasticsearch host") +def search( # pylint: disable=too-many-arguments,too-many-locals,too-many-statements + series: str, + text: str, + text_semantic: str, + text_to_video: str, + image: Path, + phash: str, + character: str, + emotion: str, + object_query: str, + episode_name: str, + episode_name_semantic: str, + list_chars_flag: bool, + list_objects_flag: bool, + season: int, + episode: int, + limit: int, + stats: bool, + json_output: bool, + host: str, +) -> None: + """Search tool - comprehensive Elasticsearch search""" + import asyncio # pylint: disable=import-outside-toplevel + import json # pylint: disable=import-outside-toplevel + import sys # pylint: disable=import-outside-toplevel + + from elasticsearch import AsyncElasticsearch # pylint: disable=import-outside-toplevel + + from preprocessor.modules.search.clients.elasticsearch_queries import ElasticsearchQueries # pylint: disable=import-outside-toplevel + from preprocessor.modules.search.clients.embedding_service import EmbeddingService # pylint: disable=import-outside-toplevel + from preprocessor.modules.search.clients.hash_service import HashService # pylint: disable=import-outside-toplevel + from preprocessor.modules.search.clients.result_formatters import ResultFormatter # pylint: disable=import-outside-toplevel + + if not any([ + text, text_semantic, text_to_video, image, phash, character, emotion, + object_query, episode_name, episode_name_semantic, list_chars_flag, list_objects_flag, stats, + ]): + click.echo("Podaj przynajmniej jedna opcje wyszukiwania. Uzyj --help", err=True) + sys.exit(1) + + series_config = SeriesConfig.load(series) + index_base = series_config.indexing.elasticsearch.index_name + + hash_value = None + if phash: + phash_path = Path(phash) + if phash_path.exists() and phash_path.is_file(): + click.echo(f"Computing perceptual hash from image: {phash}", err=True) + hash_svc = HashService() + hash_value = hash_svc.get_perceptual_hash(str(phash_path)) + if hash_value: + click.echo(f"Computed hash: {hash_value}", err=True) + else: + click.echo("Failed to compute hash from image", err=True) + sys.exit(1) + hash_svc.cleanup() + else: + hash_value = phash + + async def run() -> None: # pylint: disable=too-many-branches,too-many-statements + es_client = AsyncElasticsearch(hosts=[host], verify_certs=False) + + try: + await es_client.ping() + except Exception: # pylint: disable=broad-except + click.echo(f"✗ Cannot connect to Elasticsearch at {host}", err=True) + click.echo("Make sure Elasticsearch is running:", err=True) + click.echo(" docker-compose -f docker-compose.test.yml up -d", err=True) + sys.exit(1) + + embedding_svc = EmbeddingService() + queries = ElasticsearchQueries(embedding_svc, index_base) + + try: + if stats: + result = await queries.get_stats(es_client) + if json_output: + click.echo(json.dumps(result, indent=2)) + else: + click.echo("\nStatystyki:") + click.echo(f" Segments: {result['segments']:,}") + click.echo(f" Text Embeddings: {result['text_embeddings']:,}") + click.echo(f" Video Embeddings: {result['video_embeddings']:,}") + click.echo(f" Episode Names: {result['episode_names']:,}") + + elif list_chars_flag: + chars = await queries.list_characters(es_client) + if json_output: + click.echo(json.dumps(chars, indent=2)) + else: + click.echo(f"\nZnaleziono {len(chars)} postaci:") + for char_name, count in sorted(chars, key=lambda x: -x[1]): + click.echo(f" {char_name}: {count:,} wystapien") + + elif list_objects_flag: + objects = await queries.list_objects(es_client) + if json_output: + click.echo(json.dumps(objects, indent=2)) + else: + click.echo(f"\nZnaleziono {len(objects)} klas obiektow:") + for obj_name, count in sorted(objects, key=lambda x: -x[1]): + click.echo(f" {obj_name}: {count:,} wystapien") + + elif text: + result = await queries.search_text_query(es_client, text, season, episode, limit) + if json_output: + click.echo(json.dumps(result["hits"], indent=2)) + else: + ResultFormatter.print_results(result, "text") + + elif text_semantic: + result = await queries.search_text_semantic(es_client, text_semantic, season, episode, limit) + if json_output: + click.echo(json.dumps(result["hits"], indent=2)) + else: + ResultFormatter.print_results(result, "text_semantic") + + elif text_to_video: + result = await queries.search_text_to_video(es_client, text_to_video, season, episode, character, limit) + if json_output: + click.echo(json.dumps(result["hits"], indent=2)) + else: + ResultFormatter.print_results(result, "video") + + elif image: + result = await queries.search_video_semantic(es_client, str(image), season, episode, character, limit) + if json_output: + click.echo(json.dumps(result["hits"], indent=2)) + else: + ResultFormatter.print_results(result, "video") + + elif emotion: + result = await queries.search_by_emotion(es_client, emotion, season, episode, character, limit) + if json_output: + click.echo(json.dumps(result["hits"], indent=2)) + else: + ResultFormatter.print_results(result, "video") + + elif character: + result = await queries.search_by_character(es_client, character, season, episode, limit) + if json_output: + click.echo(json.dumps(result["hits"], indent=2)) + else: + ResultFormatter.print_results(result, "video") + + elif object_query: + result = await queries.search_by_object(es_client, object_query, season, episode, limit) + if json_output: + click.echo(json.dumps(result["hits"], indent=2)) + else: + ResultFormatter.print_results(result, "video") + + elif hash_value: + result = await queries.search_perceptual_hash(es_client, hash_value, limit) + if json_output: + click.echo(json.dumps(result["hits"], indent=2)) + else: + ResultFormatter.print_results(result, "video") + + elif episode_name: + result = await queries.search_episode_name(es_client, episode_name, season, limit) + if json_output: + click.echo(json.dumps(result["hits"], indent=2)) + else: + ResultFormatter.print_results(result, "episode_name") + + elif episode_name_semantic: + result = await queries.search_episode_name_semantic(es_client, episode_name_semantic, season, limit) + if json_output: + click.echo(json.dumps(result["hits"], indent=2)) + else: + ResultFormatter.print_results(result, "episode_name") + + finally: + embedding_svc.cleanup() + await es_client.close() + + asyncio.run(run()) + + +_CLI_TEMPLATE_SERIES = "ranczo" +_cli_pipeline = build_pipeline(_CLI_TEMPLATE_SERIES) -for _step_id, _step in _cli_pipeline._steps.items(): +for _step_id, _step in _cli_pipeline.get_all_steps().items(): command_func = _create_step_command(_step_id, _step.description) cli.add_command(command_func) diff --git a/preprocessor/cli/helpers.py b/preprocessor/cli/helpers.py index 09aeb5ff0..1652a7fd5 100644 --- a/preprocessor/cli/helpers.py +++ b/preprocessor/cli/helpers.py @@ -1,44 +1,14 @@ from dataclasses import dataclass import logging -import os from pathlib import Path from typing import Optional from preprocessor.core.context import ExecutionContext +from preprocessor.core.path_resolver import PathResolver from preprocessor.core.state_manager import StateManager from preprocessor.lib.core.logging import ErrorHandlingLogger from preprocessor.lib.episodes.episode_manager import EpisodeManager -from preprocessor.lib.ui.console import console - - -def create_cli_logger(command_name: str, loglevel: int=logging.INFO) -> ErrorHandlingLogger: - return ErrorHandlingLogger(class_name=command_name, loglevel=loglevel, error_exit_code=1) - -def create_state_manager(name: str, no_state: bool) -> Optional[StateManager]: - if no_state or not name: - return None - state_manager: StateManager = StateManager(series_name=name, working_dir=Path('.')) - state_manager.register_interrupt_handler() - state_manager.load_or_create_state() - resume_info: Optional[str] = state_manager.get_resume_info() - if resume_info: - console.print(f'[cyan]{resume_info}[/cyan]') - return state_manager - -def create_execution_context( - name: str, - logger: ErrorHandlingLogger, - no_state: bool = False, - force_rerun: bool = False, -) -> ExecutionContext: - state_manager: Optional[StateManager] = create_state_manager(name, no_state) - return ExecutionContext( - series_name=name, - base_output_dir=Path('preprocessor/output_data'), - state_manager=state_manager, - force_rerun=force_rerun, - logger=logger, - ) + @dataclass class PipelineSetup: @@ -47,40 +17,77 @@ class PipelineSetup: context: ExecutionContext episode_manager: Optional[EpisodeManager] = None + +class PipelineContextFactory: + @staticmethod + def _create_logger(command_name: str, loglevel: int = logging.INFO) -> ErrorHandlingLogger: + return ErrorHandlingLogger(class_name=command_name, loglevel=loglevel, error_exit_code=1) + + @staticmethod + def _create_state_manager(series_name: str, working_dir: Path) -> StateManager: + state_manager = StateManager(series_name=series_name, working_dir=working_dir) + state_manager.load_or_create_state() + return state_manager + + @staticmethod + def _create_episode_manager( + series: str, input_base: Path, logger: ErrorHandlingLogger, + ) -> Optional[EpisodeManager]: + episodes_json: Optional[Path] = input_base / series / 'episodes.json' + if not episodes_json.exists(): + episodes_json = None + return EpisodeManager(episodes_json, series, logger) + + @staticmethod + def _ensure_output_dir(base_dir: Path, series: str) -> Path: + series_output_dir = base_dir / series + series_output_dir.mkdir(parents=True, exist_ok=True) + return series_output_dir + + @staticmethod + def build( + series: str, + logger_name: str, + force_rerun: bool = False, + with_episode_manager: bool = True, + ) -> PipelineSetup: + logger = PipelineContextFactory._create_logger(logger_name) + base_dir = PathResolver.get_output_base() + series_output_dir = PipelineContextFactory._ensure_output_dir(base_dir, series) + + state_manager = PipelineContextFactory._create_state_manager(series, series_output_dir) + + context = ExecutionContext( + series_name=series, + base_output_dir=base_dir, + logger=logger, + state_manager=state_manager, + force_rerun=force_rerun, + ) + + episode_manager = None + if with_episode_manager: + input_base = PathResolver.get_input_base() + episode_manager = PipelineContextFactory._create_episode_manager( + series, input_base, logger, + ) + + return PipelineSetup( + logger=logger, + state_manager=state_manager, + context=context, + episode_manager=episode_manager, + ) + + def setup_pipeline_context( series: str, logger_name: str, force_rerun: bool = False, with_episode_manager: bool = True, ) -> PipelineSetup: - logger: ErrorHandlingLogger = create_cli_logger(logger_name) - - is_docker: bool = os.getenv('DOCKER_CONTAINER', 'false').lower() == 'true' - base_dir: Path = Path('/app/output_data') if is_docker else Path('preprocessor/output_data') + return PipelineContextFactory.build(series, logger_name, force_rerun, with_episode_manager) - series_output_dir: Path = base_dir / series - series_output_dir.mkdir(parents=True, exist_ok=True) - state_manager: StateManager = StateManager(series, working_dir=series_output_dir) - state_manager.load_or_create_state() - - context: ExecutionContext = ExecutionContext( - series_name=series, - base_output_dir=base_dir, - logger=logger, - state_manager=state_manager, - force_rerun=force_rerun, - ) - episode_manager: Optional[EpisodeManager] = None - if with_episode_manager: - input_base: Path = Path('/input_data') if is_docker else Path('preprocessor/input_data') - episodes_json: Optional[Path] = input_base / series / 'episodes.json' - if not episodes_json.exists(): - episodes_json = None - episode_manager = EpisodeManager(episodes_json, series, logger) - return PipelineSetup( - logger=logger, - state_manager=state_manager, - context=context, - episode_manager=episode_manager, - ) +def create_cli_logger(command_name: str, loglevel: int = logging.INFO) -> ErrorHandlingLogger: + return PipelineContextFactory._create_logger(command_name, loglevel) diff --git a/preprocessor/cli/skip_list_builder.py b/preprocessor/cli/skip_list_builder.py new file mode 100644 index 000000000..960975215 --- /dev/null +++ b/preprocessor/cli/skip_list_builder.py @@ -0,0 +1,21 @@ +from typing import ( + List, + Tuple, +) + +from preprocessor.config.series_config import SeriesConfig +from preprocessor.lib.core.logging import ErrorHandlingLogger + + +class SkipListBuilder: + @staticmethod + def build( + cli_skip: Tuple[str, ...], + series_config: SeriesConfig, + logger: ErrorHandlingLogger, + ) -> List[str]: + skip_list = list(cli_skip) + if series_config.pipeline_mode == "selective" and series_config.skip_steps: + logger.info(f"🔧 Selective mode: auto-skipping {', '.join(series_config.skip_steps)}") + skip_list.extend(series_config.skip_steps) + return list(set(skip_list)) diff --git a/preprocessor/core/path_manager.py b/preprocessor/core/path_manager.py index 1ba450287..a9e48acc4 100644 --- a/preprocessor/core/path_manager.py +++ b/preprocessor/core/path_manager.py @@ -1,21 +1,20 @@ from pathlib import Path from typing import TYPE_CHECKING -from preprocessor.config.config import get_base_output_dir +from preprocessor.core.path_service import PathService if TYPE_CHECKING: from preprocessor.lib.episodes.episode_manager import EpisodeInfo -class PathManager: +class PathManager: def __init__(self, series_name: str) -> None: - self._series_name: str = series_name.lower() + self._service: PathService = PathService(series_name) - def build_filename(self, episode_info: 'EpisodeInfo', extension: str='json', suffix: str='') -> str: - base: str = f'{self._series_name}_{episode_info.episode_code()}' - suffix_str: str = f'_{suffix}' if suffix else '' - return f'{base}{suffix_str}.{extension}' + def build_filename( + self, episode_info: 'EpisodeInfo', extension: str = 'json', suffix: str = '', + ) -> str: + return self._service.build_filename(episode_info, extension, suffix) def get_episode_dir(self, episode_info: 'EpisodeInfo', subdir: str) -> Path: - base_output_dir: Path = get_base_output_dir(self._series_name) - return base_output_dir / subdir / episode_info.season_code() / episode_info.episode_num() + return self._service.get_episode_dir(episode_info, subdir) diff --git a/preprocessor/core/path_resolver.py b/preprocessor/core/path_resolver.py new file mode 100644 index 000000000..606d3ed93 --- /dev/null +++ b/preprocessor/core/path_resolver.py @@ -0,0 +1,17 @@ +from pathlib import Path + +from preprocessor.core.path_service import PathService + + +class PathResolver: + @staticmethod + def _is_docker() -> bool: + return PathService._is_docker() + + @staticmethod + def get_input_base() -> Path: + return PathService.get_input_base() + + @staticmethod + def get_output_base() -> Path: + return PathService.get_output_base() diff --git a/preprocessor/core/path_service.py b/preprocessor/core/path_service.py new file mode 100644 index 000000000..ef6428838 --- /dev/null +++ b/preprocessor/core/path_service.py @@ -0,0 +1,36 @@ +import os +from pathlib import Path +from typing import TYPE_CHECKING + +from preprocessor.config.config import get_base_output_dir + +if TYPE_CHECKING: + from preprocessor.lib.episodes.episode_manager import EpisodeInfo + + +class PathService: + @staticmethod + def _is_docker() -> bool: + return os.getenv('DOCKER_CONTAINER', 'false').lower() == 'true' + + @staticmethod + def get_input_base() -> Path: + return Path('/input_data') if PathService._is_docker() else Path('preprocessor/input_data') + + @staticmethod + def get_output_base() -> Path: + return Path('/app/output_data') if PathService._is_docker() else Path('preprocessor/output_data') + + def __init__(self, series_name: str) -> None: + self._series_name: str = series_name.lower() + + def build_filename( + self, episode_info: 'EpisodeInfo', extension: str = 'json', suffix: str = '', + ) -> str: + base: str = f'{self._series_name}_{episode_info.episode_code()}' + suffix_str: str = f'_{suffix}' if suffix else '' + return f'{base}{suffix_str}.{extension}' + + def get_episode_dir(self, episode_info: 'EpisodeInfo', subdir: str) -> Path: + base_output_dir: Path = get_base_output_dir(self._series_name) + return base_output_dir / subdir / episode_info.season_code() / episode_info.episode_num() diff --git a/preprocessor/modules/search/clients/elasticsearch_queries.py b/preprocessor/modules/search/clients/elasticsearch_queries.py index 86fa95638..9bff301bf 100644 --- a/preprocessor/modules/search/clients/elasticsearch_queries.py +++ b/preprocessor/modules/search/clients/elasticsearch_queries.py @@ -13,8 +13,25 @@ class ElasticsearchQueries: - def __init__(self, embedding_service: EmbeddingService) -> None: + def __init__(self, embedding_service: EmbeddingService, index_base: str) -> None: self._embedding_service = embedding_service + self._index_base = index_base + + @property + def _segments_index(self) -> str: + return f'{self._index_base}_text_segments' + + @property + def _text_embeddings_index(self) -> str: + return f'{self._index_base}_text_embeddings' + + @property + def _video_frames_index(self) -> str: + return f'{self._index_base}_video_frames' + + @property + def _episode_names_index(self) -> str: + return f'{self._index_base}_episode_names' @staticmethod def _build_episode_filters(season: Optional[int], episode: Optional[int]) -> List[Dict[str, Any]]: @@ -39,7 +56,7 @@ async def search_text_query( must_clauses.extend(self._build_episode_filters(season, episode)) query_body = {'bool': {'must': must_clauses}} return await es_client.search( - index='ranczo_segments', + index=self._segments_index, query=query_body, size=limit, _source=[ @@ -67,7 +84,7 @@ async def search_text_semantic( if filter_clauses: knn_query['filter'] = filter_clauses return await es_client.search( - index='ranczo_text_embeddings', + index=self._text_embeddings_index, knn=knn_query, size=limit, _source=[ @@ -103,7 +120,7 @@ async def search_video_semantic( if filter_clauses: knn_query['filter'] = filter_clauses return await es_client.search( - index='ranczo_video_frames', + index=self._video_frames_index, knn=knn_query, size=limit, _source=[ @@ -139,7 +156,7 @@ async def search_text_to_video( if filter_clauses: knn_query['filter'] = filter_clauses return await es_client.search( - index='ranczo_video_frames', + index=self._video_frames_index, knn=knn_query, size=limit, _source=[ @@ -148,8 +165,8 @@ async def search_text_to_video( ], ) - @staticmethod async def search_by_character( + self, es_client: AsyncElasticsearch, character: str, season: Optional[int]=None, @@ -162,9 +179,9 @@ async def search_by_character( 'query': {'term': {'character_appearances.name': character}}, }, }] - must_clauses.extend(ElasticsearchQueries._build_episode_filters(season, episode)) + must_clauses.extend(self._build_episode_filters(season, episode)) return await es_client.search( - index='ranczo_video_frames', + index=self._video_frames_index, query={'bool': {'must': must_clauses}}, size=limit, _source=[ @@ -173,8 +190,8 @@ async def search_by_character( ], ) - @staticmethod async def search_by_emotion( + self, es_client: AsyncElasticsearch, emotion: str, season: Optional[int]=None, @@ -186,7 +203,7 @@ async def search_by_emotion( if character: nested_must.append({'term': {'character_appearances.name': character}}) must_clauses = [{'nested': {'path': 'character_appearances', 'query': {'bool': {'must': nested_must}}}}] - must_clauses.extend(ElasticsearchQueries._build_episode_filters(season, episode)) + must_clauses.extend(self._build_episode_filters(season, episode)) nested_filter: Dict[str, Any] = {'term': {'character_appearances.emotion.label': emotion}} if character: nested_filter = { @@ -198,7 +215,7 @@ async def search_by_emotion( }, } return await es_client.search( - index='ranczo_video_frames', + index=self._video_frames_index, query={'bool': {'must': must_clauses}}, sort=[{ 'character_appearances.emotion.confidence': { @@ -214,15 +231,15 @@ async def search_by_emotion( ], ) - @staticmethod async def search_by_object( + self, es_client: AsyncElasticsearch, object_query: str, season: Optional[int]=None, episode: Optional[int]=None, limit: int=20, ) -> Dict[str, Any]: - filter_clauses = ElasticsearchQueries._build_episode_filters(season, episode) + filter_clauses = self._build_episode_filters(season, episode) must_clauses: List[Dict[str, Any]] = [] if ':' in object_query: object_class, count_filter = object_query.split(':', 1) @@ -282,7 +299,7 @@ async def search_by_object( query_body = {'bool': {'must': must_clauses, 'filter': filter_clauses}} object_class = object_query.split(':')[0].strip() if ':' in object_query else object_query.strip() return await es_client.search( - index='ranczo_video_frames', + index=self._video_frames_index, query=query_body, sort=[{ 'detected_objects.count': { @@ -301,14 +318,14 @@ async def search_by_object( ], ) - @staticmethod async def search_perceptual_hash( + self, es_client: AsyncElasticsearch, phash: str, limit: int=10, ) -> Dict[str, Any]: return await es_client.search( - index='ranczo_video_frames', + index=self._video_frames_index, query={'term': {'perceptual_hash': phash}}, size=limit, _source=[ @@ -317,10 +334,9 @@ async def search_perceptual_hash( ], ) - @staticmethod - async def list_characters(es_client: AsyncElasticsearch) -> List[Tuple[str, int]]: + async def list_characters(self, es_client: AsyncElasticsearch) -> List[Tuple[str, int]]: result = await es_client.search( - index='ranczo_video_frames', + index=self._video_frames_index, size=0, aggs={ 'characters_nested': { @@ -336,10 +352,9 @@ async def list_characters(es_client: AsyncElasticsearch) -> List[Tuple[str, int] buckets = result['aggregations']['characters_nested']['character_names']['buckets'] return [(b['key'], b['doc_count']) for b in buckets] - @staticmethod - async def list_objects(es_client: AsyncElasticsearch) -> List[Tuple[str, int]]: + async def list_objects(self, es_client: AsyncElasticsearch) -> List[Tuple[str, int]]: result = await es_client.search( - index='ranczo_video_frames', + index=self._video_frames_index, size=0, aggs={ 'objects_nested': { @@ -355,8 +370,8 @@ async def list_objects(es_client: AsyncElasticsearch) -> List[Tuple[str, int]]: buckets = result['aggregations']['objects_nested']['object_classes']['buckets'] return [(b['key'], b['doc_count']) for b in buckets] - @staticmethod async def search_episode_name( + self, es_client: AsyncElasticsearch, query: str, season: Optional[int]=None, @@ -369,7 +384,7 @@ async def search_episode_name( must_clauses.append({'term': {'episode_metadata.season': season}}) query_body = {'bool': {'must': must_clauses}} return await es_client.search( - index='ranczo_episode_names', + index=self._episode_names_index, query=query_body, size=limit, _source=['episode_id', 'title', 'video_path', 'episode_metadata'], @@ -395,17 +410,16 @@ async def search_episode_name_semantic( if filter_clauses: knn_query['filter'] = filter_clauses return await es_client.search( - index='ranczo_episode_names', + index=self._episode_names_index, knn=knn_query, size=limit, _source=['episode_id', 'title', 'video_path', 'episode_metadata'], ) - @staticmethod - async def get_stats(es_client: AsyncElasticsearch) -> Dict[str, int]: + async def get_stats(self, es_client: AsyncElasticsearch) -> Dict[str, int]: return { - 'segments': (await es_client.count(index='ranczo_segments'))['count'], - 'text_embeddings': (await es_client.count(index='ranczo_text_embeddings'))['count'], - 'video_embeddings': (await es_client.count(index='ranczo_video_frames'))['count'], - 'episode_names': (await es_client.count(index='ranczo_episode_names'))['count'], + 'segments': (await es_client.count(index=self._segments_index))['count'], + 'text_embeddings': (await es_client.count(index=self._text_embeddings_index))['count'], + 'video_embeddings': (await es_client.count(index=self._video_frames_index))['count'], + 'episode_names': (await es_client.count(index=self._episode_names_index))['count'], } From 1589b50d8ec04e8f43fcd716dd584d52e2f3b5af Mon Sep 17 00:00:00 2001 From: dam2452 <81230036+dam2452@users.noreply.github.com> Date: Wed, 11 Feb 2026 14:09:17 +0100 Subject: [PATCH 13/89] Switch to Qwen3-VL-Embedding & update descriptions Replace Qwen/Qwen2-VL-8B-Instruct with Qwen/Qwen3-VL-Embedding-8B in defaults and step configs, and translate pipeline step descriptions from Polish to English. Files modified: preprocessor/config/step_configs.py, preprocessor/app/config_defaults.py, preprocessor/app/pipeline_factory.py. No functional changes besides model selection and description text. --- preprocessor/app/config_defaults.py | 4 ++-- preprocessor/app/pipeline_factory.py | 36 ++++++++++++++-------------- preprocessor/config/step_configs.py | 4 ++-- 3 files changed, 22 insertions(+), 22 deletions(-) diff --git a/preprocessor/app/config_defaults.py b/preprocessor/app/config_defaults.py index a8bb016ce..d724d3042 100644 --- a/preprocessor/app/config_defaults.py +++ b/preprocessor/app/config_defaults.py @@ -41,7 +41,7 @@ def get_default_step_configs(series_name: str) -> Dict[str, object]: 'detect_scenes': SceneDetectionConfig(threshold=0.5, min_scene_len=10), 'export_frames': FrameExportConfig(frames_per_scene=3), 'text_embeddings': TextEmbeddingConfig( - model_name='Qwen/Qwen2-VL-8B-Instruct', + model_name='Qwen/Qwen3-VL-Embedding-8B', batch_size=8, device='cuda', text_sentences_per_chunk=5, @@ -49,7 +49,7 @@ def get_default_step_configs(series_name: str) -> Dict[str, object]: ), 'image_hashing': ImageHashConfig(batch_size=32), 'video_embeddings': VideoEmbeddingConfig( - model_name='Qwen/Qwen2-VL-8B-Instruct', + model_name='Qwen/Qwen3-VL-Embedding-8B', batch_size=8, device='cuda', ), diff --git a/preprocessor/app/pipeline_factory.py b/preprocessor/app/pipeline_factory.py index b27f28035..e83eb5b5b 100644 --- a/preprocessor/app/pipeline_factory.py +++ b/preprocessor/app/pipeline_factory.py @@ -89,7 +89,7 @@ def build_pipeline(series_name: str) -> PipelineDefinition: # pylint: disable=t id="transcode", phase=PROCESSING, module="preprocessor.modules.video.transcoding:VideoTranscoderStep", - description=f"Konwersja do {series_config.processing.transcode.codec} {series_config.processing.transcode.resolution} z adaptacyjnym bitrate", + description=f"Conversion to {series_config.processing.transcode.codec} {series_config.processing.transcode.resolution} with adaptive bitrate", produces=["transcoded_videos/{season}/{episode}.mp4"], needs=[], config=TranscodeConfig( @@ -106,7 +106,7 @@ def build_pipeline(series_name: str) -> PipelineDefinition: # pylint: disable=t id="detect_scenes", phase=PROCESSING, module="preprocessor.modules.video.scene_detection:SceneDetectorStep", - description="Wykrywa zmiany scen używając TransNetV2", + description="Detects scene changes using TransNetV2", produces=["scene_detections/{season}/{episode}.json"], needs=[transcoded_videos], config=SceneDetectionConfig( @@ -119,7 +119,7 @@ def build_pipeline(series_name: str) -> PipelineDefinition: # pylint: disable=t id="export_frames", phase=PROCESSING, module="preprocessor.modules.video.frame_export:FrameExporterStep", - description="Eksportuje klatki (PNG) na granicach scen", + description="Exports frames (PNG) at scene boundaries", produces=["frames/{season}/{episode}/*.png"], needs=[scene_data], config=FrameExportConfig(frames_per_scene=series_config.processing.frame_export.frames_per_scene), @@ -129,7 +129,7 @@ def build_pipeline(series_name: str) -> PipelineDefinition: # pylint: disable=t id="transcribe", phase=PROCESSING, module="preprocessor.modules.text.transcription:TranscriptionStep", - description=f"Transkrypcja audio używając {series_config.processing.transcription.mode}", + description=f"Audio transcription using {series_config.processing.transcription.mode}", produces=["transcriptions/{season}/{episode}.json"], needs=[transcoded_videos], config=WhisperTranscriptionConfig( @@ -145,7 +145,7 @@ def build_pipeline(series_name: str) -> PipelineDefinition: # pylint: disable=t id="separate_sounds", phase=PROCESSING, module="preprocessor.modules.audio.separation:SoundSeparationStep", - description="Rozdziela dialogi od efektów dźwiękowych", + description="Separates dialogue from sound effects", produces=["separated_audio/{season}/{episode}/"], needs=[transcription_data], config=SoundSeparationConfig(), @@ -155,7 +155,7 @@ def build_pipeline(series_name: str) -> PipelineDefinition: # pylint: disable=t id="analyze_text", phase=PROCESSING, module="preprocessor.modules.text.analysis:TextAnalysisStep", - description="Analiza statystyk tekstu (częstotliwość słów, sentiment)", + description="Analyzes text statistics (word frequency, sentiment)", produces=["text_analysis/{season}/{episode}.json"], needs=[transcription_data], config=TextAnalysisConfig(language=series_config.processing.transcription.language), @@ -165,11 +165,11 @@ def build_pipeline(series_name: str) -> PipelineDefinition: # pylint: disable=t id="text_embeddings", phase=PROCESSING, module="preprocessor.modules.text.embeddings:TextEmbeddingStep", - description="Generuje embeddingi tekstowe używając Qwen2-VL", + description="Generates text embeddings using Qwen3-VL-Embedding", produces=["embeddings/text/{season}/{episode}.npy"], needs=[text_stats], config=TextEmbeddingConfig( - model_name="Qwen/Qwen2-VL-8B-Instruct", + model_name="Qwen/Qwen3-VL-Embedding-8B", batch_size=8, device="cuda", text_sentences_per_chunk=5, @@ -181,7 +181,7 @@ def build_pipeline(series_name: str) -> PipelineDefinition: # pylint: disable=t id="image_hashing", phase=PROCESSING, module="preprocessor.modules.vision.image_hashing:ImageHashStep", - description="Perceptual hashing klatek (phash, dhash, wavelet)", + description="Perceptual frame hashing (phash, dhash, wavelet)", produces=["hashes/{season}/{episode}.json"], needs=[exported_frames], config=ImageHashConfig(batch_size=32), @@ -191,11 +191,11 @@ def build_pipeline(series_name: str) -> PipelineDefinition: # pylint: disable=t id="video_embeddings", phase=PROCESSING, module="preprocessor.modules.vision.embeddings:VideoEmbeddingStep", - description="Embeddingi wizualne używając Qwen2-VL", + description="Visual embeddings using Qwen3-VL-Embedding", produces=["embeddings/vision/{season}/{episode}.npy"], needs=[exported_frames, image_hashes], config=VideoEmbeddingConfig( - model_name="Qwen/Qwen2-VL-8B-Instruct", + model_name="Qwen/Qwen3-VL-Embedding-8B", batch_size=8, device="cuda", ), @@ -205,7 +205,7 @@ def build_pipeline(series_name: str) -> PipelineDefinition: # pylint: disable=t id="detect_characters", phase=PROCESSING, module="preprocessor.modules.vision.character_detection:CharacterDetectorStep", - description="Rozpoznaje postacie na klatkach używając InsightFace", + description="Recognizes characters in frames using InsightFace", produces=["detections/characters/{season}/{episode}.json"], needs=[exported_frames], config=CharacterDetectionConfig(threshold=0.7), @@ -215,7 +215,7 @@ def build_pipeline(series_name: str) -> PipelineDefinition: # pylint: disable=t id="detect_emotions", phase=PROCESSING, module="preprocessor.modules.vision.emotion_detection:EmotionDetectionStep", - description="Detekcja emocji na twarzach używając EmoNet", + description="Detects emotions on faces using EmoNet", produces=["detections/emotions/{season}/{episode}.json"], needs=[exported_frames], config=EmotionDetectionConfig(), @@ -225,7 +225,7 @@ def build_pipeline(series_name: str) -> PipelineDefinition: # pylint: disable=t id="cluster_faces", phase=PROCESSING, module="preprocessor.modules.vision.face_clustering:FaceClusteringStep", - description="Klasteryzacja twarzy używając HDBSCAN", + description="Face clustering using HDBSCAN", produces=["clusters/faces/{season}/{episode}.json"], needs=[exported_frames], config=FaceClusteringConfig(), @@ -235,7 +235,7 @@ def build_pipeline(series_name: str) -> PipelineDefinition: # pylint: disable=t id="detect_objects", phase=PROCESSING, module="preprocessor.modules.vision.object_detection:ObjectDetectionStep", - description="Detekcja obiektów ogólnych używając D-FINE", + description="General object detection using D-FINE", produces=["detections/objects/{season}/{episode}.json"], needs=[exported_frames], config=ObjectDetectionConfig(), @@ -245,7 +245,7 @@ def build_pipeline(series_name: str) -> PipelineDefinition: # pylint: disable=t id="generate_elastic_docs", phase=INDEXING, module="preprocessor.modules.search.document_generation:DocumentGeneratorStep", - description="Łączy wszystkie dane w dokumenty Elasticsearch", + description="Combines all data into Elasticsearch documents", produces=["elastic_documents/{season}/{episode}.ndjson"], needs=[ text_embeddings, @@ -262,7 +262,7 @@ def build_pipeline(series_name: str) -> PipelineDefinition: # pylint: disable=t id="generate_archives", phase=INDEXING, module="preprocessor.modules.packaging.archives:ArchiveGenerationStep", - description="Tworzy archiwa ZIP per odcinek (wszystkie artefakty)", + description="Creates ZIP archives per episode (all artifacts)", produces=["archives/{season}/{episode}.zip"], needs=[elastic_documents], config=ArchiveConfig(), @@ -272,7 +272,7 @@ def build_pipeline(series_name: str) -> PipelineDefinition: # pylint: disable=t id="index_to_elasticsearch", phase=INDEXING, module="preprocessor.modules.search.indexing:ElasticsearchIndexerStep", - description="Wrzuca dokumenty do Elasticsearch", + description="Indexes documents into Elasticsearch", produces=[""], needs=[elastic_documents], config=ElasticsearchConfig( diff --git a/preprocessor/config/step_configs.py b/preprocessor/config/step_configs.py index 68341cf7b..a81b0a990 100644 --- a/preprocessor/config/step_configs.py +++ b/preprocessor/config/step_configs.py @@ -60,14 +60,14 @@ class TextAnalysisConfig(BaseModel): language: str = 'pl' class TextEmbeddingConfig(BaseModel): - model_name: str = 'Qwen/Qwen2-VL-8B-Instruct' + model_name: str = 'Qwen/Qwen3-VL-Embedding-8B' batch_size: int = Field(default=8, ge=1) device: str = 'cuda' text_sentences_per_chunk: int = Field(default=5, ge=1) text_chunk_overlap: int = Field(default=1, ge=0) class VideoEmbeddingConfig(BaseModel): - model_name: str = 'Qwen/Qwen2-VL-8B-Instruct' + model_name: str = 'Qwen/Qwen3-VL-Embedding-8B' batch_size: int = Field(default=8, ge=1) device: str = 'cuda' From de75fc8acd48b673f75597cf3c56eed21abc8b51 Mon Sep 17 00:00:00 2001 From: dam2452 <81230036+dam2452@users.noreply.github.com> Date: Wed, 11 Feb 2026 14:14:58 +0100 Subject: [PATCH 14/89] Remove redundant docstrings and comments MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Remove non-functional docstrings and a TODO comment across several preprocessor modules (cli/cli_main.py, config/prompts/common_schemas.py, lib/io/metadata.py, modules/scraping/base_scraper.py, modules/search/clients/result_formatters.py, modules/search/indexing.py, modules/text/embeddings.py, modules/vision/embeddings.py). Cosmetic cleanup only — no functional changes. --- preprocessor/cli/cli_main.py | 1 - preprocessor/config/prompts/common_schemas.py | 4 ---- preprocessor/lib/io/metadata.py | 1 - preprocessor/modules/scraping/base_scraper.py | 1 - preprocessor/modules/search/clients/result_formatters.py | 4 ---- preprocessor/modules/search/indexing.py | 2 -- preprocessor/modules/text/embeddings.py | 1 - preprocessor/modules/vision/embeddings.py | 1 - 8 files changed, 15 deletions(-) diff --git a/preprocessor/cli/cli_main.py b/preprocessor/cli/cli_main.py index dc0c5a117..b9ab69dd4 100644 --- a/preprocessor/cli/cli_main.py +++ b/preprocessor/cli/cli_main.py @@ -150,7 +150,6 @@ def search( # pylint: disable=too-many-arguments,too-many-locals,too-many-state json_output: bool, host: str, ) -> None: - """Search tool - comprehensive Elasticsearch search""" import asyncio # pylint: disable=import-outside-toplevel import json # pylint: disable=import-outside-toplevel import sys # pylint: disable=import-outside-toplevel diff --git a/preprocessor/config/prompts/common_schemas.py b/preprocessor/config/prompts/common_schemas.py index 6ec402adc..163e299ee 100644 --- a/preprocessor/config/prompts/common_schemas.py +++ b/preprocessor/config/prompts/common_schemas.py @@ -1,8 +1,4 @@ -"""Common JSON schemas used across prompts.""" - - def episode_metadata_schema() -> str: - """Returns JSON schema for episode metadata.""" return ( '{\n' ' "title": str,\n' diff --git a/preprocessor/lib/io/metadata.py b/preprocessor/lib/io/metadata.py index 2823d89e9..6ee162575 100644 --- a/preprocessor/lib/io/metadata.py +++ b/preprocessor/lib/io/metadata.py @@ -40,7 +40,6 @@ def create_embedding_collection( embedding_count: int, embedding_type: str, ) -> EmbeddingCollection: - """Helper to create EmbeddingCollection with standard parameters.""" return EmbeddingCollection( episode_id=episode_id, episode_info=episode_info, diff --git a/preprocessor/modules/scraping/base_scraper.py b/preprocessor/modules/scraping/base_scraper.py index b0cb106a6..5b92bdf04 100644 --- a/preprocessor/modules/scraping/base_scraper.py +++ b/preprocessor/modules/scraping/base_scraper.py @@ -87,7 +87,6 @@ def _save_result(self, result: Dict[str, Any]) -> None: json.dump(result, f, indent=2, ensure_ascii=False) def get_output_subdir(self) -> str: - """Scrapery używają bezpośrednio output_file zamiast subdirektoriów per-episode.""" return "" @abstractmethod diff --git a/preprocessor/modules/search/clients/result_formatters.py b/preprocessor/modules/search/clients/result_formatters.py index a44c13a2f..0c88ecaf2 100644 --- a/preprocessor/modules/search/clients/result_formatters.py +++ b/preprocessor/modules/search/clients/result_formatters.py @@ -31,7 +31,6 @@ def _format_scene_context(scene_info: Optional[Dict[str, Any]]) -> str: @staticmethod def __format_character_appearances(appearances: list) -> str: - """Format character appearances with emotions.""" chars_strs = [] for char in appearances: char_str = char.get('name', 'Unknown') @@ -44,12 +43,10 @@ def __format_character_appearances(appearances: list) -> str: @staticmethod def __format_detected_objects(objects: list) -> str: - """Format detected objects list.""" return ', '.join([f"{obj['class']}:{obj['count']}" for obj in objects]) @staticmethod def __print_text_result(source: Dict[str, Any], scene_ctx: str) -> None: - """Print text search result.""" click.echo(f"Segment ID: {source.get('segment_id', 'N/A')}") start_time = ResultFormatter.format_timestamp(source['start_time']) end_time = ResultFormatter.format_timestamp(source['end_time']) @@ -59,7 +56,6 @@ def __print_text_result(source: Dict[str, Any], scene_ctx: str) -> None: @staticmethod def __print_video_result(source: Dict[str, Any], scene_ctx: str) -> None: - """Print video/frame search result.""" timestamp = ResultFormatter.format_timestamp(source['timestamp']) click.echo(f"Frame: {source['frame_number']} @ {timestamp}{scene_ctx}") if 'frame_type' in source: diff --git a/preprocessor/modules/search/indexing.py b/preprocessor/modules/search/indexing.py index c96780ae8..88035dbf7 100644 --- a/preprocessor/modules/search/indexing.py +++ b/preprocessor/modules/search/indexing.py @@ -106,8 +106,6 @@ async def _execute_async( def _get_mapping_for_type( doc_type: str, # pylint: disable=unused-argument ) -> Optional[Dict[str, Any]]: - """Get Elasticsearch mapping for document type.""" - # TODO: Load mappings from config or separate file # pylint: disable=fixme return None def cleanup(self) -> None: diff --git a/preprocessor/modules/text/embeddings.py b/preprocessor/modules/text/embeddings.py index fdd84ae16..d0de6ec8c 100644 --- a/preprocessor/modules/text/embeddings.py +++ b/preprocessor/modules/text/embeddings.py @@ -38,7 +38,6 @@ def _create_embedding_collection( # pylint: disable=duplicate-code output_path: Path, embedding_count: int, ) -> EmbeddingCollection: - """Create EmbeddingCollection with standard parameters.""" return MetadataBuilder.create_embedding_collection( episode_id=input_data.episode_id, episode_info=input_data.episode_info, diff --git a/preprocessor/modules/vision/embeddings.py b/preprocessor/modules/vision/embeddings.py index 302c03c6f..6361caa5d 100644 --- a/preprocessor/modules/vision/embeddings.py +++ b/preprocessor/modules/vision/embeddings.py @@ -39,7 +39,6 @@ def _create_embedding_collection( # pylint: disable=duplicate-code output_path: Path, embedding_count: int, ) -> EmbeddingCollection: - """Create EmbeddingCollection with standard parameters.""" return MetadataBuilder.create_embedding_collection( episode_id=input_data.episode_id, episode_info=input_data.episode_info, From 289e04d65a316c160ba0f9961fdab25fda0824c4 Mon Sep 17 00:00:00 2001 From: dam2452 <81230036+dam2452@users.noreply.github.com> Date: Wed, 11 Feb 2026 15:44:12 +0100 Subject: [PATCH 15/89] Privatize helper methods & cleanup dead code Rename numerous internal helpers to private (double-underscore) names across the codebase and update call sites accordingly. Remove or trim several unused/legacy routines (state interrupt handling, metadata save, various helper utilities and wrappers), simplify logging wrapper, and add pylint disables where appropriate. These changes are intended to encapsulate implementation details, reduce public surface area and remove dead code without altering high-level behaviors. --- preprocessor/app/config_defaults.py | 2 +- preprocessor/app/pipeline.py | 8 +- preprocessor/app/pipeline_builder.py | 4 +- preprocessor/app/pipeline_factory.py | 2 +- preprocessor/cli/cli_main.py | 14 +- preprocessor/cli/helpers.py | 20 +- preprocessor/config/config.py | 55 +-- preprocessor/config/series_config.py | 10 +- preprocessor/config/step_configs.py | 2 +- preprocessor/core/base_processor.py | 165 +------ preprocessor/core/processing_metadata.py | 22 - preprocessor/core/state_manager.py | 39 +- preprocessor/lib/ai/models.py | 6 +- preprocessor/lib/ai/provider.py | 9 +- .../lib/characters/reference_downloader.py | 4 +- preprocessor/lib/core/logging.py | 12 - preprocessor/lib/episodes/episode_manager.py | 24 +- preprocessor/lib/io/files.py | 8 +- preprocessor/lib/io/hashing.py | 2 +- preprocessor/lib/io/metadata.py | 4 +- preprocessor/lib/media/resolution.py | 4 +- preprocessor/lib/search/embedding_model.py | 2 +- preprocessor/lib/text/text_statistics.py | 8 +- .../processors/sound_separator.py | 6 +- preprocessor/lib/transcription/utils.py | 2 +- preprocessor/lib/ui/console.py | 4 +- preprocessor/lib/ui/progress.py | 2 +- .../lib/validation/file_validators.py | 14 +- preprocessor/lib/video/emotion_utils.py | 18 +- preprocessor/lib/video/frame_utils.py | 4 +- preprocessor/lib/video/image_hasher.py | 2 +- preprocessor/modules/audio/separation.py | 42 +- .../modules/scraping/reference_processor.py | 430 +++++++++++------- .../search/clients/elasticsearch_queries.py | 56 +-- .../modules/search/clients/hash_service.py | 4 +- .../search/clients/result_formatters.py | 4 +- .../modules/search/document_generation.py | 12 +- preprocessor/modules/search/indexing.py | 4 +- preprocessor/modules/text/embeddings.py | 14 +- preprocessor/modules/text/import_step.py | 20 +- .../modules/validation/episode_stats.py | 16 +- preprocessor/modules/video/frame_export.py | 24 +- .../modules/vision/character_detection.py | 4 +- preprocessor/modules/vision/embeddings.py | 4 +- preprocessor/modules/vision/image_hashing.py | 8 +- 45 files changed, 495 insertions(+), 625 deletions(-) diff --git a/preprocessor/app/config_defaults.py b/preprocessor/app/config_defaults.py index d724d3042..59e488258 100644 --- a/preprocessor/app/config_defaults.py +++ b/preprocessor/app/config_defaults.py @@ -20,7 +20,7 @@ ) -def get_default_step_configs(series_name: str) -> Dict[str, object]: +def __get_default_step_configs(series_name: str) -> Dict[str, object]: return { 'transcode': TranscodeConfig( video_bitrate_mbps=2.5, diff --git a/preprocessor/app/pipeline.py b/preprocessor/app/pipeline.py index b1ece5706..5a2b44e48 100644 --- a/preprocessor/app/pipeline.py +++ b/preprocessor/app/pipeline.py @@ -38,11 +38,11 @@ def validate(self, logger: Optional["ErrorHandlingLogger"] = None) -> None: for step_id, step in self._steps.items(): for dep_id in step.dependency_ids: if dep_id not in self._steps: - self._raise_missing_dependency_error(step_id, dep_id) + self.__raise_missing_dependency_error(step_id, dep_id) self._graph.add_edge(dep_id, step_id) if not nx.is_directed_acyclic_graph(self._graph): - self._raise_cycle_error() + self.__raise_cycle_error() message = ( f"✅ Pipeline '{self.name}' validated successfully:\n" @@ -56,7 +56,7 @@ def validate(self, logger: Optional["ErrorHandlingLogger"] = None) -> None: else: print(message) - def _raise_missing_dependency_error( + def __raise_missing_dependency_error( self, step_id: str, missing_dep_id: str, ) -> None: raise ValueError( @@ -73,7 +73,7 @@ def _raise_missing_dependency_error( f"\n{'=' * 80}\n", ) - def _raise_cycle_error(self) -> None: + def __raise_cycle_error(self) -> None: cycles: List[List[str]] = list(nx.simple_cycles(self._graph)) cycle_path: str = " → ".join(cycles[0]) + f" → {cycles[0][0]}" diff --git a/preprocessor/app/pipeline_builder.py b/preprocessor/app/pipeline_builder.py index 0f3974d35..09e9903df 100644 --- a/preprocessor/app/pipeline_builder.py +++ b/preprocessor/app/pipeline_builder.py @@ -24,7 +24,7 @@ def add_step(self, step: PipelineStep) -> "PipelineExecutor": self.steps.append(step) return self - def run_for_episodes( + def __run_for_episodes( # pylint: disable=unused-private-member self, source_path: Path, episode_manager: EpisodeManager, ) -> None: video_files = VideoDiscovery.discover(source_path) @@ -122,7 +122,7 @@ def execute_step( runner = PipelineExecutor(self.context) runner.add_step(instance) - runner.run_for_episodes(source_path, episode_manager) + runner.__run_for_episodes(source_path, episode_manager) self.context.logger.info(f"✅ Step '{step_id}' completed") diff --git a/preprocessor/app/pipeline_factory.py b/preprocessor/app/pipeline_factory.py index e83eb5b5b..8de631d83 100644 --- a/preprocessor/app/pipeline_factory.py +++ b/preprocessor/app/pipeline_factory.py @@ -320,6 +320,6 @@ def visualize(series_name: str = "ranczo") -> None: print(pipeline.to_ascii_art()) -def get_step_configs(series_name: str) -> Dict[str, object]: +def __get_step_configs(series_name: str) -> Dict[str, object]: pipeline = build_pipeline(series_name) return {step_id: step.config for step_id, step in pipeline.get_all_steps().items()} diff --git a/preprocessor/cli/cli_main.py b/preprocessor/cli/cli_main.py index b9ab69dd4..ac1372d16 100644 --- a/preprocessor/cli/cli_main.py +++ b/preprocessor/cli/cli_main.py @@ -25,7 +25,7 @@ def cli() -> None: @cli.command(name="visualize") @click.option("--series", default="ranczo", help="Series name (e.g., ranczo)") -def visualize_command(series: str) -> None: +def __visualize_command(series: str) -> None: visualize(series) @@ -37,10 +37,10 @@ def visualize_command(series: str) -> None: multiple=True, help="Step IDs to skip (e.g., --skip transcode --skip detect_scenes)", ) -def run_all(series: str, force_rerun: bool, skip: Tuple[str, ...]) -> None: +def __run_all(series: str, force_rerun: bool, skip: Tuple[str, ...]) -> None: series_config = SeriesConfig.load(series) pipeline = build_pipeline(series) - setup = setup_pipeline_context(series, "run_all", force_rerun, with_episode_manager=True) + setup = setup_pipeline_context(series, "__run_all", force_rerun, with_episode_manager=True) try: skip_list = SkipListBuilder.build(skip, series_config, setup.logger) @@ -68,11 +68,11 @@ def run_all(series: str, force_rerun: bool, skip: Tuple[str, ...]) -> None: setup.logger.finalize() -def _create_step_command(step_id: str, step_description: str) -> Callable: +def __create_step_command(step_id: str, step_description: str) -> Callable: @click.command(name=step_id.replace("_", "-"), help=f"{step_description}") @click.option("--series", required=True, help="Series name (e.g., ranczo)") @click.option("--force-rerun", is_flag=True, help="Force rerun even if cached") - def step_command(series: str, force_rerun: bool, _step_id: str = step_id) -> None: + def __step_command(series: str, force_rerun: bool, _step_id: str = step_id) -> None: pipeline = build_pipeline(series) setup = setup_pipeline_context(series, _step_id, force_rerun, with_episode_manager=True) @@ -106,7 +106,7 @@ def step_command(series: str, force_rerun: bool, _step_id: str = step_id) -> Non finally: setup.logger.finalize() - return step_command + return __step_command @cli.command(name="search") @@ -312,7 +312,7 @@ async def run() -> None: # pylint: disable=too-many-branches,too-many-statement _cli_pipeline = build_pipeline(_CLI_TEMPLATE_SERIES) for _step_id, _step in _cli_pipeline.get_all_steps().items(): - command_func = _create_step_command(_step_id, _step.description) + command_func = __create_step_command(_step_id, _step.description) cli.add_command(command_func) diff --git a/preprocessor/cli/helpers.py b/preprocessor/cli/helpers.py index 1652a7fd5..257295b14 100644 --- a/preprocessor/cli/helpers.py +++ b/preprocessor/cli/helpers.py @@ -20,17 +20,17 @@ class PipelineSetup: class PipelineContextFactory: @staticmethod - def _create_logger(command_name: str, loglevel: int = logging.INFO) -> ErrorHandlingLogger: + def __create_logger(command_name: str, loglevel: int = logging.INFO) -> ErrorHandlingLogger: return ErrorHandlingLogger(class_name=command_name, loglevel=loglevel, error_exit_code=1) @staticmethod - def _create_state_manager(series_name: str, working_dir: Path) -> StateManager: + def __create_state_manager(series_name: str, working_dir: Path) -> StateManager: state_manager = StateManager(series_name=series_name, working_dir=working_dir) state_manager.load_or_create_state() return state_manager @staticmethod - def _create_episode_manager( + def __create_episode_manager( series: str, input_base: Path, logger: ErrorHandlingLogger, ) -> Optional[EpisodeManager]: episodes_json: Optional[Path] = input_base / series / 'episodes.json' @@ -39,7 +39,7 @@ def _create_episode_manager( return EpisodeManager(episodes_json, series, logger) @staticmethod - def _ensure_output_dir(base_dir: Path, series: str) -> Path: + def __ensure_output_dir(base_dir: Path, series: str) -> Path: series_output_dir = base_dir / series series_output_dir.mkdir(parents=True, exist_ok=True) return series_output_dir @@ -51,11 +51,11 @@ def build( force_rerun: bool = False, with_episode_manager: bool = True, ) -> PipelineSetup: - logger = PipelineContextFactory._create_logger(logger_name) + logger = PipelineContextFactory.__create_logger(logger_name) base_dir = PathResolver.get_output_base() - series_output_dir = PipelineContextFactory._ensure_output_dir(base_dir, series) + series_output_dir = PipelineContextFactory.__ensure_output_dir(base_dir, series) - state_manager = PipelineContextFactory._create_state_manager(series, series_output_dir) + state_manager = PipelineContextFactory.__create_state_manager(series, series_output_dir) context = ExecutionContext( series_name=series, @@ -68,7 +68,7 @@ def build( episode_manager = None if with_episode_manager: input_base = PathResolver.get_input_base() - episode_manager = PipelineContextFactory._create_episode_manager( + episode_manager = PipelineContextFactory.__create_episode_manager( series, input_base, logger, ) @@ -89,5 +89,5 @@ def setup_pipeline_context( return PipelineContextFactory.build(series, logger_name, force_rerun, with_episode_manager) -def create_cli_logger(command_name: str, loglevel: int = logging.INFO) -> ErrorHandlingLogger: - return PipelineContextFactory._create_logger(command_name, loglevel) +def __create_cli_logger(command_name: str, loglevel: int = logging.INFO) -> ErrorHandlingLogger: + return PipelineContextFactory.__create_logger(command_name, loglevel) diff --git a/preprocessor/config/config.py b/preprocessor/config/config.py index b93c7659c..f08d9c9ee 100644 --- a/preprocessor/config/config.py +++ b/preprocessor/config/config.py @@ -84,21 +84,12 @@ class TranscodeSettings: def get_output_dir(series_name: str) -> Path: return get_base_output_dir(series_name) / 'transcoded_videos' - def calculate_video_bitrate_mbps(self) -> float: + def __calculate_video_bitrate_mbps(self) -> float: # pylint: disable=unused-private-member total_bitrate_mbps = self.target_file_size_mb * 8 / self.target_duration_seconds audio_bitrate_mbps = self.audio_bitrate_kbps / 1000.0 video_bitrate_mbps = total_bitrate_mbps - audio_bitrate_mbps return round(video_bitrate_mbps, 2) - def calculate_minrate_mbps(self, percent: float=0.5) -> float: - return round(self.calculate_video_bitrate_mbps() * percent, 2) - - def calculate_maxrate_mbps(self, percent: float=1.75) -> float: - return round(self.calculate_video_bitrate_mbps() * percent, 2) - - def calculate_bufsize_mbps(self, multiplier: float=2.0) -> float: - return round(self.calculate_video_bitrate_mbps() * multiplier, 2) - @dataclass class SceneDetectionSettings: threshold: float = 0.5 @@ -140,7 +131,7 @@ class WhisperSettings: model: str = 'large-v3-turbo' @classmethod - def _from_env(cls) -> 'WhisperSettings': + def __from_env(cls) -> 'WhisperSettings': # pylint: disable=unused-private-member return cls(model=os.getenv('WHISPER_MODEL', 'large-v3-turbo')) @dataclass @@ -158,7 +149,7 @@ class ElevenLabsSettings(BaseAPISettings): max_attempts: int = 60 @classmethod - def _from_env(cls) -> 'ElevenLabsSettings': + def __from_env(cls) -> 'ElevenLabsSettings': # pylint: disable=unused-private-member api_key = None if os.getenv('ELEVEN_API_KEY'): api_key = SecretStr(os.getenv('ELEVEN_API_KEY', '')) @@ -209,7 +200,7 @@ class EmotionDetectionSettings: model_name: str = 'enet_b2_8' @classmethod - def _from_env(cls) -> 'EmotionDetectionSettings': + def __from_env(cls) -> 'EmotionDetectionSettings': # pylint: disable=unused-private-member model_name = os.getenv('EMOTION_MODEL_NAME', 'enet_b2_8') return cls(model_name=model_name) @@ -225,18 +216,6 @@ class CharacterSettings: def get_output_dir(series_name: str) -> Path: return get_base_output_dir(series_name) / 'characters' - @staticmethod - def get_characters_list_file(series_name: str) -> Path: - return get_base_output_dir(series_name) / 'characters.json' - - @staticmethod - def get_detections_dir(series_name: str) -> Path: - return get_base_output_dir(series_name) / 'character_detections' - - @staticmethod - def get_processed_references_dir(series_name: str) -> Path: - return get_base_output_dir(series_name) / 'character_references_processed' - @dataclass class ObjectDetectionSettings: model_name: str = 'ustc-community/dfine-xlarge-obj2coco' @@ -246,10 +225,6 @@ class ObjectDetectionSettings: def get_output_dir(series_name: str) -> Path: return get_base_output_dir(series_name) / 'object_detections' - @staticmethod - def get_visualized_output_dir(series_name: str) -> Path: - return get_base_output_dir(series_name) / 'object_detections' / 'visualizations' - @dataclass class ImageHashSettings: @@ -269,7 +244,7 @@ class ImageScraperSettings(BaseAPISettings): page_navigation_timeout: int = 30000 @classmethod - def _from_env(cls) -> 'ImageScraperSettings': + def __from_env(cls) -> 'ImageScraperSettings': # pylint: disable=unused-private-member api_key = None if os.getenv('SERPAPI_API_KEY'): api_key = SecretStr(os.getenv('SERPAPI_API_KEY', '')) @@ -293,14 +268,14 @@ class ElasticsearchSettings: password: str = '' @classmethod - def _from_env(cls) -> 'ElasticsearchSettings': + def __from_env(cls) -> 'ElasticsearchSettings': # pylint: disable=unused-private-member return cls(host=os.getenv('ES_HOST', ''), user=os.getenv('ES_USER', ''), password=os.getenv('ES_PASS', '')) @dataclass class GeminiSettings(BaseAPISettings): @classmethod - def _from_env(cls) -> 'GeminiSettings': + def __from_env(cls) -> 'GeminiSettings': # pylint: disable=unused-private-member api_key = None if os.getenv('GEMINI_API_KEY'): api_key = SecretStr(os.getenv('GEMINI_API_KEY', '')) @@ -331,10 +306,10 @@ class Settings: # pylint: disable=too-many-instance-attributes transcription: TranscriptionSettings @classmethod - def _from_env(cls) -> 'Settings': + def __from_env(cls) -> 'Settings': # pylint: disable=unused-private-member return cls( output_subdirs=OutputSubdirs(), - whisper=WhisperSettings._from_env(), + whisper=WhisperSettings.__from_env(), text_chunking=TextChunkingSettings(), embedding_model=EmbeddingModelSettings(), embedding=EmbeddingSettings(), @@ -347,11 +322,11 @@ def _from_env(cls) -> 'Settings': object_detection=ObjectDetectionSettings(), face_recognition=FaceRecognitionSettings(), face_clustering=FaceClusteringSettings(), - emotion_detection=EmotionDetectionSettings._from_env(), - image_scraper=ImageScraperSettings._from_env(), - elevenlabs=ElevenLabsSettings._from_env(), - elasticsearch=ElasticsearchSettings._from_env(), - gemini=GeminiSettings._from_env(), + emotion_detection=EmotionDetectionSettings.__from_env(), + image_scraper=ImageScraperSettings.__from_env(), + elevenlabs=ElevenLabsSettings.__from_env(), + elasticsearch=ElasticsearchSettings.__from_env(), + gemini=GeminiSettings.__from_env(), transcode=TranscodeSettings(), transcription=TranscriptionSettings(), ) @@ -417,4 +392,4 @@ class IndexConfig: def to_dict(self) -> Dict[str, Any]: return {'name': self.name, 'transcription_jsons': str(self.transcription_jsons), 'dry_run': self.dry_run, 'append': self.append} -settings = Settings._from_env() +settings = Settings.__from_env() diff --git a/preprocessor/config/series_config.py b/preprocessor/config/series_config.py index 65acd13d5..ceaaace12 100644 --- a/preprocessor/config/series_config.py +++ b/preprocessor/config/series_config.py @@ -8,13 +8,13 @@ ) -def _deep_merge(base: Dict[str, Any], override: Dict[str, Any]) -> Dict[str, Any]: +def __deep_merge(base: Dict[str, Any], override: Dict[str, Any]) -> Dict[str, Any]: result: Dict[str, Any] = base.copy() for key, value in override.items(): if key.startswith('_'): continue if key in result and isinstance(result[key], dict) and isinstance(value, dict): - result[key] = _deep_merge(result[key], value) + result[key] = __deep_merge(result[key], value) else: result[key] = value return result @@ -173,7 +173,7 @@ def __load_defaults() -> Dict[str, Any]: return {k: v for k, v in data.items() if not k.startswith('_')} @staticmethod - def _load_from_file(config_path: Path) -> 'SeriesConfig': + def __load_from_file(config_path: Path) -> 'SeriesConfig': if not config_path.exists(): raise FileNotFoundError( f"Series config not found: {config_path}\n" @@ -190,7 +190,7 @@ def _load_from_file(config_path: Path) -> 'SeriesConfig': if not k.startswith('_') } - merged_config: Dict[str, Any] = _deep_merge(defaults, series_filtered) + merged_config: Dict[str, Any] = __deep_merge(defaults, series_filtered) return SeriesConfig.__load_from_dict(merged_config) @@ -199,4 +199,4 @@ def load(series_name: str) -> 'SeriesConfig': config_dir: Path = Path('preprocessor/series_configs') config_path: Path = config_dir / f'{series_name}.json' - return SeriesConfig._load_from_file(config_path) + return SeriesConfig.__load_from_file(config_path) diff --git a/preprocessor/config/step_configs.py b/preprocessor/config/step_configs.py index a81b0a990..52f33fc94 100644 --- a/preprocessor/config/step_configs.py +++ b/preprocessor/config/step_configs.py @@ -27,7 +27,7 @@ class Config: arbitrary_types_allowed = True @model_validator(mode='after') - def maxrate_must_be_greater_than_bitrate(self) -> Self: + def __maxrate_must_be_greater_than_bitrate(self) -> Self: # pylint: disable=unused-private-member if self.maxrate_mbps < self.video_bitrate_mbps: raise ValueError('maxrate must be >= video_bitrate') return self diff --git a/preprocessor/core/base_processor.py b/preprocessor/core/base_processor.py index f7d9927fe..6d52b4e75 100644 --- a/preprocessor/core/base_processor.py +++ b/preprocessor/core/base_processor.py @@ -14,10 +14,7 @@ Tuple, ) -from preprocessor.config.constants import ( - FILE_SUFFIXES, - SUPPORTED_VIDEO_EXTENSIONS, -) +from preprocessor.config.constants import SUPPORTED_VIDEO_EXTENSIONS from preprocessor.core.path_manager import PathManager from preprocessor.core.state_manager import StateManager from preprocessor.lib.core.logging import ErrorHandlingLogger @@ -55,10 +52,6 @@ def __init__(self, args: Dict[str, Any], class_name: str, error_exit_code: int, self.path_manager: PathManager = args.get('path_manager', PathManager(self.series_name)) self.progress = args.get('progress_tracker', ProgressTracker()) - @classmethod - def get_video_glob_patterns(cls) -> List[str]: - return [f'*{ext}' for ext in cls.SUPPORTED_VIDEO_EXTENSIONS] - @abstractmethod def _validate_args(self, args: Dict[str, Any]) -> None: pass @@ -82,34 +75,9 @@ def cleanup(self) -> None: def _load_resources(self) -> bool: return True - def _get_processing_info(self) -> List[str]: + def __get_processing_info(self) -> List[str]: return [] - @staticmethod - def _get_episode_processing_items_from_metadata( - metadata_pattern: str, - base_dir: Path, - episode_manager: 'EpisodeManager', - ) -> List[ProcessingItem]: - all_metadata_files = list(base_dir.glob(metadata_pattern)) - items = [] - for metadata_file in all_metadata_files: - episode_info = episode_manager.parse_filename(metadata_file) - if not episode_info: - continue - episode_id = episode_manager.get_episode_id_for_state(episode_info) - items.append( - ProcessingItem( - episode_id=episode_id, - input_path=metadata_file, - metadata={ - 'episode_info': episode_info, - 'series_name': episode_manager.series_name, - }, - ), - ) - return items - def _get_processing_items(self) -> List[ProcessingItem]: raise NotImplementedError( f'{self.__class__.__name__} must implement _get_processing_items() ' @@ -145,7 +113,7 @@ def __to_snake_case(name: str) -> str: name = re.sub('(.)([A-Z][a-z]+)', '\\1_\\2', name) return re.sub('([a-z0-9])([A-Z])', '\\1_\\2', name).lower() - def _should_skip_item( + def __should_skip_item( self, item: ProcessingItem, ) -> Tuple[bool, List[OutputSpec], str]: expected_outputs = self._get_expected_outputs(item) @@ -187,7 +155,7 @@ def _execute(self) -> None: skipped_count = 0 skip_messages = [] for item in all_items: - should_skip, missing_outputs, skip_message = self._should_skip_item(item) + should_skip, missing_outputs, skip_message = self.__should_skip_item(item) if should_skip: if skip_message: skip_messages.append(skip_message) @@ -213,7 +181,7 @@ def __execute_processing(self, items: List[ProcessingItem]) -> None: if not items: console.print('[yellow]No items to process, skipping resource loading[/yellow]') return - for info_line in self._get_processing_info(): + for info_line in self.__get_processing_info(): console.print(info_line) if not self._load_resources(): return @@ -224,7 +192,7 @@ def __execute_processing(self, items: List[ProcessingItem]) -> None: for item in items: try: if self.state_manager: - temp_files = self._get_temp_files(item) + temp_files = self.__get_temp_files(item) self.state_manager.mark_step_started(step_name, item.episode_id, temp_files) missing_outputs = item.metadata.get('missing_outputs', []) self._process_item(item, missing_outputs) @@ -238,127 +206,8 @@ def __execute_processing(self, items: List[ProcessingItem]) -> None: console.print('\n[yellow]Processing interrupted[/yellow]') raise - def _get_temp_files(self, item: ProcessingItem) -> List[str]: # pylint: disable=unused-argument + def __get_temp_files(self, item: ProcessingItem) -> List[str]: # pylint: disable=unused-argument return [] def _get_progress_description(self) -> str: return f'Processing {self.__class__.__name__}' - - def _create_video_processing_items( - self, - source_path: Path, - extensions: List[str], - episode_manager: 'EpisodeManager', - skip_unparseable: bool = True, - subdirectory_filter: Optional[str] = None, - ) -> List[ProcessingItem]: - series_name = self.series_name - if not source_path.is_file(): - if source_path.name != series_name: - source_path = source_path / series_name - if not source_path.exists(): - raise FileNotFoundError( - f'Input directory does not exist: {source_path}\n' - f'Expected structure: /input_data/{series_name}/S01/, ' - f'/input_data/{series_name}/S02/, etc.\n\n' - f'Migration guide:\n' - f' mkdir -p /input_data/{series_name}\n' - f' mv /input_data/S* /input_data/{series_name}/', - ) - video_files = [] - if source_path.is_file(): - video_files = [source_path] - else: - for ext in extensions: - if subdirectory_filter: - pattern = f'**/{subdirectory_filter}/{ext}' - else: - pattern = f'**/{ext}' - video_files.extend(source_path.glob(pattern)) - items = [] - for video_file in sorted(video_files): - episode_info = episode_manager.parse_filename(video_file) - if not episode_info: - if skip_unparseable: - self.logger.error( - f'Cannot parse episode info from {video_file.name}', - ) - continue - episode_id = video_file.stem - else: - from preprocessor.lib.episodes import EpisodeManager # pylint: disable=import-outside-toplevel - episode_id = EpisodeManager.get_episode_id_for_state(episode_info) - items.append( - ProcessingItem( - episode_id=episode_id, - input_path=video_file, - metadata={'episode_info': episode_info}, - ), - ) - return items - - def _create_transcription_processing_item( - self, transcription_file: Path, - ) -> ProcessingItem: - base_name = ( - transcription_file.stem - .replace(FILE_SUFFIXES['segmented'], '') - .replace(FILE_SUFFIXES['simple'], '') - ) - episode_info = ( - self.episode_manager.parse_filename(transcription_file) - if hasattr(self, 'episode_manager') - else None - ) - if episode_info: - from preprocessor.lib.episodes import EpisodeManager # pylint: disable=import-outside-toplevel - episode_id = EpisodeManager.get_episode_id_for_state(episode_info) - else: - episode_id = base_name - return ProcessingItem( - episode_id=episode_id, - input_path=transcription_file, - metadata={'base_name': base_name}, - ) - - def _build_output_path( - self, episode_info, filename: str, subdir: Optional[str] = None, - ) -> Path: - target_subdir = subdir if subdir is not None else self.get_output_subdir() - return self.path_manager.build_path(episode_info, target_subdir, filename) - - def _build_output_paths( - self, episode_info, filenames: List[str], subdir: Optional[str] = None, - ) -> List[Path]: - return [ - self._build_output_path(episode_info, filename, subdir) - for filename in filenames - ] - - def _build_season_path( - self, episode_info, filename: str, subdir: Optional[str] = None, - ) -> Path: - target_subdir = subdir if subdir is not None else self.get_output_subdir() - return self.path_manager.build_season_path(episode_info, target_subdir, filename) - - def _build_filename( - self, episode_info, extension: str = 'json', suffix: Optional[str] = None, - ) -> str: - return self.path_manager.build_filename( - episode_info, extension=extension, suffix=suffix, - ) - - def _build_single_output( - self, - item: ProcessingItem, - suffix: str, - extension: str = 'json', - subdir: Optional[str] = None, - required: bool = True, - ) -> List[OutputSpec]: - episode_info = item.metadata.get('episode_info') - if not episode_info: - return [] - filename = self._build_filename(episode_info, extension=extension, suffix=suffix) - path = self._build_output_path(episode_info, filename, subdir=subdir) - return [OutputSpec(path=path, required=required)] diff --git a/preprocessor/core/processing_metadata.py b/preprocessor/core/processing_metadata.py index 761726a5e..e8cca6716 100644 --- a/preprocessor/core/processing_metadata.py +++ b/preprocessor/core/processing_metadata.py @@ -3,7 +3,6 @@ field, ) from datetime import datetime -import json from pathlib import Path from typing import ( Any, @@ -28,13 +27,6 @@ def start(self): self.start_time = datetime.now() self.status = 'running' - def finish(self, exit_code: int): - self.end_time = datetime.now() - self.exit_code = exit_code - if self.start_time: - self.duration_seconds = (self.end_time - self.start_time).total_seconds() - self.status = 'success' if exit_code == 0 else 'failed' - def skip(self): self.status = 'skipped' @@ -82,15 +74,6 @@ def add_step(self, name: str, step_num: str) -> StepMetadata: self.steps.append(step) return step - def finish_processing( - self, final_exit_code: int, additional_stats: Optional[Dict[str, Any]] = None, - ): - self.end_time = datetime.now() - self.total_duration_seconds = (self.end_time - self.start_time).total_seconds() - self.final_status = 'success' if final_exit_code == 0 else 'failed' - if additional_stats: - self.params['additional_statistics'] = additional_stats - def __get_statistics(self) -> Dict[str, Any]: completed_steps = [s for s in self.steps if s.status == 'success'] failed_steps = [s for s in self.steps if s.status == 'failed'] @@ -125,8 +108,3 @@ def to_dict(self) -> Dict[str, Any]: 'steps': [step.to_dict() for step in self.steps], 'statistics': self.__get_statistics(), } - - def save_to_file(self, output_path: Path): - output_path.parent.mkdir(parents=True, exist_ok=True) - with open(output_path, 'w', encoding='utf-8') as f: - json.dump(self.to_dict(), f, indent=2, ensure_ascii=False) diff --git a/preprocessor/core/state_manager.py b/preprocessor/core/state_manager.py index 81f1d37e0..2e74427e8 100644 --- a/preprocessor/core/state_manager.py +++ b/preprocessor/core/state_manager.py @@ -6,8 +6,6 @@ from datetime import datetime import json from pathlib import Path -import signal -import sys from typing import ( Any, Dict, @@ -49,7 +47,7 @@ def to_dict(self) -> Dict[str, Any]: } @classmethod - def _from_dict(cls, data: Dict[str, Any]) -> 'ProcessingState': + def __from_dict(cls, data: Dict[str, Any]) -> 'ProcessingState': # pylint: disable=unused-private-member completed_steps = [ StepCheckpoint(**step) for step in data.get('completed_steps', []) ] @@ -73,15 +71,13 @@ def __init__(self, series_name: str, working_dir: Path = Path('.')) -> None: state_filename: str = self.STATE_FILE_TEMPLATE.format(series=series_name) self.__state_file: Path = working_dir / state_filename self.__state: Optional[ProcessingState] = None - self.__cleanup_registered: bool = False - self.__interrupted: bool = False def load_or_create_state(self) -> ProcessingState: if self.__state_file.exists(): console.print(f'[yellow]Found existing state file: {self.__state_file}[/yellow]') with open(self.__state_file, 'r', encoding='utf-8') as f: data = json.load(f) - self.__state = ProcessingState._from_dict(data) + self.__state = ProcessingState.__from_dict(data) console.print(f'[green]Loaded state for series: {self.__state.series_name}[/green]') console.print(f'[green]Completed steps: {len(self.__state.completed_steps)}[/green]') return self.__state @@ -138,7 +134,7 @@ def is_step_completed(self, step: str, episode: str) -> bool: for s in self.__state.completed_steps ) - def __rollback_in_progress(self) -> None: + def __rollback_in_progress(self) -> None: # pylint: disable=unused-private-member if self.__state is None or self.__state.in_progress is None: return console.print( @@ -160,32 +156,3 @@ def cleanup(self) -> None: if self.__state_file.exists(): console.print(f'[blue]Cleaning up state file: {self.__state_file}[/blue]') self.__state_file.unlink() - - def register_interrupt_handler(self) -> None: - if self.__cleanup_registered: - return - - def _signal_handler(_sig: int, _frame: Any) -> None: - if self.__interrupted: - console.print('\n[red]Force quit! Not cleaning up.[/red]') - sys.exit(1) - self.__interrupted = True - console.print('\n[yellow]Interrupt received (Ctrl+C)...[/yellow]') - console.print('[yellow]Rolling back incomplete work...[/yellow]') - self.__rollback_in_progress() - console.print('[green]Cleanup complete. You can resume later.[/green]') - console.print('[blue]To resume: run the same command again[/blue]') - sys.exit(0) - signal.signal(signal.SIGINT, _signal_handler) - signal.signal(signal.SIGTERM, _signal_handler) - self.__cleanup_registered = True - console.print('[blue]Interrupt handler registered (Ctrl+C to safely stop)[/blue]') - - def get_resume_info(self) -> Optional[str]: - if self.__state is None or not self.__state.completed_steps: - return None - last_step = self.__state.completed_steps[-1] - return ( - f'Resuming from: {last_step.step} ({last_step.episode}) ' - f'at {last_step.completed_at}' - ) diff --git a/preprocessor/lib/ai/models.py b/preprocessor/lib/ai/models.py index c0a8f5ec8..4e9788a96 100644 --- a/preprocessor/lib/ai/models.py +++ b/preprocessor/lib/ai/models.py @@ -20,7 +20,8 @@ class EpisodeInfo(BaseModel): @field_validator('viewership', mode='before') @classmethod @staticmethod - def _convert_viewership_to_str(cls, v: Optional[int]) -> Optional[str]: + def __convert_viewership_to_str(cls, v: Optional[int]) -> Optional[str]: + # pylint: disable=unused-private-member if v is None: return None if isinstance(v, int): @@ -35,7 +36,8 @@ class SeasonMetadata(BaseModel): @model_validator(mode='before') @classmethod @staticmethod - def _convert_old_format(cls, data: dict) -> dict: + def __convert_old_format(cls, data: dict) -> dict: + # pylint: disable=unused-private-member # pylint: disable=unused-private-member if isinstance(data, dict) and 'episodes' in data: for idx, episode in enumerate(data['episodes'], start=1): if isinstance(episode, dict) and 'episode_number' in episode and ('episode_in_season' not in episode): diff --git a/preprocessor/lib/ai/provider.py b/preprocessor/lib/ai/provider.py index 945a9d5c8..0362921ee 100644 --- a/preprocessor/lib/ai/provider.py +++ b/preprocessor/lib/ai/provider.py @@ -55,7 +55,8 @@ def __init__(self, model_name: Optional[str] = None, parser_mode: Optional[Parse else: self.__client = VLLMClient(model_name=model_name) - def extract_season_episodes(self, page_text: str, url: str) -> Optional[SeasonMetadata]: + def __extract_season_episodes(self, page_text: str, url: str) -> Optional[SeasonMetadata]: + # pylint: disable=unused-private-member return self.__process_llm_request( system_prompt=extract_season_system.get(), user_prompt=extract_season_user.get().format(url=url, page_text=page_text), @@ -63,7 +64,8 @@ def extract_season_episodes(self, page_text: str, url: str) -> Optional[SeasonMe error_context=f'extraction failed for {url}', ) - def extract_episode_metadata(self, page_text: str, url: str) -> Optional[EpisodeMetadata]: + def __extract_episode_metadata(self, page_text: str, url: str) -> Optional[EpisodeMetadata]: + # pylint: disable=unused-private-member return self.__process_llm_request( system_prompt=extract_episode_metadata_system.get(), user_prompt=extract_episode_metadata_user.get().format(url=url, page_text=page_text), @@ -71,7 +73,8 @@ def extract_episode_metadata(self, page_text: str, url: str) -> Optional[Episode error_context=f'extraction failed for {url}', ) - def merge_episode_data(self, metadata_list: List[EpisodeMetadata]) -> EpisodeMetadata: + def __merge_episode_data(self, metadata_list: List[EpisodeMetadata]) -> EpisodeMetadata: + # pylint: disable=unused-private-member if not metadata_list: raise ValueError('No metadata to merge') if len(metadata_list) == 1: diff --git a/preprocessor/lib/characters/reference_downloader.py b/preprocessor/lib/characters/reference_downloader.py index 1abc2edcb..824fc76e1 100644 --- a/preprocessor/lib/characters/reference_downloader.py +++ b/preprocessor/lib/characters/reference_downloader.py @@ -118,7 +118,7 @@ def __count_faces(self, img) -> int: return len(faces) @staticmethod - def _validate_and_decode_image( + def __validate_and_decode_image( img_bytes: bytes, img_url: str, logger, ) -> np.ndarray | None: if not img_bytes: @@ -148,7 +148,7 @@ def __download_image_with_browser( if 'image' not in content_type: return None img_bytes = response.body() - return self._validate_and_decode_image(img_bytes, img_url, self.logger) + return self.__validate_and_decode_image(img_bytes, img_url, self.logger) except TimeoutError: self.logger.debug(f'Timeout downloading image {img_url}') return None diff --git a/preprocessor/lib/core/logging.py b/preprocessor/lib/core/logging.py index 4bfed151e..82f903989 100644 --- a/preprocessor/lib/core/logging.py +++ b/preprocessor/lib/core/logging.py @@ -46,18 +46,6 @@ def __setup_logger(self, level: int) -> None: ) self.__logger: logging.Logger = logging.getLogger(self.__class_name) - def log(self, level: int, message: str) -> None: - if level == logging.ERROR: - self.__logger.error(message) - elif level == logging.INFO: - self.__logger.info(message) - elif level == logging.WARNING: - self.__logger.warning(message) - elif level == logging.DEBUG: - self.__logger.debug(message) - else: - raise RuntimeError(f'Logging level {level} is not supported.') - def info(self, message: str) -> None: self.__logger.info(message) diff --git a/preprocessor/lib/episodes/episode_manager.py b/preprocessor/lib/episodes/episode_manager.py index 9e56471e8..08958db7c 100644 --- a/preprocessor/lib/episodes/episode_manager.py +++ b/preprocessor/lib/episodes/episode_manager.py @@ -37,7 +37,7 @@ def season_code(self) -> str: def episode_num(self) -> str: return f'E{self.relative_episode:02d}' - def is_special(self) -> bool: + def __is_special(self) -> bool: # pylint: disable=unused-private-member return self.season == 0 class EpisodeManager: @@ -51,7 +51,7 @@ def __init__(self, episodes_info_json: Optional[Path], series_name: str, logger: with open(episodes_info_json, 'r', encoding='utf-8') as f: self.episodes_data = json.load(f) - def _create_episode_info( + def __create_episode_info( self, season: int, relative_episode: int, @@ -86,13 +86,13 @@ def parse_filename(self, file_path: Path) -> Optional[EpisodeInfo]: def get_episode_by_season_and_relative(self, season: int, relative_episode: int) -> EpisodeInfo: if not self.episodes_data: - return self._create_episode_info(season, relative_episode) + return self.__create_episode_info(season, relative_episode) for season_data in self.episodes_data.get(EpisodesDataKeys.SEASONS, []): if season_data.get(EpisodesDataKeys.SEASON_NUMBER) == season: episodes = sorted(season_data.get(EpisodesDataKeys.EPISODES, []), key=lambda ep: ep.get(EpisodeMetadataKeys.EPISODE_NUMBER, 0)) if 0 < relative_episode <= len(episodes): ep_data = episodes[relative_episode - 1] - return self._create_episode_info( + return self.__create_episode_info( season=season, relative_episode=relative_episode, title=ep_data.get(EpisodeMetadataKeys.TITLE), @@ -105,10 +105,10 @@ def get_episode_by_season_and_relative(self, season: int, relative_episode: int) f'Processing S{season:02d}E{relative_episode:02d} with filename-only metadata. ' f'Scrape episode info for season {season} to get title, premiere date, etc.', ) - return self._create_episode_info(season, relative_episode) + return self.__create_episode_info(season, relative_episode) @staticmethod - def find_video_file(episode_info: EpisodeInfo, search_dir: Path) -> Optional[Path]: + def __find_video_file(episode_info: EpisodeInfo, search_dir: Path) -> Optional[Path]: # pylint: disable=unused-private-member if not search_dir.exists(): return None if search_dir.is_file(): @@ -125,7 +125,7 @@ def find_video_file(episode_info: EpisodeInfo, search_dir: Path) -> Optional[Pat return video_file return None - def find_transcription_file(self, episode_info: EpisodeInfo, search_dir: Path, prefer_segmented: bool=True) -> Optional[Path]: + def __find_transcription_file(self, episode_info: EpisodeInfo, search_dir: Path, prefer_segmented: bool=True) -> Optional[Path]: # pylint: disable=unused-private-member if not search_dir.exists(): return None season_dir_name = episode_info.season_code() @@ -142,7 +142,7 @@ def find_transcription_file(self, episode_info: EpisodeInfo, search_dir: Path, p return None @staticmethod - def find_scene_timestamps_file(episode_info: EpisodeInfo, search_dir: Path) -> Optional[Path]: + def __find_scene_timestamps_file(episode_info: EpisodeInfo, search_dir: Path) -> Optional[Path]: if not search_dir.exists(): return None episode_code = episode_info.episode_code() @@ -152,14 +152,14 @@ def find_scene_timestamps_file(episode_info: EpisodeInfo, search_dir: Path) -> O return None @staticmethod - def load_scene_timestamps( + def __load_scene_timestamps( # pylint: disable=unused-private-member episode_info: EpisodeInfo, search_dir: Optional[Path], _logger: Optional[ErrorHandlingLogger]=None, ) -> Optional[List[Dict[str, Any]]]: if not search_dir: return None - scene_file = EpisodeManager.find_scene_timestamps_file(episode_info, search_dir) + scene_file = EpisodeManager.__find_scene_timestamps_file(episode_info, search_dir) if not scene_file: return None try: @@ -184,7 +184,7 @@ def get_metadata(episode_info: EpisodeInfo) -> Dict[str, Any]: def get_episode_id_for_state(episode_info: EpisodeInfo) -> str: return episode_info.episode_code() - def list_all_episodes(self) -> List[EpisodeInfo]: + def __list_all_episodes(self) -> List[EpisodeInfo]: # pylint: disable=unused-private-member episodes: List[EpisodeInfo] = [] if not self.episodes_data: return episodes @@ -193,7 +193,7 @@ def list_all_episodes(self) -> List[EpisodeInfo]: season_episodes = sorted(season_data.get(EpisodesDataKeys.EPISODES, []), key=lambda ep: ep.get(EpisodeMetadataKeys.EPISODE_NUMBER, 0)) for idx, ep_data in enumerate(season_episodes): episodes.append( - self._create_episode_info( + self.__create_episode_info( season=season_num, relative_episode=idx + 1, title=ep_data.get(EpisodeMetadataKeys.TITLE), diff --git a/preprocessor/lib/io/files.py b/preprocessor/lib/io/files.py index bc0bf9ed2..e3efbd026 100644 --- a/preprocessor/lib/io/files.py +++ b/preprocessor/lib/io/files.py @@ -10,7 +10,7 @@ class FileOperations: @staticmethod - def _atomic_write(path: Path, write_func: Callable[[Any], None]) -> None: + def __atomic_write(path: Path, write_func: Callable[[Any], None]) -> None: temp_path = path.with_suffix(path.suffix + '.tmp') try: write_func(temp_path) @@ -26,7 +26,7 @@ def atomic_write_json(path: Path, data: Dict[str, Any], indent: int=2) -> None: def __write(temp: Path) -> None: with open(temp, 'w', encoding='utf-8') as f: json.dump(data, f, ensure_ascii=False, indent=indent) - FileOperations._atomic_write(path, __write) + FileOperations.__atomic_write(path, __write) @staticmethod def load_json(path: Path) -> Dict[str, Any]: @@ -34,12 +34,12 @@ def load_json(path: Path) -> Dict[str, Any]: return json.load(f) @staticmethod - def atomic_write_text(path: Path, content: str) -> None: + def __atomic_write_text(path: Path, content: str) -> None: # pylint: disable=unused-private-member def __write(temp: Path) -> None: with open(temp, 'w', encoding='utf-8') as f: f.write(content) - FileOperations._atomic_write(path, __write) + FileOperations.__atomic_write(path, __write) def atomic_write_json(path: Path, data: Dict[str, Any], indent: int=2) -> None: FileOperations.atomic_write_json(path, data, indent) diff --git a/preprocessor/lib/io/hashing.py b/preprocessor/lib/io/hashing.py index a53f0642b..cc040a9ad 100644 --- a/preprocessor/lib/io/hashing.py +++ b/preprocessor/lib/io/hashing.py @@ -15,7 +15,7 @@ class HashStorage: @staticmethod - def save_image_hashes_to_json( + def __save_image_hashes_to_json( # pylint: disable=unused-private-member episode_info: EpisodeInfo, hash_results: List[Dict[str, Any]], series_name: str, diff --git a/preprocessor/lib/io/metadata.py b/preprocessor/lib/io/metadata.py index 6ee162575..d1ef87124 100644 --- a/preprocessor/lib/io/metadata.py +++ b/preprocessor/lib/io/metadata.py @@ -12,7 +12,7 @@ class MetadataBuilder: @staticmethod - def create_minimal_episode_info(episode_info) -> Dict[str, Any]: + def __create_minimal_episode_info(episode_info) -> Dict[str, Any]: return {'season': episode_info.season, 'episode_number': episode_info.relative_episode} @staticmethod @@ -25,7 +25,7 @@ def create_processing_metadata( ) -> Dict[str, Any]: return { 'generated_at': datetime.now().isoformat(), - 'episode_info': MetadataBuilder.create_minimal_episode_info(episode_info), + 'episode_info': MetadataBuilder.__create_minimal_episode_info(episode_info), 'processing_parameters': processing_params, 'statistics': statistics, results_key: results_data, diff --git a/preprocessor/lib/media/resolution.py b/preprocessor/lib/media/resolution.py index c489a6169..5930f39c3 100644 --- a/preprocessor/lib/media/resolution.py +++ b/preprocessor/lib/media/resolution.py @@ -26,7 +26,7 @@ def __str__(self): return f'{self.height}p' @classmethod - def from_str(cls: Type[T], init: str) -> T: + def __from_str(cls: Type[T], init: str) -> T: # pylint: disable=unused-private-member init = init.strip() if not init[0].isalpha(): init = 'R' + init.upper() @@ -35,5 +35,5 @@ def from_str(cls: Type[T], init: str) -> T: return cls[init] @classmethod - def get_all_choices(cls) -> List[str]: + def __get_all_choices(cls) -> List[str]: # pylint: disable=unused-private-member return [str(r) for r in cls] diff --git a/preprocessor/lib/search/embedding_model.py b/preprocessor/lib/search/embedding_model.py index 671c162f8..da0a7b000 100644 --- a/preprocessor/lib/search/embedding_model.py +++ b/preprocessor/lib/search/embedding_model.py @@ -19,5 +19,5 @@ def encode_text(self, text: Union[str, List[str]]) -> Union[List[float], List[Li return self._service.get_text_embedding(text) return [self._service.get_text_embedding(t) for t in text] - def encode_image(self, image_path: str) -> List[float]: + def __encode_image(self, image_path: str) -> List[float]: # pylint: disable=unused-private-member return self._service.get_image_embedding(image_path) diff --git a/preprocessor/lib/text/text_statistics.py b/preprocessor/lib/text/text_statistics.py index 5c8702eb1..da972bfee 100644 --- a/preprocessor/lib/text/text_statistics.py +++ b/preprocessor/lib/text/text_statistics.py @@ -51,16 +51,16 @@ def from_file(cls, file_path: Path, language: str='pl') -> 'TextStatistics': with open(file_path, 'r', encoding='utf-8') as f: text = f.read() stats = cls(text=text, language=language) - stats.calculate() + stats.__calculate() return stats @classmethod - def from_text(cls, text: str, language: str='pl') -> 'TextStatistics': + def __from_text(cls, text: str, language: str='pl') -> 'TextStatistics': # pylint: disable=unused-private-member stats = cls(text=text, language=language) - stats.calculate() + stats.__calculate() return stats - def calculate(self) -> None: + def __calculate(self) -> None: # pylint: disable=unused-private-member self.__calculate_basic_stats() self.__calculate_character_stats() self.__calculate_word_stats() diff --git a/preprocessor/lib/transcription/processors/sound_separator.py b/preprocessor/lib/transcription/processors/sound_separator.py index 13ec57d69..ad53d49b4 100644 --- a/preprocessor/lib/transcription/processors/sound_separator.py +++ b/preprocessor/lib/transcription/processors/sound_separator.py @@ -245,7 +245,7 @@ def format_timestamp(seconds: float) -> str: millis = int(seconds % 1 * 1000) return f'{hours:02d}:{minutes:02d}:{secs:02d},{millis:03d}' - def _write_srt(segments: List[Dict], output_path: Path) -> None: + def __write_srt(segments: List[Dict], output_path: Path) -> None: with open(output_path, 'w', encoding='utf-8') as f: for idx, seg in enumerate(segments, start=1): words = seg.get('words', []) @@ -260,8 +260,8 @@ def _write_srt(segments: List[Dict], output_path: Path) -> None: f.write(f'{idx}\n') f.write(f'{format_timestamp(start_time)} --> {format_timestamp(end_time)}\n') f.write(f'{text}\n\n') - _write_srt(dialogue_segments, clean_srt) - _write_srt(sound_segments, sound_srt) + __write_srt(dialogue_segments, clean_srt) + __write_srt(sound_segments, sound_srt) def _get_progress_description(self) -> str: return 'Separating sound events from dialogues' diff --git a/preprocessor/lib/transcription/utils.py b/preprocessor/lib/transcription/utils.py index 4cbb01f35..994d66158 100644 --- a/preprocessor/lib/transcription/utils.py +++ b/preprocessor/lib/transcription/utils.py @@ -10,7 +10,7 @@ class TranscriptionUtils: @staticmethod - def fix_unicode(file_path: Path) -> None: + def __fix_unicode(file_path: Path) -> None: # pylint: disable=unused-private-member if not file_path.exists(): return with open(file_path, 'r', encoding='utf-8') as f: diff --git a/preprocessor/lib/ui/console.py b/preprocessor/lib/ui/console.py index 4f7d631e0..3be9d1fd0 100644 --- a/preprocessor/lib/ui/console.py +++ b/preprocessor/lib/ui/console.py @@ -8,7 +8,7 @@ _console_instance = None -def _get_console() -> Console: +def __get_console() -> Console: global _console_instance # pylint: disable=global-statement if _console_instance is None: in_docker = ( @@ -91,4 +91,4 @@ def __exit__(self, exc_type, exc_val, exc_tb): def create_progress() -> SimpleProgress: return SimpleProgress() -console = _get_console() +console = __get_console() diff --git a/preprocessor/lib/ui/progress.py b/preprocessor/lib/ui/progress.py index bb61b86ea..58be57e8c 100644 --- a/preprocessor/lib/ui/progress.py +++ b/preprocessor/lib/ui/progress.py @@ -13,7 +13,7 @@ def __init__(self): self.start_time: Optional[float] = None @contextmanager - def track_operation(self, operation_name: str, total: int): + def __track_operation(self, operation_name: str, total: int): # pylint: disable=unused-private-member self.current_operation = operation_name self.start_time = time.time() console.print(f' [cyan]{operation_name} (total: {total})...[/cyan]') diff --git a/preprocessor/lib/validation/file_validators.py b/preprocessor/lib/validation/file_validators.py index 4ff1dfbb0..0c8f3a537 100644 --- a/preprocessor/lib/validation/file_validators.py +++ b/preprocessor/lib/validation/file_validators.py @@ -28,14 +28,14 @@ class ValidationResult: class FileValidator: @staticmethod - def _check_file_exists(path: Path) -> Optional[ValidationResult]: + def __check_file_exists(path: Path) -> Optional[ValidationResult]: if not path.exists(): return ValidationResult(is_valid=False, error_message=f'File does not exist: {path}') return None @staticmethod def validate_json_file(path: Path) -> ValidationResult: - if (error := FileValidator._check_file_exists(path)): + if (error := FileValidator.__check_file_exists(path)): return error try: with open(path, 'r', encoding='utf-8') as f: @@ -51,7 +51,7 @@ def validate_json_file(path: Path) -> ValidationResult: @staticmethod def validate_jsonl_file(path: Path) -> ValidationResult: - if (error := FileValidator._check_file_exists(path)): + if (error := FileValidator.__check_file_exists(path)): return error try: line_count = 0 @@ -80,7 +80,7 @@ def validate_jsonl_file(path: Path) -> ValidationResult: @staticmethod def validate_image_file(path: Path) -> ValidationResult: - if (error := FileValidator._check_file_exists(path)): + if (error := FileValidator.__check_file_exists(path)): return error try: with Image.open(path) as img: @@ -103,7 +103,7 @@ def validate_image_file(path: Path) -> ValidationResult: @staticmethod def validate_video_file(path: Path) -> ValidationResult: - if (error := FileValidator._check_file_exists(path)): + if (error := FileValidator.__check_file_exists(path)): return error try: result = subprocess.run( @@ -141,8 +141,8 @@ def validate_video_file(path: Path) -> ValidationResult: return ValidationResult(is_valid=False, error_message=f'Error validating video: {e}') @staticmethod - def validate_archive_file(path: Path) -> ValidationResult: - if (error := FileValidator._check_file_exists(path)): + def __validate_archive_file(path: Path) -> ValidationResult: # pylint: disable=unused-private-member + if (error := FileValidator.__check_file_exists(path)): return error try: with zipfile.ZipFile(path, 'r') as zip_ref: diff --git a/preprocessor/lib/video/emotion_utils.py b/preprocessor/lib/video/emotion_utils.py index 6ef99be2d..832571f2c 100644 --- a/preprocessor/lib/video/emotion_utils.py +++ b/preprocessor/lib/video/emotion_utils.py @@ -16,7 +16,7 @@ class EmotionDetector: @staticmethod - def init_model(logger: Optional[ErrorHandlingLogger]=None) -> HSEmotionRecognizer: + def __init_model(logger: Optional[ErrorHandlingLogger]=None) -> HSEmotionRecognizer: # pylint: disable=unused-private-member model_name = settings.emotion_detection.model_name if logger: logger.info(f'Loading HSEmotion model: {model_name}...') @@ -29,7 +29,7 @@ def init_model(logger: Optional[ErrorHandlingLogger]=None) -> HSEmotionRecognize raise RuntimeError(f'Failed to load HSEmotion model {model_name}: {e}') from e @staticmethod - def _process_emotion_result( + def __process_emotion_result( emotion: str, scores: np.ndarray, ) -> Tuple[str, float, Dict[str, float]]: @@ -48,12 +48,12 @@ def detect( ) -> Tuple[str, float, Dict[str, float]]: try: emotion, scores = model.predict_emotions(face_image, logits=False) - return EmotionDetector._process_emotion_result(emotion, scores) + return EmotionDetector.__process_emotion_result(emotion, scores) except Exception as e: raise RuntimeError(f'Emotion detection failed: {e}') from e @staticmethod - def _clip_bbox( + def __clip_bbox( x1: int, y1: int, x2: int, @@ -68,11 +68,11 @@ def _clip_bbox( return (x1, y1, x2, y2) @staticmethod - def crop_face(frame: np.ndarray, bbox: Dict[str, int]) -> Optional[np.ndarray]: + def __crop_face(frame: np.ndarray, bbox: Dict[str, int]) -> Optional[np.ndarray]: # pylint: disable=unused-private-member try: x1, y1, x2, y2 = (bbox['x1'], bbox['y1'], bbox['x2'], bbox['y2']) height, width = frame.shape[:2] - x1, y1, x2, y2 = EmotionDetector._clip_bbox(x1, y1, x2, y2, width, height) + x1, y1, x2, y2 = EmotionDetector.__clip_bbox(x1, y1, x2, y2, width, height) if x2 <= x1 or y2 <= y1: return None face_crop = frame[y1:y2, x1:x2] @@ -81,7 +81,7 @@ def crop_face(frame: np.ndarray, bbox: Dict[str, int]) -> Optional[np.ndarray]: return None @staticmethod - def detect_batch( + def __detect_batch( # pylint: disable=unused-private-member face_images: List[np.ndarray], model: HSEmotionRecognizer, batch_size: int = 32, @@ -101,12 +101,12 @@ def detect_batch( try: batch_results = model.predict_multi_emotions(batch, logits=False) for emotion, scores in batch_results: - results.append(EmotionDetector._process_emotion_result(emotion, scores)) + results.append(EmotionDetector.__process_emotion_result(emotion, scores)) except Exception: for face_img in batch: try: emotion, scores = model.predict_emotions(face_img, logits=False) - results.append(EmotionDetector._process_emotion_result(emotion, scores)) + results.append(EmotionDetector.__process_emotion_result(emotion, scores)) except Exception: results.append(None) return results diff --git a/preprocessor/lib/video/frame_utils.py b/preprocessor/lib/video/frame_utils.py index 616cbea15..d45523646 100644 --- a/preprocessor/lib/video/frame_utils.py +++ b/preprocessor/lib/video/frame_utils.py @@ -12,7 +12,7 @@ class FrameLoader: @staticmethod - def _load_single(frames_dir: Path, request: Dict[str, Any], convert_rgb: bool) -> Image.Image: + def __load_single(frames_dir: Path, request: Dict[str, Any], convert_rgb: bool) -> Image.Image: if 'frame_path' in request: frame_path = frames_dir / request['frame_path'] else: @@ -28,5 +28,5 @@ def _load_single(frames_dir: Path, request: Dict[str, Any], convert_rgb: bool) - @staticmethod def load_from_requests(frames_dir: Path, frame_requests: List[Dict[str, Any]], convert_rgb: bool=False, num_workers: int=4) -> List[Image.Image]: with ThreadPoolExecutor(max_workers=num_workers) as executor: - images = list(executor.map(lambda req: FrameLoader._load_single(frames_dir, req, convert_rgb), frame_requests)) + images = list(executor.map(lambda req: FrameLoader.__load_single(frames_dir, req, convert_rgb), frame_requests)) return images diff --git a/preprocessor/lib/video/image_hasher.py b/preprocessor/lib/video/image_hasher.py index b84248a25..f121ffdb4 100644 --- a/preprocessor/lib/video/image_hasher.py +++ b/preprocessor/lib/video/image_hasher.py @@ -16,7 +16,7 @@ def __init__(self) -> None: if torch.cuda.is_available(): self.model = self.model.cuda() - def compute_hash(self, image_tensor: torch.Tensor) -> int: + def __compute_hash(self, image_tensor: torch.Tensor) -> int: # pylint: disable=unused-private-member if self.model is None: raise RuntimeError('Model not initialized or already cleaned up') with torch.no_grad(): diff --git a/preprocessor/modules/audio/separation.py b/preprocessor/modules/audio/separation.py index 95cdb4bd6..10d382b87 100644 --- a/preprocessor/modules/audio/separation.py +++ b/preprocessor/modules/audio/separation.py @@ -73,18 +73,18 @@ def execute( # pylint: disable=too-many-locals for segment in segments: classification = classify_segment(segment) if classification == 'dialogue': - cleaned = self._clean_segment_text(segment) + cleaned = self.__clean_segment_text(segment) dialogue_segments.append(cleaned) elif classification == 'sound_event': - cleaned = self._clean_segment_text(segment) + cleaned = self.__clean_segment_text(segment) cleaned['sound_type'] = 'sound' sound_segments.append(cleaned) elif classification == 'mixed': - dialogue_parts, sound_parts = self._split_mixed_segment(segment) + dialogue_parts, sound_parts = self.__split_mixed_segment(segment) dialogue_segments.extend(dialogue_parts) sound_segments.extend(sound_parts) - dialogue_segments = self._renumber_segments(dialogue_segments) - sound_segments = self._renumber_segments(sound_segments) + dialogue_segments = self.__renumber_segments(dialogue_segments) + sound_segments = self.__renumber_segments(sound_segments) clean_data = {'episode_info': episode_info_dict, 'segments': dialogue_segments} sound_data = {'episode_info': episode_info_dict, 'segments': sound_segments} atomic_write_json(clean_json, clean_data) @@ -103,10 +103,10 @@ def execute( # pylint: disable=too-many-locals sound_txt = sound_dir / f"{base_name}{FILE_SUFFIXES['sound_events']}{FILE_EXTENSIONS['txt']}" clean_srt = clean_dir / f"{base_name}{FILE_SUFFIXES['clean']}{FILE_EXTENSIONS['srt']}" sound_srt = sound_dir / f"{base_name}{FILE_SUFFIXES['sound_events']}{FILE_EXTENSIONS['srt']}" - self._generate_txt_file(clean_json, clean_txt) - self._generate_txt_file(sound_json, sound_txt) - self._generate_srt_file(dialogue_segments, clean_srt) - self._generate_srt_file(sound_segments, sound_srt) + self.__generate_txt_file(clean_json, clean_txt) + self.__generate_txt_file(sound_json, sound_txt) + self.__generate_srt_file(dialogue_segments, clean_srt) + self.__generate_srt_file(sound_segments, sound_srt) context.mark_step_completed(self.name, input_data.episode_id) return TranscriptionData( path=clean_json, @@ -118,10 +118,10 @@ def execute( # pylint: disable=too-many-locals ) @staticmethod - def _is_sound_event_text(text: str) -> bool: + def __is_sound_event_text(text: str) -> bool: # pylint: disable=unused-private-member return bool(re.match(r'^\(.*\)$', text.strip())) - def _split_mixed_segment( + def __split_mixed_segment( self, segment: Dict[str, Any], ) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]: @@ -139,7 +139,7 @@ def _split_mixed_segment( continue if word_type != current_type: if current_words and current_type: - self._finalize_sequence( + self.__finalize_sequence( current_type, current_words, current_start, @@ -152,7 +152,7 @@ def _split_mixed_segment( else: current_words.append(word) if current_words and current_type: - self._finalize_sequence( + self.__finalize_sequence( current_type, current_words, current_start, @@ -162,7 +162,7 @@ def _split_mixed_segment( return (dialogue_parts, sound_parts) @staticmethod - def _finalize_sequence( + def __finalize_sequence( seq_type: str, words: List[Dict[str, Any]], start: float, @@ -188,7 +188,7 @@ def _finalize_sequence( dialogue_parts.append(new_segment) @staticmethod - def _clean_segment_text(segment: Dict[str, Any]) -> Dict[str, Any]: + def __clean_segment_text(segment: Dict[str, Any]) -> Dict[str, Any]: cleaned = segment.copy() text = cleaned.get('text', '') text = re.sub('\\s+', ' ', text) @@ -202,13 +202,13 @@ def _clean_segment_text(segment: Dict[str, Any]) -> Dict[str, Any]: return cleaned @staticmethod - def _renumber_segments(segments: List[Dict[str, Any]]) -> List[Dict[str, Any]]: + def __renumber_segments(segments: List[Dict[str, Any]]) -> List[Dict[str, Any]]: for i, seg in enumerate(segments): seg['id'] = i return segments @staticmethod - def _generate_txt_file(json_path: Path, txt_path: Path) -> None: + def __generate_txt_file(json_path: Path, txt_path: Path) -> None: with open(json_path, 'r', encoding='utf-8') as f: data = json.load(f) segments = data.get('segments', []) @@ -223,20 +223,20 @@ def _generate_txt_file(json_path: Path, txt_path: Path) -> None: f.write(' '.join(text_lines)) @staticmethod - def _generate_srt_file(segments: List[Dict[str, Any]], srt_path: Path) -> None: + def __generate_srt_file(segments: List[Dict[str, Any]], srt_path: Path) -> None: with open(srt_path, 'w', encoding='utf-8') as f: for idx, seg in enumerate(segments, 1): start = seg.get('start', 0) end = seg.get('end', 0) text = seg.get('text', '').strip() - start_time = SoundSeparationStep._format_srt_time(start) - end_time = SoundSeparationStep._format_srt_time(end) + start_time = SoundSeparationStep.__format_srt_time(start) + end_time = SoundSeparationStep.__format_srt_time(end) f.write(f'{idx}\n') f.write(f'{start_time} --> {end_time}\n') f.write(f'{text}\n\n') @staticmethod - def _format_srt_time(seconds: float) -> str: + def __format_srt_time(seconds: float) -> str: hours = int(seconds // 3600) minutes = int(seconds % 3600 // 60) secs = int(seconds % 60) diff --git a/preprocessor/modules/scraping/reference_processor.py b/preprocessor/modules/scraping/reference_processor.py index cc6be995a..3814045bb 100644 --- a/preprocessor/modules/scraping/reference_processor.py +++ b/preprocessor/modules/scraping/reference_processor.py @@ -1,3 +1,4 @@ +from dataclasses import dataclass from datetime import datetime import json import logging @@ -7,6 +8,7 @@ Dict, List, Optional, + Tuple, ) import warnings @@ -59,8 +61,8 @@ def __safe_resize(img: np.ndarray, target_size: tuple) -> Optional[np.ndarray]: if img.shape[0] == 0 or img.shape[1] == 0: return None try: - return cv2.resize(img, target_size) - except cv2.error as e: + return cv2.resize(img, target_size) # pylint: disable=no-member + except cv2.error as e: # pylint: disable=catching-non-exception logging.error(f'OpenCV resize error: {e}') return None @@ -104,7 +106,7 @@ def _process_item(self, item: ProcessingItem, missing_outputs: List[OutputSpec]) def __detect_faces_in_references(self, image_paths: List[Path]) -> List[List[FaceData]]: all_faces = [] for idx, img_path in enumerate(image_paths): - img = cv2.imread(str(img_path)) + img = cv2.imread(str(img_path)) # pylint: disable=no-member if img is None: console.print(f'[yellow]Warning: Could not read {img_path}[/yellow]') all_faces.append([]) @@ -299,8 +301,10 @@ def __create_selection_grid(self, data, mode: str, char_name: str) -> Path: # p for col_idx in range(num_refs): label = f'Ref {col_idx + 1}' x = padding + col_idx * (face_size + padding) - cv2.putText( grid, label, (x + 10, 20), - cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 0), 1, + cv2.putText( # pylint: disable=no-member + grid, label, (x + 10, 20), + cv2.FONT_HERSHEY_SIMPLEX, # pylint: disable=no-member + 0.5, (0, 0, 0), 1, ) for cand_idx, candidate in enumerate(candidates): y_base = label_height + padding + cand_idx * (face_size + label_height + padding) @@ -311,8 +315,10 @@ def __create_selection_grid(self, data, mode: str, char_name: str) -> Path: # p if face_resized is not None: grid[y:y + face_size, x:x + face_size] = face_resized label = f'Candidate {cand_idx + 1}' - cv2.putText( grid, label, (5, y_base + face_size // 2), - cv2.FONT_HERSHEY_SIMPLEX, 0.4, (0, 0, 255), 1, ) + cv2.putText( # pylint: disable=no-member + grid, label, (5, y_base + face_size // 2), + cv2.FONT_HERSHEY_SIMPLEX, 0.4, (0, 0, 255), 1, # pylint: disable=no-member + ) else: faces_data = data num_faces = len(faces_data) @@ -332,11 +338,11 @@ def __create_selection_grid(self, data, mode: str, char_name: str) -> Path: # p if face_resized is not None: grid[y:y + face_size, x:x + face_size] = face_resized label = str(idx + 1) - cv2.putText( + cv2.putText( # pylint: disable=no-member grid, label, (x + 5, y + 20), - cv2.FONT_HERSHEY_SIMPLEX, + cv2.FONT_HERSHEY_SIMPLEX, # pylint: disable=no-member 0.7, (0, 0, 255), 2, @@ -344,7 +350,7 @@ def __create_selection_grid(self, data, mode: str, char_name: str) -> Path: # p selection_grids_dir = self.output_dir.parent / 'character_selection_grids' selection_grids_dir.mkdir(parents=True, exist_ok=True) output_path = selection_grids_dir / f"{char_name.replace(' ', '_').lower()}_selection.jpg" - cv2.imwrite(str(output_path), grid) + cv2.imwrite(str(output_path), grid) # pylint: disable=no-member return output_path def __save_processed_references( # pylint: disable=too-many-locals @@ -362,7 +368,7 @@ def __save_processed_references( # pylint: disable=too-many-locals self.logger.warning(f'Skipping face {idx} for {char_name}: failed to resize (invalid dimensions)') continue face_output_path = char_output_dir / f'face_{idx:02d}.jpg' - cv2.imwrite(str(face_output_path), face_normalized) + cv2.imwrite(str(face_output_path), face_normalized) # pylint: disable=no-member face_vectors.append(face_data.face_vector) mean_vector = np.mean(face_vectors, axis=0) vector_path = char_output_dir / 'face_vector.npy' @@ -400,224 +406,326 @@ def __save_processed_references( # pylint: disable=too-many-locals def _get_progress_description(self) -> str: return 'Processing character references' - def generate_validation_grid(self) -> None: # pylint: disable=too-many-locals,too-many-statements - output_path = self.output_dir / 'validation_grid.png' - if output_path.exists(): - console.print(f'[dim]⊘ Skipping validation grid (already exists): {output_path}[/dim]') - return - console.print('\n[blue]Generating validation grid...[/blue]') - if not self.output_dir.exists(): - console.print('[yellow]No processed references found, skipping validation grid[/yellow]') - return - processed_chars = sorted([d for d in self.output_dir.iterdir() if d.is_dir()]) - if not processed_chars: - console.print('[yellow]No processed characters found, skipping validation grid[/yellow]') - return - face_size = 280 - padding = 15 - row_height = face_size + padding * 2 - header_height = 180 - footer_height = 80 - label_col_width = 350 - stats_col_width = 200 - face_col_width = face_size + padding - faces_per_char = 3 - grid_width = label_col_width + stats_col_width + faces_per_char * face_col_width + padding * 2 - grid_height = header_height + len(processed_chars) * row_height + footer_height - bg_color = (250, 252, 255) - grid = np.full((grid_height, grid_width, 3), bg_color, dtype=np.uint8) + def _execute(self) -> None: + super()._execute() + self.generate_validation_grid() + + @dataclass + class _GridDimensions: + face_size: int = 280 + padding: int = 15 + header_height: int = 180 + footer_height: int = 80 + label_col_width: int = 350 + stats_col_width: int = 200 + header_row_height: int = 40 + faces_per_char: int = 3 + + @property + def row_height(self) -> int: + return self.face_size + self.padding * 2 + + @property + def face_col_width(self) -> int: + return self.face_size + self.padding + + def total_width(self) -> int: + return ( + self.label_col_width + + self.stats_col_width + + self.faces_per_char * self.face_col_width + + self.padding * 2 + ) + + def total_height(self, num_chars: int) -> int: + return self.header_height + num_chars * self.row_height + self.footer_height + + @staticmethod + def __load_all_metadata(processed_chars: List[Path]) -> List[Dict[str, Any]]: + metadata_all = [] + for char_dir in processed_chars: + metadata_file = char_dir / 'metadata.json' + if metadata_file.exists(): + with open(metadata_file, 'r', encoding='utf-8') as f: + metadata_all.append(json.load(f)) + return metadata_all + + @staticmethod + def __render_header( + grid: np.ndarray, + dims: _GridDimensions, + total_chars: int, + avg_similarity: float, + threshold: float, + ) -> None: header_bg_color = (45, 55, 72) - cv2.rectangle(grid, (0, 0), (grid_width, header_height), header_bg_color, -1) + cv2.rectangle(grid, (0, 0), (dims.total_width(), dims.header_height), header_bg_color, -1) # pylint: disable=no-member + title_text = 'FACIAL REFERENCE VALIDATION REPORT' - cv2.putText( + cv2.putText( # pylint: disable=no-member grid, title_text, - (padding * 3, 50), - cv2.FONT_HERSHEY_DUPLEX, + (dims.padding * 3, 50), + cv2.FONT_HERSHEY_DUPLEX, # pylint: disable=no-member 1.1, (255, 255, 255), 2, - cv2.LINE_AA, + cv2.LINE_AA, # pylint: disable=no-member ) + subtitle = 'InsightFace Buffalo-L Model | Face Vector Extraction & Similarity Analysis' - cv2.putText( + cv2.putText( # pylint: disable=no-member grid, subtitle, - (padding * 3, 85), - cv2.FONT_HERSHEY_SIMPLEX, + (dims.padding * 3, 85), + cv2.FONT_HERSHEY_SIMPLEX, # pylint: disable=no-member 0.55, (200, 210, 220), 1, - cv2.LINE_AA, + cv2.LINE_AA, # pylint: disable=no-member ) - metadata_all = [] - for char_dir in processed_chars: - metadata_file = char_dir / 'metadata.json' - if metadata_file.exists(): - with open(metadata_file, 'r', encoding='utf-8') as f: - metadata_all.append(json.load(f)) - total_chars = len(processed_chars) - avg_similarity = np.mean([m.get('average_similarity', 0) for m in metadata_all]) if metadata_all else 0 - threshold = self.similarity_threshold + stats_y = 115 - stats_items = [f'Total Subjects: {total_chars}', f'Avg Similarity: {avg_similarity:.4f}', f'Threshold: {threshold:.2f}'] + stats_items = [ + f'Total Subjects: {total_chars}', + f'Avg Similarity: {avg_similarity:.4f}', + f'Threshold: {threshold:.2f}', + ] for idx, stat in enumerate(stats_items): - x_pos = padding * 3 + idx * 280 - cv2.putText( + x_pos = dims.padding * 3 + idx * 280 + cv2.putText( # pylint: disable=no-member grid, stat, (x_pos, stats_y), - cv2.FONT_HERSHEY_SIMPLEX, + cv2.FONT_HERSHEY_SIMPLEX, # pylint: disable=no-member 0.5, (180, 200, 220), 1, - cv2.LINE_AA, + cv2.LINE_AA, # pylint: disable=no-member ) - table_header_y = header_height + 1 - cv2.line(grid, (0, table_header_y), (grid_width, table_header_y), (180, 190, 200), 2) + + @staticmethod + def __render_table_headers(grid: np.ndarray, dims: _GridDimensions) -> None: + table_header_y = dims.header_height + 1 + cv2.line(grid, (0, table_header_y), (dims.total_width(), table_header_y), (180, 190, 200), 2) # pylint: disable=no-member + col_headers = [ - ('CHARACTER NAME', label_col_width // 2, 0), - ('STATISTICS', label_col_width + stats_col_width // 2, 0), - ('REFERENCE IMAGE 1', label_col_width + stats_col_width + face_col_width // 2, 0), - ('REFERENCE IMAGE 2', label_col_width + stats_col_width + face_col_width * 3 // 2, 0), - ('REFERENCE IMAGE 3', label_col_width + stats_col_width + face_col_width * 5 // 2, 0), + ('CHARACTER NAME', dims.label_col_width // 2, 0), + ('STATISTICS', dims.label_col_width + dims.stats_col_width // 2, 0), + ('REFERENCE IMAGE 1', dims.label_col_width + dims.stats_col_width + dims.face_col_width // 2, 0), + ('REFERENCE IMAGE 2', dims.label_col_width + dims.stats_col_width + dims.face_col_width * 3 // 2, 0), + ('REFERENCE IMAGE 3', dims.label_col_width + dims.stats_col_width + dims.face_col_width * 5 // 2, 0), ] - header_row_height = 40 + for text, x_center, _ in col_headers: - text_size = cv2.getTextSize(text, cv2.FONT_HERSHEY_SIMPLEX, 0.42, 1)[0] + text_size = cv2.getTextSize(text, cv2.FONT_HERSHEY_SIMPLEX, 0.42, 1)[0] # pylint: disable=no-member text_x = x_center - text_size[0] // 2 - cv2.putText( + cv2.putText( # pylint: disable=no-member grid, text, (text_x, table_header_y + 25), - cv2.FONT_HERSHEY_SIMPLEX, + cv2.FONT_HERSHEY_SIMPLEX, # pylint: disable=no-member 0.42, (60, 70, 85), 1, - cv2.LINE_AA, + cv2.LINE_AA, # pylint: disable=no-member ) - cv2.line( + + cv2.line( # pylint: disable=no-member grid, - (0, table_header_y + header_row_height), - (grid_width, table_header_y + header_row_height), + (0, table_header_y + dims.header_row_height), + (dims.total_width(), table_header_y + dims.header_row_height), (200, 210, 220), 1, ) - y_offset = header_height + header_row_height + padding - for idx, char_dir in enumerate(processed_chars): - char_name = char_dir.name.replace('_', ' ').title() - metadata_file = char_dir / 'metadata.json' - if idx % 2 == 0: - row_bg = (245, 248, 252) - else: - row_bg = bg_color - cv2.rectangle( grid, (0, y_offset - padding), - (grid_width, y_offset + face_size + padding), row_bg, -1, + + def __render_character_row( # pylint: disable=too-many-locals + self, + grid: np.ndarray, + dims: _GridDimensions, + char_dir: Path, + row_idx: int, + y_offset: int, + bg_color: Tuple[int, int, int], + ) -> None: + char_name = char_dir.name.replace('_', ' ').title() + metadata_file = char_dir / 'metadata.json' + + row_bg = (245, 248, 252) if row_idx % 2 == 0 else bg_color + cv2.rectangle( # pylint: disable=no-member + grid, + (0, y_offset - dims.padding), + (dims.total_width(), y_offset + dims.face_size + dims.padding), + row_bg, + -1, + ) + + cv2.putText( # pylint: disable=no-member + grid, + char_name, + (dims.padding * 2, y_offset + dims.face_size // 2), + cv2.FONT_HERSHEY_SIMPLEX, # pylint: disable=no-member + 0.55, + (30, 40, 50), + 1, + cv2.LINE_AA, # pylint: disable=no-member + ) + + if metadata_file.exists(): + with open(metadata_file, 'r', encoding='utf-8') as f: + metadata = json.load(f) + + similarity = metadata.get('average_similarity', 0.0) + method = metadata.get('detection_stats', {}).get('selection_method', 'unknown') + faces_detected = metadata.get('detection_stats', {}).get('total_faces_detected', []) + + stats_x = dims.label_col_width + dims.padding + stats_y_base = y_offset + dims.face_size // 2 - 30 + + sim_color = (0, 150, 0) if similarity >= self.similarity_threshold else (180, 100, 0) + cv2.putText( # pylint: disable=no-member + grid, + f'Similarity: {similarity:.4f}', + (stats_x, stats_y_base), + cv2.FONT_HERSHEY_SIMPLEX, # pylint: disable=no-member + 0.45, + sim_color, + 1, + cv2.LINE_AA, # pylint: disable=no-member ) - cv2.putText( grid, char_name, (padding * 2, y_offset + face_size // 2), - cv2.FONT_HERSHEY_SIMPLEX, 0.55, (30, 40, 50), 1, cv2.LINE_AA, ) - if metadata_file.exists(): - with open(metadata_file, 'r', encoding='utf-8') as f: - metadata = json.load(f) - similarity = metadata.get('average_similarity', 0.0) - method = metadata.get('detection_stats', {}).get('selection_method', 'unknown') - faces_detected = metadata.get('detection_stats', {}).get('total_faces_detected', []) - stats_x = label_col_width + padding - stats_y_base = y_offset + face_size // 2 - 30 - sim_color = (0, 150, 0) if similarity >= threshold else (180, 100, 0) - cv2.putText( - grid, - f'Similarity: {similarity:.4f}', - (stats_x, stats_y_base), - cv2.FONT_HERSHEY_SIMPLEX, - 0.45, - sim_color, - 1, - cv2.LINE_AA, - ) - method_color = (50, 120, 200) if method == 'automatic' else (180, 100, 50) - cv2.putText( - grid, - f'Method: {method}', - (stats_x, stats_y_base + 25), - cv2.FONT_HERSHEY_SIMPLEX, - 0.42, - method_color, - 1, - cv2.LINE_AA, - ) - faces_str = str(faces_detected) if len(str(faces_detected)) < 20 else f'[{len(faces_detected)} imgs]' - cv2.putText( - grid, - f'Detected: {faces_str}', - (stats_x, stats_y_base + 50), - cv2.FONT_HERSHEY_SIMPLEX, - 0.38, - (100, 110, 120), - 1, - cv2.LINE_AA, - ) - face_files = sorted(char_dir.glob('face_*.jpg')) - for face_idx, face_file in enumerate(face_files[:faces_per_char]): - face_img = cv2.imread(str(face_file)) - if face_img is None: - continue - face_resized = CharacterReferenceProcessor.__safe_resize(face_img, (face_size, face_size)) - if face_resized is None: - continue - x = label_col_width + stats_col_width + face_idx * face_col_width + padding - y = y_offset - grid[y:y + face_size, x:x + face_size] = face_resized - border_color = (180, 190, 200) - cv2.rectangle( - grid, - (x - 1, y - 1), - (x + face_size + 1, y + face_size + 1), - border_color, - 1, - ) - y_offset += row_height - footer_y = grid_height - footer_height + 20 - cv2.line(grid, (0, footer_y - 20), (grid_width, footer_y - 20), (200, 210, 220), 1) + + method_color = (50, 120, 200) if method == 'automatic' else (180, 100, 50) + cv2.putText( # pylint: disable=no-member + grid, + f'Method: {method}', + (stats_x, stats_y_base + 25), + cv2.FONT_HERSHEY_SIMPLEX, # pylint: disable=no-member + 0.42, + method_color, + 1, + cv2.LINE_AA, # pylint: disable=no-member + ) + + faces_str = str(faces_detected) if len(str(faces_detected)) < 20 else f'[{len(faces_detected)} imgs]' + cv2.putText( # pylint: disable=no-member + grid, + f'Detected: {faces_str}', + (stats_x, stats_y_base + 50), + cv2.FONT_HERSHEY_SIMPLEX, # pylint: disable=no-member + 0.38, + (100, 110, 120), + 1, + cv2.LINE_AA, # pylint: disable=no-member + ) + + face_files = sorted(char_dir.glob('face_*.jpg')) + for face_idx, face_file in enumerate(face_files[:dims.faces_per_char]): + face_img = cv2.imread(str(face_file)) # pylint: disable=no-member + if face_img is None: + continue + + face_resized = self.__safe_resize(face_img, (dims.face_size, dims.face_size)) + if face_resized is None: + continue + + x = dims.label_col_width + dims.stats_col_width + face_idx * dims.face_col_width + dims.padding + y = y_offset + grid[y:y + dims.face_size, x:x + dims.face_size] = face_resized + + border_color = (180, 190, 200) + cv2.rectangle( # pylint: disable=no-member + grid, + (x - 1, y - 1), + (x + dims.face_size + 1, y + dims.face_size + 1), + border_color, + 1, + ) + + @staticmethod + def __render_footer(grid: np.ndarray, dims: _GridDimensions, grid_height: int) -> None: + footer_y = grid_height - dims.footer_height + 20 + cv2.line(grid, (0, footer_y - 20), (dims.total_width(), footer_y - 20), (200, 210, 220), 1) # pylint: disable=no-member + footer_text = ( f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')} | " f"Model: {settings.face_recognition.model_name} | " f"Normalized Size: {settings.character.normalized_face_size[0]}x" f"{settings.character.normalized_face_size[1]}px" ) - cv2.putText( + cv2.putText( # pylint: disable=no-member grid, footer_text, - (padding * 3, footer_y), - cv2.FONT_HERSHEY_SIMPLEX, + (dims.padding * 3, footer_y), + cv2.FONT_HERSHEY_SIMPLEX, # pylint: disable=no-member 0.4, (120, 130, 140), 1, - cv2.LINE_AA, + cv2.LINE_AA, # pylint: disable=no-member ) + legend_y = footer_y + 30 legend_items = [ ('Automatic: Face found on all references', (50, 120, 200)), ('Manual: User-selected reference', (180, 100, 50)), ] for idx, (text, color) in enumerate(legend_items): - x_pos = padding * 3 + idx * 380 - cv2.circle(grid, (x_pos, legend_y - 3), 5, color, -1) - cv2.putText( + x_pos = dims.padding * 3 + idx * 380 + cv2.circle(grid, (x_pos, legend_y - 3), 5, color, -1) # pylint: disable=no-member + cv2.putText( # pylint: disable=no-member grid, text, (x_pos + 15, legend_y), - cv2.FONT_HERSHEY_SIMPLEX, + cv2.FONT_HERSHEY_SIMPLEX, # pylint: disable=no-member 0.38, (100, 110, 120), 1, - cv2.LINE_AA, + cv2.LINE_AA, # pylint: disable=no-member ) - cv2.imwrite( + + def generate_validation_grid(self) -> None: + output_path = self.output_dir / 'validation_grid.png' + if output_path.exists(): + console.print(f'[dim]⊘ Skipping validation grid (already exists): {output_path}[/dim]') + return + + console.print('\n[blue]Generating validation grid...[/blue]') + + if not self.output_dir.exists(): + console.print('[yellow]No processed references found, skipping validation grid[/yellow]') + return + + processed_chars = sorted([d for d in self.output_dir.iterdir() if d.is_dir()]) + if not processed_chars: + console.print('[yellow]No processed characters found, skipping validation grid[/yellow]') + return + + dims = self._GridDimensions() + grid_width = dims.total_width() + grid_height = dims.total_height(len(processed_chars)) + bg_color = (250, 252, 255) + grid = np.full((grid_height, grid_width, 3), bg_color, dtype=np.uint8) + + metadata_all = self.__load_all_metadata(processed_chars) + avg_similarity = ( + np.mean([m.get('average_similarity', 0) for m in metadata_all]) if metadata_all else 0 + ) + + self.__render_header(grid, dims, len(processed_chars), avg_similarity, self.similarity_threshold) + self.__render_table_headers(grid, dims) + + y_offset = dims.header_height + dims.header_row_height + dims.padding + for idx, char_dir in enumerate(processed_chars): + self.__render_character_row(grid, dims, char_dir, idx, y_offset, bg_color) + y_offset += dims.row_height + + self.__render_footer(grid, dims, grid_height) + + cv2.imwrite( # pylint: disable=no-member str(output_path), grid, - [cv2.IMWRITE_PNG_COMPRESSION, 6], + [cv2.IMWRITE_PNG_COMPRESSION, 6], # pylint: disable=no-member ) + console.print(f'[green]✓ Validation grid saved to: {output_path}[/green]') console.print(f'[green] Grid size: {grid_width}x{grid_height}px[/green]') console.print(f'[green] Characters: {len(processed_chars)}[/green]') diff --git a/preprocessor/modules/search/clients/elasticsearch_queries.py b/preprocessor/modules/search/clients/elasticsearch_queries.py index 9bff301bf..bbea17e6d 100644 --- a/preprocessor/modules/search/clients/elasticsearch_queries.py +++ b/preprocessor/modules/search/clients/elasticsearch_queries.py @@ -18,23 +18,23 @@ def __init__(self, embedding_service: EmbeddingService, index_base: str) -> None self._index_base = index_base @property - def _segments_index(self) -> str: + def __segments_index(self) -> str: return f'{self._index_base}_text_segments' @property - def _text_embeddings_index(self) -> str: + def __text_embeddings_index(self) -> str: return f'{self._index_base}_text_embeddings' @property - def _video_frames_index(self) -> str: + def __video_frames_index(self) -> str: return f'{self._index_base}_video_frames' @property - def _episode_names_index(self) -> str: + def __episode_names_index(self) -> str: return f'{self._index_base}_episode_names' @staticmethod - def _build_episode_filters(season: Optional[int], episode: Optional[int]) -> List[Dict[str, Any]]: + def __build_episode_filters(season: Optional[int], episode: Optional[int]) -> List[Dict[str, Any]]: filters = [] if season is not None: filters.append({'term': {'episode_metadata.season': season}}) @@ -53,10 +53,10 @@ async def search_text_query( must_clauses = [ {'multi_match': {'query': query, 'fields': ['text^2', 'episode_metadata.title'], 'fuzziness': 'AUTO'}}, ] - must_clauses.extend(self._build_episode_filters(season, episode)) + must_clauses.extend(self.__build_episode_filters(season, episode)) query_body = {'bool': {'must': must_clauses}} return await es_client.search( - index=self._segments_index, + index=self.__segments_index, query=query_body, size=limit, _source=[ @@ -74,7 +74,7 @@ async def search_text_semantic( limit: int=10, ) -> Dict[str, Any]: embedding = self._embedding_service.get_text_embedding(text) - filter_clauses = self._build_episode_filters(season, episode) + filter_clauses = self.__build_episode_filters(season, episode) knn_query: Dict[str, Any] = { 'field': 'text_embedding', 'query_vector': embedding, @@ -84,7 +84,7 @@ async def search_text_semantic( if filter_clauses: knn_query['filter'] = filter_clauses return await es_client.search( - index=self._text_embeddings_index, + index=self.__text_embeddings_index, knn=knn_query, size=limit, _source=[ @@ -103,7 +103,7 @@ async def search_video_semantic( limit: int=10, ) -> Dict[str, Any]: embedding = self._embedding_service.get_image_embedding(image_path) - filter_clauses = self._build_episode_filters(season, episode) + filter_clauses = self.__build_episode_filters(season, episode) if character: filter_clauses.append({ 'nested': { @@ -120,7 +120,7 @@ async def search_video_semantic( if filter_clauses: knn_query['filter'] = filter_clauses return await es_client.search( - index=self._video_frames_index, + index=self.__video_frames_index, knn=knn_query, size=limit, _source=[ @@ -139,7 +139,7 @@ async def search_text_to_video( limit: int=10, ) -> Dict[str, Any]: embedding = self._embedding_service.get_text_embedding(text) - filter_clauses = self._build_episode_filters(season, episode) + filter_clauses = self.__build_episode_filters(season, episode) if character: filter_clauses.append({ 'nested': { @@ -156,7 +156,7 @@ async def search_text_to_video( if filter_clauses: knn_query['filter'] = filter_clauses return await es_client.search( - index=self._video_frames_index, + index=self.__video_frames_index, knn=knn_query, size=limit, _source=[ @@ -179,9 +179,9 @@ async def search_by_character( 'query': {'term': {'character_appearances.name': character}}, }, }] - must_clauses.extend(self._build_episode_filters(season, episode)) + must_clauses.extend(self.__build_episode_filters(season, episode)) return await es_client.search( - index=self._video_frames_index, + index=self.__video_frames_index, query={'bool': {'must': must_clauses}}, size=limit, _source=[ @@ -203,7 +203,7 @@ async def search_by_emotion( if character: nested_must.append({'term': {'character_appearances.name': character}}) must_clauses = [{'nested': {'path': 'character_appearances', 'query': {'bool': {'must': nested_must}}}}] - must_clauses.extend(self._build_episode_filters(season, episode)) + must_clauses.extend(self.__build_episode_filters(season, episode)) nested_filter: Dict[str, Any] = {'term': {'character_appearances.emotion.label': emotion}} if character: nested_filter = { @@ -215,7 +215,7 @@ async def search_by_emotion( }, } return await es_client.search( - index=self._video_frames_index, + index=self.__video_frames_index, query={'bool': {'must': must_clauses}}, sort=[{ 'character_appearances.emotion.confidence': { @@ -239,7 +239,7 @@ async def search_by_object( episode: Optional[int]=None, limit: int=20, ) -> Dict[str, Any]: - filter_clauses = self._build_episode_filters(season, episode) + filter_clauses = self.__build_episode_filters(season, episode) must_clauses: List[Dict[str, Any]] = [] if ':' in object_query: object_class, count_filter = object_query.split(':', 1) @@ -299,7 +299,7 @@ async def search_by_object( query_body = {'bool': {'must': must_clauses, 'filter': filter_clauses}} object_class = object_query.split(':')[0].strip() if ':' in object_query else object_query.strip() return await es_client.search( - index=self._video_frames_index, + index=self.__video_frames_index, query=query_body, sort=[{ 'detected_objects.count': { @@ -325,7 +325,7 @@ async def search_perceptual_hash( limit: int=10, ) -> Dict[str, Any]: return await es_client.search( - index=self._video_frames_index, + index=self.__video_frames_index, query={'term': {'perceptual_hash': phash}}, size=limit, _source=[ @@ -336,7 +336,7 @@ async def search_perceptual_hash( async def list_characters(self, es_client: AsyncElasticsearch) -> List[Tuple[str, int]]: result = await es_client.search( - index=self._video_frames_index, + index=self.__video_frames_index, size=0, aggs={ 'characters_nested': { @@ -354,7 +354,7 @@ async def list_characters(self, es_client: AsyncElasticsearch) -> List[Tuple[str async def list_objects(self, es_client: AsyncElasticsearch) -> List[Tuple[str, int]]: result = await es_client.search( - index=self._video_frames_index, + index=self.__video_frames_index, size=0, aggs={ 'objects_nested': { @@ -384,7 +384,7 @@ async def search_episode_name( must_clauses.append({'term': {'episode_metadata.season': season}}) query_body = {'bool': {'must': must_clauses}} return await es_client.search( - index=self._episode_names_index, + index=self.__episode_names_index, query=query_body, size=limit, _source=['episode_id', 'title', 'video_path', 'episode_metadata'], @@ -410,7 +410,7 @@ async def search_episode_name_semantic( if filter_clauses: knn_query['filter'] = filter_clauses return await es_client.search( - index=self._episode_names_index, + index=self.__episode_names_index, knn=knn_query, size=limit, _source=['episode_id', 'title', 'video_path', 'episode_metadata'], @@ -418,8 +418,8 @@ async def search_episode_name_semantic( async def get_stats(self, es_client: AsyncElasticsearch) -> Dict[str, int]: return { - 'segments': (await es_client.count(index=self._segments_index))['count'], - 'text_embeddings': (await es_client.count(index=self._text_embeddings_index))['count'], - 'video_embeddings': (await es_client.count(index=self._video_frames_index))['count'], - 'episode_names': (await es_client.count(index=self._episode_names_index))['count'], + 'segments': (await es_client.count(index=self.__segments_index))['count'], + 'text_embeddings': (await es_client.count(index=self.__text_embeddings_index))['count'], + 'video_embeddings': (await es_client.count(index=self.__video_frames_index))['count'], + 'episode_names': (await es_client.count(index=self.__episode_names_index))['count'], } diff --git a/preprocessor/modules/search/clients/hash_service.py b/preprocessor/modules/search/clients/hash_service.py index c2b03af81..6c927019b 100644 --- a/preprocessor/modules/search/clients/hash_service.py +++ b/preprocessor/modules/search/clients/hash_service.py @@ -16,7 +16,7 @@ class HashService: def __init__(self) -> None: self._hasher: Optional[PerceptualHasher] = None - def _load_hasher(self) -> PerceptualHasher: + def __load_hasher(self) -> PerceptualHasher: if self._hasher is not None: return self._hasher click.echo('Loading perceptual hasher...', err=True) @@ -27,7 +27,7 @@ def _load_hasher(self) -> PerceptualHasher: return self._hasher def get_perceptual_hash(self, image_path: Union[str, Path]) -> Optional[str]: - hasher = self._load_hasher() + hasher = self.__load_hasher() image = Image.open(image_path).convert('RGB') hashes = hasher.compute_phash_batch([image]) # pylint: disable=no-member return hashes[0] if hashes else None diff --git a/preprocessor/modules/search/clients/result_formatters.py b/preprocessor/modules/search/clients/result_formatters.py index 0c88ecaf2..440a847d7 100644 --- a/preprocessor/modules/search/clients/result_formatters.py +++ b/preprocessor/modules/search/clients/result_formatters.py @@ -22,7 +22,7 @@ def format_timestamp(seconds: float) -> str: return f'{minutes}m {secs:.1f}s' @staticmethod - def _format_scene_context(scene_info: Optional[Dict[str, Any]]) -> str: + def __format_scene_context(scene_info: Optional[Dict[str, Any]]) -> str: if not scene_info: return '' start = ResultFormatter.format_timestamp(scene_info.get('scene_start_time', 0)) @@ -81,7 +81,7 @@ def print_results(result: Dict[str, Any], result_type: str='text') -> None: source = hit[ElasticsearchKeys.SOURCE] score = hit[ElasticsearchKeys.SCORE] meta = source[EpisodeMetadataKeys.EPISODE_METADATA] - scene_ctx = ResultFormatter._format_scene_context(source.get('scene_info')) + scene_ctx = ResultFormatter.__format_scene_context(source.get('scene_info')) click.echo(f'\n[{i}] Score: {score:.2f}') season_code = 'S00' if meta['season'] == 0 else f"S{meta['season']:02d}" click.echo(f"Episode: {season_code}E{meta['episode_number']:02d} - {meta.get('title', 'N/A')}") diff --git a/preprocessor/modules/search/document_generation.py b/preprocessor/modules/search/document_generation.py index f15310e6a..d472ad13f 100644 --- a/preprocessor/modules/search/document_generation.py +++ b/preprocessor/modules/search/document_generation.py @@ -32,18 +32,18 @@ def execute(self, input_data: Artifact, context: ExecutionContext) -> ElasticDoc return ElasticDocuments(episode_id=episode_id, episode_info=episode_info, path=output_dir, document_count=0) context.logger.info(f'Generating Elasticsearch documents for {episode_id}') context.mark_step_started(self.name, episode_id) - data = self._gather_input_data(episode_info, context) + data = self.__gather_input_data(episode_info, context) generated_files = [] total_docs = 0 if self.config.generate_segments and 'transcription' in data: - path, count = self._generate_segments_jsonl(data, episode_info, context) + path, count = self.__generate_segments_jsonl(data, episode_info, context) generated_files.append(path) total_docs += count context.mark_step_completed(self.name, episode_id) return ElasticDocuments(episode_id=episode_id, episode_info=episode_info, path=output_dir, document_count=total_docs) @staticmethod - def _gather_input_data(episode_info: Any, context: ExecutionContext) -> Dict[str, Any]: + def __gather_input_data(episode_info: Any, context: ExecutionContext) -> Dict[str, Any]: data = {} clean_filename = f'{context.series_name}_{episode_info.episode_code()}_clean_transcription.json' clean_path = context.get_output_path(episode_info, 'transcriptions/clean', clean_filename) @@ -59,11 +59,11 @@ def _gather_input_data(episode_info: Any, context: ExecutionContext) -> Dict[str data['scenes'] = load_json(scene_path) return data - def _generate_segments_jsonl(self, data: Dict[str, Any], episode_info: Any, context: ExecutionContext) -> tuple[Path, int]: + def __generate_segments_jsonl(self, data: Dict[str, Any], episode_info: Any, context: ExecutionContext) -> tuple[Path, int]: output_filename = f'{context.series_name}_{episode_info.episode_code()}_text_segments.jsonl' output_path = context.get_output_path(episode_info, 'elastic_documents/text_segments', output_filename) segments = data['transcription'].get('segments', []) - episode_metadata = self._build_episode_metadata(episode_info, context) + episode_metadata = self.__build_episode_metadata(episode_info, context) filename = f'{context.series_name}_{episode_info.episode_code()}.mp4' video_bot_path = f'bot/{context.series_name.upper()}-WIDEO/{episode_info.season_code()}/{filename}' count = 0 @@ -84,5 +84,5 @@ def _generate_segments_jsonl(self, data: Dict[str, Any], episode_info: Any, cont return (output_path, count) @staticmethod - def _build_episode_metadata(episode_info: Any, context: ExecutionContext) -> Dict[str, Any]: + def __build_episode_metadata(episode_info: Any, context: ExecutionContext) -> Dict[str, Any]: return {'season': episode_info.season, 'episode_number': episode_info.relative_episode, 'series_name': context.series_name} diff --git a/preprocessor/modules/search/indexing.py b/preprocessor/modules/search/indexing.py index 88035dbf7..741277ba8 100644 --- a/preprocessor/modules/search/indexing.py +++ b/preprocessor/modules/search/indexing.py @@ -69,7 +69,7 @@ async def _execute_async( if not self.config.append: await self._es.delete_index() - mapping: Optional[Dict[str, Any]] = self._get_mapping_for_type(doc_type) + mapping: Optional[Dict[str, Any]] = self.__get_mapping_for_type(doc_type) if mapping: await self._es.create_index(mapping) @@ -103,7 +103,7 @@ async def _execute_async( ) @staticmethod - def _get_mapping_for_type( + def __get_mapping_for_type( doc_type: str, # pylint: disable=unused-argument ) -> Optional[Dict[str, Any]]: return None diff --git a/preprocessor/modules/text/embeddings.py b/preprocessor/modules/text/embeddings.py index d0de6ec8c..c7462c0c6 100644 --- a/preprocessor/modules/text/embeddings.py +++ b/preprocessor/modules/text/embeddings.py @@ -70,7 +70,7 @@ def execute( # pylint: disable=too-many-locals output_path, len(emb_data.get('results', [])), ) - transcription: Dict[str, Any] = self._load_clean_transcription(input_data, context) + transcription: Dict[str, Any] = self.__load_clean_transcription(input_data, context) segments: List[Dict[str, Any]] = transcription.get('segments', []) if not segments: context.logger.warning(f'No text segments for embedding in {input_data.episode_id}') @@ -84,7 +84,7 @@ def execute( # pylint: disable=too-many-locals context.logger.info(f'Generating text embeddings for {input_data.episode_id}') context.mark_step_started(self.name, input_data.episode_id) full_text: str = ' '.join([seg.get('text', '') for seg in segments]) - sentences: List[str] = self._split_into_sentences(full_text) + sentences: List[str] = self.__split_into_sentences(full_text) text_chunks: List[str] = [] chunk_metadata: List[Dict[str, Any]] = [] step: int = self.config.text_sentences_per_chunk - self.config.text_chunk_overlap @@ -97,8 +97,8 @@ def execute( # pylint: disable=too-many-locals continue char_start: int = sum((len(s) + 1 for s in sentences[:i])) char_end: int = char_start + len(chunk_text) - start_seg_id: int = self._find_segment_at_position(segments, char_start) - end_seg_id: int = self._find_segment_at_position(segments, char_end) + start_seg_id: int = self.__find_segment_at_position(segments, char_start) + end_seg_id: int = self.__find_segment_at_position(segments, char_end) text_chunks.append(chunk_text) chunk_metadata.append({'segment_range': [start_seg_id, end_seg_id], 'text': chunk_text}) results: List[Dict[str, Any]] = [] @@ -123,7 +123,7 @@ def execute( # pylint: disable=too-many-locals return self._create_embedding_collection(input_data, output_path, len(results)) @staticmethod - def _load_clean_transcription( + def __load_clean_transcription( input_data: TranscriptionData, context: ExecutionContext, # pylint: disable=unused-argument ) -> Dict[str, Any]: @@ -137,7 +137,7 @@ def _load_clean_transcription( return load_json(raw_path) @staticmethod - def _split_into_sentences(text: str) -> List[str]: + def __split_into_sentences(text: str) -> List[str]: normalized_text: str = re.sub('\\.{2,}', '.', text) sentences: List[str] = re.split('([.!?]+(?:\\s+|$))', normalized_text) result: List[str] = [] @@ -150,7 +150,7 @@ def _split_into_sentences(text: str) -> List[str]: return result @staticmethod - def _find_segment_at_position(segments: List[Dict[str, Any]], char_pos: int) -> int: + def __find_segment_at_position(segments: List[Dict[str, Any]], char_pos: int) -> int: cumulative_length: int = 0 for idx, seg in enumerate(segments): seg_length: int = len(seg.get('text', '')) + 1 diff --git a/preprocessor/modules/text/import_step.py b/preprocessor/modules/text/import_step.py index 8b1ea8ba0..45aee8fcc 100644 --- a/preprocessor/modules/text/import_step.py +++ b/preprocessor/modules/text/import_step.py @@ -32,7 +32,7 @@ def name(self) -> str: def execute(self, input_data: None, context: ExecutionContext) -> List[TranscriptionData]: if self._episode_manager is None: self._episode_manager = EpisodeManager(None, context.series_name, context.logger) - json_files: List[Path] = self._find_transcription_files() + json_files: List[Path] = self.__find_transcription_files() if not json_files: context.logger.warning(f'No transcription files found in {self.config.source_dir}') return [] @@ -40,24 +40,24 @@ def execute(self, input_data: None, context: ExecutionContext) -> List[Transcrip results: List[TranscriptionData] = [] for json_file in json_files: try: - artifact: Optional[TranscriptionData] = self._import_single_file(json_file, context) + artifact: Optional[TranscriptionData] = self.__import_single_file(json_file, context) if artifact: results.append(artifact) except Exception as e: context.logger.error(f'Failed to import {json_file.name}: {e}') return results - def _find_transcription_files(self) -> List[Path]: + def __find_transcription_files(self) -> List[Path]: pattern: str = '*.json' if self.config.format_type == '11labs_segmented': pattern = '*_segmented.json' files: List[Path] = sorted(self.config.source_dir.rglob(pattern)) return [f for f in files if not f.name.startswith('.')] - def _import_single_file(self, json_file: Path, context: ExecutionContext) -> Optional[TranscriptionData]: + def __import_single_file(self, json_file: Path, context: ExecutionContext) -> Optional[TranscriptionData]: episode_info: Optional['EpisodeInfo'] = self._episode_manager.parse_filename(json_file) if not episode_info: - season_num, episode_num = self._extract_season_episode_fallback(json_file) + season_num, episode_num = self.__extract_season_episode_fallback(json_file) episode_info = self._episode_manager.get_episode_by_season_and_relative(season_num, episode_num) if not episode_info: context.logger.warning(f'Could not determine episode for {json_file}') @@ -75,9 +75,9 @@ def _import_single_file(self, json_file: Path, context: ExecutionContext) -> Opt with open(json_file, 'r', encoding='utf-8') as f: source_data: Dict[str, Any] = json.load(f) if self.config.format_type == '11labs_segmented': - converted_data: Dict[str, Any] = self._convert_11labs_segmented(source_data, json_file) + converted_data: Dict[str, Any] = self.__convert_11labs_segmented(source_data, json_file) elif self.config.format_type == '11labs': - converted_data = self._convert_11labs_full(source_data, json_file) + converted_data = self.__convert_11labs_full(source_data, json_file) else: raise ValueError(f'Unknown format type: {self.config.format_type}') converted_data['episode_info'] = EpisodeManager.get_metadata(episode_info) @@ -95,7 +95,7 @@ def _import_single_file(self, json_file: Path, context: ExecutionContext) -> Opt ) @staticmethod - def _convert_11labs_segmented(data: Dict[str, Any], source_file: Path) -> Dict[str, Any]: + def __convert_11labs_segmented(data: Dict[str, Any], source_file: Path) -> Dict[str, Any]: segments: List[Dict[str, Any]] = [] for i, segment in enumerate(data.get('segments', [])): converted_segment: Dict[str, Any] = { @@ -113,7 +113,7 @@ def _convert_11labs_segmented(data: Dict[str, Any], source_file: Path) -> Dict[s } @staticmethod - def _convert_11labs_full(data: Dict[str, Any], source_file: Path) -> Dict[str, Any]: + def __convert_11labs_full(data: Dict[str, Any], source_file: Path) -> Dict[str, Any]: segments: List[Dict[str, Any]] = [] words: List[Dict[str, Any]] = data.get('words', []) current_segment: Dict[str, Any] = {'words': [], 'start': None, 'end': None, 'text': '', 'speaker': 'unknown'} @@ -142,7 +142,7 @@ def _convert_11labs_full(data: Dict[str, Any], source_file: Path) -> Dict[str, A } @staticmethod - def _extract_season_episode_fallback(file_path: Path) -> Tuple[int, int]: + def __extract_season_episode_fallback(file_path: Path) -> Tuple[int, int]: match: Optional[re.Match] = re.search('S(\\d+)E(\\d+)', file_path.name, re.IGNORECASE) if match: return (int(match.group(1)), int(match.group(2))) diff --git a/preprocessor/modules/validation/episode_stats.py b/preprocessor/modules/validation/episode_stats.py index 1feb96694..59fef7be9 100644 --- a/preprocessor/modules/validation/episode_stats.py +++ b/preprocessor/modules/validation/episode_stats.py @@ -103,7 +103,7 @@ def __validate_raw_transcription(self, transcription_files: Dict[str, Path]): self.__extract_transcription_stats(raw_transcription) def __extract_transcription_stats(self, raw_transcription: Path): - data = self._load_json_safely(raw_transcription) + data = self.__load_json_safely(raw_transcription) if not data: self.errors.append(f'Error reading transcription: {raw_transcription}') return @@ -196,7 +196,7 @@ def __validate_scenes(self): if not result.is_valid: self.errors.append(f'Invalid scenes JSON: {result.error_message}') return - data = self._load_json_safely(scenes_file) + data = self.__load_json_safely(scenes_file) if not data: self.errors.append(f'Error reading scenes: {scenes_file}') return @@ -215,7 +215,7 @@ def __validate_json_directory( check_anomalies: bool = True, ): dir_path = PathManager(self.series_name).get_episode_dir(self.episode_info, subdir) - count, sizes, errors = self._validate_json_files_in_directory(dir_path, exclude_pattern) + count, sizes, errors = self.__validate_json_files_in_directory(dir_path, exclude_pattern) if not dir_path.exists(): self.warnings.append(f'Missing {subdir} directory') return @@ -233,7 +233,7 @@ def __validate_image_hashes(self): def __validate_visualizations(self, subdir: str, count_attr: str, context_name: str): viz_dir = PathManager(self.series_name).get_episode_dir(self.episode_info, subdir) - total_count, invalid_count, errors = self._validate_images_in_directory(viz_dir) + total_count, invalid_count, errors = self.__validate_images_in_directory(viz_dir) if total_count == 0 and viz_dir.exists(): self.warnings.append(f'No visualization images in {subdir}/') return @@ -259,7 +259,7 @@ def __validate_face_clusters(self): if not result.is_valid: self.errors.append(f'Invalid face clustering metadata: {result.error_message}') return - data = self._load_json_safely(metadata_file) + data = self.__load_json_safely(metadata_file) if not data: self.errors.append(f'Error reading face clustering metadata: {metadata_file}') return @@ -420,7 +420,7 @@ def to_dict(self) -> Dict[str, Any]: } @staticmethod - def _validate_images_in_directory( + def __validate_images_in_directory( directory: Path, extensions: Tuple[str, ...] = ('*.jpg', '*.png'), ) -> Tuple[int, int, List[str]]: @@ -441,7 +441,7 @@ def _validate_images_in_directory( return (len(image_files), invalid_count, errors) @staticmethod - def _validate_json_files_in_directory( + def __validate_json_files_in_directory( directory: Path, exclude_pattern: Optional[str] = None, ) -> Tuple[int, List[int], List[str]]: if not directory.exists(): @@ -463,7 +463,7 @@ def _validate_json_files_in_directory( return (len(json_files), sizes, errors) @staticmethod - def _load_json_safely(file_path: Path) -> Optional[Dict[str, Any]]: + def __load_json_safely(file_path: Path) -> Optional[Dict[str, Any]]: try: with open(file_path, 'r', encoding='utf-8') as f: return json.load(f) diff --git a/preprocessor/modules/video/frame_export.py b/preprocessor/modules/video/frame_export.py index 655f0ec3a..95d224d3c 100644 --- a/preprocessor/modules/video/frame_export.py +++ b/preprocessor/modules/video/frame_export.py @@ -72,8 +72,8 @@ def execute(self, input_data: SceneCollection, context: ExecutionContext) -> Fra context.logger.info(f'Extracting {len(frame_requests)} keyframes from {video_path.name}') context.mark_step_started(self.name, input_data.episode_id) try: - self._extract_frames(video_path, frame_requests, episode_dir, input_data.episode_info, context) - self._write_metadata(frame_requests, input_data.episode_info, video_path, context, metadata_file) + self.__extract_frames(video_path, frame_requests, episode_dir, input_data.episode_info, context) + self.__write_metadata(frame_requests, input_data.episode_info, video_path, context, metadata_file) except Exception as e: context.logger.error(f'Failed to extract frames from {video_path}: {e}') shutil.rmtree(episode_dir, ignore_errors=True) @@ -87,7 +87,7 @@ def execute(self, input_data: SceneCollection, context: ExecutionContext) -> Fra metadata_path=metadata_file, ) - def _extract_frames( + def __extract_frames( self, video_file: Path, frame_requests: List[FrameRequest], @@ -95,15 +95,15 @@ def _extract_frames( episode_info, context: ExecutionContext, ) -> None: - video_metadata = self._get_video_metadata(video_file) - dar = self._calculate_display_aspect_ratio(video_metadata) + video_metadata = self.__get_video_metadata(video_file) + dar = self.__calculate_display_aspect_ratio(video_metadata) vr = decord.VideoReader(str(video_file), ctx=decord.cpu(0)) for req in frame_requests: frame_num = req['frame_number'] - self._extract_and_save_frame(vr, frame_num, episode_dir, episode_info, dar, context.series_name) + self.__extract_and_save_frame(vr, frame_num, episode_dir, episode_info, dar, context.series_name) del vr - def _extract_and_save_frame( + def __extract_and_save_frame( self, vr, frame_num: int, @@ -114,13 +114,13 @@ def _extract_and_save_frame( ) -> None: frame_np = vr[frame_num].asnumpy() frame_pil = Image.fromarray(frame_np) - resized = self._resize_frame(frame_pil, dar) + resized = self.__resize_frame(frame_pil, dar) base_filename = f'{series_name}_{episode_info.episode_code()}' filename = f'{base_filename}_frame_{frame_num:06d}.jpg' resized.save(episode_dir / filename, quality=90) @staticmethod - def _get_video_metadata(video_path: Path) -> Dict[str, Any]: + def __get_video_metadata(video_path: Path) -> Dict[str, Any]: cmd = [ 'ffprobe', '-v', 'error', '-select_streams', 'v:0', '-show_entries', 'stream=width,height,sample_aspect_ratio,display_aspect_ratio', @@ -134,7 +134,7 @@ def _get_video_metadata(video_path: Path) -> Dict[str, Any]: return streams[0] @staticmethod - def _calculate_display_aspect_ratio(metadata: Dict[str, Any]) -> float: + def __calculate_display_aspect_ratio(metadata: Dict[str, Any]) -> float: width = metadata.get('width', 0) height = metadata.get('height', 0) if width == 0 or height == 0: @@ -149,7 +149,7 @@ def _calculate_display_aspect_ratio(metadata: Dict[str, Any]) -> float: sar = 1.0 return width / height * sar - def _resize_frame(self, frame: Image.Image, display_aspect_ratio: float) -> Image.Image: + def __resize_frame(self, frame: Image.Image, display_aspect_ratio: float) -> Image.Image: target_width = self.config.resolution.width target_height = self.config.resolution.height target_aspect = target_width / target_height @@ -170,7 +170,7 @@ def _resize_frame(self, frame: Image.Image, display_aspect_ratio: float) -> Imag result.paste(resized, (0, y_offset)) return result - def _write_metadata( + def __write_metadata( self, frame_requests: List[FrameRequest], episode_info, diff --git a/preprocessor/modules/vision/character_detection.py b/preprocessor/modules/vision/character_detection.py index a18d41bc9..8f5eaa89b 100644 --- a/preprocessor/modules/vision/character_detection.py +++ b/preprocessor/modules/vision/character_detection.py @@ -90,7 +90,7 @@ def execute( 'statistics': { 'total_frames_processed': len(frame_files), 'frames_with_detections': len(results), - 'character_counts': self._count_characters(results), + 'character_counts': self.__count_characters(results), }, 'detections': results, } @@ -105,7 +105,7 @@ def execute( ) @staticmethod - def _count_characters(results: List[Dict[str, Any]]) -> Dict[str, int]: + def __count_characters(results: List[Dict[str, Any]]) -> Dict[str, int]: counts: Dict[str, int] = {} for res in results: for face in res.get('faces', []): diff --git a/preprocessor/modules/vision/embeddings.py b/preprocessor/modules/vision/embeddings.py index 6361caa5d..705d988b6 100644 --- a/preprocessor/modules/vision/embeddings.py +++ b/preprocessor/modules/vision/embeddings.py @@ -68,7 +68,7 @@ def execute( # pylint: disable=too-many-locals if not frame_requests: context.logger.warning(f'No frames for embedding in {input_data.episode_id}') return self._create_embedding_collection(input_data, output_path, 0) - image_hashes: Dict[int, str] = self._load_image_hashes(input_data, context) + image_hashes: Dict[int, str] = self.__load_image_hashes(input_data, context) if self._model is None: self._model = EmbeddingModelWrapper(self.config.model_name, self.config.device) self._model.load_model() # pylint: disable=no-member @@ -107,7 +107,7 @@ def execute( # pylint: disable=too-many-locals return self._create_embedding_collection(input_data, output_path, len(results)) @staticmethod - def _load_image_hashes( + def __load_image_hashes( input_data: FrameCollection, context: ExecutionContext, ) -> Dict[int, str]: filename_base = f'{context.series_name}_{input_data.episode_info.episode_code()}' diff --git a/preprocessor/modules/vision/image_hashing.py b/preprocessor/modules/vision/image_hashing.py index dbbe8201c..4cce10dcb 100644 --- a/preprocessor/modules/vision/image_hashing.py +++ b/preprocessor/modules/vision/image_hashing.py @@ -81,7 +81,7 @@ def execute( # pylint: disable=too-many-locals hash_results.append(result) del pil_images if i % (batch_size * 5) == 0: - self._cleanup_memory() + self.__cleanup_memory() output_data: Dict[str, Any] = { 'episode_id': input_data.episode_id, 'series_name': context.series_name, @@ -94,7 +94,7 @@ def execute( # pylint: disable=too-many-locals } atomic_write_json(output_path, output_data) context.mark_step_completed(self.name, input_data.episode_id) - self._cleanup_memory() + self.__cleanup_memory() return ImageHashCollection( episode_id=input_data.episode_id, episode_info=input_data.episode_info, @@ -104,10 +104,10 @@ def execute( # pylint: disable=too-many-locals def cleanup(self) -> None: self._hasher = None - self._cleanup_memory() + self.__cleanup_memory() @staticmethod - def _cleanup_memory() -> None: + def __cleanup_memory() -> None: gc.collect() if torch.cuda.is_available(): torch.cuda.empty_cache() From a7b619325d6fc2c203dca0636432e02e6211d2b9 Mon Sep 17 00:00:00 2001 From: dam2452 <81230036+dam2452@users.noreply.github.com> Date: Wed, 11 Feb 2026 18:07:07 +0100 Subject: [PATCH 16/89] Add dataclass fixer; refactor pipeline and configs Add tools to enforce/fix dataclass field ordering and perform a broad refactor of pipeline/executor and configuration dataclasses. Changes include: - Add check_dataclasses.py and fix_dataclasses.py: utilities to detect and automatically reorder dataclass fields so fields without defaults come before those with defaults. - Pipeline changes: reorganize PipelineDefinition methods (get_all_steps, register, validate, cycle/missing-dependency errors) and improve error messages; add executor helpers in PipelineExecutor/PipelineBuilder (cleanup, execute_step(s), state-marking helpers) to consolidate step lifecycle handling. - StepBuilder/CLI updates: reorder StepBuilder dataclass fields and relocate validation logic; adjust PipelineContextFactory ordering and helper methods. - Config refactor: reorder and add fields across many config dataclasses (OutputSubdirs, TranscodeSettings, Whisper/ElevenLabs/Embedding settings, ImageScraper, Elasticsearch, Gemini, Settings, TranscodeConfig, TranscriptionConfig, etc.), rename private env loaders from __from_env to _from_env, add SeriesConfig.load and defaults-loading helpers, and add small behavioral/property changes (e.g. image scraper serpapi_key property). Overall purpose: ensure consistent dataclass definitions, improve maintainability of pipeline execution and lifecycle, and normalize configuration structures and env-loading conventions. --- preprocessor/app/pipeline.py | 140 +-- preprocessor/app/pipeline_builder.py | 98 +- preprocessor/app/step_builder.py | 40 +- preprocessor/cli/helpers.py | 50 +- preprocessor/config/config.py | 188 ++-- preprocessor/config/constants.py | 16 +- preprocessor/config/enums.py | 6 +- preprocessor/config/series_config.py | 78 +- preprocessor/config/step_configs.py | 56 +- preprocessor/config/types/clip.py | 4 +- preprocessor/config/types/detection.py | 10 +- preprocessor/config/types/episode.py | 8 +- preprocessor/config/types/frame.py | 2 +- preprocessor/config/types/keys.py | 140 +-- preprocessor/config/types/scene.py | 10 +- preprocessor/config/types/search.py | 20 +- preprocessor/config/types/transcription.py | 30 +- preprocessor/config/types/video.py | 14 +- preprocessor/core/artifacts.py | 26 +- preprocessor/core/base_processor.py | 168 ++-- preprocessor/core/base_step.py | 14 +- preprocessor/core/context.py | 24 +- preprocessor/core/path_resolver.py | 6 +- preprocessor/core/path_service.py | 22 +- preprocessor/core/processing_metadata.py | 60 +- preprocessor/core/state_manager.py | 68 +- preprocessor/lib/ai/models.py | 10 +- preprocessor/lib/ai/provider.py | 142 +-- preprocessor/lib/characters/face_detection.py | 76 +- .../characters/image_search/image_search.py | 6 +- preprocessor/lib/characters/models.py | 6 +- .../lib/characters/reference_downloader.py | 224 ++--- preprocessor/lib/core/logging.py | 48 +- preprocessor/lib/episodes/episode_manager.py | 174 ++-- preprocessor/lib/io/files.py | 22 +- preprocessor/lib/io/metadata.py | 36 +- preprocessor/lib/media/ffmpeg.py | 234 ++--- preprocessor/lib/media/resolution.py | 14 +- preprocessor/lib/media/scene_detection.py | 40 +- preprocessor/lib/search/elasticsearch.py | 46 +- preprocessor/lib/text/language_config.py | 2 +- preprocessor/lib/text/text_statistics.py | 132 +-- preprocessor/lib/transcription/elevenlabs.py | 28 +- .../lib/transcription/engines/base_engine.py | 4 +- .../engines/elevenlabs_engine.py | 84 +- .../transcription/engines/whisper_engine.py | 24 +- .../generators/base_generator.py | 4 +- .../generators/json_generator.py | 32 +- .../generators/multi_format_generator.py | 54 +- .../transcription/generators/srt_generator.py | 12 +- .../transcription/generators/txt_generator.py | 12 +- .../processors/audio_normalizer.py | 26 +- .../processors/episode_info_processor.py | 28 +- .../processors/normalized_audio_processor.py | 18 +- .../processors/sound_separator.py | 184 ++-- .../transcription/processors/unicode_fixer.py | 12 +- preprocessor/lib/transcription/utils.py | 58 +- preprocessor/lib/transcription/whisper.py | 32 +- preprocessor/lib/ui/console.py | 12 +- preprocessor/lib/validation/base_result.py | 2 +- .../lib/validation/file_validators.py | 56 +- preprocessor/lib/video/emotion_utils.py | 52 +- preprocessor/lib/video/frame_utils.py | 12 +- preprocessor/lib/video/image_hasher.py | 14 +- preprocessor/modules/audio/extraction.py | 8 +- preprocessor/modules/audio/separation.py | 158 ++-- preprocessor/modules/packaging/archives.py | 8 +- preprocessor/modules/scraping/base_scraper.py | 34 +- .../modules/scraping/base_scraper_step.py | 75 ++ .../scraping/character_scraper_step.py | 56 +- .../modules/scraping/episode_scraper.py | 52 +- .../modules/scraping/episode_scraper_step.py | 63 +- .../modules/scraping/reference_processor.py | 885 +++++++++--------- .../scraping/reference_processor_step.py | 8 +- .../search/clients/elasticsearch_queries.py | 390 ++++---- .../search/clients/embedding_service.py | 56 +- .../modules/search/clients/hash_service.py | 26 +- .../search/clients/result_formatters.py | 64 +- .../modules/search/document_generation.py | 16 +- preprocessor/modules/search/indexing.py | 16 +- preprocessor/modules/text/analysis.py | 8 +- preprocessor/modules/text/embeddings.py | 64 +- preprocessor/modules/text/import_step.py | 128 +-- preprocessor/modules/text/transcription.py | 14 +- .../modules/validation/episode_stats.py | 504 +++++----- .../modules/validation/global_validator.py | 46 +- .../modules/validation/season_comparator.py | 64 +- preprocessor/modules/video/frame_export.py | 70 +- preprocessor/modules/video/scene_detection.py | 14 +- preprocessor/modules/video/transcoding.py | 8 +- .../modules/vision/character_detection.py | 14 +- preprocessor/modules/vision/embeddings.py | 46 +- .../modules/vision/emotion_detection.py | 8 +- .../modules/vision/face_clustering.py | 8 +- preprocessor/modules/vision/image_hashing.py | 12 +- .../modules/vision/object_detection.py | 8 +- 96 files changed, 3096 insertions(+), 3105 deletions(-) create mode 100644 preprocessor/modules/scraping/base_scraper_step.py diff --git a/preprocessor/app/pipeline.py b/preprocessor/app/pipeline.py index 5a2b44e48..e903b66c1 100644 --- a/preprocessor/app/pipeline.py +++ b/preprocessor/app/pipeline.py @@ -20,74 +20,8 @@ def __init__(self, name: str) -> None: self._steps: Dict[str, StepBuilder] = {} self._graph: Optional[nx.DiGraph] = None - def register(self, step: StepBuilder) -> None: - if step.id in self._steps: - raise ValueError( - f"❌ DUPLICATE STEP:\n" - f" Step '{step.id}' is already registered in the pipeline!\n" - f" Check build_pipeline() in pipeline_factory.py", - ) - self._steps[step.id] = step - - def validate(self, logger: Optional["ErrorHandlingLogger"] = None) -> None: - self._graph = nx.DiGraph() - - for step_id, step in self._steps.items(): - self._graph.add_node(step_id, step=step) - - for step_id, step in self._steps.items(): - for dep_id in step.dependency_ids: - if dep_id not in self._steps: - self.__raise_missing_dependency_error(step_id, dep_id) - self._graph.add_edge(dep_id, step_id) - - if not nx.is_directed_acyclic_graph(self._graph): - self.__raise_cycle_error() - - message = ( - f"✅ Pipeline '{self.name}' validated successfully:\n" - f" - {len(self._steps)} steps registered\n" - f" - DAG structure confirmed\n" - f" - No cyclic dependencies" - ) - - if logger: - logger.info(message) - else: - print(message) - - def __raise_missing_dependency_error( - self, step_id: str, missing_dep_id: str, - ) -> None: - raise ValueError( - f"\n{'=' * 80}\n" - f"❌ PIPELINE DEPENDENCY ERROR\n" - f"{'=' * 80}\n\n" - f"Step: '{step_id}'\n" - f"Needs: '{missing_dep_id}'\n" - f"Issue: Step '{missing_dep_id}' is not registered!\n\n" - f"Solution:\n" - f" 1. Check build_pipeline() in preprocessor/app/pipeline_factory.py\n" - f" 2. Ensure '{missing_dep_id}' is added via pipeline.register()\n" - f" 3. Or remove '{missing_dep_id}' from needs=[...] in definition of '{step_id}'\n" - f"\n{'=' * 80}\n", - ) - - def __raise_cycle_error(self) -> None: - cycles: List[List[str]] = list(nx.simple_cycles(self._graph)) - cycle_path: str = " → ".join(cycles[0]) + f" → {cycles[0][0]}" - - raise ValueError( - f"\n{'=' * 80}\n" - f"❌ PIPELINE DEPENDENCY CYCLE DETECTED\n" - f"{'=' * 80}\n\n" - f"Cyclic dependency detected:\n" - f" {cycle_path}\n\n" - f"Steps in cycle: {', '.join(cycles[0])}\n\n" - f"Pipeline must be a DAG (Directed Acyclic Graph).\n" - f"Remove one of the dependencies to break the cycle.\n" - f"\n{'=' * 80}\n", - ) + def get_all_steps(self) -> Dict[str, StepBuilder]: + return dict(self._steps) def get_execution_order( self, targets: Optional[List[str]] = None, skip: Optional[List[str]] = None, @@ -120,6 +54,15 @@ def get_step(self, step_id: str) -> StepBuilder: ) return self._steps[step_id] + def register(self, step: StepBuilder) -> None: + if step.id in self._steps: + raise ValueError( + f"❌ DUPLICATE STEP:\n" + f" Step '{step.id}' is already registered in the pipeline!\n" + f" Check build_pipeline() in pipeline_factory.py", + ) + self._steps[step.id] = step + def to_ascii_art(self) -> str: if not self._graph: self.validate() @@ -158,8 +101,65 @@ def to_ascii_art(self) -> str: lines.append("=" * 80) return "\n".join(lines) - def get_all_steps(self) -> Dict[str, StepBuilder]: - return dict(self._steps) + def validate(self, logger: Optional["ErrorHandlingLogger"] = None) -> None: + self._graph = nx.DiGraph() + + for step_id, step in self._steps.items(): + self._graph.add_node(step_id, step=step) + + for step_id, step in self._steps.items(): + for dep_id in step.dependency_ids: + if dep_id not in self._steps: + self.__raise_missing_dependency_error(step_id, dep_id) + self._graph.add_edge(dep_id, step_id) + + if not nx.is_directed_acyclic_graph(self._graph): + self.__raise_cycle_error() + + message = ( + f"✅ Pipeline '{self.name}' validated successfully:\n" + f" - {len(self._steps)} steps registered\n" + f" - DAG structure confirmed\n" + f" - No cyclic dependencies" + ) + + if logger: + logger.info(message) + else: + print(message) def __repr__(self) -> str: return f"PipelineDefinition(name='{self.name}', steps={len(self._steps)})" + + def __raise_cycle_error(self) -> None: + cycles: List[List[str]] = list(nx.simple_cycles(self._graph)) + cycle_path: str = " → ".join(cycles[0]) + f" → {cycles[0][0]}" + + raise ValueError( + f"\n{'=' * 80}\n" + f"❌ PIPELINE DEPENDENCY CYCLE DETECTED\n" + f"{'=' * 80}\n\n" + f"Cyclic dependency detected:\n" + f" {cycle_path}\n\n" + f"Steps in cycle: {', '.join(cycles[0])}\n\n" + f"Pipeline must be a DAG (Directed Acyclic Graph).\n" + f"Remove one of the dependencies to break the cycle.\n" + f"\n{'=' * 80}\n", + ) + + def __raise_missing_dependency_error( + self, step_id: str, missing_dep_id: str, + ) -> None: + raise ValueError( + f"\n{'=' * 80}\n" + f"❌ PIPELINE DEPENDENCY ERROR\n" + f"{'=' * 80}\n\n" + f"Step: '{step_id}'\n" + f"Needs: '{missing_dep_id}'\n" + f"Issue: Step '{missing_dep_id}' is not registered!\n\n" + f"Solution:\n" + f" 1. Check build_pipeline() in preprocessor/app/pipeline_factory.py\n" + f" 2. Ensure '{missing_dep_id}' is added via pipeline.register()\n" + f" 3. Or remove '{missing_dep_id}' from needs=[...] in definition of '{step_id}'\n" + f"\n{'=' * 80}\n", + ) diff --git a/preprocessor/app/pipeline_builder.py b/preprocessor/app/pipeline_builder.py index 09e9903df..07fc39bca 100644 --- a/preprocessor/app/pipeline_builder.py +++ b/preprocessor/app/pipeline_builder.py @@ -24,6 +24,55 @@ def add_step(self, step: PipelineStep) -> "PipelineExecutor": self.steps.append(step) return self + def cleanup(self) -> None: + for step in self.steps: + if hasattr(step, "cleanup"): + try: + step.cleanup() + except Exception as e: + self.context.logger.error(f"Cleanup failed for step {step.name}: {e}") + + def execute_step( + self, + pipeline: "PipelineDefinition", + step_id: str, + source_path: Path, + episode_manager: EpisodeManager, + ) -> None: + step = pipeline.get_step(step_id) + self.context.logger.info(f"🔧 Step: {step_id}") + self.context.logger.info(f"📝 {step.description}") + + StepClass = step.load_class() + instance = StepClass(step.config) + + runner = PipelineExecutor(self.context) + runner.add_step(instance) + runner.__run_for_episodes(source_path, episode_manager) + + self.context.logger.info(f"✅ Step '{step_id}' completed") + + def execute_steps( + self, + pipeline: "PipelineDefinition", + step_ids: List[str], + source_path: Path, + episode_manager: EpisodeManager, + ) -> None: + for step_id in step_ids: + self.context.logger.info(f"{'=' * 80}") + self.execute_step(pipeline, step_id, source_path, episode_manager) + + def __mark_step_completed(self, step_name: str, episode_id: str) -> None: + if self.context.state_manager is None: + return + self.context.state_manager.mark_step_completed(step_name, episode_id) + + def __mark_step_in_progress(self, step_name: str, episode_id: str) -> None: + if self.context.state_manager is None: + return + self.context.state_manager.mark_step_started(step_name, episode_id) + def __run_for_episodes( # pylint: disable=unused-private-member self, source_path: Path, episode_manager: EpisodeManager, ) -> None: @@ -87,52 +136,3 @@ def __should_skip_step(self, step_name: str, episode_id: str) -> bool: return False return self.context.state_manager.is_step_completed(step_name, episode_id) - - def __mark_step_in_progress(self, step_name: str, episode_id: str) -> None: - if self.context.state_manager is None: - return - self.context.state_manager.mark_step_started(step_name, episode_id) - - def __mark_step_completed(self, step_name: str, episode_id: str) -> None: - if self.context.state_manager is None: - return - self.context.state_manager.mark_step_completed(step_name, episode_id) - - def cleanup(self) -> None: - for step in self.steps: - if hasattr(step, "cleanup"): - try: - step.cleanup() - except Exception as e: - self.context.logger.error(f"Cleanup failed for step {step.name}: {e}") - - def execute_step( - self, - pipeline: "PipelineDefinition", - step_id: str, - source_path: Path, - episode_manager: EpisodeManager, - ) -> None: - step = pipeline.get_step(step_id) - self.context.logger.info(f"🔧 Step: {step_id}") - self.context.logger.info(f"📝 {step.description}") - - StepClass = step.load_class() - instance = StepClass(step.config) - - runner = PipelineExecutor(self.context) - runner.add_step(instance) - runner.__run_for_episodes(source_path, episode_manager) - - self.context.logger.info(f"✅ Step '{step_id}' completed") - - def execute_steps( - self, - pipeline: "PipelineDefinition", - step_ids: List[str], - source_path: Path, - episode_manager: EpisodeManager, - ) -> None: - for step_id in step_ids: - self.context.logger.info(f"{'=' * 80}") - self.execute_step(pipeline, step_id, source_path, episode_manager) diff --git a/preprocessor/app/step_builder.py b/preprocessor/app/step_builder.py index e1456435a..01380f30f 100644 --- a/preprocessor/app/step_builder.py +++ b/preprocessor/app/step_builder.py @@ -21,23 +21,13 @@ class Phase: @dataclass class StepBuilder: + description: str id: str - phase: Phase module: str - description: str + phase: Phase produces: List[str] - needs: List["StepBuilder"] = field(default_factory=list) config: Any = None - - def __post_init__(self) -> None: - if not self.id.replace("_", "").replace("-", "").isalnum(): - raise ValueError( - f"Invalid step_id: '{self.id}'. Use only alphanumeric and underscores.", - ) - if not self.module or ":" not in self.module: - raise ValueError( - f"Invalid module format for '{self.id}'. Expected 'package.module:ClassName'", - ) + needs: List["StepBuilder"] = field(default_factory=list) @property def dependency_ids(self) -> List[str]: @@ -60,14 +50,24 @@ def load_class(self) -> type: f"Class '{class_name}' not found in module '{module_path}' for step '{self.id}': {e}", ) from e - def __repr__(self) -> str: - deps = f", needs={self.dependency_ids}" if self.needs else "" - return f"StepBuilder(id='{self.id}'{deps})" - - def __hash__(self) -> int: - return hash(self.id) - def __eq__(self, other: object) -> bool: if not isinstance(other, StepBuilder): return False return self.id == other.id + + def __hash__(self) -> int: + return hash(self.id) + + def __post_init__(self) -> None: + if not self.id.replace("_", "").replace("-", "").isalnum(): + raise ValueError( + f"Invalid step_id: '{self.id}'. Use only alphanumeric and underscores.", + ) + if not self.module or ":" not in self.module: + raise ValueError( + f"Invalid module format for '{self.id}'. Expected 'package.module:ClassName'", + ) + + def __repr__(self) -> str: + deps = f", needs={self.dependency_ids}" if self.needs else "" + return f"StepBuilder(id='{self.id}'{deps})" diff --git a/preprocessor/cli/helpers.py b/preprocessor/cli/helpers.py index 257295b14..c0ae6d1b1 100644 --- a/preprocessor/cli/helpers.py +++ b/preprocessor/cli/helpers.py @@ -12,37 +12,13 @@ @dataclass class PipelineSetup: + context: ExecutionContext logger: ErrorHandlingLogger state_manager: StateManager - context: ExecutionContext episode_manager: Optional[EpisodeManager] = None class PipelineContextFactory: - @staticmethod - def __create_logger(command_name: str, loglevel: int = logging.INFO) -> ErrorHandlingLogger: - return ErrorHandlingLogger(class_name=command_name, loglevel=loglevel, error_exit_code=1) - - @staticmethod - def __create_state_manager(series_name: str, working_dir: Path) -> StateManager: - state_manager = StateManager(series_name=series_name, working_dir=working_dir) - state_manager.load_or_create_state() - return state_manager - - @staticmethod - def __create_episode_manager( - series: str, input_base: Path, logger: ErrorHandlingLogger, - ) -> Optional[EpisodeManager]: - episodes_json: Optional[Path] = input_base / series / 'episodes.json' - if not episodes_json.exists(): - episodes_json = None - return EpisodeManager(episodes_json, series, logger) - - @staticmethod - def __ensure_output_dir(base_dir: Path, series: str) -> Path: - series_output_dir = base_dir / series - series_output_dir.mkdir(parents=True, exist_ok=True) - return series_output_dir @staticmethod def build( @@ -79,6 +55,30 @@ def build( episode_manager=episode_manager, ) + @staticmethod + def __create_episode_manager( + series: str, input_base: Path, logger: ErrorHandlingLogger, + ) -> Optional[EpisodeManager]: + episodes_json: Optional[Path] = input_base / series / 'episodes.json' + if not episodes_json.exists(): + episodes_json = None + return EpisodeManager(episodes_json, series, logger) + @staticmethod + def __create_logger(command_name: str, loglevel: int = logging.INFO) -> ErrorHandlingLogger: + return ErrorHandlingLogger(class_name=command_name, loglevel=loglevel, error_exit_code=1) + + @staticmethod + def __create_state_manager(series_name: str, working_dir: Path) -> StateManager: + state_manager = StateManager(series_name=series_name, working_dir=working_dir) + state_manager.load_or_create_state() + return state_manager + + @staticmethod + def __ensure_output_dir(base_dir: Path, series: str) -> Path: + series_output_dir = base_dir / series + series_output_dir.mkdir(parents=True, exist_ok=True) + return series_output_dir + def setup_pipeline_context( series: str, diff --git a/preprocessor/config/config.py b/preprocessor/config/config.py index f08d9c9ee..28dd192ea 100644 --- a/preprocessor/config/config.py +++ b/preprocessor/config/config.py @@ -30,39 +30,39 @@ def get_output_path(relative_path: str, series_name: Optional[str]=None) -> Path @dataclass class ElasticDocumentSubdirs: - text_segments: str = 'text_segments' - text_embeddings: str = 'text_embeddings' - video_frames: str = 'video_frames' episode_names: str = 'episode_names' - text_statistics: str = 'text_statistics' full_episode_embeddings: str = 'full_episode_embeddings' - sound_events: str = 'sound_events' sound_event_embeddings: str = 'sound_event_embeddings' + sound_events: str = 'sound_events' + text_embeddings: str = 'text_embeddings' + text_segments: str = 'text_segments' + text_statistics: str = 'text_statistics' + video_frames: str = 'video_frames' @dataclass class TranscriptionSubdirs: - raw: str = 'raw' clean: str = 'clean' + raw: str = 'raw' sound_events: str = 'sound_events' @dataclass class OutputSubdirs: # pylint: disable=too-many-instance-attributes - video: str = 'transcoded_videos' - transcriptions: str = 'transcriptions' - transcription_subdirs: TranscriptionSubdirs = field(default_factory=TranscriptionSubdirs) - scenes: str = 'scene_timestamps' - frames: str = 'exported_frames' - embeddings: str = 'embeddings' - image_hashes: str = 'image_hashes' + archives: str = 'archives' character_detections: str = 'character_detections' character_visualizations: str = 'character_detections/visualizations' + elastic_document_subdirs: ElasticDocumentSubdirs = field(default_factory=ElasticDocumentSubdirs) + elastic_documents: str = 'elastic_documents' + embeddings: str = 'embeddings' face_clusters: str = 'face_clusters' + frames: str = 'exported_frames' + image_hashes: str = 'image_hashes' object_detections: str = 'object_detections' object_visualizations: str = 'object_detections/visualizations' - elastic_documents: str = 'elastic_documents' - archives: str = 'archives' + scenes: str = 'scene_timestamps' + transcription_subdirs: TranscriptionSubdirs = field(default_factory=TranscriptionSubdirs) + transcriptions: str = 'transcriptions' validation_reports: str = 'validation_reports' - elastic_document_subdirs: ElasticDocumentSubdirs = field(default_factory=ElasticDocumentSubdirs) + video: str = 'transcoded_videos' @dataclass class BaseAPISettings: @@ -74,26 +74,20 @@ def api_key(self) -> Optional[str]: @dataclass class TranscodeSettings: - codec: str = 'h264_nvenc' - target_file_size_mb: float = 50.0 - target_duration_seconds: float = 100.0 audio_bitrate_kbps: int = 128 + codec: str = 'h264_nvenc' gop_size: float = 0.5 + target_duration_seconds: float = 100.0 + target_file_size_mb: float = 50.0 @staticmethod def get_output_dir(series_name: str) -> Path: return get_base_output_dir(series_name) / 'transcoded_videos' - def __calculate_video_bitrate_mbps(self) -> float: # pylint: disable=unused-private-member - total_bitrate_mbps = self.target_file_size_mb * 8 / self.target_duration_seconds - audio_bitrate_mbps = self.audio_bitrate_kbps / 1000.0 - video_bitrate_mbps = total_bitrate_mbps - audio_bitrate_mbps - return round(video_bitrate_mbps, 2) - @dataclass class SceneDetectionSettings: - threshold: float = 0.5 min_scene_len: int = 10 + threshold: float = 0.5 @staticmethod def get_output_dir(series_name: str) -> Path: @@ -105,8 +99,8 @@ class SceneChangesSettings: @dataclass class KeyframeExtractionSettings: - strategy: str = 'scene_changes' scene_changes: SceneChangesSettings = field(default_factory=SceneChangesSettings) + strategy: str = 'scene_changes' @dataclass class FrameExportSettings: @@ -118,9 +112,9 @@ def get_output_dir(series_name: str) -> Path: @dataclass class TranscriptionSettings: - model: str = 'large-v3-turbo' - language: str = 'Polish' device: str = 'cuda' + language: str = 'Polish' + model: str = 'large-v3-turbo' @staticmethod def get_output_dir(series_name: str) -> Path: @@ -131,25 +125,25 @@ class WhisperSettings: model: str = 'large-v3-turbo' @classmethod - def __from_env(cls) -> 'WhisperSettings': # pylint: disable=unused-private-member + def _from_env(cls) -> 'WhisperSettings': return cls(model=os.getenv('WHISPER_MODEL', 'large-v3-turbo')) @dataclass class TextChunkingSettings: segments_per_embedding: int = 5 - text_sentences_per_chunk: int = 8 text_chunk_overlap: int = 3 + text_sentences_per_chunk: int = 8 @dataclass class ElevenLabsSettings(BaseAPISettings): - model_id: str = 'scribe_v1' - language_code: str = 'pol' diarize: bool = True - polling_interval: int = 20 + language_code: str = 'pol' max_attempts: int = 60 + model_id: str = 'scribe_v1' + polling_interval: int = 20 @classmethod - def __from_env(cls) -> 'ElevenLabsSettings': # pylint: disable=unused-private-member + def _from_env(cls) -> 'ElevenLabsSettings': api_key = None if os.getenv('ELEVEN_API_KEY'): api_key = SecretStr(os.getenv('ELEVEN_API_KEY', '')) @@ -157,24 +151,24 @@ def __from_env(cls) -> 'ElevenLabsSettings': # pylint: disable=unused-private-m @dataclass class EmbeddingModelSettings: - model_name: str = 'Qwen/Qwen3-VL-Embedding-8B' - model_revision: str = 'main' embedding_dim: int = 4096 + enable_chunked_prefill: bool = True + enforce_eager: bool = False gpu_memory_utilization: float = 0.85 - tensor_parallel_size: int = 1 - max_model_len: int = 8192 image_placeholder: str = '<|vision_start|><|image_pad|><|vision_end|>' - enable_chunked_prefill: bool = True + max_model_len: int = 8192 max_num_batched_tokens: int = 8192 - enforce_eager: bool = False + model_name: str = 'Qwen/Qwen3-VL-Embedding-8B' + model_revision: str = 'main' + tensor_parallel_size: int = 1 @dataclass class EmbeddingSettings: batch_size: int = 32 - text_batch_size: int = 64 - progress_sub_batch_size: int = 100 - prefetch_chunks: int = 2 generate_full_episode_embedding: bool = True + prefetch_chunks: int = 2 + progress_sub_batch_size: int = 100 + text_batch_size: int = 64 @staticmethod def get_output_dir(series_name: str) -> Path: @@ -182,8 +176,8 @@ def get_output_dir(series_name: str) -> Path: @dataclass class FaceRecognitionSettings: - model_name: str = 'buffalo_l' detection_size: Tuple[int, int] = (1280, 1280) + model_name: str = 'buffalo_l' @dataclass class FaceClusteringSettings: @@ -200,17 +194,17 @@ class EmotionDetectionSettings: model_name: str = 'enet_b2_8' @classmethod - def __from_env(cls) -> 'EmotionDetectionSettings': # pylint: disable=unused-private-member + def _from_env(cls) -> 'EmotionDetectionSettings': model_name = os.getenv('EMOTION_MODEL_NAME', 'enet_b2_8') return cls(model_name=model_name) @dataclass class CharacterSettings: - reference_images_per_character: int = 3 - normalized_face_size: Tuple[int, int] = (112, 112) face_detection_threshold: float = 0.2 - reference_matching_threshold: float = 0.5 frame_detection_threshold: float = 0.55 + normalized_face_size: Tuple[int, int] = (112, 112) + reference_images_per_character: int = 3 + reference_matching_threshold: float = 0.5 @staticmethod def get_output_dir(series_name: str) -> Path: @@ -218,8 +212,8 @@ def get_output_dir(series_name: str) -> Path: @dataclass class ObjectDetectionSettings: - model_name: str = 'ustc-community/dfine-xlarge-obj2coco' conf_threshold: float = 0.3 + model_name: str = 'ustc-community/dfine-xlarge-obj2coco' @staticmethod def get_output_dir(series_name: str) -> Path: @@ -235,25 +229,25 @@ def get_output_dir(series_name: str) -> Path: @dataclass class ImageScraperSettings(BaseAPISettings): max_results_to_scrape: int = 50 - min_image_width: int = 800 min_image_height: int = 600 + min_image_width: int = 800 + page_navigation_timeout: int = 30000 + request_delay_max: float = 6.0 + request_delay_min: float = 3.0 retry_attempts: int = 3 retry_delay: float = 3.0 - request_delay_min: float = 3.0 - request_delay_max: float = 6.0 - page_navigation_timeout: int = 30000 + + @property + def serpapi_key(self) -> Optional[str]: + return self.api_key @classmethod - def __from_env(cls) -> 'ImageScraperSettings': # pylint: disable=unused-private-member + def _from_env(cls) -> 'ImageScraperSettings': api_key = None if os.getenv('SERPAPI_API_KEY'): api_key = SecretStr(os.getenv('SERPAPI_API_KEY', '')) return cls(_api_key=api_key) - @property - def serpapi_key(self) -> Optional[str]: - return self.api_key - @dataclass class ScraperSettings: @@ -264,18 +258,18 @@ def get_output_dir(series_name: str) -> Path: @dataclass class ElasticsearchSettings: host: str = '' - user: str = '' password: str = '' + user: str = '' @classmethod - def __from_env(cls) -> 'ElasticsearchSettings': # pylint: disable=unused-private-member + def _from_env(cls) -> 'ElasticsearchSettings': return cls(host=os.getenv('ES_HOST', ''), user=os.getenv('ES_USER', ''), password=os.getenv('ES_PASS', '')) @dataclass class GeminiSettings(BaseAPISettings): @classmethod - def __from_env(cls) -> 'GeminiSettings': # pylint: disable=unused-private-member + def _from_env(cls) -> 'GeminiSettings': api_key = None if os.getenv('GEMINI_API_KEY'): api_key = SecretStr(os.getenv('GEMINI_API_KEY', '')) @@ -283,33 +277,33 @@ def __from_env(cls) -> 'GeminiSettings': # pylint: disable=unused-private-membe @dataclass class Settings: # pylint: disable=too-many-instance-attributes - output_subdirs: OutputSubdirs - whisper: WhisperSettings - text_chunking: TextChunkingSettings - embedding_model: EmbeddingModelSettings + character: CharacterSettings + elasticsearch: ElasticsearchSettings + elevenlabs: ElevenLabsSettings embedding: EmbeddingSettings - scene_detection: SceneDetectionSettings - keyframe_extraction: KeyframeExtractionSettings + embedding_model: EmbeddingModelSettings + emotion_detection: EmotionDetectionSettings + face_clustering: FaceClusteringSettings + face_recognition: FaceRecognitionSettings frame_export: FrameExportSettings + gemini: GeminiSettings image_hash: ImageHashSettings - scraper: ScraperSettings - character: CharacterSettings - object_detection: ObjectDetectionSettings - face_recognition: FaceRecognitionSettings - face_clustering: FaceClusteringSettings - emotion_detection: EmotionDetectionSettings image_scraper: ImageScraperSettings - elevenlabs: ElevenLabsSettings - elasticsearch: ElasticsearchSettings - gemini: GeminiSettings + keyframe_extraction: KeyframeExtractionSettings + object_detection: ObjectDetectionSettings + output_subdirs: OutputSubdirs + scene_detection: SceneDetectionSettings + scraper: ScraperSettings + text_chunking: TextChunkingSettings transcode: TranscodeSettings transcription: TranscriptionSettings + whisper: WhisperSettings @classmethod - def __from_env(cls) -> 'Settings': # pylint: disable=unused-private-member + def _from_env(cls) -> 'Settings': return cls( output_subdirs=OutputSubdirs(), - whisper=WhisperSettings.__from_env(), + whisper=WhisperSettings._from_env(), text_chunking=TextChunkingSettings(), embedding_model=EmbeddingModelSettings(), embedding=EmbeddingSettings(), @@ -322,28 +316,28 @@ def __from_env(cls) -> 'Settings': # pylint: disable=unused-private-member object_detection=ObjectDetectionSettings(), face_recognition=FaceRecognitionSettings(), face_clustering=FaceClusteringSettings(), - emotion_detection=EmotionDetectionSettings.__from_env(), - image_scraper=ImageScraperSettings.__from_env(), - elevenlabs=ElevenLabsSettings.__from_env(), - elasticsearch=ElasticsearchSettings.__from_env(), - gemini=GeminiSettings.__from_env(), + emotion_detection=EmotionDetectionSettings._from_env(), + image_scraper=ImageScraperSettings._from_env(), + elevenlabs=ElevenLabsSettings._from_env(), + elasticsearch=ElasticsearchSettings._from_env(), + gemini=GeminiSettings._from_env(), transcode=TranscodeSettings(), transcription=TranscriptionSettings(), ) @dataclass class TranscodeConfig: - videos: Path - transcoded_videos: Path - resolution: Resolution codec: str gop_size: float + resolution: Resolution + transcoded_videos: Path + videos: Path + audio_bitrate_kbps: int = 128 + bufsize_mbps: Optional[float] = None episodes_info_json: Optional[Path] = None - video_bitrate_mbps: Optional[float] = None - minrate_mbps: Optional[float] = None maxrate_mbps: Optional[float] = None - bufsize_mbps: Optional[float] = None - audio_bitrate_kbps: int = 128 + minrate_mbps: Optional[float] = None + video_bitrate_mbps: Optional[float] = None def to_dict(self) -> Dict[str, Any]: return { @@ -362,13 +356,13 @@ def to_dict(self) -> Dict[str, Any]: @dataclass class TranscriptionConfig: - videos: Path + device: str episodes_info_json: Path - transcription_jsons: Path - model: str language: str - device: str + model: str name: str + transcription_jsons: Path + videos: Path extra_json_keys_to_remove: List[str] = field(default_factory=list) def to_dict(self) -> Dict[str, Any]: @@ -387,9 +381,9 @@ def to_dict(self) -> Dict[str, Any]: class IndexConfig: name: str transcription_jsons: Path - dry_run: bool = False append: bool = False + dry_run: bool = False def to_dict(self) -> Dict[str, Any]: return {'name': self.name, 'transcription_jsons': str(self.transcription_jsons), 'dry_run': self.dry_run, 'append': self.append} -settings = Settings.__from_env() +settings = Settings._from_env() diff --git a/preprocessor/config/constants.py b/preprocessor/config/constants.py index 97df95324..1dd71f4a0 100644 --- a/preprocessor/config/constants.py +++ b/preprocessor/config/constants.py @@ -41,26 +41,16 @@ } class EpisodesDataKeys: + EPISODES = 'episodes' SEASONS = 'seasons' SEASON_NUMBER = 'season' - EPISODES = 'episodes' class EpisodeMetadataKeys: EPISODE_NUMBER = 'episode_number' - TITLE = 'title' PREMIERE_DATE = 'premiere_date' + TITLE = 'title' VIEWERSHIP = 'viewership' class FfprobeKeys: - STREAMS = 'streams' - FORMAT = 'format' - -class ValidationMetadataKeys: - SIZE_BYTES = 'size_bytes' - SIZE_MB = 'size_mb' - LINE_COUNT = 'line_count' - WIDTH = 'width' - HEIGHT = 'height' FORMAT = 'format' - CODEC = 'codec' - DURATION = 'duration' + STREAMS = 'streams' diff --git a/preprocessor/config/enums.py b/preprocessor/config/enums.py index 633525a1d..00f3e5343 100644 --- a/preprocessor/config/enums.py +++ b/preprocessor/config/enums.py @@ -5,9 +5,9 @@ class KeyframeStrategy(str, Enum): SCENE_CHANGES = 'scene_changes' class FrameType(str, Enum): + SCENE_END = 'scene_end' SCENE_SINGLE = 'scene_single' SCENE_START = 'scene_start' - SCENE_END = 'scene_end' @staticmethod def scene_mid(index: int) -> str: @@ -22,9 +22,9 @@ class ParserMode(str, Enum): PREMIUM = 'premium' class TranscriptionFormat(str, Enum): - ELEVENLABS_SEGMENTED = '11labs_segmented' ELEVENLABS = '11labs' + ELEVENLABS_SEGMENTED = '11labs_segmented' class Device(str, Enum): - CUDA = 'cuda' CPU = 'cpu' + CUDA = 'cuda' diff --git a/preprocessor/config/series_config.py b/preprocessor/config/series_config.py index ceaaace12..06c95b672 100644 --- a/preprocessor/config/series_config.py +++ b/preprocessor/config/series_config.py @@ -22,53 +22,53 @@ def __deep_merge(base: Dict[str, Any], override: Dict[str, Any]) -> Dict[str, An @dataclass class EpisodeScrapingConfig: - urls: List[str] parser_mode: str + urls: List[str] @dataclass class CharacterScrapingConfig: - urls: List[str] parser_mode: str + urls: List[str] @dataclass class CharacterReferencesConfig: - search_engine: str images_per_character: int + search_engine: str @dataclass class ScrapingConfig: - episodes: EpisodeScrapingConfig - characters: CharacterScrapingConfig character_references: CharacterReferencesConfig + characters: CharacterScrapingConfig + episodes: EpisodeScrapingConfig @dataclass class TranscriptionProcessingConfig: + device: str + language: str mode: str model: str - language: str - device: str @dataclass class TranscodeProcessingConfig: + bufsize_mbps: float codec: str + force_deinterlace: bool + gop_size: float + maxrate_mbps: float + minrate_mbps: float resolution: str video_bitrate_mbps: float - minrate_mbps: float - maxrate_mbps: float - bufsize_mbps: float - gop_size: float - force_deinterlace: bool @dataclass class SceneDetectionProcessingConfig: - threshold: float min_scene_len: int + threshold: float @dataclass @@ -78,18 +78,18 @@ class FrameExportProcessingConfig: @dataclass class ProcessingConfig: - transcription: TranscriptionProcessingConfig - transcode: TranscodeProcessingConfig - scene_detection: SceneDetectionProcessingConfig frame_export: FrameExportProcessingConfig + scene_detection: SceneDetectionProcessingConfig + transcode: TranscodeProcessingConfig + transcription: TranscriptionProcessingConfig @dataclass class ElasticsearchIndexingConfig: - index_name: str - host: str - dry_run: bool append: bool + dry_run: bool + host: str + index_name: str @dataclass @@ -99,13 +99,29 @@ class IndexingConfig: @dataclass class SeriesConfig: - series_name: str display_name: str + indexing: IndexingConfig pipeline_mode: str - skip_steps: List[str] - scraping: ScrapingConfig processing: ProcessingConfig - indexing: IndexingConfig + scraping: ScrapingConfig + series_name: str + skip_steps: List[str] + + @staticmethod + def load(series_name: str) -> 'SeriesConfig': + config_dir: Path = Path('preprocessor/series_configs') + config_path: Path = config_dir / f'{series_name}.json' + + return SeriesConfig.__load_from_file(config_path) + + @staticmethod + def __load_defaults() -> Dict[str, Any]: + defaults_path: Path = Path('preprocessor/series_configs/defaults.json') + if not defaults_path.exists(): + return {} + with open(defaults_path, 'r', encoding='utf-8') as f: + data: Dict[str, Any] = json.load(f) + return {k: v for k, v in data.items() if not k.startswith('_')} @staticmethod def __load_from_dict(data: Dict[str, Any]) -> 'SeriesConfig': @@ -163,15 +179,6 @@ def __load_from_dict(data: Dict[str, Any]) -> 'SeriesConfig': ), ) - @staticmethod - def __load_defaults() -> Dict[str, Any]: - defaults_path: Path = Path('preprocessor/series_configs/defaults.json') - if not defaults_path.exists(): - return {} - with open(defaults_path, 'r', encoding='utf-8') as f: - data: Dict[str, Any] = json.load(f) - return {k: v for k, v in data.items() if not k.startswith('_')} - @staticmethod def __load_from_file(config_path: Path) -> 'SeriesConfig': if not config_path.exists(): @@ -193,10 +200,3 @@ def __load_from_file(config_path: Path) -> 'SeriesConfig': merged_config: Dict[str, Any] = __deep_merge(defaults, series_filtered) return SeriesConfig.__load_from_dict(merged_config) - - @staticmethod - def load(series_name: str) -> 'SeriesConfig': - config_dir: Path = Path('preprocessor/series_configs') - config_path: Path = config_dir / f'{series_name}.json' - - return SeriesConfig.__load_from_file(config_path) diff --git a/preprocessor/config/step_configs.py b/preprocessor/config/step_configs.py index 52f33fc94..cea26da42 100644 --- a/preprocessor/config/step_configs.py +++ b/preprocessor/config/step_configs.py @@ -12,16 +12,16 @@ class TranscodeConfig(BaseModel): - resolution: Resolution = Field(default=Resolution.R720P) + audio_bitrate_kbps: int = 128 + bufsize_mbps: float = Field(gt=0) codec: str = Field(default='h264_nvenc') + force_deinterlace: bool = False + gop_size: float = Field(gt=0) + maxrate_mbps: float = Field(gt=0) + minrate_mbps: float = Field(gt=0) preset: str = 'p7' + resolution: Resolution = Field(default=Resolution.R720P) video_bitrate_mbps: float = Field(gt=0) - minrate_mbps: float = Field(gt=0) - maxrate_mbps: float = Field(gt=0) - bufsize_mbps: float = Field(gt=0) - audio_bitrate_kbps: int = 128 - gop_size: float = Field(gt=0) - force_deinterlace: bool = False class Config: arbitrary_types_allowed = True @@ -33,43 +33,43 @@ def __maxrate_must_be_greater_than_bitrate(self) -> Self: # pylint: disable=unu return self class SceneDetectionConfig(BaseModel): - threshold: float = Field(default=0.5, ge=0, le=1) min_scene_len: int = Field(default=10, ge=1) + threshold: float = Field(default=0.5, ge=0, le=1) class FrameExportConfig(BaseModel): - resolution: Resolution = Field(default=Resolution.R720P) - keyframe_strategy: KeyframeStrategy = KeyframeStrategy.SCENE_CHANGES frames_per_scene: int = Field(default=3, ge=1) + keyframe_strategy: KeyframeStrategy = KeyframeStrategy.SCENE_CHANGES + resolution: Resolution = Field(default=Resolution.R720P) class Config: arbitrary_types_allowed = True class TranscriptionConfig(BaseModel): - model: str = 'large-v3' language: str = 'pl' + model: str = 'large-v3' output_formats: List[str] = ['json', 'srt', 'txt'] class WhisperTranscriptionConfig(BaseModel): - model: str = 'large-v3-turbo' - language: str = 'pl' - device: str = 'cuda' beam_size: int = Field(default=10, ge=1) + device: str = 'cuda' + language: str = 'pl' + model: str = 'large-v3-turbo' temperature: float = Field(default=0.0, ge=0.0, le=1.0) class TextAnalysisConfig(BaseModel): language: str = 'pl' class TextEmbeddingConfig(BaseModel): - model_name: str = 'Qwen/Qwen3-VL-Embedding-8B' batch_size: int = Field(default=8, ge=1) device: str = 'cuda' - text_sentences_per_chunk: int = Field(default=5, ge=1) + model_name: str = 'Qwen/Qwen3-VL-Embedding-8B' text_chunk_overlap: int = Field(default=1, ge=0) + text_sentences_per_chunk: int = Field(default=5, ge=1) class VideoEmbeddingConfig(BaseModel): - model_name: str = 'Qwen/Qwen3-VL-Embedding-8B' batch_size: int = Field(default=8, ge=1) device: str = 'cuda' + model_name: str = 'Qwen/Qwen3-VL-Embedding-8B' class SoundSeparationConfig(BaseModel): pass @@ -81,14 +81,14 @@ class ImageHashConfig(BaseModel): batch_size: int = Field(default=32, ge=1) class TranscriptionImportConfig(BaseModel): - source_dir: str format_type: str = '11labs_segmented' + source_dir: str class ElasticsearchConfig(BaseModel): - index_name: str - host: str = 'localhost:9200' - dry_run: bool = False append: bool = False + dry_run: bool = False + host: str = 'localhost:9200' + index_name: str class AudioExtractionConfig(BaseModel): pass @@ -113,24 +113,24 @@ class ValidationConfig(BaseModel): class EpisodeScraperConfig(BaseModel): - urls: List[str] - output_file: str headless: bool = True merge_sources: bool = True - scraper_method: str = "crawl4ai" + output_file: str parser_mode: str = "normal" + scraper_method: str = "crawl4ai" + urls: List[str] class CharacterScraperConfig(BaseModel): - urls: List[str] - output_file: str headless: bool = True - scraper_method: str = "crawl4ai" + output_file: str parser_mode: str = "normal" + scraper_method: str = "crawl4ai" + urls: List[str] class CharacterReferenceConfig(BaseModel): characters_file: str + images_per_character: int = Field(default=5, ge=1, le=20) output_dir: str search_engine: str = "duckduckgo" - images_per_character: int = Field(default=5, ge=1, le=20) diff --git a/preprocessor/config/types/clip.py b/preprocessor/config/types/clip.py index a1dac0191..f0fde8b36 100644 --- a/preprocessor/config/types/clip.py +++ b/preprocessor/config/types/clip.py @@ -6,6 +6,6 @@ class ClipSegment(TypedDict): - video_path: Union[str, Any] - start_time: float end_time: float + start_time: float + video_path: Union[str, Any] diff --git a/preprocessor/config/types/detection.py b/preprocessor/config/types/detection.py index f6282c2f6..22e003f5e 100644 --- a/preprocessor/config/types/detection.py +++ b/preprocessor/config/types/detection.py @@ -6,20 +6,20 @@ class CharacterDetectionInFrame(TypedDict): - name: str - confidence: float bbox: List[int] + confidence: float embedding: NotRequired[List[float]] + name: str class ObjectDetectionInFrame(TypedDict): - class_name: str + bbox: List[int] class_id: int + class_name: str confidence: float - bbox: List[int] class Detection(TypedDict): bbox: List[int] - confidence: float class_id: NotRequired[int] class_name: NotRequired[str] + confidence: float name: NotRequired[str] diff --git a/preprocessor/config/types/episode.py b/preprocessor/config/types/episode.py index 29b356095..446751b89 100644 --- a/preprocessor/config/types/episode.py +++ b/preprocessor/config/types/episode.py @@ -7,17 +7,17 @@ class EpisodeInfo(TypedDict): episode_number: int - title: str premiere_date: str + title: str viewership: Union[str, int, float] class EpisodeMetadata(TypedDict): - season: int episode_number: int - title: str premiere_date: str - viewership: Union[str, int, float] + season: int series_name: str + title: str + viewership: Union[str, int, float] class SeasonInfo(TypedDict): pass diff --git a/preprocessor/config/types/frame.py b/preprocessor/config/types/frame.py index 7d9c59ebe..94832464e 100644 --- a/preprocessor/config/types/frame.py +++ b/preprocessor/config/types/frame.py @@ -6,6 +6,6 @@ class FrameRequest(TypedDict): frame: int + scene_number: NotRequired[int] time: float type: str - scene_number: NotRequired[int] diff --git a/preprocessor/config/types/keys.py b/preprocessor/config/types/keys.py index 64c39a574..80479e7a7 100644 --- a/preprocessor/config/types/keys.py +++ b/preprocessor/config/types/keys.py @@ -1,178 +1,178 @@ class SegmentKeys: - START_TIME = 'start_time' + END = 'end' END_TIME = 'end_time' - TEXT = 'text' - VIDEO_PATH = 'video_path' - SEGMENT_ID = 'segment_id' ID = 'id' + SEGMENT_ID = 'segment_id' START = 'start' - END = 'end' + START_TIME = 'start_time' + TEXT = 'text' + VIDEO_PATH = 'video_path' class EpisodeMetadataKeys: - EPISODE_METADATA = 'episode_metadata' EPISODE_INFO = 'episode_info' - SEASON = 'season' + EPISODE_METADATA = 'episode_metadata' EPISODE_NUMBER = 'episode_number' + PREMIERE_DATE = 'premiere_date' + SEASON = 'season' SERIES_NAME = 'series_name' TITLE = 'title' - PREMIERE_DATE = 'premiere_date' VIEWERSHIP = 'viewership' class ElasticsearchKeys: - SOURCE = '_source' - SCORE = '_score' - HITS = 'hits' - TOTAL = 'total' AGGREGATIONS = 'aggregations' BUCKETS = 'buckets' + HITS = 'hits' KEY = 'key' + SCORE = '_score' + SOURCE = '_source' + TOTAL = 'total' class ElasticsearchAggregationKeys: - UNIQUE_EPISODES = 'unique_episodes' SEASONS = 'seasons' + UNIQUE_EPISODES = 'unique_episodes' VALUE = 'value' class TranscriptionContextKeys: - TARGET = 'target' CONTEXT = 'context' - OVERALL_START_TIME = 'overall_start_time' OVERALL_END_TIME = 'overall_end_time' + OVERALL_START_TIME = 'overall_start_time' + TARGET = 'target' class ElasticsearchQueryKeys: - QUERY = 'query' - TERM = 'term' - MATCH = 'match' + AGGS = 'aggs' + ASC = 'asc' + AUTO = 'AUTO' BOOL = 'bool' - MUST = 'must' + CARDINALITY = 'cardinality' + DESC = 'desc' + FIELD = 'field' FILTER = 'filter' + FUZZINESS = 'fuzziness' + GT = 'gt' + INCLUDES = 'includes' + KEY = '_key' + LT = 'lt' + MATCH = 'match' + MUST = 'must' + ORDER = 'order' + QUERY = 'query' RANGE = 'range' SIZE = 'size' SORT = 'sort' - ORDER = 'order' - ASC = 'asc' - DESC = 'desc' - FUZZINESS = 'fuzziness' - AUTO = 'AUTO' + SOURCE = '_source' + TERM = 'term' TERMS = 'terms' - FIELD = 'field' - AGGS = 'aggs' - CARDINALITY = 'cardinality' TOP_HITS = 'top_hits' - INCLUDES = 'includes' - LT = 'lt' - GT = 'gt' - SOURCE = '_source' - KEY = '_key' class EpisodesDataKeys: + EPISODES = 'episodes' SEASONS = 'seasons' SEASON_NUMBER = 'season_number' - EPISODES = 'episodes' class FfprobeKeys: - STREAMS = 'streams' FORMAT = 'format' + STREAMS = 'streams' class FfprobeStreamKeys: - R_FRAME_RATE = 'r_frame_rate' BIT_RATE = 'bit_rate' CODEC_NAME = 'codec_name' - WIDTH = 'width' - HEIGHT = 'height' DURATION = 'duration' + HEIGHT = 'height' + R_FRAME_RATE = 'r_frame_rate' + WIDTH = 'width' class FfprobeFormatKeys: DURATION = 'duration' SIZE = 'size' class DetectionKeys: - DETECTIONS = 'detections' CHARACTERS = 'characters' - FRAME_NUMBER = 'frame_number' + DETECTIONS = 'detections' FRAME = 'frame' - FRAME_NAME = 'frame_name' FRAME_FILE = 'frame_file' + FRAME_NAME = 'frame_name' + FRAME_NUMBER = 'frame_number' class CharacterDetectionKeys: - NAME = 'name' + BBOX = 'bbox' CONFIDENCE = 'confidence' EMOTION = 'emotion' - BBOX = 'bbox' + NAME = 'name' class EmotionKeys: - LABEL = 'label' CONFIDENCE = 'confidence' + LABEL = 'label' class ObjectDetectionKeys: - CLASS_NAME = 'class_name' + BBOX = 'bbox' CLASS_ID = 'class_id' + CLASS_NAME = 'class_name' CONFIDENCE = 'confidence' - BBOX = 'bbox' class SceneKeys: - SCENES = 'scenes' - START = 'start' END = 'end' + SCENES = 'scenes' + SCENE_END_FRAME = 'scene_end_frame' + SCENE_END_TIME = 'scene_end_time' SCENE_NUMBER = 'scene_number' SCENE_START_FRAME = 'scene_start_frame' - SCENE_END_FRAME = 'scene_end_frame' SCENE_START_TIME = 'scene_start_time' - SCENE_END_TIME = 'scene_end_time' + START = 'start' class SceneTimeKeys: - SECONDS = 'seconds' FRAME = 'frame' + SECONDS = 'seconds' class ElasticDocKeys: - SCENE_INFO = 'scene_info' CHARACTER_APPEARANCES = 'character_appearances' DETECTED_OBJECTS = 'detected_objects' PERCEPTUAL_HASH = 'perceptual_hash' PERCEPTUAL_HASH_INT = 'perceptual_hash_int' + SCENE_INFO = 'scene_info' class EmbeddingKeys: + EMBEDDING = 'embedding' EPISODE_ID = 'episode_id' - TITLE = 'title' - TITLE_EMBEDDING = 'title_embedding' EPISODE_METADATA = 'episode_metadata' FRAME_NUMBER = 'frame_number' - PERCEPTUAL_HASH = 'perceptual_hash' FRAME_PATH = 'frame_path' - TIMESTAMP = 'timestamp' - EMBEDDING = 'embedding' + PERCEPTUAL_HASH = 'perceptual_hash' SCENE_NUMBER = 'scene_number' + TIMESTAMP = 'timestamp' + TITLE = 'title' + TITLE_EMBEDDING = 'title_embedding' class ValidationMetadataKeys: - WIDTH = 'width' - HEIGHT = 'height' - FORMAT = 'format' - SIZE_MB = 'size_mb' - SIZE_BYTES = 'size_bytes' - LINE_COUNT = 'line_count' CODEC = 'codec' DURATION = 'duration' + FORMAT = 'format' + HEIGHT = 'height' + LINE_COUNT = 'line_count' + SIZE_BYTES = 'size_bytes' + SIZE_MB = 'size_mb' + WIDTH = 'width' class WordKeys: - TYPE = 'type' - START = 'start' END = 'end' + START = 'start' + TEXT = 'text' + TYPE = 'type' WORD = 'word' WORDS = 'words' - TEXT = 'text' class WordTypeValues: - SPACING = 'spacing' AUDIO_EVENT = 'audio_event' + SPACING = 'spacing' class GoogleSearchKeys: + API_KEY = 'api_key' ENGINE = 'engine' - Q = 'q' - HL = 'hl' GL = 'gl' - API_KEY = 'api_key' + HL = 'hl' IMAGES_RESULTS = 'images_results' + Q = 'q' class ImageResultKeys: + IMAGE = 'image' ORIGINAL = 'original' THUMBNAIL = 'thumbnail' - IMAGE = 'image' diff --git a/preprocessor/config/types/scene.py b/preprocessor/config/types/scene.py index 7d94c8118..5b8d17999 100644 --- a/preprocessor/config/types/scene.py +++ b/preprocessor/config/types/scene.py @@ -6,23 +6,23 @@ class SceneDict(TypedDict): - scene_number: int - start_frame: int end_frame: int - start_time: float end_time: float fps: float + scene_number: int + start_frame: int + start_time: float class SceneTimestampPoint(TypedDict): frame: int seconds: float class SceneTimestamp(TypedDict): + end: SceneTimestampPoint scene_number: int start: SceneTimestampPoint - end: SceneTimestampPoint class SceneTimestampsData(TypedDict): + fps: NotRequired[float] scenes: List[SceneTimestamp] total_scenes: NotRequired[int] - fps: NotRequired[float] diff --git a/preprocessor/config/types/search.py b/preprocessor/config/types/search.py index 9d963f67d..2b930892e 100644 --- a/preprocessor/config/types/search.py +++ b/preprocessor/config/types/search.py @@ -11,38 +11,38 @@ class SearchSegment(TypedDict): - season: int + end_time: float episode_number: int - title: str + season: int start_time: float - end_time: float + title: str class ElasticsearchHit(TypedDict): - _source: ElasticsearchSegment _score: float + _source: ElasticsearchSegment class ElasticsearchHits(TypedDict): hits: List[ElasticsearchHit] - total: Dict[str, Any] max_score: float + total: Dict[str, Any] class ElasticsearchResponse(TypedDict): - hits: ElasticsearchHits aggregations: NotRequired[Dict[str, Any]] - took: int + hits: ElasticsearchHits timed_out: bool + took: int class EpisodeBucket(TypedDict): - key: int doc_count: int episode_metadata: Dict[str, Any] + key: int class SeasonBucket(TypedDict): - key: int doc_count: int + key: int unique_episodes: Dict[str, int] class ElasticsearchAggregations(TypedDict): + buckets: NotRequired[List[Union[SeasonBucket, EpisodeBucket]]] seasons: Dict[str, Union[List[SeasonBucket], int]] unique_episodes: Dict[str, Union[List[EpisodeBucket], int]] - buckets: NotRequired[List[Union[SeasonBucket, EpisodeBucket]]] diff --git a/preprocessor/config/types/transcription.py b/preprocessor/config/types/transcription.py index 3f5b07b7a..04f52fbc3 100644 --- a/preprocessor/config/types/transcription.py +++ b/preprocessor/config/types/transcription.py @@ -8,37 +8,37 @@ class BaseSegment(TypedDict): + end: float id: int - text: str start: float - end: float + text: str class SegmentWithTimes(TypedDict): - segment_id: int - text: str - start_time: float end_time: float episode_metadata: EpisodeMetadata + segment_id: int + start_time: float + text: str video_path: NotRequired[str] class SegmentWithScore(SegmentWithTimes): _score: float class ElasticsearchSegment(TypedDict): - segment_id: NotRequired[int] - id: NotRequired[int] - text: str - start_time: NotRequired[float] - start: NotRequired[float] - end_time: NotRequired[float] + _score: NotRequired[float] end: NotRequired[float] - episode_metadata: NotRequired[EpisodeMetadata] + end_time: NotRequired[float] episode_info: NotRequired[EpisodeMetadata] + episode_metadata: NotRequired[EpisodeMetadata] + id: NotRequired[int] + segment_id: NotRequired[int] + start: NotRequired[float] + start_time: NotRequired[float] + text: str video_path: NotRequired[str] - _score: NotRequired[float] class TranscriptionContext(TypedDict): - target: ElasticsearchSegment context: List[BaseSegment] - overall_start_time: float overall_end_time: float + overall_start_time: float + target: ElasticsearchSegment diff --git a/preprocessor/config/types/video.py b/preprocessor/config/types/video.py index a9120555f..12cd0059f 100644 --- a/preprocessor/config/types/video.py +++ b/preprocessor/config/types/video.py @@ -5,15 +5,15 @@ class HashResult(TypedDict): + file_path: NotRequired[str] frame_number: int - timestamp: float hash: str - file_path: NotRequired[str] + timestamp: float class VideoMetadata(TypedDict): - width: int - height: int - fps: float - duration: float - codec: NotRequired[str] bitrate: NotRequired[int] + codec: NotRequired[str] + duration: float + fps: float + height: int + width: int diff --git a/preprocessor/core/artifacts.py b/preprocessor/core/artifacts.py index 889c61300..db08c08d4 100644 --- a/preprocessor/core/artifacts.py +++ b/preprocessor/core/artifacts.py @@ -26,17 +26,17 @@ class SourceVideo(EpisodeArtifact): @dataclass(frozen=True) class TranscodedVideo(EpisodeArtifact): + codec: str path: Path resolution: str - codec: str @dataclass(frozen=True) class SceneCollection(EpisodeArtifact): + min_scene_len: int path: Path - video_path: Path scenes: List[Dict[str, Any]] threshold: float - min_scene_len: int + video_path: Path @dataclass(frozen=True) class FrameCollection(EpisodeArtifact): @@ -46,28 +46,28 @@ class FrameCollection(EpisodeArtifact): @dataclass(frozen=True) class TranscriptionData(EpisodeArtifact): - path: Path + format: str language: str model: str - format: str + path: Path @dataclass(frozen=True) class EmbeddingCollection(EpisodeArtifact): - path: Path - model_name: str embedding_count: int embedding_type: str + model_name: str + path: Path @dataclass(frozen=True) class DetectionResults(EpisodeArtifact): - path: Path - detection_type: str detection_count: int + detection_type: str + path: Path @dataclass(frozen=True) class ElasticDocuments(EpisodeArtifact): - path: Path document_count: int + path: Path @dataclass(frozen=True) class TextAnalysisResults(EpisodeArtifact): @@ -77,19 +77,19 @@ class TextAnalysisResults(EpisodeArtifact): @dataclass(frozen=True) class AudioArtifact(EpisodeArtifact): - path: Path format: str + path: Path @dataclass(frozen=True) class IndexingResult(Artifact): - index_name: str document_count: int + index_name: str success: bool @dataclass(frozen=True) class ImageHashCollection(EpisodeArtifact): - path: Path hash_count: int + path: Path @dataclass(frozen=True) class EmotionData(EpisodeArtifact): diff --git a/preprocessor/core/base_processor.py b/preprocessor/core/base_processor.py index 6d52b4e75..c8143f61f 100644 --- a/preprocessor/core/base_processor.py +++ b/preprocessor/core/base_processor.py @@ -37,11 +37,11 @@ class OutputSpec: required: bool = True class BaseProcessor(ABC): - SUPPORTED_VIDEO_EXTENSIONS = SUPPORTED_VIDEO_EXTENSIONS - REQUIRES: List[str] = [] - PRODUCES: List[str] = [] - PRIORITY: int = 100 DESCRIPTION: str = '' + PRIORITY: int = 100 + PRODUCES: List[str] = [] + REQUIRES: List[str] = [] + SUPPORTED_VIDEO_EXTENSIONS = SUPPORTED_VIDEO_EXTENSIONS def __init__(self, args: Dict[str, Any], class_name: str, error_exit_code: int, loglevel: int=logging.DEBUG) -> None: self._validate_args(args) @@ -52,8 +52,11 @@ def __init__(self, args: Dict[str, Any], class_name: str, error_exit_code: int, self.path_manager: PathManager = args.get('path_manager', PathManager(self.series_name)) self.progress = args.get('progress_tracker', ProgressTracker()) + def cleanup(self) -> None: + pass + @abstractmethod - def _validate_args(self, args: Dict[str, Any]) -> None: + def get_output_subdir(self) -> str: pass def work(self) -> int: @@ -69,83 +72,6 @@ def work(self) -> int: self.cleanup() return self.logger.finalize() - def cleanup(self) -> None: - pass - - def _load_resources(self) -> bool: - return True - - def __get_processing_info(self) -> List[str]: - return [] - - def _get_processing_items(self) -> List[ProcessingItem]: - raise NotImplementedError( - f'{self.__class__.__name__} must implement _get_processing_items() ' - 'or override _execute() directly (legacy mode)', - ) - - def _get_expected_outputs(self, item: ProcessingItem) -> List[OutputSpec]: - raise NotImplementedError( - f'{self.__class__.__name__} must implement _get_expected_outputs() ' - 'or override _execute() directly (legacy mode)', - ) - - def _process_item( - self, item: ProcessingItem, missing_outputs: List[OutputSpec], - ) -> None: - raise NotImplementedError( - f'{self.__class__.__name__} must implement _process_item() ' - 'or override _execute() directly (legacy mode)', - ) - - @abstractmethod - def get_output_subdir(self) -> str: - pass - - def __get_step_name(self) -> str: - class_name = self.__class__.__name__ - name = class_name.replace('Processor', '').replace('Generator', '').replace('Detector', '') - name = name.replace('Transcoder', '').replace('Importer', '').replace('Indexer', '') - return self.__to_snake_case(name) - - @staticmethod - def __to_snake_case(name: str) -> str: - name = re.sub('(.)([A-Z][a-z]+)', '\\1_\\2', name) - return re.sub('([a-z0-9])([A-Z])', '\\1_\\2', name).lower() - - def __should_skip_item( - self, item: ProcessingItem, - ) -> Tuple[bool, List[OutputSpec], str]: - expected_outputs = self._get_expected_outputs(item) - if not expected_outputs: - return (False, [], '') - missing_outputs = [ - output for output in expected_outputs - if not output.path.exists() or output.path.stat().st_size == 0 - ] - step_name = self.__get_step_name() - state_completed = ( - self.state_manager - and self.state_manager.is_step_completed(step_name, item.episode_id) - ) - if not missing_outputs and state_completed: - return (True, [], f'[yellow]Skipping (completed): {item.episode_id}[/yellow]') - if not missing_outputs and (not state_completed): - if self.state_manager: - self.state_manager.mark_step_completed(step_name, item.episode_id) - return ( - True, - [], - f'[yellow]Skipping (files exist, state synced): {item.episode_id}[/yellow]', - ) - if missing_outputs and state_completed: - console.print( - f'[yellow]Warning: State marked complete but outputs missing ' - f'for {item.episode_id}[/yellow]', - ) - return (False, missing_outputs, '') - return (False, missing_outputs, '') - def _execute(self) -> None: all_items = self._get_processing_items() if not all_items: @@ -177,6 +103,36 @@ def _execute(self) -> None: ) self.__execute_processing(items_to_process) + def _get_expected_outputs(self, item: ProcessingItem) -> List[OutputSpec]: + raise NotImplementedError( + f'{self.__class__.__name__} must implement _get_expected_outputs() ' + 'or override _execute() directly (legacy mode)', + ) + + def _get_processing_items(self) -> List[ProcessingItem]: + raise NotImplementedError( + f'{self.__class__.__name__} must implement _get_processing_items() ' + 'or override _execute() directly (legacy mode)', + ) + + def _get_progress_description(self) -> str: + return f'Processing {self.__class__.__name__}' + + def _load_resources(self) -> bool: + return True + + def _process_item( + self, item: ProcessingItem, missing_outputs: List[OutputSpec], + ) -> None: + raise NotImplementedError( + f'{self.__class__.__name__} must implement _process_item() ' + 'or override _execute() directly (legacy mode)', + ) + + @abstractmethod + def _validate_args(self, args: Dict[str, Any]) -> None: + pass + def __execute_processing(self, items: List[ProcessingItem]) -> None: if not items: console.print('[yellow]No items to process, skipping resource loading[/yellow]') @@ -206,8 +162,52 @@ def __execute_processing(self, items: List[ProcessingItem]) -> None: console.print('\n[yellow]Processing interrupted[/yellow]') raise + def __get_processing_info(self) -> List[str]: + return [] + + def __get_step_name(self) -> str: + class_name = self.__class__.__name__ + name = class_name.replace('Processor', '').replace('Generator', '').replace('Detector', '') + name = name.replace('Transcoder', '').replace('Importer', '').replace('Indexer', '') + return self.__to_snake_case(name) + def __get_temp_files(self, item: ProcessingItem) -> List[str]: # pylint: disable=unused-argument return [] - def _get_progress_description(self) -> str: - return f'Processing {self.__class__.__name__}' + def __should_skip_item( + self, item: ProcessingItem, + ) -> Tuple[bool, List[OutputSpec], str]: + expected_outputs = self._get_expected_outputs(item) + if not expected_outputs: + return (False, [], '') + missing_outputs = [ + output for output in expected_outputs + if not output.path.exists() or output.path.stat().st_size == 0 + ] + step_name = self.__get_step_name() + state_completed = ( + self.state_manager + and self.state_manager.is_step_completed(step_name, item.episode_id) + ) + if not missing_outputs and state_completed: + return (True, [], f'[yellow]Skipping (completed): {item.episode_id}[/yellow]') + if not missing_outputs and (not state_completed): + if self.state_manager: + self.state_manager.mark_step_completed(step_name, item.episode_id) + return ( + True, + [], + f'[yellow]Skipping (files exist, state synced): {item.episode_id}[/yellow]', + ) + if missing_outputs and state_completed: + console.print( + f'[yellow]Warning: State marked complete but outputs missing ' + f'for {item.episode_id}[/yellow]', + ) + return (False, missing_outputs, '') + return (False, missing_outputs, '') + + @staticmethod + def __to_snake_case(name: str) -> str: + name = re.sub('(.)([A-Z][a-z]+)', '\\1_\\2', name) + return re.sub('([a-z0-9])([A-Z])', '\\1_\\2', name).lower() diff --git a/preprocessor/core/base_step.py b/preprocessor/core/base_step.py index d46993f12..ecf5d1809 100644 --- a/preprocessor/core/base_step.py +++ b/preprocessor/core/base_step.py @@ -22,18 +22,18 @@ class PipelineStep(ABC, Generic[InputT, OutputT, ConfigT]): def __init__(self, config: ConfigT) -> None: self._config: ConfigT = config - @abstractmethod - def execute(self, input_data: InputT, context: "ExecutionContext") -> OutputT: + def cleanup(self) -> None: pass @property + def config(self) -> ConfigT: + return self._config + @abstractmethod - def name(self) -> str: + def execute(self, input_data: InputT, context: "ExecutionContext") -> OutputT: pass @property - def config(self) -> ConfigT: - return self._config - - def cleanup(self) -> None: + @abstractmethod + def name(self) -> str: pass diff --git a/preprocessor/core/context.py b/preprocessor/core/context.py index 017b7b056..198eedf24 100644 --- a/preprocessor/core/context.py +++ b/preprocessor/core/context.py @@ -27,22 +27,10 @@ def __init__( self._force_rerun: bool = force_rerun self._logger: ErrorHandlingLogger = logger - @property - def series_name(self) -> str: - return self._series_name - @property def force_rerun(self) -> bool: return self._force_rerun - @property - def logger(self) -> ErrorHandlingLogger: - return self._logger - - @property - def state_manager(self) -> Optional['StateManager']: - return self._state_manager - def get_output_path( self, episode_info: 'EpisodeInfo', subdir: str, filename: str, ) -> Path: @@ -67,6 +55,10 @@ def is_step_completed(self, step_name: str, episode_id: str) -> bool: return False return self._state_manager.is_step_completed(step_name, episode_id) + @property + def logger(self) -> ErrorHandlingLogger: + return self._logger + def mark_step_completed(self, step_name: str, episode_id: str) -> None: if self._state_manager: self._state_manager.mark_step_completed(step_name, episode_id) @@ -76,3 +68,11 @@ def mark_step_started( ) -> None: if self._state_manager: self._state_manager.mark_step_started(step_name, episode_id, temp_files) + + @property + def series_name(self) -> str: + return self._series_name + + @property + def state_manager(self) -> Optional['StateManager']: + return self._state_manager diff --git a/preprocessor/core/path_resolver.py b/preprocessor/core/path_resolver.py index 606d3ed93..a505531fa 100644 --- a/preprocessor/core/path_resolver.py +++ b/preprocessor/core/path_resolver.py @@ -4,9 +4,6 @@ class PathResolver: - @staticmethod - def _is_docker() -> bool: - return PathService._is_docker() @staticmethod def get_input_base() -> Path: @@ -15,3 +12,6 @@ def get_input_base() -> Path: @staticmethod def get_output_base() -> Path: return PathService.get_output_base() + @staticmethod + def _is_docker() -> bool: + return PathService._is_docker() diff --git a/preprocessor/core/path_service.py b/preprocessor/core/path_service.py index ef6428838..fe34588a7 100644 --- a/preprocessor/core/path_service.py +++ b/preprocessor/core/path_service.py @@ -9,17 +9,6 @@ class PathService: - @staticmethod - def _is_docker() -> bool: - return os.getenv('DOCKER_CONTAINER', 'false').lower() == 'true' - - @staticmethod - def get_input_base() -> Path: - return Path('/input_data') if PathService._is_docker() else Path('preprocessor/input_data') - - @staticmethod - def get_output_base() -> Path: - return Path('/app/output_data') if PathService._is_docker() else Path('preprocessor/output_data') def __init__(self, series_name: str) -> None: self._series_name: str = series_name.lower() @@ -34,3 +23,14 @@ def build_filename( def get_episode_dir(self, episode_info: 'EpisodeInfo', subdir: str) -> Path: base_output_dir: Path = get_base_output_dir(self._series_name) return base_output_dir / subdir / episode_info.season_code() / episode_info.episode_num() + + @staticmethod + def get_input_base() -> Path: + return Path('/input_data') if PathService._is_docker() else Path('preprocessor/input_data') + + @staticmethod + def get_output_base() -> Path: + return Path('/app/output_data') if PathService._is_docker() else Path('preprocessor/output_data') + @staticmethod + def _is_docker() -> bool: + return os.getenv('DOCKER_CONTAINER', 'false').lower() == 'true' diff --git a/preprocessor/core/processing_metadata.py b/preprocessor/core/processing_metadata.py index e8cca6716..0ef038fb0 100644 --- a/preprocessor/core/processing_metadata.py +++ b/preprocessor/core/processing_metadata.py @@ -16,20 +16,20 @@ class StepMetadata: name: str step_num: str - start_time: Optional[datetime] = None - end_time: Optional[datetime] = None duration_seconds: Optional[float] = None - status: str = 'pending' + end_time: Optional[datetime] = None exit_code: Optional[int] = None extra_info: Dict[str, Any] = field(default_factory=dict) + start_time: Optional[datetime] = None + status: str = 'pending' + + def skip(self): + self.status = 'skipped' def start(self): self.start_time = datetime.now() self.status = 'running' - def skip(self): - self.status = 'skipped' - def to_dict(self) -> Dict[str, Any]: return { 'name': self.name, @@ -55,25 +55,22 @@ def __init__(self, series_name: str, params: Dict[str, Any]): self.steps: List[StepMetadata] = [] self.final_status = 'running' - @staticmethod - def __sanitize_params(params: Dict[str, Any]) -> Dict[str, Any]: - sanitized = {} - for key, value in params.items(): - if key in set('state_manager'): - continue - if isinstance(value, Path): - sanitized[key] = str(value) - elif isinstance(value, (str, int, float, bool, list, dict, type(None))): - sanitized[key] = value - else: - sanitized[key] = str(value) - return sanitized - def add_step(self, name: str, step_num: str) -> StepMetadata: step = StepMetadata(name=name, step_num=step_num) self.steps.append(step) return step + def to_dict(self) -> Dict[str, Any]: + return { + 'series_name': self.series_name, + 'start_time': self.start_time.isoformat(), + 'end_time': self.end_time.isoformat() if self.end_time else None, + 'final_status': self.final_status, + 'parameters': self.params, + 'steps': [step.to_dict() for step in self.steps], + 'statistics': self.__get_statistics(), + } + def __get_statistics(self) -> Dict[str, Any]: completed_steps = [s for s in self.steps if s.status == 'success'] failed_steps = [s for s in self.steps if s.status == 'failed'] @@ -98,13 +95,16 @@ def __get_statistics(self) -> Dict[str, Any]: ), } - def to_dict(self) -> Dict[str, Any]: - return { - 'series_name': self.series_name, - 'start_time': self.start_time.isoformat(), - 'end_time': self.end_time.isoformat() if self.end_time else None, - 'final_status': self.final_status, - 'parameters': self.params, - 'steps': [step.to_dict() for step in self.steps], - 'statistics': self.__get_statistics(), - } + @staticmethod + def __sanitize_params(params: Dict[str, Any]) -> Dict[str, Any]: + sanitized = {} + for key, value in params.items(): + if key in set('state_manager'): + continue + if isinstance(value, Path): + sanitized[key] = str(value) + elif isinstance(value, (str, int, float, bool, list, dict, type(None))): + sanitized[key] = value + else: + sanitized[key] = str(value) + return sanitized diff --git a/preprocessor/core/state_manager.py b/preprocessor/core/state_manager.py index 2e74427e8..c3b4ed44f 100644 --- a/preprocessor/core/state_manager.py +++ b/preprocessor/core/state_manager.py @@ -18,22 +18,22 @@ @dataclass class StepCheckpoint: - step: str - episode: str completed_at: str + episode: str + step: str @dataclass class InProgressStep: - step: str episode: str started_at: str + step: str temp_files: List[str] = field(default_factory=list) @dataclass class ProcessingState: + last_checkpoint: str series_name: str started_at: str - last_checkpoint: str completed_steps: List[StepCheckpoint] = field(default_factory=list) in_progress: Optional[InProgressStep] = None @@ -72,6 +72,19 @@ def __init__(self, series_name: str, working_dir: Path = Path('.')) -> None: self.__state_file: Path = working_dir / state_filename self.__state: Optional[ProcessingState] = None + def cleanup(self) -> None: + if self.__state_file.exists(): + console.print(f'[blue]Cleaning up state file: {self.__state_file}[/blue]') + self.__state_file.unlink() + + def is_step_completed(self, step: str, episode: str) -> bool: + if self.__state is None: + return False + return any( + (s.step == step and s.episode == episode) + for s in self.__state.completed_steps + ) + def load_or_create_state(self) -> ProcessingState: if self.__state_file.exists(): console.print(f'[yellow]Found existing state file: {self.__state_file}[/yellow]') @@ -92,27 +105,6 @@ def load_or_create_state(self) -> ProcessingState: self.__save_state() return self.__state - def __save_state(self) -> None: - if self.__state is None: - return - self.__state.last_checkpoint = datetime.now().isoformat() - with open(self.__state_file, 'w', encoding='utf-8') as f: - json.dump(self.__state.to_dict(), f, indent=2, ensure_ascii=False) - - def mark_step_started( - self, step: str, episode: str, temp_files: Optional[List[str]] = None, - ) -> None: - if self.__state is None: - raise RuntimeError('State not initialized') - self.__state.in_progress = InProgressStep( - step=step, - episode=episode, - started_at=datetime.now().isoformat(), - temp_files=temp_files or [], - ) - self.__save_state() - console.print(f'[cyan]Started: {step} for {episode}[/cyan]') - def mark_step_completed(self, step: str, episode: str) -> None: if self.__state is None: raise RuntimeError('State not initialized') @@ -126,13 +118,19 @@ def mark_step_completed(self, step: str, episode: str) -> None: self.__save_state() console.print(f'[green]✓ Completed: {step} for {episode}[/green]') - def is_step_completed(self, step: str, episode: str) -> bool: + def mark_step_started( + self, step: str, episode: str, temp_files: Optional[List[str]] = None, + ) -> None: if self.__state is None: - return False - return any( - (s.step == step and s.episode == episode) - for s in self.__state.completed_steps + raise RuntimeError('State not initialized') + self.__state.in_progress = InProgressStep( + step=step, + episode=episode, + started_at=datetime.now().isoformat(), + temp_files=temp_files or [], ) + self.__save_state() + console.print(f'[cyan]Started: {step} for {episode}[/cyan]') def __rollback_in_progress(self) -> None: # pylint: disable=unused-private-member if self.__state is None or self.__state.in_progress is None: @@ -152,7 +150,9 @@ def __rollback_in_progress(self) -> None: # pylint: disable=unused-private-membe self.__state.in_progress = None self.__save_state() - def cleanup(self) -> None: - if self.__state_file.exists(): - console.print(f'[blue]Cleaning up state file: {self.__state_file}[/blue]') - self.__state_file.unlink() + def __save_state(self) -> None: + if self.__state is None: + return + self.__state.last_checkpoint = datetime.now().isoformat() + with open(self.__state_file, 'w', encoding='utf-8') as f: + json.dump(self.__state.to_dict(), f, indent=2, ensure_ascii=False) diff --git a/preprocessor/lib/ai/models.py b/preprocessor/lib/ai/models.py index 4e9788a96..ba8e7809b 100644 --- a/preprocessor/lib/ai/models.py +++ b/preprocessor/lib/ai/models.py @@ -13,8 +13,8 @@ class EpisodeInfo(BaseModel): episode_in_season: int overall_episode_number: int - title: str premiere_date: Optional[str] = None + title: str viewership: Optional[str] = None @field_validator('viewership', mode='before') @@ -30,8 +30,8 @@ def __convert_viewership_to_str(cls, v: Optional[int]) -> Optional[str]: class SeasonMetadata(BaseModel): - season_number: int episodes: List[EpisodeInfo] + season_number: int @model_validator(mode='before') @classmethod @@ -52,11 +52,11 @@ class AllSeasonsMetadata(BaseModel): class EpisodeMetadata(BaseModel): - title: str description: str - summary: str - season: Optional[int] = None episode_number: Optional[int] = None + season: Optional[int] = None + summary: str + title: str class CharacterInfo(BaseModel): diff --git a/preprocessor/lib/ai/provider.py b/preprocessor/lib/ai/provider.py index 0362921ee..7312ae621 100644 --- a/preprocessor/lib/ai/provider.py +++ b/preprocessor/lib/ai/provider.py @@ -38,13 +38,8 @@ class LLMProvider: - __instance: Optional['LLMProvider'] = None __client: Optional[BaseLLMClient] = None - - def __new__(cls, model_name: Optional[str] = None, parser_mode: Optional[ParserMode] = None) -> 'LLMProvider': - if cls.__instance is None: - cls.__instance = super().__new__(cls) - return cls.__instance + __instance: Optional['LLMProvider'] = None def __init__(self, model_name: Optional[str] = None, parser_mode: Optional[ParserMode] = None) -> None: self._parser_mode = parser_mode or ParserMode.NORMAL @@ -55,52 +50,6 @@ def __init__(self, model_name: Optional[str] = None, parser_mode: Optional[Parse else: self.__client = VLLMClient(model_name=model_name) - def __extract_season_episodes(self, page_text: str, url: str) -> Optional[SeasonMetadata]: - # pylint: disable=unused-private-member - return self.__process_llm_request( - system_prompt=extract_season_system.get(), - user_prompt=extract_season_user.get().format(url=url, page_text=page_text), - response_model=SeasonMetadata, - error_context=f'extraction failed for {url}', - ) - - def __extract_episode_metadata(self, page_text: str, url: str) -> Optional[EpisodeMetadata]: - # pylint: disable=unused-private-member - return self.__process_llm_request( - system_prompt=extract_episode_metadata_system.get(), - user_prompt=extract_episode_metadata_user.get().format(url=url, page_text=page_text), - response_model=EpisodeMetadata, - error_context=f'extraction failed for {url}', - ) - - def __merge_episode_data(self, metadata_list: List[EpisodeMetadata]) -> EpisodeMetadata: - # pylint: disable=unused-private-member - if not metadata_list: - raise ValueError('No metadata to merge') - if len(metadata_list) == 1: - return metadata_list[0] - - combined_text = '\n\n---\n\n'.join([ - f'Source {i + 1}:\n' - f'Title: {m.title}\n' - f'Description: {m.description}\n' - f'Summary: {m.summary}\n' - f'Season: {m.season}\n' - f'Episode: {m.episode_number}' - for i, m in enumerate(metadata_list) - ]) - - result = self.__process_llm_request( - system_prompt=merge_episode_data_system.get(), - user_prompt=merge_episode_data_user.get().format( - num_sources=len(metadata_list), - combined_text=combined_text, - ), - response_model=EpisodeMetadata, - error_context='merge failed', - ) - return result if result else metadata_list[0] - def extract_all_seasons(self, scraped_pages: List[Dict[str, Any]]) -> Optional[List[SeasonMetadata]]: combined_content = '' for i, page in enumerate(scraped_pages, 1): @@ -142,6 +91,76 @@ def extract_characters( ) return result.characters if result else None + def __new__(cls, model_name: Optional[str] = None, parser_mode: Optional[ParserMode] = None) -> 'LLMProvider': + if cls.__instance is None: + cls.__instance = super().__new__(cls) + return cls.__instance + + def __extract_episode_metadata(self, page_text: str, url: str) -> Optional[EpisodeMetadata]: + # pylint: disable=unused-private-member + return self.__process_llm_request( + system_prompt=extract_episode_metadata_system.get(), + user_prompt=extract_episode_metadata_user.get().format(url=url, page_text=page_text), + response_model=EpisodeMetadata, + error_context=f'extraction failed for {url}', + ) + + @staticmethod + def __extract_json(content: str) -> Dict[str, Any]: + try: + if '```json' in content: + start = content.find('```json') + 7 + end = content.find('```', start) + json_str = content[start:end].strip() + elif '```' in content: + start = content.find('```') + 3 + end = content.find('```', start) + json_str = content[start:end].strip() + else: + json_str = content.strip() + return json.loads(json_str) + except json.JSONDecodeError as e: + console.print(f'[red]JSON parse error: {e}[/red]') + console.print(f'[yellow]Raw content:\n{content}[/yellow]') + raise + + def __extract_season_episodes(self, page_text: str, url: str) -> Optional[SeasonMetadata]: + # pylint: disable=unused-private-member + return self.__process_llm_request( + system_prompt=extract_season_system.get(), + user_prompt=extract_season_user.get().format(url=url, page_text=page_text), + response_model=SeasonMetadata, + error_context=f'extraction failed for {url}', + ) + + def __merge_episode_data(self, metadata_list: List[EpisodeMetadata]) -> EpisodeMetadata: + # pylint: disable=unused-private-member + if not metadata_list: + raise ValueError('No metadata to merge') + if len(metadata_list) == 1: + return metadata_list[0] + + combined_text = '\n\n---\n\n'.join([ + f'Source {i + 1}:\n' + f'Title: {m.title}\n' + f'Description: {m.description}\n' + f'Summary: {m.summary}\n' + f'Season: {m.season}\n' + f'Episode: {m.episode_number}' + for i, m in enumerate(metadata_list) + ]) + + result = self.__process_llm_request( + system_prompt=merge_episode_data_system.get(), + user_prompt=merge_episode_data_user.get().format( + num_sources=len(metadata_list), + combined_text=combined_text, + ), + response_model=EpisodeMetadata, + error_context='merge failed', + ) + return result if result else metadata_list[0] + def __process_llm_request( self, system_prompt: str, @@ -163,22 +182,3 @@ def __process_llm_request( except Exception as e: console.print(f'[red]LLM {error_context}: {e}[/red]') return None - - @staticmethod - def __extract_json(content: str) -> Dict[str, Any]: - try: - if '```json' in content: - start = content.find('```json') + 7 - end = content.find('```', start) - json_str = content[start:end].strip() - elif '```' in content: - start = content.find('```') + 3 - end = content.find('```', start) - json_str = content[start:end].strip() - else: - json_str = content.strip() - return json.loads(json_str) - except json.JSONDecodeError as e: - console.print(f'[red]JSON parse error: {e}[/red]') - console.print(f'[yellow]Raw content:\n{content}[/yellow]') - raise diff --git a/preprocessor/lib/characters/face_detection.py b/preprocessor/lib/characters/face_detection.py index d2eec0ef1..57bf15d25 100644 --- a/preprocessor/lib/characters/face_detection.py +++ b/preprocessor/lib/characters/face_detection.py @@ -21,6 +21,44 @@ class FaceDetector: + @staticmethod + def detect_characters_in_frame( + frame_path: Path, + face_app: FaceAnalysis, + character_vectors: Dict[str, np.ndarray], + threshold: float, + ) -> List[Dict[str, Any]]: + img = cv2.imread(str(frame_path)) + if img is None: + return [] + faces = face_app.get(img) + if not faces: + return [] + detected = [] + for face in faces: + face_embedding = face.normed_embedding + bbox = face.bbox.astype(int) + best_match = None + best_similarity = threshold + for char_name, char_vector in character_vectors.items(): + similarity = np.dot(face_embedding, char_vector) + if similarity > best_similarity: + best_similarity = similarity + best_match = char_name + if best_match is not None: + detected.append({ + 'name': best_match, + 'confidence': float(best_similarity), + 'bbox': { + 'x1': int(bbox[0]), + 'y1': int(bbox[1]), + 'x2': int(bbox[2]), + 'y2': int(bbox[3]), + }, + }) + detected.sort(key=lambda x: x['confidence'], reverse=True) + return detected + @staticmethod def init() -> FaceAnalysis: model_root = os.getenv('INSIGHTFACE_HOME', os.path.expanduser('~/.insightface')) @@ -103,41 +141,3 @@ def __get_face_embedding(img_path: str, face_app: FaceAnalysis) -> Optional[np.n return None faces.sort(key=lambda x: (x.bbox[2] - x.bbox[0]) * (x.bbox[3] - x.bbox[1]), reverse=True) return faces[0].normed_embedding - - @staticmethod - def detect_characters_in_frame( - frame_path: Path, - face_app: FaceAnalysis, - character_vectors: Dict[str, np.ndarray], - threshold: float, - ) -> List[Dict[str, Any]]: - img = cv2.imread(str(frame_path)) - if img is None: - return [] - faces = face_app.get(img) - if not faces: - return [] - detected = [] - for face in faces: - face_embedding = face.normed_embedding - bbox = face.bbox.astype(int) - best_match = None - best_similarity = threshold - for char_name, char_vector in character_vectors.items(): - similarity = np.dot(face_embedding, char_vector) - if similarity > best_similarity: - best_similarity = similarity - best_match = char_name - if best_match is not None: - detected.append({ - 'name': best_match, - 'confidence': float(best_similarity), - 'bbox': { - 'x1': int(bbox[0]), - 'y1': int(bbox[1]), - 'x2': int(bbox[2]), - 'y2': int(bbox[3]), - }, - }) - detected.sort(key=lambda x: x['confidence'], reverse=True) - return detected diff --git a/preprocessor/lib/characters/image_search/image_search.py b/preprocessor/lib/characters/image_search/image_search.py index 1437dc780..fdc3305fb 100644 --- a/preprocessor/lib/characters/image_search/image_search.py +++ b/preprocessor/lib/characters/image_search/image_search.py @@ -13,11 +13,11 @@ class BaseImageSearch(ABC): def __init__(self, max_results: int=50): self.max_results = max_results + @property @abstractmethod - def search(self, query: str) -> List[Dict[str, str]]: + def name(self) -> str: pass - @property @abstractmethod - def name(self) -> str: + def search(self, query: str) -> List[Dict[str, str]]: pass diff --git a/preprocessor/lib/characters/models.py b/preprocessor/lib/characters/models.py index 43bff537d..777447e9e 100644 --- a/preprocessor/lib/characters/models.py +++ b/preprocessor/lib/characters/models.py @@ -7,12 +7,12 @@ @dataclass class FaceData: bbox: np.ndarray + face_img: np.ndarray face_vector: np.ndarray - source_image_path: Path source_image_idx: int - face_img: np.ndarray + source_image_path: Path @dataclass class CandidateFace: - faces: list[FaceData] avg_similarity: float + faces: list[FaceData] diff --git a/preprocessor/lib/characters/reference_downloader.py b/preprocessor/lib/characters/reference_downloader.py index 824fc76e1..5997d9f2c 100644 --- a/preprocessor/lib/characters/reference_downloader.py +++ b/preprocessor/lib/characters/reference_downloader.py @@ -52,28 +52,9 @@ def __init__(self, args: Dict[str, Any]): self.face_app: FaceAnalysis = None self.browser_context: Optional[BrowserContext] = None - def __create_search_engine(self) -> BaseImageSearch: - if self.search_mode == 'premium': - serpapi_key = settings.image_scraper.serpapi_key - return GoogleImageSearch(api_key=serpapi_key, max_results=self.max_results) - return DuckDuckGoImageSearch(max_results=self.max_results) - - def _validate_args(self, args: Dict[str, Any]) -> None: - if 'characters_json' not in args: - raise ValueError('characters_json is required') - def get_output_subdir(self, item: Optional['ProcessingItem'] = None) -> str: # pylint: disable=unused-argument return 'character_references' - def __all_references_exist(self, characters: List[Dict[str, Any]]) -> bool: - for char in characters: - char_name = char['name'] - output_folder = self.output_dir / char_name.replace(' ', '_').lower() - existing_images = list(output_folder.glob('*.jpg')) - if len(existing_images) < self.images_per_character: - return False - return True - def _execute(self) -> None: if not self.characters_json.exists(): console.print(f'[red]Characters JSON not found: {self.characters_json}[/red]') @@ -113,25 +94,74 @@ def _execute(self) -> None: self.browser_context.close() console.print('[green]✓ Reference download completed[/green]') + def _validate_args(self, args: Dict[str, Any]) -> None: + if 'characters_json' not in args: + raise ValueError('characters_json is required') + + def __all_references_exist(self, characters: List[Dict[str, Any]]) -> bool: + for char in characters: + char_name = char['name'] + output_folder = self.output_dir / char_name.replace(' ', '_').lower() + existing_images = list(output_folder.glob('*.jpg')) + if len(existing_images) < self.images_per_character: + return False + return True + + def __check_existing_images( + self, output_folder: Path, char_name: str, progress, + ) -> Optional[int]: + existing_images = list(output_folder.glob('*.jpg')) + if len(existing_images) >= self.images_per_character: + progress.console.print( + f'[green]✓ {char_name}: {len(existing_images)} images ' + f'already exist (skipping)[/green]', + ) + return None + return len(existing_images) + def __count_faces(self, img) -> int: faces = self.face_app.get(img) return len(faces) - @staticmethod - def __validate_and_decode_image( - img_bytes: bytes, img_url: str, logger, - ) -> np.ndarray | None: - if not img_bytes: - return None - img_array = np.asarray(bytearray(img_bytes), dtype=np.uint8) - img = cv2.imdecode(img_array, cv2.IMREAD_COLOR) - if img is None or img.size == 0: - logger.debug(f'Failed to decode image from {img_url}') - return None - if len(img.shape) != 3 or img.shape[2] != 3: - logger.debug(f'Image has unexpected shape {img.shape} from {img_url}') - return None - return img + def __create_search_engine(self) -> BaseImageSearch: + if self.search_mode == 'premium': + serpapi_key = settings.image_scraper.serpapi_key + return GoogleImageSearch(api_key=serpapi_key, max_results=self.max_results) + return DuckDuckGoImageSearch(max_results=self.max_results) + + def __download_character_references(self, char_name: str, progress) -> bool: + output_folder = self.__prepare_output_folder(char_name) + saved_count = self.__check_existing_images(output_folder, char_name, progress) + if saved_count is None: + return False + search_query = f'Serial {self.series_name} {char_name} postać' + progress.console.print( + f'[cyan]Searching [{self.search_engine.name}]: {search_query}[/cyan]', + ) + for attempt in range(settings.image_scraper.retry_attempts): + try: + results = self.search_engine.search(search_query) + saved_count = self.__process_search_results( + results, output_folder, saved_count, + ) + break + except KeyboardInterrupt: + progress.console.print('\n[yellow]Download interrupted[/yellow]') + raise + except Exception as e: + if attempt < settings.image_scraper.retry_attempts - 1: + delay = settings.image_scraper.retry_delay * 2 ** attempt + self.logger.warning( + f'Attempt {attempt + 1} failed for {char_name}, ' + f'retrying in {delay}s: {e}', + ) + time.sleep(delay) + else: + self.logger.error( + f'All retry attempts failed for {char_name}: {e}', + ) + self.__print_results(char_name, saved_count, progress) + return True def __download_image_with_browser( self, img_url: str, page: Page, @@ -166,38 +196,23 @@ def __prepare_output_folder(self, char_name: str) -> Path: output_folder.mkdir(parents=True, exist_ok=True) return output_folder - def __check_existing_images( - self, output_folder: Path, char_name: str, progress, - ) -> Optional[int]: - existing_images = list(output_folder.glob('*.jpg')) - if len(existing_images) >= self.images_per_character: + def __print_results( + self, char_name: str, saved_count: int, progress, + ) -> None: + if saved_count >= self.images_per_character: progress.console.print( - f'[green]✓ {char_name}: {len(existing_images)} images ' - f'already exist (skipping)[/green]', + f'[green]✓[/green] {char_name}: ' + f'{saved_count}/{self.images_per_character} images', + ) + elif saved_count > 0: + progress.console.print( + f'[yellow]⚠[/yellow] {char_name}: ' + f'{saved_count}/{self.images_per_character} images (incomplete)', + ) + else: + progress.console.print( + f'[red]✗[/red] {char_name}: No suitable images found', ) - return None - return len(existing_images) - - def __validate_and_save_image( - self, img: np.ndarray, img_url: str, output_folder: Path, saved_count: int, - ) -> bool: - if not isinstance(img, np.ndarray) or img.size == 0: - self.logger.debug(f'Invalid image array from {img_url}') - return False - h, w = img.shape[:2] - if w < self.min_width or h < self.min_height: - return False - try: - face_count = self.__count_faces(img) - except Exception as face_err: - self.logger.debug(f'Face detection failed for {img_url}: {face_err}') - return False - if face_count != 1: - return False - filename = f'{saved_count:02d}.jpg' - path = output_folder / filename - cv2.imwrite(str(path), img) - return True def __process_search_results( self, results: List[Dict[str, Any]], output_folder: Path, saved_count: int, @@ -230,54 +245,39 @@ def __process_search_results( page.close() return saved_count - def __print_results( - self, char_name: str, saved_count: int, progress, - ) -> None: - if saved_count >= self.images_per_character: - progress.console.print( - f'[green]✓[/green] {char_name}: ' - f'{saved_count}/{self.images_per_character} images', - ) - elif saved_count > 0: - progress.console.print( - f'[yellow]⚠[/yellow] {char_name}: ' - f'{saved_count}/{self.images_per_character} images (incomplete)', - ) - else: - progress.console.print( - f'[red]✗[/red] {char_name}: No suitable images found', - ) + @staticmethod + def __validate_and_decode_image( + img_bytes: bytes, img_url: str, logger, + ) -> np.ndarray | None: + if not img_bytes: + return None + img_array = np.asarray(bytearray(img_bytes), dtype=np.uint8) + img = cv2.imdecode(img_array, cv2.IMREAD_COLOR) + if img is None or img.size == 0: + logger.debug(f'Failed to decode image from {img_url}') + return None + if len(img.shape) != 3 or img.shape[2] != 3: + logger.debug(f'Image has unexpected shape {img.shape} from {img_url}') + return None + return img - def __download_character_references(self, char_name: str, progress) -> bool: - output_folder = self.__prepare_output_folder(char_name) - saved_count = self.__check_existing_images(output_folder, char_name, progress) - if saved_count is None: + def __validate_and_save_image( + self, img: np.ndarray, img_url: str, output_folder: Path, saved_count: int, + ) -> bool: + if not isinstance(img, np.ndarray) or img.size == 0: + self.logger.debug(f'Invalid image array from {img_url}') return False - search_query = f'Serial {self.series_name} {char_name} postać' - progress.console.print( - f'[cyan]Searching [{self.search_engine.name}]: {search_query}[/cyan]', - ) - for attempt in range(settings.image_scraper.retry_attempts): - try: - results = self.search_engine.search(search_query) - saved_count = self.__process_search_results( - results, output_folder, saved_count, - ) - break - except KeyboardInterrupt: - progress.console.print('\n[yellow]Download interrupted[/yellow]') - raise - except Exception as e: - if attempt < settings.image_scraper.retry_attempts - 1: - delay = settings.image_scraper.retry_delay * 2 ** attempt - self.logger.warning( - f'Attempt {attempt + 1} failed for {char_name}, ' - f'retrying in {delay}s: {e}', - ) - time.sleep(delay) - else: - self.logger.error( - f'All retry attempts failed for {char_name}: {e}', - ) - self.__print_results(char_name, saved_count, progress) + h, w = img.shape[:2] + if w < self.min_width or h < self.min_height: + return False + try: + face_count = self.__count_faces(img) + except Exception as face_err: + self.logger.debug(f'Face detection failed for {img_url}: {face_err}') + return False + if face_count != 1: + return False + filename = f'{saved_count:02d}.jpg' + path = output_folder / filename + cv2.imwrite(str(path), img) return True diff --git a/preprocessor/lib/core/logging.py b/preprocessor/lib/core/logging.py index 82f903989..6e5581316 100644 --- a/preprocessor/lib/core/logging.py +++ b/preprocessor/lib/core/logging.py @@ -30,35 +30,13 @@ def __del__(self) -> None: self.__logger.error(f'- {error}') raise LoggerNotFinalizedException - def __setup_logger(self, level: int) -> None: - logging.basicConfig( - level=level, - format='%(message)s', - handlers=[ - RichHandler( - console=console, - rich_tracebacks=True, - show_time=True, - show_path=False, - ), - ], - force=True, - ) - self.__logger: logging.Logger = logging.getLogger(self.__class_name) - - def info(self, message: str) -> None: - self.__logger.info(message) + def debug(self, message: str) -> None: + self.__logger.debug(message) def error(self, message: str) -> None: self.__logger.error(message) self.__errors.append(message) - def warning(self, message: str) -> None: - self.__logger.warning(message) - - def debug(self, message: str) -> None: - self.__logger.debug(message) - def finalize(self) -> int: self.__is_finalized = True if self.__errors: @@ -80,3 +58,25 @@ def finalize(self) -> int: ), ) return 0 + + def info(self, message: str) -> None: + self.__logger.info(message) + + def warning(self, message: str) -> None: + self.__logger.warning(message) + + def __setup_logger(self, level: int) -> None: + logging.basicConfig( + level=level, + format='%(message)s', + handlers=[ + RichHandler( + console=console, + rich_tracebacks=True, + show_time=True, + show_path=False, + ), + ], + force=True, + ) + self.__logger: logging.Logger = logging.getLogger(self.__class_name) diff --git a/preprocessor/lib/episodes/episode_manager.py b/preprocessor/lib/episodes/episode_manager.py index 08958db7c..2d92974fa 100644 --- a/preprocessor/lib/episodes/episode_manager.py +++ b/preprocessor/lib/episodes/episode_manager.py @@ -21,22 +21,22 @@ @dataclass class EpisodeInfo: absolute_episode: int - season: int relative_episode: int + season: int title: str - series_name: Optional[str] = None premiere_date: Optional[str] = None + series_name: Optional[str] = None viewership: Optional[str] = None def episode_code(self) -> str: return f'S{self.season:02d}E{self.relative_episode:02d}' - def season_code(self) -> str: - return f'S{self.season:02d}' - def episode_num(self) -> str: return f'E{self.relative_episode:02d}' + def season_code(self) -> str: + return f'S{self.season:02d}' + def __is_special(self) -> bool: # pylint: disable=unused-private-member return self.season == 0 @@ -51,39 +51,6 @@ def __init__(self, episodes_info_json: Optional[Path], series_name: str, logger: with open(episodes_info_json, 'r', encoding='utf-8') as f: self.episodes_data = json.load(f) - def __create_episode_info( - self, - season: int, - relative_episode: int, - title: Optional[str]=None, - premiere_date: Optional[str]=None, - viewership: Optional[str]=None, - ) -> EpisodeInfo: - return EpisodeInfo( - absolute_episode=0, - season=season, - relative_episode=relative_episode, - title=title or f'S{season:02d}E{relative_episode:02d}', - series_name=self.series_name, - premiere_date=premiere_date, - viewership=viewership, - ) - - def parse_filename(self, file_path: Path) -> Optional[EpisodeInfo]: - full_path_str = str(file_path) - match_season_episode = re.search('S(\\d+)[/\\\\]?E(\\d+)', full_path_str, re.IGNORECASE) - if match_season_episode: - season = int(match_season_episode.group(1)) - episode = int(match_season_episode.group(2)) - return self.get_episode_by_season_and_relative(season, episode) - if self._logger: - self._logger.error( - f'Cannot parse episode from filename: {file_path.name}. ' - 'Expected format: S##E## (e.g., S01E05, S10E13). ' - 'Absolute episode numbers (E## without season) are not supported.', - ) - return None - def get_episode_by_season_and_relative(self, season: int, relative_episode: int) -> EpisodeInfo: if not self.episodes_data: return self.__create_episode_info(season, relative_episode) @@ -108,21 +75,60 @@ def get_episode_by_season_and_relative(self, season: int, relative_episode: int) return self.__create_episode_info(season, relative_episode) @staticmethod - def __find_video_file(episode_info: EpisodeInfo, search_dir: Path) -> Optional[Path]: # pylint: disable=unused-private-member + def get_episode_id_for_state(episode_info: EpisodeInfo) -> str: + return episode_info.episode_code() + + @staticmethod + def get_metadata(episode_info: EpisodeInfo) -> Dict[str, Any]: + return { + 'season': episode_info.season, + 'episode_number': episode_info.relative_episode, + 'title': episode_info.title, + 'premiere_date': episode_info.premiere_date, + 'viewership': episode_info.viewership, + } + + def parse_filename(self, file_path: Path) -> Optional[EpisodeInfo]: + full_path_str = str(file_path) + match_season_episode = re.search('S(\\d+)[/\\\\]?E(\\d+)', full_path_str, re.IGNORECASE) + if match_season_episode: + season = int(match_season_episode.group(1)) + episode = int(match_season_episode.group(2)) + return self.get_episode_by_season_and_relative(season, episode) + if self._logger: + self._logger.error( + f'Cannot parse episode from filename: {file_path.name}. ' + 'Expected format: S##E## (e.g., S01E05, S10E13). ' + 'Absolute episode numbers (E## without season) are not supported.', + ) + return None + + def __create_episode_info( + self, + season: int, + relative_episode: int, + title: Optional[str]=None, + premiere_date: Optional[str]=None, + viewership: Optional[str]=None, + ) -> EpisodeInfo: + return EpisodeInfo( + absolute_episode=0, + season=season, + relative_episode=relative_episode, + title=title or f'S{season:02d}E{relative_episode:02d}', + series_name=self.series_name, + premiere_date=premiere_date, + viewership=viewership, + ) + + @staticmethod + def __find_scene_timestamps_file(episode_info: EpisodeInfo, search_dir: Path) -> Optional[Path]: if not search_dir.exists(): return None - if search_dir.is_file(): - return search_dir episode_code = episode_info.episode_code() - season_dir_name = episode_info.season_code() - search_dirs = [search_dir / season_dir_name, search_dir] - for dir_path in search_dirs: - if not dir_path.exists(): - continue - for ext in SUPPORTED_VIDEO_EXTENSIONS: - for video_file in dir_path.glob(f'*{ext}'): - if re.search(episode_code, video_file.name, re.IGNORECASE): - return video_file + pattern = f'**/*{episode_code}*_scenes.json' + for scene_file in search_dir.glob(pattern): + return scene_file return None def __find_transcription_file(self, episode_info: EpisodeInfo, search_dir: Path, prefer_segmented: bool=True) -> Optional[Path]: # pylint: disable=unused-private-member @@ -142,48 +148,23 @@ def __find_transcription_file(self, episode_info: EpisodeInfo, search_dir: Path, return None @staticmethod - def __find_scene_timestamps_file(episode_info: EpisodeInfo, search_dir: Path) -> Optional[Path]: + def __find_video_file(episode_info: EpisodeInfo, search_dir: Path) -> Optional[Path]: # pylint: disable=unused-private-member if not search_dir.exists(): return None + if search_dir.is_file(): + return search_dir episode_code = episode_info.episode_code() - pattern = f'**/*{episode_code}*_scenes.json' - for scene_file in search_dir.glob(pattern): - return scene_file + season_dir_name = episode_info.season_code() + search_dirs = [search_dir / season_dir_name, search_dir] + for dir_path in search_dirs: + if not dir_path.exists(): + continue + for ext in SUPPORTED_VIDEO_EXTENSIONS: + for video_file in dir_path.glob(f'*{ext}'): + if re.search(episode_code, video_file.name, re.IGNORECASE): + return video_file return None - @staticmethod - def __load_scene_timestamps( # pylint: disable=unused-private-member - episode_info: EpisodeInfo, - search_dir: Optional[Path], - _logger: Optional[ErrorHandlingLogger]=None, - ) -> Optional[List[Dict[str, Any]]]: - if not search_dir: - return None - scene_file = EpisodeManager.__find_scene_timestamps_file(episode_info, search_dir) - if not scene_file: - return None - try: - with open(scene_file, 'r', encoding='utf-8') as f: - return json.load(f) - except (OSError, json.JSONDecodeError) as e: - if _logger: - _logger.error(f'Failed to load scene timestamps: {e}') - return None - - @staticmethod - def get_metadata(episode_info: EpisodeInfo) -> Dict[str, Any]: - return { - 'season': episode_info.season, - 'episode_number': episode_info.relative_episode, - 'title': episode_info.title, - 'premiere_date': episode_info.premiere_date, - 'viewership': episode_info.viewership, - } - - @staticmethod - def get_episode_id_for_state(episode_info: EpisodeInfo) -> str: - return episode_info.episode_code() - def __list_all_episodes(self) -> List[EpisodeInfo]: # pylint: disable=unused-private-member episodes: List[EpisodeInfo] = [] if not self.episodes_data: @@ -202,3 +183,22 @@ def __list_all_episodes(self) -> List[EpisodeInfo]: # pylint: disable=unused-pri ), ) return episodes + + @staticmethod + def __load_scene_timestamps( # pylint: disable=unused-private-member + episode_info: EpisodeInfo, + search_dir: Optional[Path], + _logger: Optional[ErrorHandlingLogger]=None, + ) -> Optional[List[Dict[str, Any]]]: + if not search_dir: + return None + scene_file = EpisodeManager.__find_scene_timestamps_file(episode_info, search_dir) + if not scene_file: + return None + try: + with open(scene_file, 'r', encoding='utf-8') as f: + return json.load(f) + except (OSError, json.JSONDecodeError) as e: + if _logger: + _logger.error(f'Failed to load scene timestamps: {e}') + return None diff --git a/preprocessor/lib/io/files.py b/preprocessor/lib/io/files.py index e3efbd026..160bd2e53 100644 --- a/preprocessor/lib/io/files.py +++ b/preprocessor/lib/io/files.py @@ -9,17 +9,6 @@ class FileOperations: - @staticmethod - def __atomic_write(path: Path, write_func: Callable[[Any], None]) -> None: - temp_path = path.with_suffix(path.suffix + '.tmp') - try: - write_func(temp_path) - temp_path.replace(path) - except Exception: - if temp_path.exists(): - temp_path.unlink() - raise - @staticmethod def atomic_write_json(path: Path, data: Dict[str, Any], indent: int=2) -> None: @@ -33,6 +22,17 @@ def load_json(path: Path) -> Dict[str, Any]: with open(path, 'r', encoding='utf-8') as f: return json.load(f) + @staticmethod + def __atomic_write(path: Path, write_func: Callable[[Any], None]) -> None: + temp_path = path.with_suffix(path.suffix + '.tmp') + try: + write_func(temp_path) + temp_path.replace(path) + except Exception: + if temp_path.exists(): + temp_path.unlink() + raise + @staticmethod def __atomic_write_text(path: Path, content: str) -> None: # pylint: disable=unused-private-member diff --git a/preprocessor/lib/io/metadata.py b/preprocessor/lib/io/metadata.py index d1ef87124..fe3c074c0 100644 --- a/preprocessor/lib/io/metadata.py +++ b/preprocessor/lib/io/metadata.py @@ -12,8 +12,22 @@ class MetadataBuilder: @staticmethod - def __create_minimal_episode_info(episode_info) -> Dict[str, Any]: - return {'season': episode_info.season, 'episode_number': episode_info.relative_episode} + def create_embedding_collection( + episode_id: str, + episode_info: Any, + path: Path, + model_name: str, + embedding_count: int, + embedding_type: str, + ) -> EmbeddingCollection: + return EmbeddingCollection( + episode_id=episode_id, + episode_info=episode_info, + path=path, + model_name=model_name, + embedding_count=embedding_count, + embedding_type=embedding_type, + ) @staticmethod def create_processing_metadata( @@ -32,19 +46,5 @@ def create_processing_metadata( } @staticmethod - def create_embedding_collection( - episode_id: str, - episode_info: Any, - path: Path, - model_name: str, - embedding_count: int, - embedding_type: str, - ) -> EmbeddingCollection: - return EmbeddingCollection( - episode_id=episode_id, - episode_info=episode_info, - path=path, - model_name=model_name, - embedding_count=embedding_count, - embedding_type=embedding_type, - ) + def __create_minimal_episode_info(episode_info) -> Dict[str, Any]: + return {'season': episode_info.season, 'episode_number': episode_info.relative_episode} diff --git a/preprocessor/lib/media/ffmpeg.py b/preprocessor/lib/media/ffmpeg.py index 48008e170..01f2259a4 100644 --- a/preprocessor/lib/media/ffmpeg.py +++ b/preprocessor/lib/media/ffmpeg.py @@ -12,15 +12,101 @@ class FFmpegWrapper: - __PROFILE = 'main' - __LEVEL = '4.1' - __PIX_FMT = 'yuv420p' + __AQ_STRENGTH = '15' + __AUDIO_CHANNELS = '2' __BF = '2' __B_ADAPT = '1' - __TWO_PASS = '1' + __LEVEL = '4.1' + __PIX_FMT = 'yuv420p' + __PROFILE = 'main' __RC_LOOKAHEAD = '32' - __AQ_STRENGTH = '15' - __AUDIO_CHANNELS = '2' + __TWO_PASS = '1' + + @staticmethod + def detect_interlacing( + video_path: Path, + analysis_time: Optional[int] = None, + threshold: float = 0.15, + ) -> Tuple[bool, Optional[Dict[str, Any]]]: + cmd = [ + 'ffmpeg', + '-i', str(video_path), + ] + + if analysis_time: + cmd.extend(['-t', str(analysis_time)]) + + cmd.extend([ + '-vf', 'idet', + '-an', + '-f', 'null', + '-', + ]) + + result = subprocess.run( + cmd, + stdout=subprocess.DEVNULL, + stderr=subprocess.PIPE, + text=True, + encoding='utf-8', + errors='ignore', + check=False, + ) + + if result.returncode != 0: + return (False, None) + + stats = FFmpegWrapper.__parse_idet_output(result.stderr) + if stats is None: + return (False, None) + + total_interlaced = stats['tff'] + stats['bff'] + total_frames = total_interlaced + stats['progressive'] + + if total_frames == 0: + return (False, None) + + ratio = total_interlaced / total_frames + stats['ratio'] = ratio + + return (ratio > threshold, stats) + + @staticmethod + def get_audio_bitrate(probe_data: Dict[str, Any]) -> Optional[int]: + stream = FFmpegWrapper.__get_stream_by_type(probe_data, 'audio') + if not stream: + return None + bit_rate = stream.get('bit_rate') + if not bit_rate: + return None + return int(int(bit_rate) / 1000) + + @staticmethod + def get_framerate(probe_data: Dict[str, Any]) -> float: + stream = FFmpegWrapper.__get_stream_by_type(probe_data, 'video') + if not stream: + raise ValueError('No video streams found') + r_frame_rate = stream.get('r_frame_rate') + if not r_frame_rate: + raise ValueError('Frame rate not found') + num, denom = [int(x) for x in r_frame_rate.split('/')] + return num / denom + + @staticmethod + def get_video_bitrate(probe_data: Dict[str, Any]) -> Optional[float]: + stream = FFmpegWrapper.__get_stream_by_type(probe_data, 'video') + if not stream: + return None + bit_rate = stream.get('bit_rate') + if not bit_rate: + return None + return round(int(bit_rate) / 1000000, 2) + + @staticmethod + def probe_video(video_path: Path) -> Dict[str, Any]: + cmd = ['ffprobe', '-v', 'error', '-show_streams', '-show_format', '-of', 'json', str(video_path)] + result = subprocess.run(cmd, capture_output=True, text=True, check=True) + return json.loads(result.stdout) @staticmethod def transcode( # pylint: disable=too-many-arguments @@ -54,56 +140,18 @@ def transcode( # pylint: disable=too-many-arguments subprocess.run(command, check=True, capture_output=False) @staticmethod - def probe_video(video_path: Path) -> Dict[str, Any]: - cmd = ['ffprobe', '-v', 'error', '-show_streams', '-show_format', '-of', 'json', str(video_path)] - result = subprocess.run(cmd, capture_output=True, text=True, check=True) - return json.loads(result.stdout) - - @staticmethod - def get_framerate(probe_data: Dict[str, Any]) -> float: - stream = FFmpegWrapper.__get_stream_by_type(probe_data, 'video') - if not stream: - raise ValueError('No video streams found') - r_frame_rate = stream.get('r_frame_rate') - if not r_frame_rate: - raise ValueError('Frame rate not found') - num, denom = [int(x) for x in r_frame_rate.split('/')] - return num / denom - - @staticmethod - def get_video_bitrate(probe_data: Dict[str, Any]) -> Optional[float]: - stream = FFmpegWrapper.__get_stream_by_type(probe_data, 'video') - if not stream: - return None - bit_rate = stream.get('bit_rate') - if not bit_rate: - return None - return round(int(bit_rate) / 1000000, 2) - - @staticmethod - def get_audio_bitrate(probe_data: Dict[str, Any]) -> Optional[int]: - stream = FFmpegWrapper.__get_stream_by_type(probe_data, 'audio') - if not stream: - return None - bit_rate = stream.get('bit_rate') - if not bit_rate: - return None - return int(int(bit_rate) / 1000) - - @staticmethod - def __build_video_filter(width: int, height: int, deinterlace: bool = False) -> str: - filters = [] - - if deinterlace: - filters.append('bwdif=mode=0') - - filters.append( - f"scale='iw*sar:ih',scale={width}:{height}:" - f"force_original_aspect_ratio=decrease,pad={width}:{height}:" - f"(ow-iw)/2:(oh-ih)/2:black,setsar=1", - ) - - return ','.join(filters) + def __build_audio_and_output_params( + audio_bitrate: str, vf_filter: str, output_path: Path, + ) -> List[str]: + return [ + '-c:a', 'aac', + '-b:a', audio_bitrate, + '-ac', FFmpegWrapper.__AUDIO_CHANNELS, + '-vf', vf_filter, + '-movflags', '+faststart', + '-f', 'mp4', + str(output_path), + ] @staticmethod def __build_base_command( @@ -144,72 +192,24 @@ def __build_encoding_params( ] @staticmethod - def __build_audio_and_output_params( - audio_bitrate: str, vf_filter: str, output_path: Path, - ) -> List[str]: - return [ - '-c:a', 'aac', - '-b:a', audio_bitrate, - '-ac', FFmpegWrapper.__AUDIO_CHANNELS, - '-vf', vf_filter, - '-movflags', '+faststart', - '-f', 'mp4', - str(output_path), - ] - - @staticmethod - def __get_stream_by_type(probe_data: Dict[str, Any], codec_type: str) -> Optional[Dict[str, Any]]: - streams = [s for s in probe_data.get('streams', []) if s.get('codec_type') == codec_type] - return streams[0] if streams else None - - @staticmethod - def detect_interlacing( - video_path: Path, - analysis_time: Optional[int] = None, - threshold: float = 0.15, - ) -> Tuple[bool, Optional[Dict[str, Any]]]: - cmd = [ - 'ffmpeg', - '-i', str(video_path), - ] - - if analysis_time: - cmd.extend(['-t', str(analysis_time)]) + def __build_video_filter(width: int, height: int, deinterlace: bool = False) -> str: + filters = [] - cmd.extend([ - '-vf', 'idet', - '-an', - '-f', 'null', - '-', - ]) + if deinterlace: + filters.append('bwdif=mode=0') - result = subprocess.run( - cmd, - stdout=subprocess.DEVNULL, - stderr=subprocess.PIPE, - text=True, - encoding='utf-8', - errors='ignore', - check=False, + filters.append( + f"scale='iw*sar:ih',scale={width}:{height}:" + f"force_original_aspect_ratio=decrease,pad={width}:{height}:" + f"(ow-iw)/2:(oh-ih)/2:black,setsar=1", ) - if result.returncode != 0: - return (False, None) - - stats = FFmpegWrapper.__parse_idet_output(result.stderr) - if stats is None: - return (False, None) - - total_interlaced = stats['tff'] + stats['bff'] - total_frames = total_interlaced + stats['progressive'] - - if total_frames == 0: - return (False, None) - - ratio = total_interlaced / total_frames - stats['ratio'] = ratio + return ','.join(filters) - return (ratio > threshold, stats) + @staticmethod + def __get_stream_by_type(probe_data: Dict[str, Any], codec_type: str) -> Optional[Dict[str, Any]]: + streams = [s for s in probe_data.get('streams', []) if s.get('codec_type') == codec_type] + return streams[0] if streams else None @staticmethod def __parse_idet_output(stderr: str) -> Optional[Dict[str, int]]: diff --git a/preprocessor/lib/media/resolution.py b/preprocessor/lib/media/resolution.py index 5930f39c3..fd255dccd 100644 --- a/preprocessor/lib/media/resolution.py +++ b/preprocessor/lib/media/resolution.py @@ -8,15 +8,15 @@ T = TypeVar('T', bound='Resolution') class Resolution(Enum): - R4320P = (7680, 4320) - R2160P = (3840, 2160) - R1440P = (2560, 1440) R1080P = (1920, 1080) - R720P = (1280, 720) - R480P = (854, 480) - R360P = (640, 360) - R240P = (426, 240) + R1440P = (2560, 1440) R144P = (256, 144) + R2160P = (3840, 2160) + R240P = (426, 240) + R360P = (640, 360) + R4320P = (7680, 4320) + R480P = (854, 480) + R720P = (1280, 720) def __init__(self, width: int, height: int): self.width = width diff --git a/preprocessor/lib/media/scene_detection.py b/preprocessor/lib/media/scene_detection.py index f39a48ce7..9c1e9329c 100644 --- a/preprocessor/lib/media/scene_detection.py +++ b/preprocessor/lib/media/scene_detection.py @@ -18,10 +18,13 @@ class TransNetWrapper: def __init__(self): self.model: Optional[TransNetV2] = None - def load_model(self) -> None: - if not torch.cuda.is_available(): - raise RuntimeError('CUDA not available') - self.model = TransNetV2().cuda() + def cleanup(self) -> None: + if self.model is not None: + del self.model + self.model = None + gc.collect() + if torch.cuda.is_available(): + torch.cuda.empty_cache() def detect_scenes( self, @@ -45,13 +48,10 @@ def detect_scenes( except (RuntimeError, ValueError, OSError) as e: raise RuntimeError(f'TransNetV2 detection failed: {e}') from e - def cleanup(self) -> None: - if self.model is not None: - del self.model - self.model = None - gc.collect() - if torch.cuda.is_available(): - torch.cuda.empty_cache() + def load_model(self) -> None: + if not torch.cuda.is_available(): + raise RuntimeError('CUDA not available') + self.model = TransNetV2().cuda() def __build_scenes_from_predictions( self, @@ -97,6 +97,15 @@ def __create_scene_dict( 'frame_count': int(end_frame - start_frame), } + @staticmethod + def __frame_to_timecode(frame: int, fps: float) -> str: + seconds = frame / fps + hours = int(seconds // 3600) + minutes = int(seconds % 3600 // 60) + secs = int(seconds % 60) + frames = int(seconds % 1 * fps) + return f'{hours:02d}:{minutes:02d}:{secs:02d}:{frames:02d}' + @staticmethod def __get_video_info(video_file: Path) -> Optional[Dict[str, Any]]: try: @@ -107,12 +116,3 @@ def __get_video_info(video_file: Path) -> Optional[Dict[str, Any]]: return {'fps': fps, 'duration': duration, 'total_frames': total_frames} except (RuntimeError, ValueError, OSError): return None - - @staticmethod - def __frame_to_timecode(frame: int, fps: float) -> str: - seconds = frame / fps - hours = int(seconds // 3600) - minutes = int(seconds % 3600 // 60) - secs = int(seconds % 60) - frames = int(seconds % 1 * fps) - return f'{hours:02d}:{minutes:02d}:{secs:02d}:{frames:02d}' diff --git a/preprocessor/lib/search/elasticsearch.py b/preprocessor/lib/search/elasticsearch.py index 949a0ef85..9a2f6f46b 100644 --- a/preprocessor/lib/search/elasticsearch.py +++ b/preprocessor/lib/search/elasticsearch.py @@ -18,29 +18,6 @@ def __init__(self, index_name: str, host: str='localhost:9200', dry_run: bool=Fa self.dry_run: bool = dry_run self._client: Optional[AsyncElasticsearch] = None - async def _get_client(self) -> AsyncElasticsearch: - if self._client is None: - self._client = AsyncElasticsearch([self.host], verify_certs=False, ssl_show_warn=False) - return self._client - - async def index_exists(self) -> bool: - if self.dry_run: - return False - client = await self._get_client() - return await client.indices.exists(index=self.index_name) - - async def create_index(self, mapping: Dict[str, Any]) -> None: - if self.dry_run: - return - client = await self._get_client() - await client.indices.create(index=self.index_name, body=mapping) - - async def delete_index(self) -> None: - if self.dry_run: - return - client = await self._get_client() - await client.indices.delete(index=self.index_name, ignore=[404]) - async def bulk_index(self, documents: List[Dict[str, Any]]) -> Dict[str, Any]: if self.dry_run: return {'indexed': len(documents), 'errors': []} @@ -59,3 +36,26 @@ async def close(self) -> None: if self._client is not None: await self._client.close() self._client = None + + async def create_index(self, mapping: Dict[str, Any]) -> None: + if self.dry_run: + return + client = await self._get_client() + await client.indices.create(index=self.index_name, body=mapping) + + async def delete_index(self) -> None: + if self.dry_run: + return + client = await self._get_client() + await client.indices.delete(index=self.index_name, ignore=[404]) + + async def index_exists(self) -> bool: + if self.dry_run: + return False + client = await self._get_client() + return await client.indices.exists(index=self.index_name) + + async def _get_client(self) -> AsyncElasticsearch: + if self._client is None: + self._client = AsyncElasticsearch([self.host], verify_certs=False, ssl_show_warn=False) + return self._client diff --git a/preprocessor/lib/text/language_config.py b/preprocessor/lib/text/language_config.py index 4dabcb0f4..1f61f07b3 100644 --- a/preprocessor/lib/text/language_config.py +++ b/preprocessor/lib/text/language_config.py @@ -4,10 +4,10 @@ @dataclass class LanguageConfig: - vowels: Set[str] consonants: Set[str] punctuation: Set[str] special_chars: Set[str] + vowels: Set[str] POLISH_VOWELS = set('aąeęioóuyAĄEĘIOÓUY') POLISH_CONSONANTS = set('bcćdfghjklłmnńprsśtwzźżBCĆDFGHJKLŁMNŃPRSŚTWZŹŻ') ENGLISH_VOWELS = set('aeiouAEIOU') diff --git a/preprocessor/lib/text/text_statistics.py b/preprocessor/lib/text/text_statistics.py index da972bfee..bf23b0ca7 100644 --- a/preprocessor/lib/text/text_statistics.py +++ b/preprocessor/lib/text/text_statistics.py @@ -21,30 +21,30 @@ @dataclass class TextStatistics: # pylint: disable=too-many-instance-attributes text: str + avg_sentence_length: float = 0.0 + avg_word_length: float = 0.0 + bigrams: List[Dict[str, Any]] = field(default_factory=list) + chars_without_spaces: int = 0 + consonants: int = 0 + digits: int = 0 + empty_lines: int = 0 language: str = 'pl' - sentences: int = 0 + letter_frequency: Dict[str, int] = field(default_factory=dict) + letters: int = 0 lines: int = 0 paragraphs: int = 0 - empty_lines: int = 0 - words: int = 0 - letters: int = 0 - digits: int = 0 - symbols: int = 0 punctuation_marks: int = 0 - special_characters: int = 0 - chars_without_spaces: int = 0 + sentences: int = 0 spaces: int = 0 + special_characters: int = 0 + symbols: int = 0 total_chars: int = 0 - vowels: int = 0 - consonants: int = 0 - unique_words: int = 0 - avg_word_length: float = 0.0 - avg_sentence_length: float = 0.0 + trigrams: List[Dict[str, Any]] = field(default_factory=list) type_token_ratio: float = 0.0 - letter_frequency: Dict[str, int] = field(default_factory=dict) + unique_words: int = 0 + vowels: int = 0 word_frequency: List[Dict[str, Any]] = field(default_factory=list) - bigrams: List[Dict[str, Any]] = field(default_factory=list) - trigrams: List[Dict[str, Any]] = field(default_factory=list) + words: int = 0 @classmethod def from_file(cls, file_path: Path, language: str='pl') -> 'TextStatistics': @@ -54,11 +54,36 @@ def from_file(cls, file_path: Path, language: str='pl') -> 'TextStatistics': stats.__calculate() return stats - @classmethod - def __from_text(cls, text: str, language: str='pl') -> 'TextStatistics': # pylint: disable=unused-private-member - stats = cls(text=text, language=language) - stats.__calculate() - return stats + def to_dict(self) -> Dict[str, Any]: + return { + 'basic_statistics': { + 'sentences': self.sentences, + 'lines': self.lines, + 'paragraphs': self.paragraphs, + 'empty_lines': self.empty_lines, + 'words': self.words, + 'letters': self.letters, + 'digits': self.digits, + 'symbols': self.symbols, + 'punctuation_marks': self.punctuation_marks, + 'special_characters': self.special_characters, + 'chars_without_spaces': self.chars_without_spaces, + 'spaces': self.spaces, + 'total_chars': self.total_chars, + 'vowels': self.vowels, + 'consonants': self.consonants, + }, + 'advanced_statistics': { + 'unique_words': self.unique_words, + 'avg_word_length': self.avg_word_length, + 'avg_sentence_length': self.avg_sentence_length, + 'type_token_ratio': self.type_token_ratio, + }, + 'letter_frequency': self.letter_frequency, + 'word_frequency': self.word_frequency, + 'bigrams': self.bigrams, + 'trigrams': self.trigrams, + } def __calculate(self) -> None: # pylint: disable=unused-private-member self.__calculate_basic_stats() @@ -66,11 +91,16 @@ def __calculate(self) -> None: # pylint: disable=unused-private-member self.__calculate_word_stats() self.__calculate_advanced_stats() - def __get_config(self) -> LanguageConfig: - return POLISH_CONFIG if self.language == 'pl' else ENGLISH_CONFIG - - def __get_words(self) -> List[str]: - return re.findall('\\b\\w+\\b', self.text.lower()) + def __calculate_advanced_stats(self) -> None: + if self.sentences > 0: + self.avg_sentence_length = round(self.words / self.sentences, 2) + words = self.__get_words() + if len(words) >= 2: + bigram_counter = Counter(zip(words[:-1], words[1:])) + self.bigrams = [{'bigram': f'{w1} {w2}', 'count': count} for (w1, w2), count in bigram_counter.most_common(25)] + if len(words) >= 3: + trigram_counter = Counter(zip(words[:-2], words[1:-1], words[2:])) + self.trigrams = [{'trigram': f'{w1} {w2} {w3}', 'count': count} for (w1, w2, w3), count in trigram_counter.most_common(25)] def __calculate_basic_stats(self) -> None: lines = self.text.split('\n') @@ -116,44 +146,14 @@ def __calculate_word_stats(self) -> None: self.avg_word_length = round(sum(word_lengths) / len(word_lengths), 2) if word_lengths else 0.0 self.word_frequency = [{'word': word, 'count': count} for word, count in word_counter.most_common(50)] - def __calculate_advanced_stats(self) -> None: - if self.sentences > 0: - self.avg_sentence_length = round(self.words / self.sentences, 2) - words = self.__get_words() - if len(words) >= 2: - bigram_counter = Counter(zip(words[:-1], words[1:])) - self.bigrams = [{'bigram': f'{w1} {w2}', 'count': count} for (w1, w2), count in bigram_counter.most_common(25)] - if len(words) >= 3: - trigram_counter = Counter(zip(words[:-2], words[1:-1], words[2:])) - self.trigrams = [{'trigram': f'{w1} {w2} {w3}', 'count': count} for (w1, w2, w3), count in trigram_counter.most_common(25)] + @classmethod + def __from_text(cls, text: str, language: str='pl') -> 'TextStatistics': # pylint: disable=unused-private-member + stats = cls(text=text, language=language) + stats.__calculate() + return stats - def to_dict(self) -> Dict[str, Any]: - return { - 'basic_statistics': { - 'sentences': self.sentences, - 'lines': self.lines, - 'paragraphs': self.paragraphs, - 'empty_lines': self.empty_lines, - 'words': self.words, - 'letters': self.letters, - 'digits': self.digits, - 'symbols': self.symbols, - 'punctuation_marks': self.punctuation_marks, - 'special_characters': self.special_characters, - 'chars_without_spaces': self.chars_without_spaces, - 'spaces': self.spaces, - 'total_chars': self.total_chars, - 'vowels': self.vowels, - 'consonants': self.consonants, - }, - 'advanced_statistics': { - 'unique_words': self.unique_words, - 'avg_word_length': self.avg_word_length, - 'avg_sentence_length': self.avg_sentence_length, - 'type_token_ratio': self.type_token_ratio, - }, - 'letter_frequency': self.letter_frequency, - 'word_frequency': self.word_frequency, - 'bigrams': self.bigrams, - 'trigrams': self.trigrams, - } + def __get_config(self) -> LanguageConfig: + return POLISH_CONFIG if self.language == 'pl' else ENGLISH_CONFIG + + def __get_words(self) -> List[str]: + return re.findall('\\b\\w+\\b', self.text.lower()) diff --git a/preprocessor/lib/transcription/elevenlabs.py b/preprocessor/lib/transcription/elevenlabs.py index 55de2bf8b..53930e7a6 100644 --- a/preprocessor/lib/transcription/elevenlabs.py +++ b/preprocessor/lib/transcription/elevenlabs.py @@ -23,20 +23,6 @@ class ElevenLabsTranscriber(BaseProcessor): - def _validate_args(self, args: Dict[str, Any]) -> None: - if 'videos' not in args: - raise ValueError('videos is required') - if 'output_dir' not in args: - raise ValueError('output_dir is required') - if 'series_name' not in args: - raise ValueError('series_name is required') - videos_path = Path(args['videos']) - if not videos_path.is_dir(): - raise NotADirectoryError(f"Input videos is not a directory: '{videos_path}'") - - def get_output_subdir(self) -> str: - return settings.output_subdirs.transcriptions - def __init__(self, args: Dict[str, Any]): super().__init__(args=args, class_name=self.__class__.__name__, error_exit_code=5, loglevel=logging.DEBUG) self.input_videos: Path = Path(self._args['videos']) @@ -49,6 +35,9 @@ def __init__(self, args: Dict[str, Any]): self.episode_manager = EpisodeManager(self.episodes_info_json, self.series_name, self.logger) self.engine = ElevenLabsEngine(logger=self.logger, model_id=self.model_id, language_code=self.language_code, diarize=self.diarize) + def get_output_subdir(self) -> str: + return settings.output_subdirs.transcriptions + def _execute(self) -> None: video_files: List[Path] = [] for ext in self.SUPPORTED_VIDEO_EXTENSIONS: @@ -98,6 +87,17 @@ def _execute(self) -> None: ) multi_format_gen.generate() + def _validate_args(self, args: Dict[str, Any]) -> None: + if 'videos' not in args: + raise ValueError('videos is required') + if 'output_dir' not in args: + raise ValueError('output_dir is required') + if 'series_name' not in args: + raise ValueError('series_name is required') + videos_path = Path(args['videos']) + if not videos_path.is_dir(): + raise NotADirectoryError(f"Input videos is not a directory: '{videos_path}'") + @staticmethod def __create_segments_from_words(words: List[Dict]) -> List[Dict]: if not words: diff --git a/preprocessor/lib/transcription/engines/base_engine.py b/preprocessor/lib/transcription/engines/base_engine.py index fb39ba8ae..1c77a5dec 100644 --- a/preprocessor/lib/transcription/engines/base_engine.py +++ b/preprocessor/lib/transcription/engines/base_engine.py @@ -12,9 +12,9 @@ class TranscriptionEngine(ABC): @abstractmethod - def transcribe(self, audio_path: Path) -> Dict[str, Any]: + def get_name(self) -> str: ... @abstractmethod - def get_name(self) -> str: + def transcribe(self, audio_path: Path) -> Dict[str, Any]: ... diff --git a/preprocessor/lib/transcription/engines/elevenlabs_engine.py b/preprocessor/lib/transcription/engines/elevenlabs_engine.py index 30386d203..3309b8fa6 100644 --- a/preprocessor/lib/transcription/engines/elevenlabs_engine.py +++ b/preprocessor/lib/transcription/engines/elevenlabs_engine.py @@ -46,6 +46,9 @@ def __init__( ] self._logger: ErrorHandlingLogger = logger + def get_name(self) -> str: + return 'ElevenLabs' + def transcribe(self, audio_path: Path) -> Dict[str, Any]: console.print(f'[cyan]Transcribing with 11labs: {audio_path.name}[/cyan]') if not audio_path.exists(): @@ -55,46 +58,6 @@ def transcribe(self, audio_path: Path) -> Dict[str, Any]: console.print(f'[green]Transcription completed: {audio_path.name}[/green]') return self.__convert_to_unified_format(result) - def __submit_job(self, audio_path: Path) -> str: - try: - with open(audio_path, 'rb') as audio_file: - audio_data = audio_file.read() - submit_response = self.client.speech_to_text.convert( - file=audio_data, - model_id=self.model_id, - language_code=self.language_code, - tag_audio_events=True, - timestamps_granularity='character', - diarize=self.diarize, - use_multi_channel=False, - additional_formats=self.additional_formats, - webhook=True, - ) - self._logger.info(f'Job submitted. ID: {submit_response.transcription_id}') - return submit_response.transcription_id - except ApiError as e: - self._logger.error(f'API error during job submission: {e.body}') - raise - - def __poll_for_results(self, transcription_id: str): - self._logger.info(f'Polling for results (ID: {transcription_id})...') - max_attempts = settings.elevenlabs.max_attempts - attempt = 0 - while attempt < max_attempts: - try: - result = self.client.speech_to_text.transcripts.get(transcription_id=transcription_id) - self._logger.info('Transcription complete!') - return result - except ApiError as e: - if e.status_code == 404: - self._logger.info(' ...Processing. Waiting...') - time.sleep(self.polling_interval) - attempt += 1 - else: - self._logger.error(f'API error during polling: {e.body}') - raise - raise TimeoutError(f'Transcription timeout after {max_attempts} attempts') - @staticmethod def __convert_to_unified_format(result) -> Dict[str, Any]: unified_data = {'text': result.text, 'language_code': result.language_code, 'segments': []} @@ -118,5 +81,42 @@ def __convert_to_unified_format(result) -> Dict[str, Any]: break return unified_data - def get_name(self) -> str: - return 'ElevenLabs' + def __poll_for_results(self, transcription_id: str): + self._logger.info(f'Polling for results (ID: {transcription_id})...') + max_attempts = settings.elevenlabs.max_attempts + attempt = 0 + while attempt < max_attempts: + try: + result = self.client.speech_to_text.transcripts.get(transcription_id=transcription_id) + self._logger.info('Transcription complete!') + return result + except ApiError as e: + if e.status_code == 404: + self._logger.info(' ...Processing. Waiting...') + time.sleep(self.polling_interval) + attempt += 1 + else: + self._logger.error(f'API error during polling: {e.body}') + raise + raise TimeoutError(f'Transcription timeout after {max_attempts} attempts') + + def __submit_job(self, audio_path: Path) -> str: + try: + with open(audio_path, 'rb') as audio_file: + audio_data = audio_file.read() + submit_response = self.client.speech_to_text.convert( + file=audio_data, + model_id=self.model_id, + language_code=self.language_code, + tag_audio_events=True, + timestamps_granularity='character', + diarize=self.diarize, + use_multi_channel=False, + additional_formats=self.additional_formats, + webhook=True, + ) + self._logger.info(f'Job submitted. ID: {submit_response.transcription_id}') + return submit_response.transcription_id + except ApiError as e: + self._logger.error(f'API error during job submission: {e.body}') + raise diff --git a/preprocessor/lib/transcription/engines/whisper_engine.py b/preprocessor/lib/transcription/engines/whisper_engine.py index 9cb84427c..2badcf230 100644 --- a/preprocessor/lib/transcription/engines/whisper_engine.py +++ b/preprocessor/lib/transcription/engines/whisper_engine.py @@ -26,6 +26,18 @@ def __init__(self, model: str='large-v3-turbo', language: str='Polish', device: self.model = WhisperModel(model, device=device, compute_type=compute_type) console.print('[green]✓ Whisper model loaded[/green]') + def cleanup(self) -> None: + console.print('[cyan]Unloading Whisper model and clearing GPU memory...[/cyan]') + if hasattr(self, 'model'): + del self.model + gc.collect() + if torch.cuda.is_available(): + torch.cuda.empty_cache() + console.print('[green]✓ Whisper model unloaded, GPU memory cleared[/green]') + + def get_name(self) -> str: + return f'Whisper-{self.model_name}' + def transcribe(self, audio_path: Path) -> Dict[str, Any]: console.print(f'[cyan]Transcribing with Whisper: {audio_path.name}[/cyan]') if not audio_path.exists(): @@ -35,15 +47,3 @@ def transcribe(self, audio_path: Path) -> Dict[str, Any]: result = WhisperUtils.build_transcription_result(segments, language=info.language) console.print(f'[green]✓ Transcription completed: {audio_path.name}[/green]') return result - - def get_name(self) -> str: - return f'Whisper-{self.model_name}' - - def cleanup(self) -> None: - console.print('[cyan]Unloading Whisper model and clearing GPU memory...[/cyan]') - if hasattr(self, 'model'): - del self.model - gc.collect() - if torch.cuda.is_available(): - torch.cuda.empty_cache() - console.print('[green]✓ Whisper model unloaded, GPU memory cleared[/green]') diff --git a/preprocessor/lib/transcription/generators/base_generator.py b/preprocessor/lib/transcription/generators/base_generator.py index 95143e82d..48ae5e48a 100644 --- a/preprocessor/lib/transcription/generators/base_generator.py +++ b/preprocessor/lib/transcription/generators/base_generator.py @@ -30,9 +30,9 @@ def generate(self) -> None: self.logger.error(f'Failed to generate output for {json_file}: {e}') @abstractmethod - def _process_file(self, json_file: Path, data: Dict[str, Any]) -> None: + def _get_output_filename(self, json_file: Path) -> str: ... @abstractmethod - def _get_output_filename(self, json_file: Path) -> str: + def _process_file(self, json_file: Path, data: Dict[str, Any]) -> None: ... diff --git a/preprocessor/lib/transcription/generators/json_generator.py b/preprocessor/lib/transcription/generators/json_generator.py index 3bd6fc34a..160c0f9d4 100644 --- a/preprocessor/lib/transcription/generators/json_generator.py +++ b/preprocessor/lib/transcription/generators/json_generator.py @@ -19,15 +19,6 @@ def __init__(self, format_type: Literal['full', 'simple', 'segmented'], *args, * super().__init__(*args, **kwargs) self.format_type = format_type - def _process_file(self, json_file: Path, data: Dict[str, Any]) -> None: - ... - - def _get_output_filename(self, json_file: Path) -> str: - if self.format_type == 'full': - return json_file.name - suffix = FILE_SUFFIXES[self.format_type] - return json_file.name.replace(FILE_EXTENSIONS['json'], f"{suffix}{FILE_EXTENSIONS['json']}") - def convert(self, data: Dict[str, Any]) -> Dict[str, Any]: if self.format_type == 'full': return self.convert_to_full_format(data) @@ -51,24 +42,33 @@ def convert_to_full_format(data: Dict[str, Any]) -> Dict[str, Any]: return {'language_code': language_code, 'language_probability': 1.0, 'text': full_text, 'words': words} @staticmethod - def convert_to_simple_format(data: Dict[str, Any]) -> Dict[str, Any]: + def convert_to_segmented_format(data: Dict[str, Any]) -> Dict[str, Any]: segments = data.get('segments', []) result_segments = [] for seg in segments: text = seg.get('text', '').strip() seg_words = seg.get('words', []) - speaker = 'speaker_unknown' - if seg_words: - speaker = seg_words[0].get('speaker_id', 'speaker_unknown') - result_segments.append({'speaker': speaker, 'text': text}) + result_segments.append({'text': text, 'words': TranscriptionUtils.convert_words_list(seg_words)}) return {'segments': result_segments} @staticmethod - def convert_to_segmented_format(data: Dict[str, Any]) -> Dict[str, Any]: + def convert_to_simple_format(data: Dict[str, Any]) -> Dict[str, Any]: segments = data.get('segments', []) result_segments = [] for seg in segments: text = seg.get('text', '').strip() seg_words = seg.get('words', []) - result_segments.append({'text': text, 'words': TranscriptionUtils.convert_words_list(seg_words)}) + speaker = 'speaker_unknown' + if seg_words: + speaker = seg_words[0].get('speaker_id', 'speaker_unknown') + result_segments.append({'speaker': speaker, 'text': text}) return {'segments': result_segments} + + def _get_output_filename(self, json_file: Path) -> str: + if self.format_type == 'full': + return json_file.name + suffix = FILE_SUFFIXES[self.format_type] + return json_file.name.replace(FILE_EXTENSIONS['json'], f"{suffix}{FILE_EXTENSIONS['json']}") + + def _process_file(self, json_file: Path, data: Dict[str, Any]) -> None: + ... diff --git a/preprocessor/lib/transcription/generators/multi_format_generator.py b/preprocessor/lib/transcription/generators/multi_format_generator.py index 2c368a6c4..42e38928d 100644 --- a/preprocessor/lib/transcription/generators/multi_format_generator.py +++ b/preprocessor/lib/transcription/generators/multi_format_generator.py @@ -26,20 +26,12 @@ def __init__(self, jsons_dir: Path, episodes_info_json: Path, output_base_path: self.series_name = series_name.lower() if series_name else 'unknown' self.episode_manager = EpisodeManager(episodes_info_json, self.series_name, logger) - def __call__(self) -> None: - self.generate() - def generate(self) -> None: for transcription_file in self.jsons_dir.rglob('*.json'): self.__process_file(transcription_file) - def __load_transcription(self, transcription_file: Path) -> Optional[Dict[str, Any]]: - try: - with open(transcription_file, 'r', encoding='utf-8') as f: - return json.load(f) - except Exception as e: - self.logger.error(f'Failed to load transcription {transcription_file}: {e}') - return None + def __call__(self) -> None: + self.generate() def __check_if_already_processed(self, episode_info) -> bool: filename = self.episode_manager.path_manager.build_filename( @@ -73,23 +65,6 @@ def __generate_all_formats( self.__generate_srt(transcription, episode_info) self.__generate_txt(transcription, episode_info) - def __process_file(self, transcription_file: Path) -> None: - try: - transcription = self.__load_transcription(transcription_file) - if not transcription: - return - episode_info = self.episode_manager.parse_filename(transcription_file) - if not episode_info: - self.logger.error( - f'Cannot extract episode info from {transcription_file.name}', - ) - return - if self.__check_if_already_processed(episode_info): - return - self.__generate_all_formats(transcription, episode_info) - except Exception as e: - self.logger.error(f'Error processing file {transcription_file}: {e}') - def __generate_full_json(self, data: Dict[str, Any], episode_info) -> None: filename = self.episode_manager.path_manager.build_filename(episode_info, extension='json') season_code = episode_info.season_code() @@ -154,3 +129,28 @@ def __generate_txt(self, data: Dict[str, Any], episode_info) -> None: with open(output_file, 'w', encoding='utf-8') as f: f.write(txt_content) self.logger.info(f'Generated TXT: {output_file}') + + def __load_transcription(self, transcription_file: Path) -> Optional[Dict[str, Any]]: + try: + with open(transcription_file, 'r', encoding='utf-8') as f: + return json.load(f) + except Exception as e: + self.logger.error(f'Failed to load transcription {transcription_file}: {e}') + return None + + def __process_file(self, transcription_file: Path) -> None: + try: + transcription = self.__load_transcription(transcription_file) + if not transcription: + return + episode_info = self.episode_manager.parse_filename(transcription_file) + if not episode_info: + self.logger.error( + f'Cannot extract episode info from {transcription_file.name}', + ) + return + if self.__check_if_already_processed(episode_info): + return + self.__generate_all_formats(transcription, episode_info) + except Exception as e: + self.logger.error(f'Error processing file {transcription_file}: {e}') diff --git a/preprocessor/lib/transcription/generators/srt_generator.py b/preprocessor/lib/transcription/generators/srt_generator.py index 1b7d23bc9..060e1ec60 100644 --- a/preprocessor/lib/transcription/generators/srt_generator.py +++ b/preprocessor/lib/transcription/generators/srt_generator.py @@ -10,12 +10,6 @@ class SrtGenerator(BaseTranscriptionGenerator): - def _process_file(self, json_file: Path, data: Dict[str, Any]) -> None: - ... - - def _get_output_filename(self, json_file: Path) -> str: - return json_file.name.replace(FILE_EXTENSIONS['json'], FILE_EXTENSIONS['srt']) - def convert_to_srt_format(self, data: Dict[str, Any]) -> str: segments = data.get('segments', []) srt_lines = [] @@ -35,6 +29,12 @@ def convert_to_srt_format(self, data: Dict[str, Any]) -> str: index += 1 return '\n'.join(srt_lines) + def _get_output_filename(self, json_file: Path) -> str: + return json_file.name.replace(FILE_EXTENSIONS['json'], FILE_EXTENSIONS['srt']) + + def _process_file(self, json_file: Path, data: Dict[str, Any]) -> None: + ... + @staticmethod def __format_timestamp(seconds: float) -> str: hours = int(seconds // 3600) diff --git a/preprocessor/lib/transcription/generators/txt_generator.py b/preprocessor/lib/transcription/generators/txt_generator.py index aee720036..ae2a8a918 100644 --- a/preprocessor/lib/transcription/generators/txt_generator.py +++ b/preprocessor/lib/transcription/generators/txt_generator.py @@ -10,12 +10,6 @@ class TxtGenerator(BaseTranscriptionGenerator): - def _process_file(self, json_file: Path, data: Dict[str, Any]) -> None: - ... - - def _get_output_filename(self, json_file: Path) -> str: - return json_file.name.replace(FILE_EXTENSIONS['json'], FILE_EXTENSIONS['txt']) - @staticmethod def convert_to_txt_format(data: Dict[str, Any]) -> str: segments = data.get('segments', []) @@ -25,3 +19,9 @@ def convert_to_txt_format(data: Dict[str, Any]) -> str: if text: text_parts.append(text) return ' '.join(text_parts) + + def _get_output_filename(self, json_file: Path) -> str: + return json_file.name.replace(FILE_EXTENSIONS['json'], FILE_EXTENSIONS['txt']) + + def _process_file(self, json_file: Path, data: Dict[str, Any]) -> None: + ... diff --git a/preprocessor/lib/transcription/processors/audio_normalizer.py b/preprocessor/lib/transcription/processors/audio_normalizer.py index aaf601f7e..2e7715973 100644 --- a/preprocessor/lib/transcription/processors/audio_normalizer.py +++ b/preprocessor/lib/transcription/processors/audio_normalizer.py @@ -29,19 +29,6 @@ def __call__(self) -> None: if video.suffix.lower() in self.SUPPORTED_VIDEO_EXTENSIONS: self.__process_video(video) - def __process_video(self, video: Path) -> None: - try: - output_path = self.__output_dir / video.with_suffix('.wav').name - if output_path.exists(): - return - audio_idx = self.__get_best_audio_stream(video) - if audio_idx is None: - self.__logger.error(f"Cannot find audio stream for file: '{video}'") - return - self.__normalize(video=video, audio_idx=audio_idx, output=output_path) - except Exception as e: - self.__logger.error(f'Error processing video {video}: {e}') - def __get_best_audio_stream(self, video: Path) -> Optional[int]: cmd = ['ffprobe', '-v', 'error', '-select_streams', 'a', '-show_entries', 'stream=index,bit_rate', '-of', 'json', str(video)] result = subprocess.run(cmd, capture_output=True, text=True, check=True) @@ -62,3 +49,16 @@ def __normalize(self, video: Path, audio_idx: int, output: Path) -> None: self.__logger.info(f'Normalized audio: {tmp_output}') tmp_output.replace(output) self.__logger.info(f'Replaced original file with normalized audio: {video} -> {output}') + + def __process_video(self, video: Path) -> None: + try: + output_path = self.__output_dir / video.with_suffix('.wav').name + if output_path.exists(): + return + audio_idx = self.__get_best_audio_stream(video) + if audio_idx is None: + self.__logger.error(f"Cannot find audio stream for file: '{video}'") + return + self.__normalize(video=video, audio_idx=audio_idx, output=output_path) + except Exception as e: + self.__logger.error(f'Error processing video {video}: {e}') diff --git a/preprocessor/lib/transcription/processors/episode_info_processor.py b/preprocessor/lib/transcription/processors/episode_info_processor.py index f9ec102b0..98ec39059 100644 --- a/preprocessor/lib/transcription/processors/episode_info_processor.py +++ b/preprocessor/lib/transcription/processors/episode_info_processor.py @@ -27,6 +27,11 @@ def __call__(self) -> None: for transcription_file in self.__jsons_dir.rglob('*.json'): self.__process_file(transcription_file) + @staticmethod + def __load_transcription(path: Path) -> Dict[str, Any]: + with path.open('r', encoding='utf-8') as f: + return json.load(f) + def __process_file(self, transcription_file: Path) -> None: try: transcription = self.__load_transcription(transcription_file) @@ -39,10 +44,15 @@ def __process_file(self, transcription_file: Path) -> None: except Exception as e: self.__logger.error(f'Error processing file {transcription_file}: {e}') - @staticmethod - def __load_transcription(path: Path) -> Dict[str, Any]: - with path.open('r', encoding='utf-8') as f: - return json.load(f) + def __rename_original_file(self, original_path: Path, new_name: str) -> None: + new_src = original_path.parent / new_name + if original_path.name == new_name: + self.__logger.info(f'File {original_path} already has correct name.') + elif new_src.exists(): + self.__logger.error(f'Cannot rename {original_path} -> {new_src}, file already exists!') + else: + original_path.rename(new_src) + self.__logger.info(f'Renamed source transcription file: {original_path} -> {new_src}') def __write_episode_json(self, transcription: Dict[str, Any], episode_info) -> Tuple[Path, str]: new_json_name = self.__episode_manager.path_manager.build_filename(episode_info, extension='json') @@ -54,13 +64,3 @@ def __write_episode_json(self, transcription: Dict[str, Any], episode_info) -> T json.dump(result, f, ensure_ascii=False, indent=4) self.__logger.info(f'Created episode info {output_path}.') return (output_path, new_json_name) - - def __rename_original_file(self, original_path: Path, new_name: str) -> None: - new_src = original_path.parent / new_name - if original_path.name == new_name: - self.__logger.info(f'File {original_path} already has correct name.') - elif new_src.exists(): - self.__logger.error(f'Cannot rename {original_path} -> {new_src}, file already exists!') - else: - original_path.rename(new_src) - self.__logger.info(f'Renamed source transcription file: {original_path} -> {new_src}') diff --git a/preprocessor/lib/transcription/processors/normalized_audio_processor.py b/preprocessor/lib/transcription/processors/normalized_audio_processor.py index d3d77395c..e46544421 100644 --- a/preprocessor/lib/transcription/processors/normalized_audio_processor.py +++ b/preprocessor/lib/transcription/processors/normalized_audio_processor.py @@ -46,6 +46,15 @@ def __init__( compute_type=compute_type, ) + def cleanup(self) -> None: + self.__logger.info('Unloading Whisper model and clearing GPU memory...') + if hasattr(self, '_NormalizedAudioProcessor__whisper_model'): + del self.__whisper_model + gc.collect() + if torch.cuda.is_available(): + torch.cuda.empty_cache() + self.__logger.info('Whisper model unloaded, GPU memory cleared') + def __call__(self) -> None: if self.__audio_files is not None: for audio in self.__audio_files: @@ -81,12 +90,3 @@ def __process_normalized_audio(self, normalized_audio: Path) -> None: self.__logger.info(f'Processed: {normalized_audio}') except Exception as e: self.__logger.error(f'Error processing file {normalized_audio}: {e}') - - def cleanup(self) -> None: - self.__logger.info('Unloading Whisper model and clearing GPU memory...') - if hasattr(self, '_NormalizedAudioProcessor__whisper_model'): - del self.__whisper_model - gc.collect() - if torch.cuda.is_available(): - torch.cuda.empty_cache() - self.__logger.info('Whisper model unloaded, GPU memory cleared') diff --git a/preprocessor/lib/transcription/processors/sound_separator.py b/preprocessor/lib/transcription/processors/sound_separator.py index ad53d49b4..448909fc8 100644 --- a/preprocessor/lib/transcription/processors/sound_separator.py +++ b/preprocessor/lib/transcription/processors/sound_separator.py @@ -37,24 +37,9 @@ def __init__(self, args: Dict[str, Any]) -> None: episodes_info_json = self._args.get('episodes_info_json') self.episode_manager = EpisodeManager(episodes_info_json, self.series_name, self.logger) - def _validate_args(self, args: Dict[str, Any]) -> None: - ... - def get_output_subdir(self) -> str: return settings.output_subdirs.transcriptions - def _get_processing_items(self) -> List[ProcessingItem]: - segmented_files = list(self.transcription_dir.rglob('**/raw/*_segmented.json')) - items = [] - for trans_file in segmented_files: - episode_info = self.episode_manager.parse_filename(trans_file) - if not episode_info: - self.logger.warning(f'Cannot parse episode info from {trans_file.name}') - continue - episode_id = EpisodeManager.get_episode_id_for_state(episode_info) - items.append(ProcessingItem(episode_id=episode_id, input_path=trans_file, metadata={'episode_info': episode_info})) - return items - def _get_expected_outputs(self, item: ProcessingItem) -> List[OutputSpec]: base_name = item.input_path.stem.replace(FILE_SUFFIXES['segmented'], '') episode_dir = item.input_path.parent.parent @@ -79,6 +64,21 @@ def _get_expected_outputs(self, item: ProcessingItem) -> List[OutputSpec]: OutputSpec(path=sound_srt, required=True), ] + def _get_processing_items(self) -> List[ProcessingItem]: + segmented_files = list(self.transcription_dir.rglob('**/raw/*_segmented.json')) + items = [] + for trans_file in segmented_files: + episode_info = self.episode_manager.parse_filename(trans_file) + if not episode_info: + self.logger.warning(f'Cannot parse episode info from {trans_file.name}') + continue + episode_id = EpisodeManager.get_episode_id_for_state(episode_info) + items.append(ProcessingItem(episode_id=episode_id, input_path=trans_file, metadata={'episode_info': episode_info})) + return items + + def _get_progress_description(self) -> str: + return 'Separating sound events from dialogues' + def _process_item( # pylint: disable=too-many-locals self, item, missing_outputs: List, ) -> None: @@ -129,56 +129,8 @@ def _process_item( # pylint: disable=too-many-locals self.__generate_srt_files(dialogue_segments, sound_event_segments, clean_srt, sound_srt) self.logger.info(f'Separated {item.episode_id}: {len(dialogue_segments)} dialogue, {len(sound_event_segments)} sound events') - - def __split_mixed_segment(self, segment: Dict[str, Any]) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]: - words = segment.get('words', []) - dialogue_sequences = [] - sound_sequences = [] - current_type = None - current_words = [] - for word in words: - if word.get(WordKeys.TYPE) == WordTypeValues.SPACING: - if current_words: - current_words.append(word) - continue - is_sound = is_sound_event(word) - word_type = 'sound' if is_sound else 'dialogue' - if word_type != current_type: - if current_words: - self.__finalize_sequence(current_type, current_words, dialogue_sequences, sound_sequences, segment) - current_type = word_type - current_words = [word] - else: - current_words.append(word) - if current_words: - self.__finalize_sequence(current_type, current_words, dialogue_sequences, sound_sequences, segment) - return (dialogue_sequences, sound_sequences) - - @staticmethod - def __finalize_sequence( - seq_type: str, - words: List[Dict], - dialogue_sequences: List[Dict], - sound_sequences: List[Dict], - original_segment: Dict[str, Any], - ) -> None: - if not words: - return - non_spacing_words = [w for w in words if w.get(WordKeys.TYPE) != WordTypeValues.SPACING] - if not non_spacing_words: - return - text = ''.join([w.get('text', '') for w in words]) - text = re.sub('\\s+', ' ', text).strip() - start_time = min((w.get('start') or 0 for w in words)) - end_time = max((w.get('end') or 0 for w in words)) - new_segment = {'text': text, 'start': start_time, 'end': end_time, 'words': words} - for key in original_segment: - if key not in ['text', 'start', 'end', 'words']: - new_segment[key] = original_segment[key] - if seq_type == 'dialogue': - dialogue_sequences.append(new_segment) - else: - sound_sequences.append(new_segment) + def _validate_args(self, args: Dict[str, Any]) -> None: + ... @staticmethod def __clean_segment_text(segment: Dict[str, Any]) -> Dict[str, Any]: @@ -198,18 +150,6 @@ def __clean_segment_text(segment: Dict[str, Any]) -> Dict[str, Any]: cleaned['end'] = max(ends) return cleaned - @staticmethod - def __enrich_sound_event(segment: Dict[str, Any]) -> Dict[str, Any]: - enriched = segment.copy() - enriched['sound_type'] = 'sound' - return enriched - - @staticmethod - def __renumber_segments(segments: List[Dict]) -> List[Dict]: - for i, segment in enumerate(segments): - segment['id'] = i - return segments - @staticmethod def __convert_to_simple_format(segments: List[Dict]) -> List[Dict]: simple_segments = [] @@ -220,20 +160,37 @@ def __convert_to_simple_format(segments: List[Dict]) -> List[Dict]: simple_segments.append(simple_seg) return simple_segments - def __generate_txt_files(self, original_txt: Path, clean_txt: Path, sound_txt: Path) -> None: - if not original_txt.exists(): - self.logger.warning(f'Original TXT file not found: {original_txt}') + @staticmethod + def __enrich_sound_event(segment: Dict[str, Any]) -> Dict[str, Any]: + enriched = segment.copy() + enriched['sound_type'] = 'sound' + return enriched + + @staticmethod + def __finalize_sequence( + seq_type: str, + words: List[Dict], + dialogue_sequences: List[Dict], + sound_sequences: List[Dict], + original_segment: Dict[str, Any], + ) -> None: + if not words: return - with open(original_txt, 'r', encoding='utf-8') as f: - original_content = f.read() - clean_content = re.sub('\\([^)]*\\)', '', original_content) - clean_content = re.sub('\\s+', ' ', clean_content).strip() - sound_matches = re.findall('\\([^)]*\\)', original_content) - sound_content = ' '.join(sound_matches) - with open(clean_txt, 'w', encoding='utf-8') as f: - f.write(clean_content) - with open(sound_txt, 'w', encoding='utf-8') as f: - f.write(sound_content) + non_spacing_words = [w for w in words if w.get(WordKeys.TYPE) != WordTypeValues.SPACING] + if not non_spacing_words: + return + text = ''.join([w.get('text', '') for w in words]) + text = re.sub('\\s+', ' ', text).strip() + start_time = min((w.get('start') or 0 for w in words)) + end_time = max((w.get('end') or 0 for w in words)) + new_segment = {'text': text, 'start': start_time, 'end': end_time, 'words': words} + for key in original_segment: + if key not in ['text', 'start', 'end', 'words']: + new_segment[key] = original_segment[key] + if seq_type == 'dialogue': + dialogue_sequences.append(new_segment) + else: + sound_sequences.append(new_segment) @staticmethod def __generate_srt_files(dialogue_segments: List[Dict], sound_segments: List[Dict], clean_srt: Path, sound_srt: Path) -> None: @@ -263,5 +220,48 @@ def __write_srt(segments: List[Dict], output_path: Path) -> None: __write_srt(dialogue_segments, clean_srt) __write_srt(sound_segments, sound_srt) - def _get_progress_description(self) -> str: - return 'Separating sound events from dialogues' + def __generate_txt_files(self, original_txt: Path, clean_txt: Path, sound_txt: Path) -> None: + if not original_txt.exists(): + self.logger.warning(f'Original TXT file not found: {original_txt}') + return + with open(original_txt, 'r', encoding='utf-8') as f: + original_content = f.read() + clean_content = re.sub('\\([^)]*\\)', '', original_content) + clean_content = re.sub('\\s+', ' ', clean_content).strip() + sound_matches = re.findall('\\([^)]*\\)', original_content) + sound_content = ' '.join(sound_matches) + with open(clean_txt, 'w', encoding='utf-8') as f: + f.write(clean_content) + with open(sound_txt, 'w', encoding='utf-8') as f: + f.write(sound_content) + + @staticmethod + def __renumber_segments(segments: List[Dict]) -> List[Dict]: + for i, segment in enumerate(segments): + segment['id'] = i + return segments + + + def __split_mixed_segment(self, segment: Dict[str, Any]) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]: + words = segment.get('words', []) + dialogue_sequences = [] + sound_sequences = [] + current_type = None + current_words = [] + for word in words: + if word.get(WordKeys.TYPE) == WordTypeValues.SPACING: + if current_words: + current_words.append(word) + continue + is_sound = is_sound_event(word) + word_type = 'sound' if is_sound else 'dialogue' + if word_type != current_type: + if current_words: + self.__finalize_sequence(current_type, current_words, dialogue_sequences, sound_sequences, segment) + current_type = word_type + current_words = [word] + else: + current_words.append(word) + if current_words: + self.__finalize_sequence(current_type, current_words, dialogue_sequences, sound_sequences, segment) + return (dialogue_sequences, sound_sequences) diff --git a/preprocessor/lib/transcription/processors/unicode_fixer.py b/preprocessor/lib/transcription/processors/unicode_fixer.py index cc15522a1..18d3b81d8 100644 --- a/preprocessor/lib/transcription/processors/unicode_fixer.py +++ b/preprocessor/lib/transcription/processors/unicode_fixer.py @@ -31,12 +31,12 @@ def __init__(self, args: Dict[str, Any]) -> None: episodes_info_json = self._args.get('episodes_info_json') self.episode_manager = EpisodeManager(episodes_info_json, self.series_name, self.logger) - def _validate_args(self, args: Dict[str, Any]) -> None: - ... - def get_output_subdir(self) -> str: return settings.output_subdirs.transcriptions + def _get_expected_outputs(self, item: ProcessingItem) -> List[OutputSpec]: + return [OutputSpec(path=item.input_path, required=True)] + def _get_processing_items(self) -> List[ProcessingItem]: transcription_files = list(self.transcription_jsons.rglob('*.json')) return [ @@ -48,9 +48,6 @@ def _get_processing_items(self) -> List[ProcessingItem]: for i, trans_file in enumerate(transcription_files) ] - def _get_expected_outputs(self, item: ProcessingItem) -> List[OutputSpec]: - return [OutputSpec(path=item.input_path, required=True)] - def _process_item(self, item: ProcessingItem, missing_outputs: List[OutputSpec]) -> None: trans_file = item.metadata['file'] try: @@ -61,3 +58,6 @@ def _process_item(self, item: ProcessingItem, missing_outputs: List[OutputSpec]) self.logger.debug(f'No unicode escapes found in: {trans_file.name}') except Exception as e: self.logger.error(f'Error fixing unicode in {trans_file.name}: {e}') + + def _validate_args(self, args: Dict[str, Any]) -> None: + ... diff --git a/preprocessor/lib/transcription/utils.py b/preprocessor/lib/transcription/utils.py index 994d66158..fec35909d 100644 --- a/preprocessor/lib/transcription/utils.py +++ b/preprocessor/lib/transcription/utils.py @@ -10,13 +10,17 @@ class TranscriptionUtils: @staticmethod - def __fix_unicode(file_path: Path) -> None: # pylint: disable=unused-private-member - if not file_path.exists(): - return - with open(file_path, 'r', encoding='utf-8') as f: - data: Dict[str, Any] = json.load(f) - with open(file_path, 'w', encoding='utf-8') as f: - json.dump(data, f, ensure_ascii=False, indent=2) + def convert_words_list(words: List[Dict[str, Any]]) -> List[Dict[str, Any]]: + return [ + { + 'word': word.get('text', word.get('word', '')), + 'start': word.get('start', 0.0), + 'end': word.get('end', 0.0), + 'probability': word.get('probability', word.get('confidence', 1.0)), + 'speaker_id': word.get('speaker_id', 'speaker_unknown'), + } + for word in words + ] @staticmethod def fix_transcription_file_unicode(file_path: Path) -> bool: @@ -34,17 +38,13 @@ def fix_transcription_file_unicode(file_path: Path) -> bool: return False @staticmethod - def convert_words_list(words: List[Dict[str, Any]]) -> List[Dict[str, Any]]: - return [ - { - 'word': word.get('text', word.get('word', '')), - 'start': word.get('start', 0.0), - 'end': word.get('end', 0.0), - 'probability': word.get('probability', word.get('confidence', 1.0)), - 'speaker_id': word.get('speaker_id', 'speaker_unknown'), - } - for word in words - ] + def __fix_unicode(file_path: Path) -> None: # pylint: disable=unused-private-member + if not file_path.exists(): + return + with open(file_path, 'r', encoding='utf-8') as f: + data: Dict[str, Any] = json.load(f) + with open(file_path, 'w', encoding='utf-8') as f: + json.dump(data, f, ensure_ascii=False, indent=2) class WhisperUtils: LANGUAGE_MAP: Dict[str, str] = { @@ -55,6 +55,17 @@ class WhisperUtils: 'spanish': 'es', } + @staticmethod + def build_transcription_result(segments: Any, language: str=None) -> Dict[str, Any]: + result: Dict[str, Any] = {'text': '', 'segments': []} + if language: + result['language'] = language + for segment in segments: + segment_dict = WhisperUtils.__process_segment(segment) + result['segments'].append(segment_dict) + result['text'] += segment.text + return result + @staticmethod def get_language_code(language: str) -> str: return WhisperUtils.LANGUAGE_MAP.get(language.lower(), language.lower()) @@ -82,14 +93,3 @@ def __process_segment(segment: Any) -> Dict[str, Any]: 'no_speech_prob': segment.no_speech_prob, 'words': words, } - - @staticmethod - def build_transcription_result(segments: Any, language: str=None) -> Dict[str, Any]: - result: Dict[str, Any] = {'text': '', 'segments': []} - if language: - result['language'] = language - for segment in segments: - segment_dict = WhisperUtils.__process_segment(segment) - result['segments'].append(segment_dict) - result['text'] += segment.text - return result diff --git a/preprocessor/lib/transcription/whisper.py b/preprocessor/lib/transcription/whisper.py index e0889d7f5..9c0bc6c0d 100644 --- a/preprocessor/lib/transcription/whisper.py +++ b/preprocessor/lib/transcription/whisper.py @@ -22,16 +22,14 @@ def __init__(self, model: str='large-v3-turbo', language: str='pl', device: str= self.temperature: float = temperature self._model: Optional[WhisperModel] = None - def _load_model(self) -> WhisperModel: + def cleanup(self) -> None: + console.print('[cyan]Unloading Whisper model and clearing GPU memory...[/cyan]') if self._model is not None: - return self._model - if self.device != 'cuda': - raise ValueError(f'Only GPU (cuda) is supported, got device={self.device}') - compute_type = 'float16' - console.print(f'[cyan]Loading Whisper model: {self.model_name} on {self.device} with compute_type={compute_type}[/cyan]') - self._model = WhisperModel(self.model_name, device=self.device, compute_type=compute_type) - console.print('[green]✓ Whisper model loaded[/green]') - return self._model + del self._model + self._model = None + if torch.cuda.is_available(): + torch.cuda.empty_cache() + console.print('[green]✓ Whisper model unloaded, GPU memory cleared[/green]') def transcribe(self, audio_path: Path) -> Dict[str, Any]: console.print(f'[cyan]Transcribing with Whisper: {audio_path.name}[/cyan]') @@ -51,11 +49,13 @@ def transcribe(self, audio_path: Path) -> Dict[str, Any]: console.print(f'[green]✓ Transcription completed: {audio_path.name}[/green]') return result - def cleanup(self) -> None: - console.print('[cyan]Unloading Whisper model and clearing GPU memory...[/cyan]') + def _load_model(self) -> WhisperModel: if self._model is not None: - del self._model - self._model = None - if torch.cuda.is_available(): - torch.cuda.empty_cache() - console.print('[green]✓ Whisper model unloaded, GPU memory cleared[/green]') + return self._model + if self.device != 'cuda': + raise ValueError(f'Only GPU (cuda) is supported, got device={self.device}') + compute_type = 'float16' + console.print(f'[cyan]Loading Whisper model: {self.model_name} on {self.device} with compute_type={compute_type}[/cyan]') + self._model = WhisperModel(self.model_name, device=self.device, compute_type=compute_type) + console.print('[green]✓ Whisper model loaded[/green]') + return self._model diff --git a/preprocessor/lib/ui/console.py b/preprocessor/lib/ui/console.py index 3be9d1fd0..1880c3a92 100644 --- a/preprocessor/lib/ui/console.py +++ b/preprocessor/lib/ui/console.py @@ -53,6 +53,12 @@ def advance(self, task_id: int, advance: int=1): self.__print_progress(task_id) task['last_print'] = current_time + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + pass + def __print_progress(self, task_id: int): task = self.tasks[task_id] completed = task['completed'] @@ -82,12 +88,6 @@ def __print_progress(self, task_id: int): highlight=False, ) - def __enter__(self): - return self - - def __exit__(self, exc_type, exc_val, exc_tb): - pass - def create_progress() -> SimpleProgress: return SimpleProgress() diff --git a/preprocessor/lib/validation/base_result.py b/preprocessor/lib/validation/base_result.py index 662f3059b..2719268ae 100644 --- a/preprocessor/lib/validation/base_result.py +++ b/preprocessor/lib/validation/base_result.py @@ -24,8 +24,8 @@ def status(self) -> str: @dataclass class BaseValidationResult(ValidationStatusMixin): errors: List[str] = field(default_factory=list) - warnings: List[str] = field(default_factory=list) stats: Dict[str, Any] = field(default_factory=dict) + warnings: List[str] = field(default_factory=list) def to_dict(self) -> Dict[str, Any]: return {'status': self.status, 'errors': self.errors, 'warnings': self.warnings, 'stats': self.stats} diff --git a/preprocessor/lib/validation/file_validators.py b/preprocessor/lib/validation/file_validators.py index 0c8f3a537..206122042 100644 --- a/preprocessor/lib/validation/file_validators.py +++ b/preprocessor/lib/validation/file_validators.py @@ -11,11 +11,11 @@ from PIL import Image -from preprocessor.config.constants import ValidationMetadataKeys from preprocessor.config.types.keys import ( FfprobeFormatKeys, FfprobeKeys, FfprobeStreamKeys, + ValidationMetadataKeys, ) @@ -28,10 +28,27 @@ class ValidationResult: class FileValidator: @staticmethod - def __check_file_exists(path: Path) -> Optional[ValidationResult]: - if not path.exists(): - return ValidationResult(is_valid=False, error_message=f'File does not exist: {path}') - return None + def validate_image_file(path: Path) -> ValidationResult: + if (error := FileValidator.__check_file_exists(path)): + return error + try: + with Image.open(path) as img: + img.verify() + with Image.open(path) as img: + width, height = img.size + format_type = img.format + size_mb = path.stat().st_size / (1024 * 1024) + return ValidationResult( + is_valid=True, + metadata={ + ValidationMetadataKeys.WIDTH: width, + ValidationMetadataKeys.HEIGHT: height, + ValidationMetadataKeys.FORMAT: format_type, + ValidationMetadataKeys.SIZE_MB: round(size_mb, 2), + }, + ) + except Exception as e: + return ValidationResult(is_valid=False, error_message=f'Invalid image: {e}') @staticmethod def validate_json_file(path: Path) -> ValidationResult: @@ -78,29 +95,6 @@ def validate_jsonl_file(path: Path) -> ValidationResult: except Exception as e: return ValidationResult(is_valid=False, error_message=f'Error reading file: {e}') - @staticmethod - def validate_image_file(path: Path) -> ValidationResult: - if (error := FileValidator.__check_file_exists(path)): - return error - try: - with Image.open(path) as img: - img.verify() - with Image.open(path) as img: - width, height = img.size - format_type = img.format - size_mb = path.stat().st_size / (1024 * 1024) - return ValidationResult( - is_valid=True, - metadata={ - ValidationMetadataKeys.WIDTH: width, - ValidationMetadataKeys.HEIGHT: height, - ValidationMetadataKeys.FORMAT: format_type, - ValidationMetadataKeys.SIZE_MB: round(size_mb, 2), - }, - ) - except Exception as e: - return ValidationResult(is_valid=False, error_message=f'Invalid image: {e}') - @staticmethod def validate_video_file(path: Path) -> ValidationResult: if (error := FileValidator.__check_file_exists(path)): @@ -140,6 +134,12 @@ def validate_video_file(path: Path) -> ValidationResult: except Exception as e: return ValidationResult(is_valid=False, error_message=f'Error validating video: {e}') + @staticmethod + def __check_file_exists(path: Path) -> Optional[ValidationResult]: + if not path.exists(): + return ValidationResult(is_valid=False, error_message=f'File does not exist: {path}') + return None + @staticmethod def __validate_archive_file(path: Path) -> ValidationResult: # pylint: disable=unused-private-member if (error := FileValidator.__check_file_exists(path)): diff --git a/preprocessor/lib/video/emotion_utils.py b/preprocessor/lib/video/emotion_utils.py index 832571f2c..9c833a459 100644 --- a/preprocessor/lib/video/emotion_utils.py +++ b/preprocessor/lib/video/emotion_utils.py @@ -15,32 +15,6 @@ class EmotionDetector: - @staticmethod - def __init_model(logger: Optional[ErrorHandlingLogger]=None) -> HSEmotionRecognizer: # pylint: disable=unused-private-member - model_name = settings.emotion_detection.model_name - if logger: - logger.info(f'Loading HSEmotion model: {model_name}...') - try: - fer = HSEmotionRecognizer(model_name=model_name) - if logger: - logger.info(f'HSEmotion model loaded: {model_name}') - return fer - except Exception as e: - raise RuntimeError(f'Failed to load HSEmotion model {model_name}: {e}') from e - - @staticmethod - def __process_emotion_result( - emotion: str, - scores: np.ndarray, - ) -> Tuple[str, float, Dict[str, float]]: - emotion_scores = { - EMOTION_LABELS[i]: float(scores[i]) - for i in range(len(EMOTION_LABELS)) - } - confidence = float(max(scores)) - dominant_emotion = emotion.lower() - return (dominant_emotion, confidence, emotion_scores) - @staticmethod def detect( face_image: np.ndarray, @@ -110,3 +84,29 @@ def __detect_batch( # pylint: disable=unused-private-member except Exception: results.append(None) return results + + @staticmethod + def __init_model(logger: Optional[ErrorHandlingLogger]=None) -> HSEmotionRecognizer: # pylint: disable=unused-private-member + model_name = settings.emotion_detection.model_name + if logger: + logger.info(f'Loading HSEmotion model: {model_name}...') + try: + fer = HSEmotionRecognizer(model_name=model_name) + if logger: + logger.info(f'HSEmotion model loaded: {model_name}') + return fer + except Exception as e: + raise RuntimeError(f'Failed to load HSEmotion model {model_name}: {e}') from e + + @staticmethod + def __process_emotion_result( + emotion: str, + scores: np.ndarray, + ) -> Tuple[str, float, Dict[str, float]]: + emotion_scores = { + EMOTION_LABELS[i]: float(scores[i]) + for i in range(len(EMOTION_LABELS)) + } + confidence = float(max(scores)) + dominant_emotion = emotion.lower() + return (dominant_emotion, confidence, emotion_scores) diff --git a/preprocessor/lib/video/frame_utils.py b/preprocessor/lib/video/frame_utils.py index d45523646..ef58db552 100644 --- a/preprocessor/lib/video/frame_utils.py +++ b/preprocessor/lib/video/frame_utils.py @@ -11,6 +11,12 @@ class FrameLoader: + @staticmethod + def load_from_requests(frames_dir: Path, frame_requests: List[Dict[str, Any]], convert_rgb: bool=False, num_workers: int=4) -> List[Image.Image]: + with ThreadPoolExecutor(max_workers=num_workers) as executor: + images = list(executor.map(lambda req: FrameLoader.__load_single(frames_dir, req, convert_rgb), frame_requests)) + return images + @staticmethod def __load_single(frames_dir: Path, request: Dict[str, Any], convert_rgb: bool) -> Image.Image: if 'frame_path' in request: @@ -24,9 +30,3 @@ def __load_single(frames_dir: Path, request: Dict[str, Any], convert_rgb: bool) img = img.convert('RGB') return img return Image.new('RGB', (1, 1)) - - @staticmethod - def load_from_requests(frames_dir: Path, frame_requests: List[Dict[str, Any]], convert_rgb: bool=False, num_workers: int=4) -> List[Image.Image]: - with ThreadPoolExecutor(max_workers=num_workers) as executor: - images = list(executor.map(lambda req: FrameLoader.__load_single(frames_dir, req, convert_rgb), frame_requests)) - return images diff --git a/preprocessor/lib/video/image_hasher.py b/preprocessor/lib/video/image_hasher.py index f121ffdb4..5255b1d39 100644 --- a/preprocessor/lib/video/image_hasher.py +++ b/preprocessor/lib/video/image_hasher.py @@ -16,6 +16,13 @@ def __init__(self) -> None: if torch.cuda.is_available(): self.model = self.model.cuda() + def cleanup(self) -> None: + if self.model is not None: + del self.model + self.model = None + if torch.cuda.is_available(): + torch.cuda.empty_cache() + def __compute_hash(self, image_tensor: torch.Tensor) -> int: # pylint: disable=unused-private-member if self.model is None: raise RuntimeError('Model not initialized or already cleaned up') @@ -26,11 +33,4 @@ def __compute_hash(self, image_tensor: torch.Tensor) -> int: # pylint: disable=u hash_bits = (features > features.median()).int() hash_val = int(''.join([str(bit) for bit in hash_bits.tolist()[:64]]), 2) return hash_val - - def cleanup(self) -> None: - if self.model is not None: - del self.model - self.model = None - if torch.cuda.is_available(): - torch.cuda.empty_cache() __all__ = ['PerceptualHasher'] diff --git a/preprocessor/modules/audio/extraction.py b/preprocessor/modules/audio/extraction.py index fe0d831b5..50f21c367 100644 --- a/preprocessor/modules/audio/extraction.py +++ b/preprocessor/modules/audio/extraction.py @@ -13,10 +13,6 @@ class AudioExtractionStep(PipelineStep[SourceVideo, AudioArtifact, AudioExtractionConfig]): - @property - def name(self) -> str: - return 'audio_extraction' - def execute(self, input_data: SourceVideo, context: ExecutionContext) -> AudioArtifact: episode_code = input_data.episode_info.episode_code() output_filename: str = ( @@ -61,3 +57,7 @@ def execute(self, input_data: SourceVideo, context: ExecutionContext) -> AudioAr path=output_path, format=self.config.format, ) + + @property + def name(self) -> str: + return 'audio_extraction' diff --git a/preprocessor/modules/audio/separation.py b/preprocessor/modules/audio/separation.py index 10d382b87..8748073d6 100644 --- a/preprocessor/modules/audio/separation.py +++ b/preprocessor/modules/audio/separation.py @@ -29,10 +29,6 @@ class SoundSeparationStep(PipelineStep[TranscriptionData, TranscriptionData, SoundSeparationConfig]): - @property - def name(self) -> str: - return 'sound_separation' - def execute( # pylint: disable=too-many-locals self, input_data: TranscriptionData, @@ -117,49 +113,23 @@ def execute( # pylint: disable=too-many-locals format='json', ) - @staticmethod - def __is_sound_event_text(text: str) -> bool: # pylint: disable=unused-private-member - return bool(re.match(r'^\(.*\)$', text.strip())) + @property + def name(self) -> str: + return 'sound_separation' - def __split_mixed_segment( - self, - segment: Dict[str, Any], - ) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]: - words = segment.get(WordKeys.WORDS, []) - dialogue_parts = [] - sound_parts = [] - current_type = None - current_words = [] - current_start = None - for word in words: - word_type = 'sound' if is_sound_event(word) else 'dialogue' - if word.get(WordKeys.TYPE) == WordTypeValues.SPACING: - if current_words: - current_words.append(word) - continue - if word_type != current_type: - if current_words and current_type: - self.__finalize_sequence( - current_type, - current_words, - current_start, - dialogue_parts, - sound_parts, - ) - current_type = word_type - current_words = [word] - current_start = word.get(WordKeys.START) - else: - current_words.append(word) - if current_words and current_type: - self.__finalize_sequence( - current_type, - current_words, - current_start, - dialogue_parts, - sound_parts, - ) - return (dialogue_parts, sound_parts) + @staticmethod + def __clean_segment_text(segment: Dict[str, Any]) -> Dict[str, Any]: + cleaned = segment.copy() + text = cleaned.get('text', '') + text = re.sub('\\s+', ' ', text) + cleaned['text'] = text.strip() + words = cleaned.get(WordKeys.WORDS, []) + if words: + non_spacing = [w for w in words if w.get(WordKeys.TYPE) != WordTypeValues.SPACING] + if non_spacing: + cleaned[WordKeys.START] = min((w.get(WordKeys.START, 0) for w in non_spacing)) + cleaned[WordKeys.END] = max((w.get(WordKeys.END, 0) for w in non_spacing)) + return cleaned @staticmethod def __finalize_sequence( @@ -188,24 +158,25 @@ def __finalize_sequence( dialogue_parts.append(new_segment) @staticmethod - def __clean_segment_text(segment: Dict[str, Any]) -> Dict[str, Any]: - cleaned = segment.copy() - text = cleaned.get('text', '') - text = re.sub('\\s+', ' ', text) - cleaned['text'] = text.strip() - words = cleaned.get(WordKeys.WORDS, []) - if words: - non_spacing = [w for w in words if w.get(WordKeys.TYPE) != WordTypeValues.SPACING] - if non_spacing: - cleaned[WordKeys.START] = min((w.get(WordKeys.START, 0) for w in non_spacing)) - cleaned[WordKeys.END] = max((w.get(WordKeys.END, 0) for w in non_spacing)) - return cleaned + def __format_srt_time(seconds: float) -> str: + hours = int(seconds // 3600) + minutes = int(seconds % 3600 // 60) + secs = int(seconds % 60) + millis = int(seconds % 1 * 1000) + return f'{hours:02d}:{minutes:02d}:{secs:02d},{millis:03d}' @staticmethod - def __renumber_segments(segments: List[Dict[str, Any]]) -> List[Dict[str, Any]]: - for i, seg in enumerate(segments): - seg['id'] = i - return segments + def __generate_srt_file(segments: List[Dict[str, Any]], srt_path: Path) -> None: + with open(srt_path, 'w', encoding='utf-8') as f: + for idx, seg in enumerate(segments, 1): + start = seg.get('start', 0) + end = seg.get('end', 0) + text = seg.get('text', '').strip() + start_time = SoundSeparationStep.__format_srt_time(start) + end_time = SoundSeparationStep.__format_srt_time(end) + f.write(f'{idx}\n') + f.write(f'{start_time} --> {end_time}\n') + f.write(f'{text}\n\n') @staticmethod def __generate_txt_file(json_path: Path, txt_path: Path) -> None: @@ -223,22 +194,51 @@ def __generate_txt_file(json_path: Path, txt_path: Path) -> None: f.write(' '.join(text_lines)) @staticmethod - def __generate_srt_file(segments: List[Dict[str, Any]], srt_path: Path) -> None: - with open(srt_path, 'w', encoding='utf-8') as f: - for idx, seg in enumerate(segments, 1): - start = seg.get('start', 0) - end = seg.get('end', 0) - text = seg.get('text', '').strip() - start_time = SoundSeparationStep.__format_srt_time(start) - end_time = SoundSeparationStep.__format_srt_time(end) - f.write(f'{idx}\n') - f.write(f'{start_time} --> {end_time}\n') - f.write(f'{text}\n\n') + def __is_sound_event_text(text: str) -> bool: # pylint: disable=unused-private-member + return bool(re.match(r'^\(.*\)$', text.strip())) @staticmethod - def __format_srt_time(seconds: float) -> str: - hours = int(seconds // 3600) - minutes = int(seconds % 3600 // 60) - secs = int(seconds % 60) - millis = int(seconds % 1 * 1000) - return f'{hours:02d}:{minutes:02d}:{secs:02d},{millis:03d}' + def __renumber_segments(segments: List[Dict[str, Any]]) -> List[Dict[str, Any]]: + for i, seg in enumerate(segments): + seg['id'] = i + return segments + + def __split_mixed_segment( + self, + segment: Dict[str, Any], + ) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]: + words = segment.get(WordKeys.WORDS, []) + dialogue_parts = [] + sound_parts = [] + current_type = None + current_words = [] + current_start = None + for word in words: + word_type = 'sound' if is_sound_event(word) else 'dialogue' + if word.get(WordKeys.TYPE) == WordTypeValues.SPACING: + if current_words: + current_words.append(word) + continue + if word_type != current_type: + if current_words and current_type: + self.__finalize_sequence( + current_type, + current_words, + current_start, + dialogue_parts, + sound_parts, + ) + current_type = word_type + current_words = [word] + current_start = word.get(WordKeys.START) + else: + current_words.append(word) + if current_words and current_type: + self.__finalize_sequence( + current_type, + current_words, + current_start, + dialogue_parts, + sound_parts, + ) + return (dialogue_parts, sound_parts) diff --git a/preprocessor/modules/packaging/archives.py b/preprocessor/modules/packaging/archives.py index 02f259dac..c77a8f642 100644 --- a/preprocessor/modules/packaging/archives.py +++ b/preprocessor/modules/packaging/archives.py @@ -11,10 +11,6 @@ class ArchiveGenerationStep(PipelineStep[ProcessedEpisode, ArchiveArtifact, ArchiveConfig]): - @property - def name(self) -> str: - return 'archive_generation' - def execute(self, input_data: ProcessedEpisode, context: ExecutionContext) -> ArchiveArtifact: output_filename: str = f'{context.series_name}_{input_data.episode_info.episode_code()}_archive.zip' output_path: Path = context.get_output_path(input_data.episode_info, 'archives', output_filename) @@ -26,3 +22,7 @@ def execute(self, input_data: ProcessedEpisode, context: ExecutionContext) -> Ar context.mark_step_started(self.name, input_data.episode_id) context.mark_step_completed(self.name, input_data.episode_id) return ArchiveArtifact(episode_id=input_data.episode_id, episode_info=input_data.episode_info, path=output_path) + + @property + def name(self) -> str: + return 'archive_generation' diff --git a/preprocessor/modules/scraping/base_scraper.py b/preprocessor/modules/scraping/base_scraper.py index 5b92bdf04..7eb2b6362 100644 --- a/preprocessor/modules/scraping/base_scraper.py +++ b/preprocessor/modules/scraping/base_scraper.py @@ -34,11 +34,8 @@ def __init__(self, args: Dict[str, Any], error_exit_code: int=7): self.parser_mode = ParserMode(parser_mode_str) self.llm: Optional[LLMProvider] = None - def _validate_args(self, args: Dict[str, Any]) -> None: - if 'urls' not in args or not args['urls']: - raise ValueError('At least one URL is required') - if 'output_file' not in args: - raise ValueError('output_file is required') + def get_output_subdir(self) -> str: + return "" def _execute(self) -> None: self.llm = LLMProvider(parser_mode=self.parser_mode) @@ -53,6 +50,21 @@ def _execute(self) -> None: except Exception as e: self.logger.error(f'LLM processing failed: {e}') + @abstractmethod + def _process_scraped_pages(self, scraped_pages: List[Dict[str, Any]]) -> None: + pass + + def _save_result(self, result: Dict[str, Any]) -> None: + self.output_file.parent.mkdir(parents=True, exist_ok=True) + with open(self.output_file, 'w', encoding='utf-8') as f: + json.dump(result, f, indent=2, ensure_ascii=False) + + def _validate_args(self, args: Dict[str, Any]) -> None: + if 'urls' not in args or not args['urls']: + raise ValueError('At least one URL is required') + if 'output_file' not in args: + raise ValueError('output_file is required') + def __scrape_all_urls(self) -> List[Dict[str, Any]]: scraped_pages = [] try: @@ -80,15 +92,3 @@ def __scrape_url(self, url: str) -> Optional[str]: return ScraperCrawl4AI.scrape(url, save_markdown=True, output_dir=settings.scraper.get_output_dir(self.series_name), logger=self.logger) self.logger.error(f'Unknown scraper method: {self.scraper_method}') return None - - def _save_result(self, result: Dict[str, Any]) -> None: - self.output_file.parent.mkdir(parents=True, exist_ok=True) - with open(self.output_file, 'w', encoding='utf-8') as f: - json.dump(result, f, indent=2, ensure_ascii=False) - - def get_output_subdir(self) -> str: - return "" - - @abstractmethod - def _process_scraped_pages(self, scraped_pages: List[Dict[str, Any]]) -> None: - pass diff --git a/preprocessor/modules/scraping/base_scraper_step.py b/preprocessor/modules/scraping/base_scraper_step.py new file mode 100644 index 000000000..b4a56700e --- /dev/null +++ b/preprocessor/modules/scraping/base_scraper_step.py @@ -0,0 +1,75 @@ +from abc import ( + ABC, + abstractmethod, +) +from pathlib import Path +from typing import ( + Any, + Dict, + Optional, + TypeVar, +) + +from pydantic import BaseModel + +from preprocessor.core.artifacts import SourceVideo +from preprocessor.core.base_step import PipelineStep +from preprocessor.core.context import ExecutionContext + +ConfigT = TypeVar("ConfigT", bound=BaseModel) + + +class BaseScraperStep(PipelineStep[SourceVideo, SourceVideo, ConfigT], ABC): + + def __init__(self, config: ConfigT) -> None: + super().__init__(config) + self._executed = False + + def execute( + self, input_data: SourceVideo, context: ExecutionContext, + ) -> Optional[SourceVideo]: + if self._executed: + return input_data + + output_path = Path(self.config.output_file) # type: ignore[attr-defined] + + if output_path.exists() and not context.force_rerun: + context.logger.info(f"{self._get_metadata_type_name()} metadata already exists: {output_path}") + self._executed = True + return input_data + + urls = self.config.urls # type: ignore[attr-defined] + context.logger.info(f"Scraping {self._get_metadata_type_name().lower()} from {len(urls)} URLs") + + scraper_class = self._get_scraper_class() + scraper_args = self._build_scraper_args(output_path, context) + scraper = scraper_class(scraper_args) + + exit_code = scraper.work() + + if exit_code != 0: + raise RuntimeError(f"{self._get_metadata_type_name()} scraper failed with exit code {exit_code}") + + context.logger.info(f"{self._get_metadata_type_name()} metadata saved to: {output_path}") + + self._executed = True + return input_data + + @abstractmethod + def _get_scraper_class(self): + pass + + @abstractmethod + def _get_metadata_type_name(self) -> str: + pass + + def _build_scraper_args(self, output_path: Path, context: ExecutionContext) -> Dict[str, Any]: + base_args: Dict[str, Any] = { + "urls": self.config.urls, # type: ignore[attr-defined] + "output_file": output_path, + "headless": self.config.headless, # type: ignore[attr-defined] + "scraper_method": self.config.scraper_method, # type: ignore[attr-defined] + "parser_mode": self.config.parser_mode, # type: ignore[attr-defined] + "series_name": context.series_name, + } + return base_args diff --git a/preprocessor/modules/scraping/character_scraper_step.py b/preprocessor/modules/scraping/character_scraper_step.py index 6fc263bf2..08a43e462 100644 --- a/preprocessor/modules/scraping/character_scraper_step.py +++ b/preprocessor/modules/scraping/character_scraper_step.py @@ -1,56 +1,16 @@ -from pathlib import Path -from typing import Optional - from preprocessor.config.step_configs import CharacterScraperConfig -from preprocessor.core.artifacts import SourceVideo -from preprocessor.core.base_step import PipelineStep -from preprocessor.core.context import ExecutionContext +from preprocessor.modules.scraping.base_scraper_step import BaseScraperStep from preprocessor.modules.scraping.character_scraper import CharacterScraper -class CharacterScraperStep( - PipelineStep[SourceVideo, SourceVideo, CharacterScraperConfig], -): - def __init__(self, config: CharacterScraperConfig) -> None: - super().__init__(config) - self._executed = False +class CharacterScraperStep(BaseScraperStep[CharacterScraperConfig]): + + def _get_scraper_class(self): + return CharacterScraper + + def _get_metadata_type_name(self) -> str: + return "Characters" @property def name(self) -> str: return "scrape_characters" - - def execute( - self, input_data: SourceVideo, context: ExecutionContext, - ) -> Optional[SourceVideo]: - if self._executed: - return input_data - - output_path = Path(self.config.output_file) - - if output_path.exists() and not context.force_rerun: - context.logger.info(f"Characters metadata already exists: {output_path}") - self._executed = True - return input_data - - context.logger.info(f"Scraping characters from {len(self.config.urls)} URLs") - - scraper = CharacterScraper( - { - "urls": self.config.urls, - "output_file": output_path, - "headless": self.config.headless, - "scraper_method": self.config.scraper_method, - "parser_mode": self.config.parser_mode, - "series_name": context.series_name, - }, - ) - - exit_code = scraper.work() - - if exit_code != 0: - raise RuntimeError(f"Character scraper failed with exit code {exit_code}") - - context.logger.info(f"Characters metadata saved to: {output_path}") - - self._executed = True - return input_data diff --git a/preprocessor/modules/scraping/episode_scraper.py b/preprocessor/modules/scraping/episode_scraper.py index 26f36ebec..818b3d05c 100644 --- a/preprocessor/modules/scraping/episode_scraper.py +++ b/preprocessor/modules/scraping/episode_scraper.py @@ -31,21 +31,11 @@ def _process_scraped_pages(self, scraped_pages: List[Dict[str, Any]]) -> None: console.print(f'[green]✓ Saved to: {self.output_file}[/green]') self.__validate_and_report_coverage(total_episodes) - def __validate_and_report_coverage(self, scraped_episodes_count: int) -> None: - expected_count = self.__get_expected_episodes_count() - if expected_count is None: - self.__print_no_validation_warning(scraped_episodes_count) - return - status, message = self.__get_coverage_status(scraped_episodes_count, expected_count) - self.__print_coverage_report(scraped_episodes_count, expected_count, status, message) - - @staticmethod - def __print_no_validation_warning(scraped_count: int) -> None: - console.print('\n[yellow]⚠ Coverage validation:[/yellow]') - console.print(f' [cyan]Scraped episodes: {scraped_count}[/cyan]') - console.print(' [yellow]No video directory provided - unable to validate coverage[/yellow]') - console.print(' [dim]Make sure the scraped episodes cover all your video files[/dim]') - console.print(' [dim]You can add more --scrape-urls if needed[/dim]\n') + def __count_video_files(self, directory: Path) -> int: + count = 0 + for ext in self.SUPPORTED_VIDEO_EXTENSIONS: + count += len(list(directory.rglob(f'*{ext}'))) + return count @staticmethod def __get_coverage_status(scraped: int, expected: int) -> Tuple[str, str]: @@ -55,6 +45,13 @@ def __get_coverage_status(scraped: int, expected: int) -> Tuple[str, str]: return ('extra', f'Scraped {scraped - expected} more episodes than video files') return ('perfect', 'Perfect coverage') + def __get_expected_episodes_count(self) -> Optional[int]: + if self.expected_episodes_count is not None: + return self.expected_episodes_count + if self.videos_dir and self.videos_dir.exists(): + return self.__count_video_files(self.videos_dir) + return None + @staticmethod def __print_coverage_report(scraped: int, expected: int, status: str, message: str) -> None: coverage_pct = scraped / expected * 100 if expected > 0 else 0 @@ -72,15 +69,18 @@ def __print_coverage_report(scraped: int, expected: int, status: str, message: s else: console.print('\n[green]✓ Perfect coverage - all video files have metadata![/green]\n') - def __get_expected_episodes_count(self) -> Optional[int]: - if self.expected_episodes_count is not None: - return self.expected_episodes_count - if self.videos_dir and self.videos_dir.exists(): - return self.__count_video_files(self.videos_dir) - return None + @staticmethod + def __print_no_validation_warning(scraped_count: int) -> None: + console.print('\n[yellow]⚠ Coverage validation:[/yellow]') + console.print(f' [cyan]Scraped episodes: {scraped_count}[/cyan]') + console.print(' [yellow]No video directory provided - unable to validate coverage[/yellow]') + console.print(' [dim]Make sure the scraped episodes cover all your video files[/dim]') + console.print(' [dim]You can add more --scrape-urls if needed[/dim]\n') - def __count_video_files(self, directory: Path) -> int: - count = 0 - for ext in self.SUPPORTED_VIDEO_EXTENSIONS: - count += len(list(directory.rglob(f'*{ext}'))) - return count + def __validate_and_report_coverage(self, scraped_episodes_count: int) -> None: + expected_count = self.__get_expected_episodes_count() + if expected_count is None: + self.__print_no_validation_warning(scraped_episodes_count) + return + status, message = self.__get_coverage_status(scraped_episodes_count, expected_count) + self.__print_coverage_report(scraped_episodes_count, expected_count, status, message) diff --git a/preprocessor/modules/scraping/episode_scraper_step.py b/preprocessor/modules/scraping/episode_scraper_step.py index 068f63e29..27188b091 100644 --- a/preprocessor/modules/scraping/episode_scraper_step.py +++ b/preprocessor/modules/scraping/episode_scraper_step.py @@ -1,57 +1,28 @@ from pathlib import Path -from typing import Optional +from typing import ( + Any, + Dict, +) from preprocessor.config.step_configs import EpisodeScraperConfig -from preprocessor.core.artifacts import SourceVideo -from preprocessor.core.base_step import PipelineStep from preprocessor.core.context import ExecutionContext +from preprocessor.modules.scraping.base_scraper_step import BaseScraperStep from preprocessor.modules.scraping.episode_scraper import EpisodeScraper -class EpisodeScraperStep( - PipelineStep[SourceVideo, SourceVideo, EpisodeScraperConfig], -): - def __init__(self, config: EpisodeScraperConfig) -> None: - super().__init__(config) - self._executed = False +class EpisodeScraperStep(BaseScraperStep[EpisodeScraperConfig]): - @property - def name(self) -> str: - return "scrape_episodes" - - def execute( - self, input_data: SourceVideo, context: ExecutionContext, - ) -> Optional[SourceVideo]: - if self._executed: - return input_data - - output_path = Path(self.config.output_file) - - if output_path.exists() and not context.force_rerun: - context.logger.info(f"Episodes metadata already exists: {output_path}") - self._executed = True - return input_data + def _get_scraper_class(self): + return EpisodeScraper - context.logger.info(f"Scraping episodes from {len(self.config.urls)} URLs") + def _get_metadata_type_name(self) -> str: + return "Episodes" - scraper = EpisodeScraper( - { - "urls": self.config.urls, - "output_file": output_path, - "headless": self.config.headless, - "merge_sources": self.config.merge_sources, - "scraper_method": self.config.scraper_method, - "parser_mode": self.config.parser_mode, - "series_name": context.series_name, - }, - ) + def _build_scraper_args(self, output_path: Path, context: ExecutionContext) -> Dict[str, Any]: + args = super()._build_scraper_args(output_path, context) + args["merge_sources"] = self.config.merge_sources + return args - exit_code = scraper.work() - - if exit_code != 0: - raise RuntimeError(f"Episode scraper failed with exit code {exit_code}") - - context.logger.info(f"Episodes metadata saved to: {output_path}") - - self._executed = True - return input_data + @property + def name(self) -> str: + return "scrape_episodes" diff --git a/preprocessor/modules/scraping/reference_processor.py b/preprocessor/modules/scraping/reference_processor.py index 3814045bb..dbd900cd7 100644 --- a/preprocessor/modules/scraping/reference_processor.py +++ b/preprocessor/modules/scraping/reference_processor.py @@ -33,6 +33,36 @@ class CharacterReferenceProcessor(BaseProcessor): + @dataclass + class _GridDimensions: + face_size: int = 280 + faces_per_char: int = 3 + footer_height: int = 80 + header_height: int = 180 + header_row_height: int = 40 + label_col_width: int = 350 + padding: int = 15 + stats_col_width: int = 200 + + @property + def face_col_width(self) -> int: + return self.face_size + self.padding + + @property + def row_height(self) -> int: + return self.face_size + self.padding * 2 + + def total_height(self, num_chars: int) -> int: + return self.header_height + num_chars * self.row_height + self.footer_height + + def total_width(self) -> int: + return ( + self.label_col_width + + self.stats_col_width + + self.faces_per_char * self.face_col_width + + self.padding * 2 + ) + def __init__(self, args: Dict[str, Any]): super().__init__(args=args, class_name='CharacterReferenceProcessor', error_exit_code=20, loglevel=logging.INFO) self.characters_dir = args['characters_dir'] @@ -41,30 +71,68 @@ def __init__(self, args: Dict[str, Any]): self.interactive = args['interactive'] self.face_app: Optional[FaceAnalysis] = None - def _validate_args(self, args: Dict[str, Any]) -> None: - required = ['characters_dir', 'output_dir', 'similarity_threshold', 'interactive'] - for key in required: - if key not in args: - raise ValueError(f'Missing required argument: {key}') + def generate_validation_grid(self) -> None: + output_path = self.output_dir / 'validation_grid.png' + if output_path.exists(): + console.print(f'[dim]⊘ Skipping validation grid (already exists): {output_path}[/dim]') + return + + console.print('\n[blue]Generating validation grid...[/blue]') + + if not self.output_dir.exists(): + console.print('[yellow]No processed references found, skipping validation grid[/yellow]') + return + + processed_chars = sorted([d for d in self.output_dir.iterdir() if d.is_dir()]) + if not processed_chars: + console.print('[yellow]No processed characters found, skipping validation grid[/yellow]') + return + + dims = self._GridDimensions() + grid_width = dims.total_width() + grid_height = dims.total_height(len(processed_chars)) + bg_color = (250, 252, 255) + grid = np.full((grid_height, grid_width, 3), bg_color, dtype=np.uint8) + + metadata_all = self.__load_all_metadata(processed_chars) + avg_similarity = ( + np.mean([m.get('average_similarity', 0) for m in metadata_all]) if metadata_all else 0 + ) + + self.__render_header(grid, dims, len(processed_chars), avg_similarity, self.similarity_threshold) + self.__render_table_headers(grid, dims) + + y_offset = dims.header_height + dims.header_row_height + dims.padding + for idx, char_dir in enumerate(processed_chars): + self.__render_character_row(grid, dims, char_dir, idx, y_offset, bg_color) + y_offset += dims.row_height + + self.__render_footer(grid, dims, grid_height) + + cv2.imwrite( + str(output_path), + grid, + [cv2.IMWRITE_PNG_COMPRESSION, 6], + ) + + console.print(f'[green]✓ Validation grid saved to: {output_path}[/green]') + console.print(f'[green] Grid size: {grid_width}x{grid_height}px[/green]') + console.print(f'[green] Characters: {len(processed_chars)}[/green]') + console.print(f'[green] Average similarity: {avg_similarity:.4f}[/green]') def get_output_subdir(self) -> str: return 'character_references' - def _load_resources(self) -> bool: - self.face_app = FaceDetector.init() - return True + def _execute(self) -> None: + super()._execute() + self.generate_validation_grid() - @staticmethod - def __safe_resize(img: np.ndarray, target_size: tuple) -> Optional[np.ndarray]: - if img is None or img.size == 0: - return None - if img.shape[0] == 0 or img.shape[1] == 0: - return None - try: - return cv2.resize(img, target_size) # pylint: disable=no-member - except cv2.error as e: # pylint: disable=catching-non-exception - logging.error(f'OpenCV resize error: {e}') - return None + def _get_expected_outputs(self, item: ProcessingItem) -> List[OutputSpec]: + char_output_dir = self.output_dir / item.episode_id + return [ + OutputSpec(path=char_output_dir / 'metadata.json', required=True), + OutputSpec(path=char_output_dir / 'face_vector.npy', required=True), + ] def _get_processing_items(self) -> List[ProcessingItem]: items = [] @@ -77,12 +145,12 @@ def _get_processing_items(self) -> List[ProcessingItem]: items.append(ProcessingItem(episode_id=char_dir.name, input_path=char_dir, metadata={'char_name': char_dir.name})) return items - def _get_expected_outputs(self, item: ProcessingItem) -> List[OutputSpec]: - char_output_dir = self.output_dir / item.episode_id - return [ - OutputSpec(path=char_output_dir / 'metadata.json', required=True), - OutputSpec(path=char_output_dir / 'face_vector.npy', required=True), - ] + def _get_progress_description(self) -> str: + return 'Processing character references' + + def _load_resources(self) -> bool: + self.face_app = FaceDetector.init() + return True def _process_item(self, item: ProcessingItem, missing_outputs: List[OutputSpec]) -> None: char_dir = item.input_path @@ -103,10 +171,144 @@ def _process_item(self, item: ProcessingItem, missing_outputs: List[OutputSpec]) self.__save_processed_references(char_name, selected_faces, reference_images) console.print(f'[green]✓ Processed {char_name}[/green]') + def _validate_args(self, args: Dict[str, Any]) -> None: + required = ['characters_dir', 'output_dir', 'similarity_threshold', 'interactive'] + for key in required: + if key not in args: + raise ValueError(f'Missing required argument: {key}') + + def __ask_user_to_select_candidate( + self, + candidates: List[CandidateFace], + char_name: str, + ) -> Optional[List[FaceData]]: + console.print(f'[yellow]Character: {char_name}[/yellow]') + console.print(f'[yellow]Found {len(candidates)} possible matches across all reference images.[/yellow]') + for idx, candidate in enumerate(candidates, 1): + console.print(f'Candidate {idx}: avg similarity = {candidate.avg_similarity:.2f}') + grid_path = self.__create_selection_grid(candidates, 'candidates', char_name) + console.print(f'[blue]Grid image saved to: {grid_path}[/blue]') + while True: + prompt = f'Select the correct character (1-{len(candidates)}) or skip (s): ' + user_input = input(prompt).strip().lower() # pylint: disable=bad-builtin + if user_input == 's': + return None + try: + selection = int(user_input) + if 1 <= selection <= len(candidates): + return candidates[selection - 1].faces + console.print(f"[red]Invalid selection. Please enter 1-{len(candidates)} or 's'[/red]") + except ValueError: + console.print("[red]Invalid input. Please enter a number or 's'[/red]") + + def __ask_user_to_select_initial_face( + self, + first_image_faces: List[FaceData], + all_faces: List[List[FaceData]], + char_name: str, + reference_images: List[Path], + ) -> Optional[List[FaceData]]: + console.print(f'[yellow]Character: {char_name}[/yellow]') + console.print('[yellow]No common face found across all reference images.[/yellow]') + console.print( + '[yellow]Manual selection mode: Please select the correct face ' + 'from the first image.[/yellow]', + ) + console.print( + f'[yellow]Found {len(first_image_faces)} faces in ' + 'first reference image.[/yellow]', + ) + grid_path = self.__create_selection_grid(first_image_faces, 'manual', char_name) + console.print(f'[blue]Grid image saved to: {grid_path}[/blue]') + while True: + prompt = f'Select the correct face (1-{len(first_image_faces)}) or skip (s): ' + user_input = input(prompt).strip().lower() # pylint: disable=bad-builtin + if user_input == 's': + return None + try: + selection = int(user_input) + if 1 <= selection <= len(first_image_faces): + selected_face = first_image_faces[selection - 1] + return self.__find_matching_faces_for_reference( + selected_face.face_vector, + all_faces[1:], + [selected_face], + reference_images, + ) + console.print( + f"[red]Invalid selection. Please enter 1-{len(first_image_faces)} or 's'[/red]", + ) + except ValueError: + console.print("[red]Invalid input. Please enter a number or 's'[/red]") + + def __create_selection_grid(self, data, mode: str, char_name: str) -> Path: + if mode == 'candidates': + grid = self.__create_candidates_grid(data) + else: + grid = self.__create_manual_selection_grid(data) + + selection_grids_dir = self.output_dir.parent / 'character_selection_grids' + selection_grids_dir.mkdir(parents=True, exist_ok=True) + output_path = selection_grids_dir / f"{char_name.replace(' ', '_').lower()}_selection.jpg" + cv2.imwrite(str(output_path), grid) + return output_path + + def __create_candidates_grid(self, candidates: List[CandidateFace]) -> np.ndarray: + num_refs = len(candidates[0].faces) + num_candidates = len(candidates) + face_size = 150 + padding = 10 + label_height = 30 + grid_width = num_refs * (face_size + padding) + padding + grid_height = num_candidates * (face_size + label_height + padding) + padding + label_height + grid = np.ones((grid_height, grid_width, 3), dtype=np.uint8) * 255 + + for col_idx in range(num_refs): + label = f'Ref {col_idx + 1}' + x = padding + col_idx * (face_size + padding) + cv2.putText(grid, label, (x + 10, 20), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 0), 1) + + for cand_idx, candidate in enumerate(candidates): + y_base = label_height + padding + cand_idx * (face_size + label_height + padding) + for face_idx, face_data in enumerate(candidate.faces): + x = padding + face_idx * (face_size + padding) + face_resized = self.__safe_resize(face_data.face_img, (face_size, face_size)) + if face_resized is not None: + grid[y_base:y_base + face_size, x:x + face_size] = face_resized + + label = f'Candidate {cand_idx + 1}' + cv2.putText(grid, label, (5, y_base + face_size // 2), cv2.FONT_HERSHEY_SIMPLEX, 0.4, (0, 0, 255), 1) + + return grid + + def __create_manual_selection_grid(self, faces_data: List[FaceData]) -> np.ndarray: + num_faces = len(faces_data) + cols = min(3, num_faces) + rows = (num_faces + cols - 1) // cols + face_size = 150 + padding = 10 + grid_width = cols * (face_size + padding) + padding + grid_height = rows * (face_size + padding) + padding + grid = np.ones((grid_height, grid_width, 3), dtype=np.uint8) * 255 + + for idx, face_data in enumerate(faces_data): + row = idx // cols + col = idx % cols + x = padding + col * (face_size + padding) + y = padding + row * (face_size + padding) + face_resized = self.__safe_resize(face_data.face_img, (face_size, face_size)) + if face_resized is not None: + grid[y:y + face_size, x:x + face_size] = face_resized + + label = str(idx + 1) + cv2.putText(grid, label, (x + 5, y + 20), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 0, 255), 2) + + return grid + def __detect_faces_in_references(self, image_paths: List[Path]) -> List[List[FaceData]]: all_faces = [] for idx, img_path in enumerate(image_paths): - img = cv2.imread(str(img_path)) # pylint: disable=no-member + img = cv2.imread(str(img_path)) if img is None: console.print(f'[yellow]Warning: Could not read {img_path}[/yellow]') all_faces.append([]) @@ -187,81 +389,17 @@ def __find_common_face( candidates.sort(key=lambda c: c.avg_similarity, reverse=True) return candidates[0].faces - def __ask_user_to_select_candidate( - self, - candidates: List[CandidateFace], - char_name: str, - ) -> Optional[List[FaceData]]: - console.print(f'[yellow]Character: {char_name}[/yellow]') - console.print(f'[yellow]Found {len(candidates)} possible matches across all reference images.[/yellow]') - for idx, candidate in enumerate(candidates, 1): - console.print(f'Candidate {idx}: avg similarity = {candidate.avg_similarity:.2f}') - grid_path = self.__create_selection_grid(candidates, 'candidates', char_name) - console.print(f'[blue]Grid image saved to: {grid_path}[/blue]') - while True: - prompt = f'Select the correct character (1-{len(candidates)}) or skip (s): ' - user_input = input(prompt).strip().lower() # pylint: disable=bad-builtin - if user_input == 's': - return None - try: - selection = int(user_input) - if 1 <= selection <= len(candidates): - return candidates[selection - 1].faces - console.print(f"[red]Invalid selection. Please enter 1-{len(candidates)} or 's'[/red]") - except ValueError: - console.print("[red]Invalid input. Please enter a number or 's'[/red]") - - def __ask_user_to_select_initial_face( + def __find_matching_faces_for_reference( self, - first_image_faces: List[FaceData], - all_faces: List[List[FaceData]], - char_name: str, + reference_vector: np.ndarray, + remaining_images: List[List[FaceData]], + matched_faces: List[FaceData], reference_images: List[Path], ) -> Optional[List[FaceData]]: - console.print(f'[yellow]Character: {char_name}[/yellow]') - console.print('[yellow]No common face found across all reference images.[/yellow]') - console.print( - '[yellow]Manual selection mode: Please select the correct face ' - 'from the first image.[/yellow]', - ) - console.print( - f'[yellow]Found {len(first_image_faces)} faces in ' - 'first reference image.[/yellow]', - ) - grid_path = self.__create_selection_grid(first_image_faces, 'manual', char_name) - console.print(f'[blue]Grid image saved to: {grid_path}[/blue]') - while True: - prompt = f'Select the correct face (1-{len(first_image_faces)}) or skip (s): ' - user_input = input(prompt).strip().lower() # pylint: disable=bad-builtin - if user_input == 's': - return None - try: - selection = int(user_input) - if 1 <= selection <= len(first_image_faces): - selected_face = first_image_faces[selection - 1] - return self.__find_matching_faces_for_reference( - selected_face.face_vector, - all_faces[1:], - [selected_face], - reference_images, - ) - console.print( - f"[red]Invalid selection. Please enter 1-{len(first_image_faces)} or 's'[/red]", - ) - except ValueError: - console.print("[red]Invalid input. Please enter a number or 's'[/red]") - - def __find_matching_faces_for_reference( - self, - reference_vector: np.ndarray, - remaining_images: List[List[FaceData]], - matched_faces: List[FaceData], - reference_images: List[Path], - ) -> Optional[List[FaceData]]: - for img_idx, other_image_faces in enumerate(remaining_images, 1): - if not other_image_faces: - img_path = reference_images[img_idx] - console.print(f'[red]No faces found in image {img_idx + 1}: {img_path}[/red]') + for img_idx, other_image_faces in enumerate(remaining_images, 1): + if not other_image_faces: + img_path = reference_images[img_idx] + console.print(f'[red]No faces found in image {img_idx + 1}: {img_path}[/red]') return None best_match = None best_sim: float = -1.0 @@ -287,159 +425,6 @@ def __find_matching_faces_for_reference( return None return matched_faces - def __create_selection_grid(self, data, mode: str, char_name: str) -> Path: # pylint: disable=too-many-locals - if mode == 'candidates': - candidates = data - num_refs = len(candidates[0].faces) - num_candidates = len(candidates) - face_size = 150 - padding = 10 - label_height = 30 - grid_width = num_refs * (face_size + padding) + padding - grid_height = num_candidates * (face_size + label_height + padding) + padding + label_height - grid = np.ones((grid_height, grid_width, 3), dtype=np.uint8) * 255 - for col_idx in range(num_refs): - label = f'Ref {col_idx + 1}' - x = padding + col_idx * (face_size + padding) - cv2.putText( # pylint: disable=no-member - grid, label, (x + 10, 20), - cv2.FONT_HERSHEY_SIMPLEX, # pylint: disable=no-member - 0.5, (0, 0, 0), 1, - ) - for cand_idx, candidate in enumerate(candidates): - y_base = label_height + padding + cand_idx * (face_size + label_height + padding) - for face_idx, face_data in enumerate(candidate.faces): - x = padding + face_idx * (face_size + padding) - y = y_base - face_resized = self.__safe_resize(face_data.face_img, (face_size, face_size)) - if face_resized is not None: - grid[y:y + face_size, x:x + face_size] = face_resized - label = f'Candidate {cand_idx + 1}' - cv2.putText( # pylint: disable=no-member - grid, label, (5, y_base + face_size // 2), - cv2.FONT_HERSHEY_SIMPLEX, 0.4, (0, 0, 255), 1, # pylint: disable=no-member - ) - else: - faces_data = data - num_faces = len(faces_data) - cols = min(3, num_faces) - rows = (num_faces + cols - 1) // cols - face_size = 150 - padding = 10 - grid_width = cols * (face_size + padding) + padding - grid_height = rows * (face_size + padding) + padding - grid = np.ones((grid_height, grid_width, 3), dtype=np.uint8) * 255 - for idx, face_data in enumerate(faces_data): - row = idx // cols - col = idx % cols - x = padding + col * (face_size + padding) - y = padding + row * (face_size + padding) - face_resized = self.__safe_resize(face_data.face_img, (face_size, face_size)) - if face_resized is not None: - grid[y:y + face_size, x:x + face_size] = face_resized - label = str(idx + 1) - cv2.putText( # pylint: disable=no-member - grid, - label, - (x + 5, y + 20), - cv2.FONT_HERSHEY_SIMPLEX, # pylint: disable=no-member - 0.7, - (0, 0, 255), - 2, - ) - selection_grids_dir = self.output_dir.parent / 'character_selection_grids' - selection_grids_dir.mkdir(parents=True, exist_ok=True) - output_path = selection_grids_dir / f"{char_name.replace(' ', '_').lower()}_selection.jpg" - cv2.imwrite(str(output_path), grid) # pylint: disable=no-member - return output_path - - def __save_processed_references( # pylint: disable=too-many-locals - self, - char_name: str, - selected_faces: List[FaceData], - reference_images: List[Path], - ) -> None: - char_output_dir = self.output_dir / char_name - char_output_dir.mkdir(parents=True, exist_ok=True) - face_vectors = [] - for idx, face_data in enumerate(selected_faces): - face_normalized = self.__safe_resize(face_data.face_img, settings.character.normalized_face_size) - if face_normalized is None: - self.logger.warning(f'Skipping face {idx} for {char_name}: failed to resize (invalid dimensions)') - continue - face_output_path = char_output_dir / f'face_{idx:02d}.jpg' - cv2.imwrite(str(face_output_path), face_normalized) # pylint: disable=no-member - face_vectors.append(face_data.face_vector) - mean_vector = np.mean(face_vectors, axis=0) - vector_path = char_output_dir / 'face_vector.npy' - np.save(vector_path, mean_vector) - total_faces_detected = [] - for faces_list in self.__detect_faces_in_references(reference_images): - total_faces_detected.append(len(faces_list)) - similarities = [] - if len(selected_faces) > 1: - for i in range(len(selected_faces) - 1): - similarity = np.dot(selected_faces[i].face_vector, selected_faces[i + 1].face_vector) - similarities.append(similarity) - metadata = { - 'character_name': char_name.replace('_', ' ').title(), - 'source_images': [str(img) for img in reference_images], - 'processed_at': datetime.now().isoformat(), - 'processing_params': { - 'similarity_threshold': self.similarity_threshold, - 'face_model': settings.face_recognition.model_name, - 'normalized_face_size': list(settings.character.normalized_face_size), - }, - 'detection_stats': { - 'total_faces_detected': total_faces_detected, - 'candidates_found': 1, - 'selection_method': 'automatic' if len(selected_faces) == len(reference_images) else 'manual', - }, - 'selected_face_indices': [face.source_image_idx for face in selected_faces], - 'average_similarity': float(np.mean(similarities)) if similarities else 1.0, - 'face_vector_dim': int(mean_vector.shape[0]), - } - metadata_path = char_output_dir / 'metadata.json' - with open(metadata_path, 'w', encoding='utf-8') as f: - json.dump(metadata, f, indent=2, ensure_ascii=False) - - def _get_progress_description(self) -> str: - return 'Processing character references' - - def _execute(self) -> None: - super()._execute() - self.generate_validation_grid() - - @dataclass - class _GridDimensions: - face_size: int = 280 - padding: int = 15 - header_height: int = 180 - footer_height: int = 80 - label_col_width: int = 350 - stats_col_width: int = 200 - header_row_height: int = 40 - faces_per_char: int = 3 - - @property - def row_height(self) -> int: - return self.face_size + self.padding * 2 - - @property - def face_col_width(self) -> int: - return self.face_size + self.padding - - def total_width(self) -> int: - return ( - self.label_col_width - + self.stats_col_width - + self.faces_per_char * self.face_col_width - + self.padding * 2 - ) - - def total_height(self, num_chars: int) -> int: - return self.header_height + num_chars * self.row_height + self.footer_height - @staticmethod def __load_all_metadata(processed_chars: List[Path]) -> List[Dict[str, Any]]: metadata_all = [] @@ -450,96 +435,7 @@ def __load_all_metadata(processed_chars: List[Path]) -> List[Dict[str, Any]]: metadata_all.append(json.load(f)) return metadata_all - @staticmethod - def __render_header( - grid: np.ndarray, - dims: _GridDimensions, - total_chars: int, - avg_similarity: float, - threshold: float, - ) -> None: - header_bg_color = (45, 55, 72) - cv2.rectangle(grid, (0, 0), (dims.total_width(), dims.header_height), header_bg_color, -1) # pylint: disable=no-member - - title_text = 'FACIAL REFERENCE VALIDATION REPORT' - cv2.putText( # pylint: disable=no-member - grid, - title_text, - (dims.padding * 3, 50), - cv2.FONT_HERSHEY_DUPLEX, # pylint: disable=no-member - 1.1, - (255, 255, 255), - 2, - cv2.LINE_AA, # pylint: disable=no-member - ) - - subtitle = 'InsightFace Buffalo-L Model | Face Vector Extraction & Similarity Analysis' - cv2.putText( # pylint: disable=no-member - grid, - subtitle, - (dims.padding * 3, 85), - cv2.FONT_HERSHEY_SIMPLEX, # pylint: disable=no-member - 0.55, - (200, 210, 220), - 1, - cv2.LINE_AA, # pylint: disable=no-member - ) - - stats_y = 115 - stats_items = [ - f'Total Subjects: {total_chars}', - f'Avg Similarity: {avg_similarity:.4f}', - f'Threshold: {threshold:.2f}', - ] - for idx, stat in enumerate(stats_items): - x_pos = dims.padding * 3 + idx * 280 - cv2.putText( # pylint: disable=no-member - grid, - stat, - (x_pos, stats_y), - cv2.FONT_HERSHEY_SIMPLEX, # pylint: disable=no-member - 0.5, - (180, 200, 220), - 1, - cv2.LINE_AA, # pylint: disable=no-member - ) - - @staticmethod - def __render_table_headers(grid: np.ndarray, dims: _GridDimensions) -> None: - table_header_y = dims.header_height + 1 - cv2.line(grid, (0, table_header_y), (dims.total_width(), table_header_y), (180, 190, 200), 2) # pylint: disable=no-member - - col_headers = [ - ('CHARACTER NAME', dims.label_col_width // 2, 0), - ('STATISTICS', dims.label_col_width + dims.stats_col_width // 2, 0), - ('REFERENCE IMAGE 1', dims.label_col_width + dims.stats_col_width + dims.face_col_width // 2, 0), - ('REFERENCE IMAGE 2', dims.label_col_width + dims.stats_col_width + dims.face_col_width * 3 // 2, 0), - ('REFERENCE IMAGE 3', dims.label_col_width + dims.stats_col_width + dims.face_col_width * 5 // 2, 0), - ] - - for text, x_center, _ in col_headers: - text_size = cv2.getTextSize(text, cv2.FONT_HERSHEY_SIMPLEX, 0.42, 1)[0] # pylint: disable=no-member - text_x = x_center - text_size[0] // 2 - cv2.putText( # pylint: disable=no-member - grid, - text, - (text_x, table_header_y + 25), - cv2.FONT_HERSHEY_SIMPLEX, # pylint: disable=no-member - 0.42, - (60, 70, 85), - 1, - cv2.LINE_AA, # pylint: disable=no-member - ) - - cv2.line( # pylint: disable=no-member - grid, - (0, table_header_y + dims.header_row_height), - (dims.total_width(), table_header_y + dims.header_row_height), - (200, 210, 220), - 1, - ) - - def __render_character_row( # pylint: disable=too-many-locals + def __render_character_row( self, grid: np.ndarray, dims: _GridDimensions, @@ -549,10 +445,9 @@ def __render_character_row( # pylint: disable=too-many-locals bg_color: Tuple[int, int, int], ) -> None: char_name = char_dir.name.replace('_', ' ').title() - metadata_file = char_dir / 'metadata.json' - row_bg = (245, 248, 252) if row_idx % 2 == 0 else bg_color - cv2.rectangle( # pylint: disable=no-member + + cv2.rectangle( grid, (0, y_offset - dims.padding), (dims.total_width(), y_offset + dims.face_size + dims.padding), @@ -560,67 +455,61 @@ def __render_character_row( # pylint: disable=too-many-locals -1, ) - cv2.putText( # pylint: disable=no-member + cv2.putText( grid, char_name, (dims.padding * 2, y_offset + dims.face_size // 2), - cv2.FONT_HERSHEY_SIMPLEX, # pylint: disable=no-member + cv2.FONT_HERSHEY_SIMPLEX, 0.55, (30, 40, 50), 1, - cv2.LINE_AA, # pylint: disable=no-member + cv2.LINE_AA, ) - if metadata_file.exists(): - with open(metadata_file, 'r', encoding='utf-8') as f: - metadata = json.load(f) + self.__render_character_stats(grid, dims, char_dir, y_offset) + self.__render_character_faces(grid, dims, char_dir, y_offset) - similarity = metadata.get('average_similarity', 0.0) - method = metadata.get('detection_stats', {}).get('selection_method', 'unknown') - faces_detected = metadata.get('detection_stats', {}).get('total_faces_detected', []) + def __render_character_stats( + self, grid: np.ndarray, dims: _GridDimensions, char_dir: Path, y_offset: int, + ) -> None: + metadata_file = char_dir / 'metadata.json' + if not metadata_file.exists(): + return - stats_x = dims.label_col_width + dims.padding - stats_y_base = y_offset + dims.face_size // 2 - 30 + with open(metadata_file, 'r', encoding='utf-8') as f: + metadata = json.load(f) - sim_color = (0, 150, 0) if similarity >= self.similarity_threshold else (180, 100, 0) - cv2.putText( # pylint: disable=no-member - grid, - f'Similarity: {similarity:.4f}', - (stats_x, stats_y_base), - cv2.FONT_HERSHEY_SIMPLEX, # pylint: disable=no-member - 0.45, - sim_color, - 1, - cv2.LINE_AA, # pylint: disable=no-member - ) + similarity = metadata.get('average_similarity', 0.0) + method = metadata.get('detection_stats', {}).get('selection_method', 'unknown') + faces_detected = metadata.get('detection_stats', {}).get('total_faces_detected', []) - method_color = (50, 120, 200) if method == 'automatic' else (180, 100, 50) - cv2.putText( # pylint: disable=no-member - grid, - f'Method: {method}', - (stats_x, stats_y_base + 25), - cv2.FONT_HERSHEY_SIMPLEX, # pylint: disable=no-member - 0.42, - method_color, - 1, - cv2.LINE_AA, # pylint: disable=no-member - ) + stats_x = dims.label_col_width + dims.padding + stats_y_base = y_offset + dims.face_size // 2 - 30 - faces_str = str(faces_detected) if len(str(faces_detected)) < 20 else f'[{len(faces_detected)} imgs]' - cv2.putText( # pylint: disable=no-member - grid, - f'Detected: {faces_str}', - (stats_x, stats_y_base + 50), - cv2.FONT_HERSHEY_SIMPLEX, # pylint: disable=no-member - 0.38, - (100, 110, 120), - 1, - cv2.LINE_AA, # pylint: disable=no-member - ) + sim_color = (0, 150, 0) if similarity >= self.similarity_threshold else (180, 100, 0) + cv2.putText( + grid, f'Similarity: {similarity:.4f}', (stats_x, stats_y_base), + cv2.FONT_HERSHEY_SIMPLEX, 0.45, sim_color, 1, cv2.LINE_AA, + ) + + method_color = (50, 120, 200) if method == 'automatic' else (180, 100, 50) + cv2.putText( + grid, f'Method: {method}', (stats_x, stats_y_base + 25), + cv2.FONT_HERSHEY_SIMPLEX, 0.42, method_color, 1, cv2.LINE_AA, + ) + faces_str = str(faces_detected) if len(str(faces_detected)) < 20 else f'[{len(faces_detected)} imgs]' + cv2.putText( + grid, f'Detected: {faces_str}', (stats_x, stats_y_base + 50), + cv2.FONT_HERSHEY_SIMPLEX, 0.38, (100, 110, 120), 1, cv2.LINE_AA, + ) + + def __render_character_faces( + self, grid: np.ndarray, dims: _GridDimensions, char_dir: Path, y_offset: int, + ) -> None: face_files = sorted(char_dir.glob('face_*.jpg')) for face_idx, face_file in enumerate(face_files[:dims.faces_per_char]): - face_img = cv2.imread(str(face_file)) # pylint: disable=no-member + face_img = cv2.imread(str(face_file)) if face_img is None: continue @@ -629,22 +518,18 @@ def __render_character_row( # pylint: disable=too-many-locals continue x = dims.label_col_width + dims.stats_col_width + face_idx * dims.face_col_width + dims.padding - y = y_offset - grid[y:y + dims.face_size, x:x + dims.face_size] = face_resized + grid[y_offset:y_offset + dims.face_size, x:x + dims.face_size] = face_resized - border_color = (180, 190, 200) - cv2.rectangle( # pylint: disable=no-member - grid, - (x - 1, y - 1), - (x + dims.face_size + 1, y + dims.face_size + 1), - border_color, - 1, + cv2.rectangle( + grid, (x - 1, y_offset - 1), + (x + dims.face_size + 1, y_offset + dims.face_size + 1), + (180, 190, 200), 1, ) @staticmethod def __render_footer(grid: np.ndarray, dims: _GridDimensions, grid_height: int) -> None: footer_y = grid_height - dims.footer_height + 20 - cv2.line(grid, (0, footer_y - 20), (dims.total_width(), footer_y - 20), (200, 210, 220), 1) # pylint: disable=no-member + cv2.line(grid, (0, footer_y - 20), (dims.total_width(), footer_y - 20), (200, 210, 220), 1) footer_text = ( f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')} | " @@ -652,15 +537,15 @@ def __render_footer(grid: np.ndarray, dims: _GridDimensions, grid_height: int) - f"Normalized Size: {settings.character.normalized_face_size[0]}x" f"{settings.character.normalized_face_size[1]}px" ) - cv2.putText( # pylint: disable=no-member + cv2.putText( grid, footer_text, (dims.padding * 3, footer_y), - cv2.FONT_HERSHEY_SIMPLEX, # pylint: disable=no-member + cv2.FONT_HERSHEY_SIMPLEX, 0.4, (120, 130, 140), 1, - cv2.LINE_AA, # pylint: disable=no-member + cv2.LINE_AA, ) legend_y = footer_y + 30 @@ -670,63 +555,179 @@ def __render_footer(grid: np.ndarray, dims: _GridDimensions, grid_height: int) - ] for idx, (text, color) in enumerate(legend_items): x_pos = dims.padding * 3 + idx * 380 - cv2.circle(grid, (x_pos, legend_y - 3), 5, color, -1) # pylint: disable=no-member - cv2.putText( # pylint: disable=no-member + cv2.circle(grid, (x_pos, legend_y - 3), 5, color, -1) + cv2.putText( grid, text, (x_pos + 15, legend_y), - cv2.FONT_HERSHEY_SIMPLEX, # pylint: disable=no-member + cv2.FONT_HERSHEY_SIMPLEX, 0.38, (100, 110, 120), 1, - cv2.LINE_AA, # pylint: disable=no-member + cv2.LINE_AA, ) - def generate_validation_grid(self) -> None: - output_path = self.output_dir / 'validation_grid.png' - if output_path.exists(): - console.print(f'[dim]⊘ Skipping validation grid (already exists): {output_path}[/dim]') - return + @staticmethod + def __render_header( + grid: np.ndarray, + dims: _GridDimensions, + total_chars: int, + avg_similarity: float, + threshold: float, + ) -> None: + header_bg_color = (45, 55, 72) + cv2.rectangle(grid, (0, 0), (dims.total_width(), dims.header_height), header_bg_color, -1) - console.print('\n[blue]Generating validation grid...[/blue]') + title_text = 'FACIAL REFERENCE VALIDATION REPORT' + cv2.putText( + grid, + title_text, + (dims.padding * 3, 50), + cv2.FONT_HERSHEY_DUPLEX, + 1.1, + (255, 255, 255), + 2, + cv2.LINE_AA, + ) - if not self.output_dir.exists(): - console.print('[yellow]No processed references found, skipping validation grid[/yellow]') - return + subtitle = 'InsightFace Buffalo-L Model | Face Vector Extraction & Similarity Analysis' + cv2.putText( + grid, + subtitle, + (dims.padding * 3, 85), + cv2.FONT_HERSHEY_SIMPLEX, + 0.55, + (200, 210, 220), + 1, + cv2.LINE_AA, + ) - processed_chars = sorted([d for d in self.output_dir.iterdir() if d.is_dir()]) - if not processed_chars: - console.print('[yellow]No processed characters found, skipping validation grid[/yellow]') - return + stats_y = 115 + stats_items = [ + f'Total Subjects: {total_chars}', + f'Avg Similarity: {avg_similarity:.4f}', + f'Threshold: {threshold:.2f}', + ] + for idx, stat in enumerate(stats_items): + x_pos = dims.padding * 3 + idx * 280 + cv2.putText( + grid, + stat, + (x_pos, stats_y), + cv2.FONT_HERSHEY_SIMPLEX, + 0.5, + (180, 200, 220), + 1, + cv2.LINE_AA, + ) - dims = self._GridDimensions() - grid_width = dims.total_width() - grid_height = dims.total_height(len(processed_chars)) - bg_color = (250, 252, 255) - grid = np.full((grid_height, grid_width, 3), bg_color, dtype=np.uint8) + @staticmethod + def __render_table_headers(grid: np.ndarray, dims: _GridDimensions) -> None: + table_header_y = dims.header_height + 1 + cv2.line(grid, (0, table_header_y), (dims.total_width(), table_header_y), (180, 190, 200), 2) - metadata_all = self.__load_all_metadata(processed_chars) - avg_similarity = ( - np.mean([m.get('average_similarity', 0) for m in metadata_all]) if metadata_all else 0 + col_headers = [ + ('CHARACTER NAME', dims.label_col_width // 2, 0), + ('STATISTICS', dims.label_col_width + dims.stats_col_width // 2, 0), + ('REFERENCE IMAGE 1', dims.label_col_width + dims.stats_col_width + dims.face_col_width // 2, 0), + ('REFERENCE IMAGE 2', dims.label_col_width + dims.stats_col_width + dims.face_col_width * 3 // 2, 0), + ('REFERENCE IMAGE 3', dims.label_col_width + dims.stats_col_width + dims.face_col_width * 5 // 2, 0), + ] + + for text, x_center, _ in col_headers: + text_size = cv2.getTextSize(text, cv2.FONT_HERSHEY_SIMPLEX, 0.42, 1)[0] + text_x = x_center - text_size[0] // 2 + cv2.putText( + grid, + text, + (text_x, table_header_y + 25), + cv2.FONT_HERSHEY_SIMPLEX, + 0.42, + (60, 70, 85), + 1, + cv2.LINE_AA, + ) + + cv2.line( + grid, + (0, table_header_y + dims.header_row_height), + (dims.total_width(), table_header_y + dims.header_row_height), + (200, 210, 220), + 1, ) - self.__render_header(grid, dims, len(processed_chars), avg_similarity, self.similarity_threshold) - self.__render_table_headers(grid, dims) + @staticmethod + def __safe_resize(img: np.ndarray, target_size: tuple) -> Optional[np.ndarray]: + if img is None or img.size == 0: + return None + if img.shape[0] == 0 or img.shape[1] == 0: + return None + try: + return cv2.resize(img, target_size) + except cv2.error as e: + logging.error(f'OpenCV resize error: {e}') + return None - y_offset = dims.header_height + dims.header_row_height + dims.padding - for idx, char_dir in enumerate(processed_chars): - self.__render_character_row(grid, dims, char_dir, idx, y_offset, bg_color) - y_offset += dims.row_height + def __save_processed_references( + self, + char_name: str, + selected_faces: List[FaceData], + reference_images: List[Path], + ) -> None: + char_output_dir = self.output_dir / char_name + char_output_dir.mkdir(parents=True, exist_ok=True) - self.__render_footer(grid, dims, grid_height) + face_vectors = [] + for idx, face_data in enumerate(selected_faces): + face_normalized = self.__safe_resize(face_data.face_img, settings.character.normalized_face_size) + if face_normalized is None: + self.logger.warning(f'Skipping face {idx} for {char_name}: failed to resize (invalid dimensions)') + continue + face_output_path = char_output_dir / f'face_{idx:02d}.jpg' + cv2.imwrite(str(face_output_path), face_normalized) + face_vectors.append(face_data.face_vector) - cv2.imwrite( # pylint: disable=no-member - str(output_path), - grid, - [cv2.IMWRITE_PNG_COMPRESSION, 6], # pylint: disable=no-member + mean_vector = np.mean(face_vectors, axis=0) + np.save(char_output_dir / 'face_vector.npy', mean_vector) + + metadata = self.__create_reference_metadata( + char_name, selected_faces, reference_images, mean_vector, ) + with open(char_output_dir / 'metadata.json', 'w', encoding='utf-8') as f: + json.dump(metadata, f, indent=2, ensure_ascii=False) - console.print(f'[green]✓ Validation grid saved to: {output_path}[/green]') - console.print(f'[green] Grid size: {grid_width}x{grid_height}px[/green]') - console.print(f'[green] Characters: {len(processed_chars)}[/green]') - console.print(f'[green] Average similarity: {avg_similarity:.4f}[/green]') + def __create_reference_metadata( + self, + char_name: str, + selected_faces: List[FaceData], + reference_images: List[Path], + mean_vector: np.ndarray, + ) -> Dict[str, Any]: + total_faces_detected = [ + len(faces_list) for faces_list in self.__detect_faces_in_references(reference_images) + ] + + similarities = [] + if len(selected_faces) > 1: + for i in range(len(selected_faces) - 1): + similarity = np.dot(selected_faces[i].face_vector, selected_faces[i + 1].face_vector) + similarities.append(similarity) + + return { + 'character_name': char_name.replace('_', ' ').title(), + 'source_images': [str(img) for img in reference_images], + 'processed_at': datetime.now().isoformat(), + 'processing_params': { + 'similarity_threshold': self.similarity_threshold, + 'face_model': settings.face_recognition.model_name, + 'normalized_face_size': list(settings.character.normalized_face_size), + }, + 'detection_stats': { + 'total_faces_detected': total_faces_detected, + 'candidates_found': 1, + 'selection_method': 'automatic' if len(selected_faces) == len(reference_images) else 'manual', + }, + 'selected_face_indices': [face.source_image_idx for face in selected_faces], + 'average_similarity': float(np.mean(similarities)) if similarities else 1.0, + 'face_vector_dim': int(mean_vector.shape[0]), + } diff --git a/preprocessor/modules/scraping/reference_processor_step.py b/preprocessor/modules/scraping/reference_processor_step.py index 7ab632766..1dbff41f0 100644 --- a/preprocessor/modules/scraping/reference_processor_step.py +++ b/preprocessor/modules/scraping/reference_processor_step.py @@ -15,10 +15,6 @@ def __init__(self, config: CharacterReferenceConfig) -> None: super().__init__(config) self._executed = False - @property - def name(self) -> str: - return "process_character_references" - def execute( self, input_data: SourceVideo, context: ExecutionContext, ) -> Optional[SourceVideo]: @@ -61,3 +57,7 @@ def execute( self._executed = True return input_data + + @property + def name(self) -> str: + return "process_character_references" diff --git a/preprocessor/modules/search/clients/elasticsearch_queries.py b/preprocessor/modules/search/clients/elasticsearch_queries.py index bbea17e6d..c856ef1a9 100644 --- a/preprocessor/modules/search/clients/elasticsearch_queries.py +++ b/preprocessor/modules/search/clients/elasticsearch_queries.py @@ -17,153 +17,49 @@ def __init__(self, embedding_service: EmbeddingService, index_base: str) -> None self._embedding_service = embedding_service self._index_base = index_base - @property - def __segments_index(self) -> str: - return f'{self._index_base}_text_segments' - - @property - def __text_embeddings_index(self) -> str: - return f'{self._index_base}_text_embeddings' - - @property - def __video_frames_index(self) -> str: - return f'{self._index_base}_video_frames' - - @property - def __episode_names_index(self) -> str: - return f'{self._index_base}_episode_names' - - @staticmethod - def __build_episode_filters(season: Optional[int], episode: Optional[int]) -> List[Dict[str, Any]]: - filters = [] - if season is not None: - filters.append({'term': {'episode_metadata.season': season}}) - if episode is not None: - filters.append({'term': {'episode_metadata.episode_number': episode}}) - return filters - - async def search_text_query( - self, - es_client: AsyncElasticsearch, - query: str, - season: Optional[int]=None, - episode: Optional[int]=None, - limit: int=20, - ) -> Dict[str, Any]: - must_clauses = [ - {'multi_match': {'query': query, 'fields': ['text^2', 'episode_metadata.title'], 'fuzziness': 'AUTO'}}, - ] - must_clauses.extend(self.__build_episode_filters(season, episode)) - query_body = {'bool': {'must': must_clauses}} - return await es_client.search( - index=self.__segments_index, - query=query_body, - size=limit, - _source=[ - 'episode_id', 'segment_id', 'text', 'start_time', 'end_time', 'speaker', - 'video_path', 'episode_metadata', 'scene_info', - ], - ) - - async def search_text_semantic( - self, - es_client: AsyncElasticsearch, - text: str, - season: Optional[int]=None, - episode: Optional[int]=None, - limit: int=10, - ) -> Dict[str, Any]: - embedding = self._embedding_service.get_text_embedding(text) - filter_clauses = self.__build_episode_filters(season, episode) - knn_query: Dict[str, Any] = { - 'field': 'text_embedding', - 'query_vector': embedding, - 'k': limit, - 'num_candidates': limit * 10, + async def get_stats(self, es_client: AsyncElasticsearch) -> Dict[str, int]: + return { + 'segments': (await es_client.count(index=self.__segments_index))['count'], + 'text_embeddings': (await es_client.count(index=self.__text_embeddings_index))['count'], + 'video_embeddings': (await es_client.count(index=self.__video_frames_index))['count'], + 'episode_names': (await es_client.count(index=self.__episode_names_index))['count'], } - if filter_clauses: - knn_query['filter'] = filter_clauses - return await es_client.search( - index=self.__text_embeddings_index, - knn=knn_query, - size=limit, - _source=[ - 'episode_id', 'embedding_id', 'text', 'segment_range', - 'video_path', 'episode_metadata', 'scene_info', - ], - ) - async def search_video_semantic( - self, - es_client: AsyncElasticsearch, - image_path: str, - season: Optional[int]=None, - episode: Optional[int]=None, - character: Optional[str]=None, - limit: int=10, - ) -> Dict[str, Any]: - embedding = self._embedding_service.get_image_embedding(image_path) - filter_clauses = self.__build_episode_filters(season, episode) - if character: - filter_clauses.append({ - 'nested': { - 'path': 'character_appearances', - 'query': {'term': {'character_appearances.name': character}}, - }, - }) - knn_query: Dict[str, Any] = { - 'field': 'video_embedding', - 'query_vector': embedding, - 'k': limit, - 'num_candidates': limit * 10, - } - if filter_clauses: - knn_query['filter'] = filter_clauses - return await es_client.search( + async def list_characters(self, es_client: AsyncElasticsearch) -> List[Tuple[str, int]]: + result = await es_client.search( index=self.__video_frames_index, - knn=knn_query, - size=limit, - _source=[ - 'episode_id', 'frame_number', 'timestamp', 'frame_type', 'scene_number', - 'perceptual_hash', 'video_path', 'episode_metadata', 'character_appearances', 'scene_info', - ], + size=0, + aggs={ + 'characters_nested': { + 'nested': {'path': 'character_appearances'}, + 'aggs': { + 'character_names': { + 'terms': {'field': 'character_appearances.name', 'size': 1000}, + }, + }, + }, + }, ) + buckets = result['aggregations']['characters_nested']['character_names']['buckets'] + return [(b['key'], b['doc_count']) for b in buckets] - async def search_text_to_video( - self, - es_client: AsyncElasticsearch, - text: str, - season: Optional[int]=None, - episode: Optional[int]=None, - character: Optional[str]=None, - limit: int=10, - ) -> Dict[str, Any]: - embedding = self._embedding_service.get_text_embedding(text) - filter_clauses = self.__build_episode_filters(season, episode) - if character: - filter_clauses.append({ - 'nested': { - 'path': 'character_appearances', - 'query': {'term': {'character_appearances.name': character}}, - }, - }) - knn_query: Dict[str, Any] = { - 'field': 'video_embedding', - 'query_vector': embedding, - 'k': limit, - 'num_candidates': limit * 10, - } - if filter_clauses: - knn_query['filter'] = filter_clauses - return await es_client.search( + async def list_objects(self, es_client: AsyncElasticsearch) -> List[Tuple[str, int]]: + result = await es_client.search( index=self.__video_frames_index, - knn=knn_query, - size=limit, - _source=[ - 'episode_id', 'frame_number', 'timestamp', 'frame_type', 'scene_number', - 'perceptual_hash', 'video_path', 'episode_metadata', 'character_appearances', 'scene_info', - ], + size=0, + aggs={ + 'objects_nested': { + 'nested': {'path': 'detected_objects'}, + 'aggs': { + 'object_classes': { + 'terms': {'field': 'detected_objects.class', 'size': 1000}, + }, + }, + }, + }, ) + buckets = result['aggregations']['objects_nested']['object_classes']['buckets'] + return [(b['key'], b['doc_count']) for b in buckets] async def search_by_character( self, @@ -318,6 +214,52 @@ async def search_by_object( ], ) + async def search_episode_name( + self, + es_client: AsyncElasticsearch, + query: str, + season: Optional[int]=None, + limit: int=20, + ) -> Dict[str, Any]: + must_clauses = [ + {'multi_match': {'query': query, 'fields': ['title^2', 'episode_metadata.title'], 'fuzziness': 'AUTO'}}, + ] + if season is not None: + must_clauses.append({'term': {'episode_metadata.season': season}}) + query_body = {'bool': {'must': must_clauses}} + return await es_client.search( + index=self.__episode_names_index, + query=query_body, + size=limit, + _source=['episode_id', 'title', 'video_path', 'episode_metadata'], + ) + + async def search_episode_name_semantic( + self, + es_client: AsyncElasticsearch, + text: str, + season: Optional[int]=None, + limit: int=10, + ) -> Dict[str, Any]: + embedding = self._embedding_service.get_text_embedding(text) + filter_clauses = [] + if season is not None: + filter_clauses.append({'term': {'episode_metadata.season': season}}) + knn_query: Dict[str, Any] = { + 'field': 'title_embedding', + 'query_vector': embedding, + 'k': limit, + 'num_candidates': limit * 10, + } + if filter_clauses: + knn_query['filter'] = filter_clauses + return await es_client.search( + index=self.__episode_names_index, + knn=knn_query, + size=limit, + _source=['episode_id', 'title', 'video_path', 'episode_metadata'], + ) + async def search_perceptual_hash( self, es_client: AsyncElasticsearch, @@ -334,75 +276,41 @@ async def search_perceptual_hash( ], ) - async def list_characters(self, es_client: AsyncElasticsearch) -> List[Tuple[str, int]]: - result = await es_client.search( - index=self.__video_frames_index, - size=0, - aggs={ - 'characters_nested': { - 'nested': {'path': 'character_appearances'}, - 'aggs': { - 'character_names': { - 'terms': {'field': 'character_appearances.name', 'size': 1000}, - }, - }, - }, - }, - ) - buckets = result['aggregations']['characters_nested']['character_names']['buckets'] - return [(b['key'], b['doc_count']) for b in buckets] - - async def list_objects(self, es_client: AsyncElasticsearch) -> List[Tuple[str, int]]: - result = await es_client.search( - index=self.__video_frames_index, - size=0, - aggs={ - 'objects_nested': { - 'nested': {'path': 'detected_objects'}, - 'aggs': { - 'object_classes': { - 'terms': {'field': 'detected_objects.class', 'size': 1000}, - }, - }, - }, - }, - ) - buckets = result['aggregations']['objects_nested']['object_classes']['buckets'] - return [(b['key'], b['doc_count']) for b in buckets] - - async def search_episode_name( + async def search_text_query( self, es_client: AsyncElasticsearch, query: str, season: Optional[int]=None, + episode: Optional[int]=None, limit: int=20, ) -> Dict[str, Any]: must_clauses = [ - {'multi_match': {'query': query, 'fields': ['title^2', 'episode_metadata.title'], 'fuzziness': 'AUTO'}}, + {'multi_match': {'query': query, 'fields': ['text^2', 'episode_metadata.title'], 'fuzziness': 'AUTO'}}, ] - if season is not None: - must_clauses.append({'term': {'episode_metadata.season': season}}) + must_clauses.extend(self.__build_episode_filters(season, episode)) query_body = {'bool': {'must': must_clauses}} return await es_client.search( - index=self.__episode_names_index, + index=self.__segments_index, query=query_body, size=limit, - _source=['episode_id', 'title', 'video_path', 'episode_metadata'], + _source=[ + 'episode_id', 'segment_id', 'text', 'start_time', 'end_time', 'speaker', + 'video_path', 'episode_metadata', 'scene_info', + ], ) - async def search_episode_name_semantic( + async def search_text_semantic( self, es_client: AsyncElasticsearch, text: str, season: Optional[int]=None, + episode: Optional[int]=None, limit: int=10, ) -> Dict[str, Any]: embedding = self._embedding_service.get_text_embedding(text) - filter_clauses = [] - if season is not None: - filter_clauses.append({'term': {'episode_metadata.season': season}}) + filter_clauses = self.__build_episode_filters(season, episode) knn_query: Dict[str, Any] = { - 'field': 'title_embedding', + 'field': 'text_embedding', 'query_vector': embedding, 'k': limit, 'num_candidates': limit * 10, @@ -410,16 +318,108 @@ async def search_episode_name_semantic( if filter_clauses: knn_query['filter'] = filter_clauses return await es_client.search( - index=self.__episode_names_index, + index=self.__text_embeddings_index, knn=knn_query, size=limit, - _source=['episode_id', 'title', 'video_path', 'episode_metadata'], + _source=[ + 'episode_id', 'embedding_id', 'text', 'segment_range', + 'video_path', 'episode_metadata', 'scene_info', + ], ) - async def get_stats(self, es_client: AsyncElasticsearch) -> Dict[str, int]: - return { - 'segments': (await es_client.count(index=self.__segments_index))['count'], - 'text_embeddings': (await es_client.count(index=self.__text_embeddings_index))['count'], - 'video_embeddings': (await es_client.count(index=self.__video_frames_index))['count'], - 'episode_names': (await es_client.count(index=self.__episode_names_index))['count'], + async def search_text_to_video( + self, + es_client: AsyncElasticsearch, + text: str, + season: Optional[int]=None, + episode: Optional[int]=None, + character: Optional[str]=None, + limit: int=10, + ) -> Dict[str, Any]: + embedding = self._embedding_service.get_text_embedding(text) + filter_clauses = self.__build_episode_filters(season, episode) + if character: + filter_clauses.append({ + 'nested': { + 'path': 'character_appearances', + 'query': {'term': {'character_appearances.name': character}}, + }, + }) + knn_query: Dict[str, Any] = { + 'field': 'video_embedding', + 'query_vector': embedding, + 'k': limit, + 'num_candidates': limit * 10, } + if filter_clauses: + knn_query['filter'] = filter_clauses + return await es_client.search( + index=self.__video_frames_index, + knn=knn_query, + size=limit, + _source=[ + 'episode_id', 'frame_number', 'timestamp', 'frame_type', 'scene_number', + 'perceptual_hash', 'video_path', 'episode_metadata', 'character_appearances', 'scene_info', + ], + ) + + async def search_video_semantic( + self, + es_client: AsyncElasticsearch, + image_path: str, + season: Optional[int]=None, + episode: Optional[int]=None, + character: Optional[str]=None, + limit: int=10, + ) -> Dict[str, Any]: + embedding = self._embedding_service.get_image_embedding(image_path) + filter_clauses = self.__build_episode_filters(season, episode) + if character: + filter_clauses.append({ + 'nested': { + 'path': 'character_appearances', + 'query': {'term': {'character_appearances.name': character}}, + }, + }) + knn_query: Dict[str, Any] = { + 'field': 'video_embedding', + 'query_vector': embedding, + 'k': limit, + 'num_candidates': limit * 10, + } + if filter_clauses: + knn_query['filter'] = filter_clauses + return await es_client.search( + index=self.__video_frames_index, + knn=knn_query, + size=limit, + _source=[ + 'episode_id', 'frame_number', 'timestamp', 'frame_type', 'scene_number', + 'perceptual_hash', 'video_path', 'episode_metadata', 'character_appearances', 'scene_info', + ], + ) + + @staticmethod + def __build_episode_filters(season: Optional[int], episode: Optional[int]) -> List[Dict[str, Any]]: + filters = [] + if season is not None: + filters.append({'term': {'episode_metadata.season': season}}) + if episode is not None: + filters.append({'term': {'episode_metadata.episode_number': episode}}) + return filters + + @property + def __episode_names_index(self) -> str: + return f'{self._index_base}_episode_names' + + @property + def __segments_index(self) -> str: + return f'{self._index_base}_text_segments' + + @property + def __text_embeddings_index(self) -> str: + return f'{self._index_base}_text_embeddings' + + @property + def __video_frames_index(self) -> str: + return f'{self._index_base}_video_frames' diff --git a/preprocessor/modules/search/clients/embedding_service.py b/preprocessor/modules/search/clients/embedding_service.py index 0cab1a81a..fed8c47f0 100644 --- a/preprocessor/modules/search/clients/embedding_service.py +++ b/preprocessor/modules/search/clients/embedding_service.py @@ -24,28 +24,14 @@ def __init__(self) -> None: self._processor: Optional[AutoProcessor] = None self._device: Optional[str] = None - def _load_model(self) -> Tuple[AutoModelForVision2Seq, AutoProcessor, str]: + def cleanup(self) -> None: if self._model is not None: - return (self._model, self._processor, self._device) - click.echo('Loading embedding model...', err=True) - if not torch.cuda.is_available(): - raise RuntimeError('CUDA is required but not available. This pipeline requires GPU.') - model_name = settings.embedding_model.model_name - self._device = 'cuda' - self._model = AutoModelForVision2Seq.from_pretrained(model_name, dtype=torch.bfloat16, device_map='auto') - self._processor = AutoProcessor.from_pretrained(model_name) - click.echo(f'Model loaded on {self._device}', err=True) - return (self._model, self._processor, self._device) - - def get_text_embedding(self, text: str) -> List[float]: - model, processor, device = self._load_model() - messages = [{'role': 'user', 'content': [{'type': 'text', 'text': text}]}] - text_inputs = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=True, return_tensors='pt').to(device) - with torch.no_grad(): - output = model(input_ids=text_inputs, output_hidden_states=True) - embedding = output.hidden_states[-1][:, -1, :].squeeze(0) - embedding = torch.nn.functional.normalize(embedding, p=2, dim=0) - return embedding.float().cpu().numpy().tolist() + del self._model + del self._processor + self._model = None + self._processor = None + if torch.cuda.is_available(): + torch.cuda.empty_cache() def get_image_embedding(self, image_path: Union[str, Path]) -> List[float]: model, processor, device = self._load_model() @@ -60,11 +46,25 @@ def get_image_embedding(self, image_path: Union[str, Path]) -> List[float]: embedding = torch.nn.functional.normalize(embedding, p=2, dim=0) return embedding.float().cpu().numpy().tolist() - def cleanup(self) -> None: + def get_text_embedding(self, text: str) -> List[float]: + model, processor, device = self._load_model() + messages = [{'role': 'user', 'content': [{'type': 'text', 'text': text}]}] + text_inputs = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=True, return_tensors='pt').to(device) + with torch.no_grad(): + output = model(input_ids=text_inputs, output_hidden_states=True) + embedding = output.hidden_states[-1][:, -1, :].squeeze(0) + embedding = torch.nn.functional.normalize(embedding, p=2, dim=0) + return embedding.float().cpu().numpy().tolist() + + def _load_model(self) -> Tuple[AutoModelForVision2Seq, AutoProcessor, str]: if self._model is not None: - del self._model - del self._processor - self._model = None - self._processor = None - if torch.cuda.is_available(): - torch.cuda.empty_cache() + return (self._model, self._processor, self._device) + click.echo('Loading embedding model...', err=True) + if not torch.cuda.is_available(): + raise RuntimeError('CUDA is required but not available. This pipeline requires GPU.') + model_name = settings.embedding_model.model_name + self._device = 'cuda' + self._model = AutoModelForVision2Seq.from_pretrained(model_name, dtype=torch.bfloat16, device_map='auto') + self._processor = AutoProcessor.from_pretrained(model_name) + click.echo(f'Model loaded on {self._device}', err=True) + return (self._model, self._processor, self._device) diff --git a/preprocessor/modules/search/clients/hash_service.py b/preprocessor/modules/search/clients/hash_service.py index 6c927019b..3ccc06ecb 100644 --- a/preprocessor/modules/search/clients/hash_service.py +++ b/preprocessor/modules/search/clients/hash_service.py @@ -16,15 +16,12 @@ class HashService: def __init__(self) -> None: self._hasher: Optional[PerceptualHasher] = None - def __load_hasher(self) -> PerceptualHasher: + def cleanup(self) -> None: if self._hasher is not None: - return self._hasher - click.echo('Loading perceptual hasher...', err=True) - if not torch.cuda.is_available(): - raise RuntimeError('CUDA is required but not available. This pipeline requires GPU.') - self._hasher = PerceptualHasher(device='cuda', hash_size=8) # pylint: disable=unexpected-keyword-arg - click.echo('Hasher loaded on cuda', err=True) - return self._hasher + del self._hasher + self._hasher = None + if torch.cuda.is_available(): + torch.cuda.empty_cache() def get_perceptual_hash(self, image_path: Union[str, Path]) -> Optional[str]: hasher = self.__load_hasher() @@ -32,9 +29,12 @@ def get_perceptual_hash(self, image_path: Union[str, Path]) -> Optional[str]: hashes = hasher.compute_phash_batch([image]) # pylint: disable=no-member return hashes[0] if hashes else None - def cleanup(self) -> None: + def __load_hasher(self) -> PerceptualHasher: if self._hasher is not None: - del self._hasher - self._hasher = None - if torch.cuda.is_available(): - torch.cuda.empty_cache() + return self._hasher + click.echo('Loading perceptual hasher...', err=True) + if not torch.cuda.is_available(): + raise RuntimeError('CUDA is required but not available. This pipeline requires GPU.') + self._hasher = PerceptualHasher(device='cuda', hash_size=8) # pylint: disable=unexpected-keyword-arg + click.echo('Hasher loaded on cuda', err=True) + return self._hasher diff --git a/preprocessor/modules/search/clients/result_formatters.py b/preprocessor/modules/search/clients/result_formatters.py index 440a847d7..ba5382ee7 100644 --- a/preprocessor/modules/search/clients/result_formatters.py +++ b/preprocessor/modules/search/clients/result_formatters.py @@ -22,12 +22,30 @@ def format_timestamp(seconds: float) -> str: return f'{minutes}m {secs:.1f}s' @staticmethod - def __format_scene_context(scene_info: Optional[Dict[str, Any]]) -> str: - if not scene_info: - return '' - start = ResultFormatter.format_timestamp(scene_info.get('scene_start_time', 0)) - end = ResultFormatter.format_timestamp(scene_info.get('scene_end_time', 0)) - return f" [Scene {scene_info.get('scene_number', '?')}: {start} - {end}]" + def print_results(result: Dict[str, Any], result_type: str='text') -> None: + total = result[ElasticsearchKeys.HITS][ElasticsearchKeys.TOTAL][ElasticsearchAggregationKeys.VALUE] + hits = result[ElasticsearchKeys.HITS][ElasticsearchKeys.HITS] + click.echo(f'\nZnaleziono: {total} wynikow') + click.echo('=' * 80) + for i, hit in enumerate(hits, 1): + source = hit[ElasticsearchKeys.SOURCE] + score = hit[ElasticsearchKeys.SCORE] + meta = source[EpisodeMetadataKeys.EPISODE_METADATA] + scene_ctx = ResultFormatter.__format_scene_context(source.get('scene_info')) + click.echo(f'\n[{i}] Score: {score:.2f}') + season_code = 'S00' if meta['season'] == 0 else f"S{meta['season']:02d}" + click.echo(f"Episode: {season_code}E{meta['episode_number']:02d} - {meta.get('title', 'N/A')}") + if result_type == 'text': + ResultFormatter.__print_text_result(source, scene_ctx) + elif result_type == 'text_semantic': + click.echo(f"Segments: {source['segment_range'][0]}-{source['segment_range'][1]}{scene_ctx}") + click.echo(f"Embedding ID: {source.get('embedding_id', 'N/A')}") + click.echo(f"Text: {source['text']}") + elif result_type == 'episode_name': + click.echo(f"Episode Title: {source.get('title', 'N/A')}") + else: + ResultFormatter.__print_video_result(source, scene_ctx) + click.echo(f"Path: {source['video_path']}") @staticmethod def __format_character_appearances(appearances: list) -> str: @@ -45,6 +63,14 @@ def __format_character_appearances(appearances: list) -> str: def __format_detected_objects(objects: list) -> str: return ', '.join([f"{obj['class']}:{obj['count']}" for obj in objects]) + @staticmethod + def __format_scene_context(scene_info: Optional[Dict[str, Any]]) -> str: + if not scene_info: + return '' + start = ResultFormatter.format_timestamp(scene_info.get('scene_start_time', 0)) + end = ResultFormatter.format_timestamp(scene_info.get('scene_end_time', 0)) + return f" [Scene {scene_info.get('scene_number', '?')}: {start} - {end}]" + @staticmethod def __print_text_result(source: Dict[str, Any], scene_ctx: str) -> None: click.echo(f"Segment ID: {source.get('segment_id', 'N/A')}") @@ -70,29 +96,3 @@ def __print_video_result(source: Dict[str, Any], scene_ctx: str) -> None: if source.get('detected_objects'): objects = ResultFormatter.__format_detected_objects(source['detected_objects']) click.echo(f'Objects: {objects}') - - @staticmethod - def print_results(result: Dict[str, Any], result_type: str='text') -> None: - total = result[ElasticsearchKeys.HITS][ElasticsearchKeys.TOTAL][ElasticsearchAggregationKeys.VALUE] - hits = result[ElasticsearchKeys.HITS][ElasticsearchKeys.HITS] - click.echo(f'\nZnaleziono: {total} wynikow') - click.echo('=' * 80) - for i, hit in enumerate(hits, 1): - source = hit[ElasticsearchKeys.SOURCE] - score = hit[ElasticsearchKeys.SCORE] - meta = source[EpisodeMetadataKeys.EPISODE_METADATA] - scene_ctx = ResultFormatter.__format_scene_context(source.get('scene_info')) - click.echo(f'\n[{i}] Score: {score:.2f}') - season_code = 'S00' if meta['season'] == 0 else f"S{meta['season']:02d}" - click.echo(f"Episode: {season_code}E{meta['episode_number']:02d} - {meta.get('title', 'N/A')}") - if result_type == 'text': - ResultFormatter.__print_text_result(source, scene_ctx) - elif result_type == 'text_semantic': - click.echo(f"Segments: {source['segment_range'][0]}-{source['segment_range'][1]}{scene_ctx}") - click.echo(f"Embedding ID: {source.get('embedding_id', 'N/A')}") - click.echo(f"Text: {source['text']}") - elif result_type == 'episode_name': - click.echo(f"Episode Title: {source.get('title', 'N/A')}") - else: - ResultFormatter.__print_video_result(source, scene_ctx) - click.echo(f"Path: {source['video_path']}") diff --git a/preprocessor/modules/search/document_generation.py b/preprocessor/modules/search/document_generation.py index d472ad13f..05e042b2a 100644 --- a/preprocessor/modules/search/document_generation.py +++ b/preprocessor/modules/search/document_generation.py @@ -17,10 +17,6 @@ class DocumentGeneratorStep(PipelineStep[Artifact, ElasticDocuments, DocumentGenerationConfig]): - @property - def name(self) -> str: - return 'document_generation' - def execute(self, input_data: Artifact, context: ExecutionContext) -> ElasticDocuments: if not hasattr(input_data, 'episode_info'): raise ValueError('Input artifact must have episode_info') @@ -42,6 +38,14 @@ def execute(self, input_data: Artifact, context: ExecutionContext) -> ElasticDoc context.mark_step_completed(self.name, episode_id) return ElasticDocuments(episode_id=episode_id, episode_info=episode_info, path=output_dir, document_count=total_docs) + @property + def name(self) -> str: + return 'document_generation' + + @staticmethod + def __build_episode_metadata(episode_info: Any, context: ExecutionContext) -> Dict[str, Any]: + return {'season': episode_info.season, 'episode_number': episode_info.relative_episode, 'series_name': context.series_name} + @staticmethod def __gather_input_data(episode_info: Any, context: ExecutionContext) -> Dict[str, Any]: data = {} @@ -82,7 +86,3 @@ def __generate_segments_jsonl(self, data: Dict[str, Any], episode_info: Any, con f.write(json.dumps(doc, ensure_ascii=False) + '\n') count += 1 return (output_path, count) - - @staticmethod - def __build_episode_metadata(episode_info: Any, context: ExecutionContext) -> Dict[str, Any]: - return {'season': episode_info.season, 'episode_number': episode_info.relative_episode, 'series_name': context.series_name} diff --git a/preprocessor/modules/search/indexing.py b/preprocessor/modules/search/indexing.py index 741277ba8..43083d9f1 100644 --- a/preprocessor/modules/search/indexing.py +++ b/preprocessor/modules/search/indexing.py @@ -24,13 +24,18 @@ def __init__(self, config: ElasticsearchConfig) -> None: super().__init__(config) self._es: Optional[ElasticsearchWrapper] = None - @property - def name(self) -> str: - return 'elasticsearch_indexing' + def cleanup(self) -> None: + if self._es: + asyncio.run(self._es.close()) + self._es = None def execute(self, input_data: List[ElasticDocuments], context: ExecutionContext) -> IndexingResult: return asyncio.run(self._execute_async(input_data, context)) + @property + def name(self) -> str: + return 'elasticsearch_indexing' + async def _execute_async( self, input_data: List[ElasticDocuments], @@ -107,8 +112,3 @@ def __get_mapping_for_type( doc_type: str, # pylint: disable=unused-argument ) -> Optional[Dict[str, Any]]: return None - - def cleanup(self) -> None: - if self._es: - asyncio.run(self._es.close()) - self._es = None diff --git a/preprocessor/modules/text/analysis.py b/preprocessor/modules/text/analysis.py index 88a28a941..923ef4758 100644 --- a/preprocessor/modules/text/analysis.py +++ b/preprocessor/modules/text/analysis.py @@ -16,10 +16,6 @@ class TextAnalysisStep(PipelineStep[TranscriptionData, TextAnalysisResults, TextAnalysisConfig]): - @property - def name(self) -> str: - return 'text_analysis' - def execute(self, input_data: TranscriptionData, context: ExecutionContext) -> TextAnalysisResults: output_filename = input_data.path.stem + '_text_stats.json' output_path = input_data.path.parent / output_filename @@ -48,3 +44,7 @@ def execute(self, input_data: TranscriptionData, context: ExecutionContext) -> T atomic_write_json(output_path, result_data) context.mark_step_completed(self.name, input_data.episode_id) return TextAnalysisResults(episode_id=input_data.episode_id, episode_info=input_data.episode_info, path=output_path, statistics=result_data) + + @property + def name(self) -> str: + return 'text_analysis' diff --git a/preprocessor/modules/text/embeddings.py b/preprocessor/modules/text/embeddings.py index c7462c0c6..12ca67d5a 100644 --- a/preprocessor/modules/text/embeddings.py +++ b/preprocessor/modules/text/embeddings.py @@ -28,24 +28,9 @@ def __init__(self, config: TextEmbeddingConfig) -> None: super().__init__(config) self._model: Optional[EmbeddingModelWrapper] = None - @property - def name(self) -> str: - return 'text_embedding' - - def _create_embedding_collection( # pylint: disable=duplicate-code - self, - input_data: TranscriptionData, - output_path: Path, - embedding_count: int, - ) -> EmbeddingCollection: - return MetadataBuilder.create_embedding_collection( - episode_id=input_data.episode_id, - episode_info=input_data.episode_info, - path=output_path, - model_name=self.config.model_name, - embedding_count=embedding_count, - embedding_type='text', - ) + def cleanup(self) -> None: + if self._model: + self._model = None def execute( # pylint: disable=too-many-locals self, @@ -122,6 +107,35 @@ def execute( # pylint: disable=too-many-locals context.mark_step_completed(self.name, input_data.episode_id) return self._create_embedding_collection(input_data, output_path, len(results)) + @property + def name(self) -> str: + return 'text_embedding' + + def _create_embedding_collection( # pylint: disable=duplicate-code + self, + input_data: TranscriptionData, + output_path: Path, + embedding_count: int, + ) -> EmbeddingCollection: + return MetadataBuilder.create_embedding_collection( + episode_id=input_data.episode_id, + episode_info=input_data.episode_info, + path=output_path, + model_name=self.config.model_name, + embedding_count=embedding_count, + embedding_type='text', + ) + + @staticmethod + def __find_segment_at_position(segments: List[Dict[str, Any]], char_pos: int) -> int: + cumulative_length: int = 0 + for idx, seg in enumerate(segments): + seg_length: int = len(seg.get('text', '')) + 1 + if cumulative_length <= char_pos < cumulative_length + seg_length: + return idx + cumulative_length += seg_length + return len(segments) - 1 if segments else 0 + @staticmethod def __load_clean_transcription( input_data: TranscriptionData, @@ -148,17 +162,3 @@ def __split_into_sentences(text: str) -> List[str]: if len(sentences) % 2 == 1 and sentences[-1].strip(): result.append(sentences[-1].strip()) return result - - @staticmethod - def __find_segment_at_position(segments: List[Dict[str, Any]], char_pos: int) -> int: - cumulative_length: int = 0 - for idx, seg in enumerate(segments): - seg_length: int = len(seg.get('text', '')) + 1 - if cumulative_length <= char_pos < cumulative_length + seg_length: - return idx - cumulative_length += seg_length - return len(segments) - 1 if segments else 0 - - def cleanup(self) -> None: - if self._model: - self._model = None diff --git a/preprocessor/modules/text/import_step.py b/preprocessor/modules/text/import_step.py index 45aee8fcc..2577ed90f 100644 --- a/preprocessor/modules/text/import_step.py +++ b/preprocessor/modules/text/import_step.py @@ -25,10 +25,6 @@ def __init__(self, config: TranscriptionImportConfig) -> None: super().__init__(config) self._episode_manager: Optional[EpisodeManager] = None - @property - def name(self) -> str: - return 'transcription_import' - def execute(self, input_data: None, context: ExecutionContext) -> List[TranscriptionData]: if self._episode_manager is None: self._episode_manager = EpisodeManager(None, context.series_name, context.logger) @@ -47,6 +43,70 @@ def execute(self, input_data: None, context: ExecutionContext) -> List[Transcrip context.logger.error(f'Failed to import {json_file.name}: {e}') return results + @property + def name(self) -> str: + return 'transcription_import' + + @staticmethod + def __convert_11labs_full(data: Dict[str, Any], source_file: Path) -> Dict[str, Any]: + segments: List[Dict[str, Any]] = [] + words: List[Dict[str, Any]] = data.get('words', []) + current_segment: Dict[str, Any] = {'words': [], 'start': None, 'end': None, 'text': '', 'speaker': 'unknown'} + for word in words: + if current_segment['start'] is None: + current_segment['start'] = word.get('start') + current_segment['words'].append(word) + current_segment['end'] = word.get('end') + if word.get('text', '').endswith(('.', '!', '?')) or len(current_segment['words']) >= 20: + current_segment['text'] = ' '.join((w.get('text', '') for w in current_segment['words'])) + segments.append(dict(current_segment)) + current_segment = {'words': [], 'start': None, 'end': None, 'text': '', 'speaker': word.get('speaker_id', 'unknown')} + if current_segment['words']: + current_segment['text'] = ' '.join((w.get('text', '') for w in current_segment['words'])) + segments.append(current_segment) + for i, seg in enumerate(segments): + seg['id'] = i + return { + 'transcription': { + 'format': '11labs', + 'source_file': source_file.name, + 'language_code': data.get('language_code', 'pol'), + 'language_probability': data.get('language_probability', 1.0), + }, + 'segments': segments, + } + + @staticmethod + def __convert_11labs_segmented(data: Dict[str, Any], source_file: Path) -> Dict[str, Any]: + segments: List[Dict[str, Any]] = [] + for i, segment in enumerate(data.get('segments', [])): + converted_segment: Dict[str, Any] = { + 'id': i, + 'start': segment.get('start'), + 'end': segment.get('end'), + 'text': segment.get('text', ''), + 'speaker': segment.get('speaker', 'unknown'), + 'words': segment.get('words', []), + } + segments.append(converted_segment) + return { + 'transcription': {'format': '11labs_segmented', 'source_file': source_file.name, 'segments': segments}, + 'segments': segments, + } + + @staticmethod + def __extract_season_episode_fallback(file_path: Path) -> Tuple[int, int]: + match: Optional[re.Match] = re.search('S(\\d+)E(\\d+)', file_path.name, re.IGNORECASE) + if match: + return (int(match.group(1)), int(match.group(2))) + parent_match: Optional[re.Match] = re.search('S(\\d+)', file_path.parent.name, re.IGNORECASE) + if parent_match: + season: int = int(parent_match.group(1)) + episode_match: Optional[re.Match] = re.search('E(\\d+)', file_path.name, re.IGNORECASE) + if episode_match: + return (season, int(episode_match.group(1))) + return (1, 1) + def __find_transcription_files(self) -> List[Path]: pattern: str = '*.json' if self.config.format_type == '11labs_segmented': @@ -93,63 +153,3 @@ def __import_single_file(self, json_file: Path, context: ExecutionContext) -> Op model=converted_data.get('transcription', {}).get('format', '11labs'), format='json', ) - - @staticmethod - def __convert_11labs_segmented(data: Dict[str, Any], source_file: Path) -> Dict[str, Any]: - segments: List[Dict[str, Any]] = [] - for i, segment in enumerate(data.get('segments', [])): - converted_segment: Dict[str, Any] = { - 'id': i, - 'start': segment.get('start'), - 'end': segment.get('end'), - 'text': segment.get('text', ''), - 'speaker': segment.get('speaker', 'unknown'), - 'words': segment.get('words', []), - } - segments.append(converted_segment) - return { - 'transcription': {'format': '11labs_segmented', 'source_file': source_file.name, 'segments': segments}, - 'segments': segments, - } - - @staticmethod - def __convert_11labs_full(data: Dict[str, Any], source_file: Path) -> Dict[str, Any]: - segments: List[Dict[str, Any]] = [] - words: List[Dict[str, Any]] = data.get('words', []) - current_segment: Dict[str, Any] = {'words': [], 'start': None, 'end': None, 'text': '', 'speaker': 'unknown'} - for word in words: - if current_segment['start'] is None: - current_segment['start'] = word.get('start') - current_segment['words'].append(word) - current_segment['end'] = word.get('end') - if word.get('text', '').endswith(('.', '!', '?')) or len(current_segment['words']) >= 20: - current_segment['text'] = ' '.join((w.get('text', '') for w in current_segment['words'])) - segments.append(dict(current_segment)) - current_segment = {'words': [], 'start': None, 'end': None, 'text': '', 'speaker': word.get('speaker_id', 'unknown')} - if current_segment['words']: - current_segment['text'] = ' '.join((w.get('text', '') for w in current_segment['words'])) - segments.append(current_segment) - for i, seg in enumerate(segments): - seg['id'] = i - return { - 'transcription': { - 'format': '11labs', - 'source_file': source_file.name, - 'language_code': data.get('language_code', 'pol'), - 'language_probability': data.get('language_probability', 1.0), - }, - 'segments': segments, - } - - @staticmethod - def __extract_season_episode_fallback(file_path: Path) -> Tuple[int, int]: - match: Optional[re.Match] = re.search('S(\\d+)E(\\d+)', file_path.name, re.IGNORECASE) - if match: - return (int(match.group(1)), int(match.group(2))) - parent_match: Optional[re.Match] = re.search('S(\\d+)', file_path.parent.name, re.IGNORECASE) - if parent_match: - season: int = int(parent_match.group(1)) - episode_match: Optional[re.Match] = re.search('E(\\d+)', file_path.name, re.IGNORECASE) - if episode_match: - return (season, int(episode_match.group(1))) - return (1, 1) diff --git a/preprocessor/modules/text/transcription.py b/preprocessor/modules/text/transcription.py index ed2a976ed..fc75e0510 100644 --- a/preprocessor/modules/text/transcription.py +++ b/preprocessor/modules/text/transcription.py @@ -23,9 +23,10 @@ def __init__(self, config: WhisperTranscriptionConfig) -> None: super().__init__(config) self._whisper: Optional[Whisper] = None - @property - def name(self) -> str: - return 'transcription' + def cleanup(self) -> None: + if self._whisper: + self._whisper.cleanup() + self._whisper = None def execute(self, input_data: AudioArtifact, context: ExecutionContext) -> TranscriptionData: output_filename: str = ( @@ -84,7 +85,6 @@ def execute(self, input_data: AudioArtifact, context: ExecutionContext) -> Trans format='json', ) - def cleanup(self) -> None: - if self._whisper: - self._whisper.cleanup() - self._whisper = None + @property + def name(self) -> str: + return 'transcription' diff --git a/preprocessor/modules/validation/episode_stats.py b/preprocessor/modules/validation/episode_stats.py index 59fef7be9..a10b58570 100644 --- a/preprocessor/modules/validation/episode_stats.py +++ b/preprocessor/modules/validation/episode_stats.py @@ -32,26 +32,26 @@ class EpisodeStats(ValidationStatusMixin): # pylint: disable=too-many-instance-attributes episode_info: EpisodeInfo series_name: str + character_visualizations_count: Optional[int] = None errors: List[str] = field(default_factory=list) - warnings: List[str] = field(default_factory=list) - transcription_chars: Optional[int] = None - transcription_duration: Optional[float] = None - transcription_words: Optional[int] = None + exported_frames_avg_resolution: Optional[Tuple[int, int]] = None exported_frames_count: Optional[int] = None exported_frames_total_size_mb: Optional[float] = None - exported_frames_avg_resolution: Optional[Tuple[int, int]] = None - video_size_mb: Optional[float] = None - video_duration: Optional[float] = None - video_codec: Optional[str] = None - video_resolution: Optional[Tuple[int, int]] = None - scenes_count: Optional[int] = None - scenes_avg_duration: Optional[float] = None + face_clusters_count: Optional[int] = None + face_clusters_total_faces: Optional[int] = None image_hashes_count: Optional[int] = None object_detections_count: Optional[int] = None object_visualizations_count: Optional[int] = None - character_visualizations_count: Optional[int] = None - face_clusters_count: Optional[int] = None - face_clusters_total_faces: Optional[int] = None + scenes_avg_duration: Optional[float] = None + scenes_count: Optional[int] = None + transcription_chars: Optional[int] = None + transcription_duration: Optional[float] = None + transcription_words: Optional[int] = None + video_codec: Optional[str] = None + video_duration: Optional[float] = None + video_resolution: Optional[Tuple[int, int]] = None + video_size_mb: Optional[float] = None + warnings: List[str] = field(default_factory=list) def collect_stats(self): self.__validate_transcription() @@ -65,42 +65,49 @@ def collect_stats(self): self.__validate_object_visualizations() self.__validate_other_files() - def __validate_transcription(self): - transcriptions_dir = PathManager(self.series_name).get_episode_dir(self.episode_info, settings.output_subdirs.transcriptions) - base_name = f'{self.series_name}_{self.episode_info.episode_code()}' - raw_dir = transcriptions_dir / settings.output_subdirs.transcription_subdirs.raw - clean_dir = transcriptions_dir / settings.output_subdirs.transcription_subdirs.clean - sound_events_dir = transcriptions_dir / settings.output_subdirs.transcription_subdirs.sound_events - transcription_files = { - 'main': raw_dir / f'{base_name}.json', - 'segmented': raw_dir / f'{base_name}_segmented.json', - 'simple': raw_dir / f'{base_name}_simple.json', - 'clean': clean_dir / f'{base_name}_clean_transcription.json', - 'clean_txt': clean_dir / f'{base_name}_clean_transcription.txt', - 'sound_events': sound_events_dir / f'{base_name}_sound_events.json', + def to_dict(self) -> Dict[str, Any]: + return { + 'status': self.status, + 'errors': self.errors, + 'warnings': self.warnings, + 'stats': { + 'transcription_chars': self.transcription_chars, + 'transcription_duration': self.transcription_duration, + 'transcription_words': self.transcription_words, + 'exported_frames_count': self.exported_frames_count, + 'exported_frames_total_size_mb': self.exported_frames_total_size_mb, + 'exported_frames_avg_resolution': self.exported_frames_avg_resolution, + 'video_size_mb': self.video_size_mb, + 'video_duration': self.video_duration, + 'video_codec': self.video_codec, + 'video_resolution': self.video_resolution, + 'scenes_count': self.scenes_count, + 'scenes_avg_duration': self.scenes_avg_duration, + 'image_hashes_count': self.image_hashes_count, + 'character_visualizations_count': self.character_visualizations_count, + 'face_clusters_count': self.face_clusters_count, + 'face_clusters_total_faces': self.face_clusters_total_faces, + 'object_detections_count': self.object_detections_count, + 'object_visualizations_count': self.object_visualizations_count, + }, } - if not any((f.exists() for f in transcription_files.values())): - self.errors.append('No transcription files found in any format') - return - self.__validate_raw_transcription(transcription_files) - self.__validate_clean_transcription(transcription_files['clean']) - self.__validate_clean_txt(transcription_files['clean_txt']) - self.__validate_sound_events(transcription_files['sound_events']) - def __validate_raw_transcription(self, transcription_files: Dict[str, Path]): - raw_transcription = None - for key in ('main', 'segmented', 'simple'): - if transcription_files[key].exists(): - raw_transcription = transcription_files[key] - break - if not raw_transcription: - self.warnings.append('Missing raw transcription file (checked: .json, _segmented.json, _simple.json)') + def __check_size_anomalies( + self, sizes: List[int], folder_name: str, threshold: float = 0.2, + ): + if len(sizes) < 2: return - result = FileValidator.validate_json_file(raw_transcription) - if not result.is_valid: - self.errors.append(f'Invalid transcription JSON: {result.error_message}') + avg_size = sum(sizes) / len(sizes) + if avg_size == 0: return - self.__extract_transcription_stats(raw_transcription) + for i, size in enumerate(sizes): + deviation = abs(size - avg_size) / avg_size + if deviation > threshold: + warning_msg = ( + f'{folder_name} file #{i + 1} size deviation: ' + f'{deviation * 100:.1f}% from average' + ) + self.warnings.append(warning_msg) def __extract_transcription_stats(self, raw_transcription: Path): data = self.__load_json_safely(raw_transcription) @@ -122,6 +129,17 @@ def __extract_transcription_stats(self, raw_transcription: Path): if segments and segments[-1].get('end'): self.transcription_duration = segments[-1].get('end', 0.0) + @staticmethod + def __load_json_safely(file_path: Path) -> Optional[Dict[str, Any]]: + try: + with open(file_path, 'r', encoding='utf-8') as f: + return json.load(f) + except Exception: + return None + + def __validate_character_visualizations(self): + self.__validate_visualizations(settings.output_subdirs.character_visualizations, 'character_visualizations_count', 'character visualization') + def __validate_clean_transcription(self, clean_transcription_file): if not clean_transcription_file.exists(): self.warnings.append(f'Missing clean transcription file: {clean_transcription_file.name}') @@ -134,13 +152,38 @@ def __validate_clean_txt(self, clean_txt_file): if not clean_txt_file.exists(): self.warnings.append(f'Missing clean transcription txt: {clean_txt_file.name}') - def __validate_sound_events(self, sound_events_file): - if not sound_events_file.exists(): - self.warnings.append(f'Missing sound events file: {sound_events_file.name}') + def __validate_embedding_dimensions(self, jsonl_file, subdir: str): + embedding_fields = { + ELASTIC_SUBDIRS.text_embeddings: 'text_embedding', + ELASTIC_SUBDIRS.video_frames: 'video_embedding', + ELASTIC_SUBDIRS.episode_names: 'title_embedding', + ELASTIC_SUBDIRS.full_episode_embeddings: 'full_episode_embedding', + ELASTIC_SUBDIRS.sound_event_embeddings: 'sound_event_embedding', + } + if subdir not in embedding_fields: return - result = FileValidator.validate_json_file(sound_events_file) - if not result.is_valid: - self.warnings.append(f'Invalid sound events JSON: {result.error_message}') + embedding_field = embedding_fields[subdir] + expected_dim = settings.embedding_model.embedding_dim + try: + with open(jsonl_file, 'r', encoding='utf-8') as f: + for line_num, line in enumerate(f, 1): + if not line.strip(): + continue + doc = json.loads(line) + if embedding_field in doc: + embedding = doc[embedding_field] + if isinstance(embedding, list): + actual_dim = len(embedding) + if actual_dim != expected_dim: + error_msg = ( + f'{jsonl_file.name} line {line_num}: ' + f'{embedding_field} has {actual_dim} dimensions, ' + f'expected {expected_dim}' + ) + self.errors.append(error_msg) + return + except Exception as e: + self.errors.append(f'Error validating embeddings in {jsonl_file.name}: {e}') def __validate_exported_frames(self): frames_dir = PathManager(self.series_name).get_episode_dir(self.episode_info, settings.output_subdirs.frames) @@ -170,82 +213,6 @@ def __validate_exported_frames(self): most_common_res = max(set(resolutions), key=resolutions.count) self.exported_frames_avg_resolution = most_common_res - def __validate_video(self): - filename = f'{self.series_name.lower()}_{self.episode_info.episode_code()}{DEFAULT_VIDEO_EXTENSION}' - season_dir = get_base_output_dir(self.series_name) / settings.output_subdirs.video / self.episode_info.season_code() - video_file = season_dir / filename - if not video_file.exists(): - self.warnings.append(f'Missing video file: {video_file}') - return - result = FileValidator.validate_video_file(video_file) - if not result.is_valid: - self.errors.append(f'Invalid video: {result.error_message}') - return - self.video_size_mb = result.metadata['size_mb'] - self.video_duration = result.metadata['duration'] - self.video_codec = result.metadata['codec'] - self.video_resolution = (result.metadata['width'], result.metadata['height']) - - def __validate_scenes(self): - scenes_dir = PathManager(self.series_name).get_episode_dir(self.episode_info, settings.output_subdirs.scenes) - scenes_file = scenes_dir / f"{self.series_name}_{self.episode_info.episode_code()}{OUTPUT_FILE_PATTERNS['scenes_suffix']}" - if not scenes_file.exists(): - self.errors.append(f'Missing scenes file: {scenes_file}') - return - result = FileValidator.validate_json_file(scenes_file) - if not result.is_valid: - self.errors.append(f'Invalid scenes JSON: {result.error_message}') - return - data = self.__load_json_safely(scenes_file) - if not data: - self.errors.append(f'Error reading scenes: {scenes_file}') - return - self.scenes_count = data.get('total_scenes', 0) - scenes = data.get('scenes', []) - if scenes: - durations = [scene.get('duration', 0) for scene in scenes] - self.scenes_avg_duration = round(sum(durations) / len(durations), 2) - - def __validate_json_directory( - self, - subdir: str, - count_attr: Optional[str], - context_name: str, - exclude_pattern: Optional[str] = None, - check_anomalies: bool = True, - ): - dir_path = PathManager(self.series_name).get_episode_dir(self.episode_info, subdir) - count, sizes, errors = self.__validate_json_files_in_directory(dir_path, exclude_pattern) - if not dir_path.exists(): - self.warnings.append(f'Missing {subdir} directory') - return - if count == 0: - self.warnings.append(f'No JSON files in {subdir}/') - return - if count_attr: - setattr(self, count_attr, count) - self.errors.extend(errors) - if check_anomalies: - self.__check_size_anomalies(sizes, context_name) - - def __validate_image_hashes(self): - self.__validate_json_directory(settings.output_subdirs.image_hashes, 'image_hashes_count', 'image_hashes') - - def __validate_visualizations(self, subdir: str, count_attr: str, context_name: str): - viz_dir = PathManager(self.series_name).get_episode_dir(self.episode_info, subdir) - total_count, invalid_count, errors = self.__validate_images_in_directory(viz_dir) - if total_count == 0 and viz_dir.exists(): - self.warnings.append(f'No visualization images in {subdir}/') - return - if total_count > 0: - setattr(self, count_attr, total_count) - self.errors.extend(errors) - if invalid_count > 0: - self.warnings.append(f'{invalid_count} invalid {context_name} images found') - - def __validate_character_visualizations(self): - self.__validate_visualizations(settings.output_subdirs.character_visualizations, 'character_visualizations_count', 'character visualization') - def __validate_face_clusters(self): clusters_dir = PathManager(self.series_name).get_episode_dir(self.episode_info, settings.output_subdirs.face_clusters) if not clusters_dir.exists(): @@ -278,6 +245,74 @@ def __validate_face_clusters(self): total_faces += noise_info.get('face_count', 0) self.face_clusters_total_faces = total_faces + def __validate_image_hashes(self): + self.__validate_json_directory(settings.output_subdirs.image_hashes, 'image_hashes_count', 'image_hashes') + + @staticmethod + def __validate_images_in_directory( + directory: Path, + extensions: Tuple[str, ...] = ('*.jpg', '*.png'), + ) -> Tuple[int, int, List[str]]: + if not directory.exists(): + return (0, 0, []) + image_files = [] + for ext in extensions: + image_files.extend(directory.glob(ext)) + if not image_files: + return (0, 0, []) + invalid_count = 0 + errors = [] + for img_file in image_files: + result = FileValidator.validate_image_file(img_file) + if not result.is_valid: + invalid_count += 1 + errors.append(f'Invalid image {img_file.name}: {result.error_message}') + return (len(image_files), invalid_count, errors) + + def __validate_json_directory( + self, + subdir: str, + count_attr: Optional[str], + context_name: str, + exclude_pattern: Optional[str] = None, + check_anomalies: bool = True, + ): + dir_path = PathManager(self.series_name).get_episode_dir(self.episode_info, subdir) + count, sizes, errors = self.__validate_json_files_in_directory(dir_path, exclude_pattern) + if not dir_path.exists(): + self.warnings.append(f'Missing {subdir} directory') + return + if count == 0: + self.warnings.append(f'No JSON files in {subdir}/') + return + if count_attr: + setattr(self, count_attr, count) + self.errors.extend(errors) + if check_anomalies: + self.__check_size_anomalies(sizes, context_name) + + @staticmethod + def __validate_json_files_in_directory( + directory: Path, exclude_pattern: Optional[str] = None, + ) -> Tuple[int, List[int], List[str]]: + if not directory.exists(): + return (0, [], []) + json_files = [ + f for f in directory.glob('*.json') + if not exclude_pattern or exclude_pattern not in str(f) + ] + if not json_files: + return (0, [], []) + sizes = [] + errors = [] + for json_file in json_files: + result = FileValidator.validate_json_file(json_file) + if not result.is_valid: + errors.append(f'Invalid JSON {json_file.name}: {result.error_message}') + else: + sizes.append(json_file.stat().st_size) + return (len(json_files), sizes, errors) + def __validate_object_detections(self): self.__validate_json_directory( settings.output_subdirs.object_detections, @@ -289,56 +324,6 @@ def __validate_object_detections(self): def __validate_object_visualizations(self): self.__validate_visualizations(settings.output_subdirs.object_visualizations, 'object_visualizations_count', 'visualization') - def __validate_embedding_dimensions(self, jsonl_file, subdir: str): - embedding_fields = { - ELASTIC_SUBDIRS.text_embeddings: 'text_embedding', - ELASTIC_SUBDIRS.video_frames: 'video_embedding', - ELASTIC_SUBDIRS.episode_names: 'title_embedding', - ELASTIC_SUBDIRS.full_episode_embeddings: 'full_episode_embedding', - ELASTIC_SUBDIRS.sound_event_embeddings: 'sound_event_embedding', - } - if subdir not in embedding_fields: - return - embedding_field = embedding_fields[subdir] - expected_dim = settings.embedding_model.embedding_dim - try: - with open(jsonl_file, 'r', encoding='utf-8') as f: - for line_num, line in enumerate(f, 1): - if not line.strip(): - continue - doc = json.loads(line) - if embedding_field in doc: - embedding = doc[embedding_field] - if isinstance(embedding, list): - actual_dim = len(embedding) - if actual_dim != expected_dim: - error_msg = ( - f'{jsonl_file.name} line {line_num}: ' - f'{embedding_field} has {actual_dim} dimensions, ' - f'expected {expected_dim}' - ) - self.errors.append(error_msg) - return - except Exception as e: - self.errors.append(f'Error validating embeddings in {jsonl_file.name}: {e}') - - def __check_size_anomalies( - self, sizes: List[int], folder_name: str, threshold: float = 0.2, - ): - if len(sizes) < 2: - return - avg_size = sum(sizes) / len(sizes) - if avg_size == 0: - return - for i, size in enumerate(sizes): - deviation = abs(size - avg_size) / avg_size - if deviation > threshold: - warning_msg = ( - f'{folder_name} file #{i + 1} size deviation: ' - f'{deviation * 100:.1f}% from average' - ) - self.warnings.append(warning_msg) - def __validate_other_files(self): char_detections_dir = PathManager(self.series_name).get_episode_dir(self.episode_info, settings.output_subdirs.character_detections) detections_file = char_detections_dir / OUTPUT_FILE_NAMES['detections'] @@ -392,80 +377,95 @@ def __validate_other_files(self): else: self.warnings.append(f'Missing text statistics file: {text_stats_file.name}') - def to_dict(self) -> Dict[str, Any]: - return { - 'status': self.status, - 'errors': self.errors, - 'warnings': self.warnings, - 'stats': { - 'transcription_chars': self.transcription_chars, - 'transcription_duration': self.transcription_duration, - 'transcription_words': self.transcription_words, - 'exported_frames_count': self.exported_frames_count, - 'exported_frames_total_size_mb': self.exported_frames_total_size_mb, - 'exported_frames_avg_resolution': self.exported_frames_avg_resolution, - 'video_size_mb': self.video_size_mb, - 'video_duration': self.video_duration, - 'video_codec': self.video_codec, - 'video_resolution': self.video_resolution, - 'scenes_count': self.scenes_count, - 'scenes_avg_duration': self.scenes_avg_duration, - 'image_hashes_count': self.image_hashes_count, - 'character_visualizations_count': self.character_visualizations_count, - 'face_clusters_count': self.face_clusters_count, - 'face_clusters_total_faces': self.face_clusters_total_faces, - 'object_detections_count': self.object_detections_count, - 'object_visualizations_count': self.object_visualizations_count, - }, - } + def __validate_raw_transcription(self, transcription_files: Dict[str, Path]): + raw_transcription = None + for key in ('main', 'segmented', 'simple'): + if transcription_files[key].exists(): + raw_transcription = transcription_files[key] + break + if not raw_transcription: + self.warnings.append('Missing raw transcription file (checked: .json, _segmented.json, _simple.json)') + return + result = FileValidator.validate_json_file(raw_transcription) + if not result.is_valid: + self.errors.append(f'Invalid transcription JSON: {result.error_message}') + return + self.__extract_transcription_stats(raw_transcription) - @staticmethod - def __validate_images_in_directory( - directory: Path, - extensions: Tuple[str, ...] = ('*.jpg', '*.png'), - ) -> Tuple[int, int, List[str]]: - if not directory.exists(): - return (0, 0, []) - image_files = [] - for ext in extensions: - image_files.extend(directory.glob(ext)) - if not image_files: - return (0, 0, []) - invalid_count = 0 - errors = [] - for img_file in image_files: - result = FileValidator.validate_image_file(img_file) - if not result.is_valid: - invalid_count += 1 - errors.append(f'Invalid image {img_file.name}: {result.error_message}') - return (len(image_files), invalid_count, errors) + def __validate_scenes(self): + scenes_dir = PathManager(self.series_name).get_episode_dir(self.episode_info, settings.output_subdirs.scenes) + scenes_file = scenes_dir / f"{self.series_name}_{self.episode_info.episode_code()}{OUTPUT_FILE_PATTERNS['scenes_suffix']}" + if not scenes_file.exists(): + self.errors.append(f'Missing scenes file: {scenes_file}') + return + result = FileValidator.validate_json_file(scenes_file) + if not result.is_valid: + self.errors.append(f'Invalid scenes JSON: {result.error_message}') + return + data = self.__load_json_safely(scenes_file) + if not data: + self.errors.append(f'Error reading scenes: {scenes_file}') + return + self.scenes_count = data.get('total_scenes', 0) + scenes = data.get('scenes', []) + if scenes: + durations = [scene.get('duration', 0) for scene in scenes] + self.scenes_avg_duration = round(sum(durations) / len(durations), 2) - @staticmethod - def __validate_json_files_in_directory( - directory: Path, exclude_pattern: Optional[str] = None, - ) -> Tuple[int, List[int], List[str]]: - if not directory.exists(): - return (0, [], []) - json_files = [ - f for f in directory.glob('*.json') - if not exclude_pattern or exclude_pattern not in str(f) - ] - if not json_files: - return (0, [], []) - sizes = [] - errors = [] - for json_file in json_files: - result = FileValidator.validate_json_file(json_file) - if not result.is_valid: - errors.append(f'Invalid JSON {json_file.name}: {result.error_message}') - else: - sizes.append(json_file.stat().st_size) - return (len(json_files), sizes, errors) + def __validate_sound_events(self, sound_events_file): + if not sound_events_file.exists(): + self.warnings.append(f'Missing sound events file: {sound_events_file.name}') + return + result = FileValidator.validate_json_file(sound_events_file) + if not result.is_valid: + self.warnings.append(f'Invalid sound events JSON: {result.error_message}') - @staticmethod - def __load_json_safely(file_path: Path) -> Optional[Dict[str, Any]]: - try: - with open(file_path, 'r', encoding='utf-8') as f: - return json.load(f) - except Exception: - return None + def __validate_transcription(self): + transcriptions_dir = PathManager(self.series_name).get_episode_dir(self.episode_info, settings.output_subdirs.transcriptions) + base_name = f'{self.series_name}_{self.episode_info.episode_code()}' + raw_dir = transcriptions_dir / settings.output_subdirs.transcription_subdirs.raw + clean_dir = transcriptions_dir / settings.output_subdirs.transcription_subdirs.clean + sound_events_dir = transcriptions_dir / settings.output_subdirs.transcription_subdirs.sound_events + transcription_files = { + 'main': raw_dir / f'{base_name}.json', + 'segmented': raw_dir / f'{base_name}_segmented.json', + 'simple': raw_dir / f'{base_name}_simple.json', + 'clean': clean_dir / f'{base_name}_clean_transcription.json', + 'clean_txt': clean_dir / f'{base_name}_clean_transcription.txt', + 'sound_events': sound_events_dir / f'{base_name}_sound_events.json', + } + if not any((f.exists() for f in transcription_files.values())): + self.errors.append('No transcription files found in any format') + return + self.__validate_raw_transcription(transcription_files) + self.__validate_clean_transcription(transcription_files['clean']) + self.__validate_clean_txt(transcription_files['clean_txt']) + self.__validate_sound_events(transcription_files['sound_events']) + + def __validate_video(self): + filename = f'{self.series_name.lower()}_{self.episode_info.episode_code()}{DEFAULT_VIDEO_EXTENSION}' + season_dir = get_base_output_dir(self.series_name) / settings.output_subdirs.video / self.episode_info.season_code() + video_file = season_dir / filename + if not video_file.exists(): + self.warnings.append(f'Missing video file: {video_file}') + return + result = FileValidator.validate_video_file(video_file) + if not result.is_valid: + self.errors.append(f'Invalid video: {result.error_message}') + return + self.video_size_mb = result.metadata['size_mb'] + self.video_duration = result.metadata['duration'] + self.video_codec = result.metadata['codec'] + self.video_resolution = (result.metadata['width'], result.metadata['height']) + + def __validate_visualizations(self, subdir: str, count_attr: str, context_name: str): + viz_dir = PathManager(self.series_name).get_episode_dir(self.episode_info, subdir) + total_count, invalid_count, errors = self.__validate_images_in_directory(viz_dir) + if total_count == 0 and viz_dir.exists(): + self.warnings.append(f'No visualization images in {subdir}/') + return + if total_count > 0: + setattr(self, count_attr, total_count) + self.errors.extend(errors) + if invalid_count > 0: + self.warnings.append(f'{invalid_count} invalid {context_name} images found') diff --git a/preprocessor/modules/validation/global_validator.py b/preprocessor/modules/validation/global_validator.py index 2db842626..fe8986eb5 100644 --- a/preprocessor/modules/validation/global_validator.py +++ b/preprocessor/modules/validation/global_validator.py @@ -21,21 +21,13 @@ def validate(self) -> GlobalValidationResult: self.__validate_processing_metadata() return self.result - def __validate_json_file(self, file_path: Path, stats_key: str): - if file_path.exists(): - result = FileValidator.validate_json_file(file_path) - if not result.is_valid: - self.result.errors.append(f'Invalid {file_path.name}: {result.error_message}') - else: - self.result.stats[stats_key] = True - else: - self.result.warnings.append(f'Missing {file_path.name}') - - def __validate_main_json_files(self): - episodes_file = self.base_output_dir / f'{self.series_name}_episodes.json' - self.__validate_json_file(episodes_file, 'episodes_json_valid') - characters_file = self.base_output_dir / f'{self.series_name}_characters.json' - self.__validate_json_file(characters_file, 'characters_json_valid') + @staticmethod + def __get_character_images(char_folder: Path) -> List[Path]: + extensions = ['*.jpg', '*.jpeg', '*.png', '*.webp'] + image_files = [] + for ext in extensions: + image_files.extend(char_folder.glob(ext)) + return image_files def __validate_characters_folder(self): characters_dir = self.base_output_dir / 'characters' @@ -66,6 +58,22 @@ def __validate_characters_folder(self): if characters_without_images: self.result.warnings.append(f'{len(characters_without_images)} characters without reference images') + def __validate_json_file(self, file_path: Path, stats_key: str): + if file_path.exists(): + result = FileValidator.validate_json_file(file_path) + if not result.is_valid: + self.result.errors.append(f'Invalid {file_path.name}: {result.error_message}') + else: + self.result.stats[stats_key] = True + else: + self.result.warnings.append(f'Missing {file_path.name}') + + def __validate_main_json_files(self): + episodes_file = self.base_output_dir / f'{self.series_name}_episodes.json' + self.__validate_json_file(episodes_file, 'episodes_json_valid') + characters_file = self.base_output_dir / f'{self.series_name}_characters.json' + self.__validate_json_file(characters_file, 'characters_json_valid') + def __validate_processing_metadata(self): metadata_dir = self.base_output_dir / 'processing_metadata' if not metadata_dir.exists(): @@ -80,11 +88,3 @@ def __validate_processing_metadata(self): result = FileValidator.validate_json_file(json_file) if not result.is_valid: self.result.errors.append(f'Invalid processing metadata {json_file.name}: {result.error_message}') - - @staticmethod - def __get_character_images(char_folder: Path) -> List[Path]: - extensions = ['*.jpg', '*.jpeg', '*.png', '*.webp'] - image_files = [] - for ext in extensions: - image_files.extend(char_folder.glob(ext)) - return image_files diff --git a/preprocessor/modules/validation/season_comparator.py b/preprocessor/modules/validation/season_comparator.py index 28d7c419a..985a48fe5 100644 --- a/preprocessor/modules/validation/season_comparator.py +++ b/preprocessor/modules/validation/season_comparator.py @@ -14,27 +14,27 @@ @dataclass class MetricComparison: - metric_name: str - min_value: Optional[float] - max_value: Optional[float] avg_value: Optional[float] difference_percent: Optional[float] + max_value: Optional[float] + metric_name: str + min_value: Optional[float] @dataclass class Anomaly: - episode: str - metric: str - value: float avg: float deviation_percent: float + episode: str + metric: str severity: str + value: float @dataclass class SeasonComparison: - season: str anomaly_threshold: float - metrics: Dict[str, MetricComparison] = field(default_factory=dict) + season: str anomalies: List[Anomaly] = field(default_factory=list) + metrics: Dict[str, MetricComparison] = field(default_factory=dict) def compare_episodes(self, episodes_stats: Dict[str, EpisodeStats]): metric_keys = [ @@ -50,6 +50,30 @@ def compare_episodes(self, episodes_stats: Dict[str, EpisodeStats]): for metric_key in metric_keys: self.__compare_metric(metric_key, episodes_stats) + def to_dict(self) -> Dict[str, Any]: + return { + 'metrics': { + metric_name: { + 'min': metric.min_value, + 'max': metric.max_value, + 'avg': metric.avg_value, + 'difference_percent': metric.difference_percent, + } + for metric_name, metric in self.metrics.items() + }, + 'anomalies': [ + { + 'episode': anomaly.episode, + 'metric': anomaly.metric, + 'value': anomaly.value, + 'avg': anomaly.avg, + 'deviation_percent': anomaly.deviation_percent, + 'severity': anomaly.severity, + } + for anomaly in self.anomalies + ], + } + def __compare_metric(self, metric_key: str, episodes_stats: Dict[str, EpisodeStats]): values = [] episode_values = {} @@ -92,27 +116,3 @@ def __compare_metric(self, metric_key: str, episodes_stats: Dict[str, EpisodeSta severity=severity, ), ) - - def to_dict(self) -> Dict[str, Any]: - return { - 'metrics': { - metric_name: { - 'min': metric.min_value, - 'max': metric.max_value, - 'avg': metric.avg_value, - 'difference_percent': metric.difference_percent, - } - for metric_name, metric in self.metrics.items() - }, - 'anomalies': [ - { - 'episode': anomaly.episode, - 'metric': anomaly.metric, - 'value': anomaly.value, - 'avg': anomaly.avg, - 'deviation_percent': anomaly.deviation_percent, - 'severity': anomaly.severity, - } - for anomaly in self.anomalies - ], - } diff --git a/preprocessor/modules/video/frame_export.py b/preprocessor/modules/video/frame_export.py index 95d224d3c..60bc8f679 100644 --- a/preprocessor/modules/video/frame_export.py +++ b/preprocessor/modules/video/frame_export.py @@ -31,10 +31,6 @@ def __init__(self, config: FrameExportConfig): decord.bridge.set_bridge('native') self.strategy = KeyframeStrategyFactory.create(self.config.keyframe_strategy, self.config.frames_per_scene) - @property - def name(self) -> str: - return 'frame_export' - def execute(self, input_data: SceneCollection, context: ExecutionContext) -> FrameCollection: episode_dir = context.get_output_path(input_data.episode_info, 'exported_frames', '') metadata_filename = f'{context.series_name}_{input_data.episode_info.episode_code()}_frame_metadata.json' @@ -87,21 +83,25 @@ def execute(self, input_data: SceneCollection, context: ExecutionContext) -> Fra metadata_path=metadata_file, ) - def __extract_frames( - self, - video_file: Path, - frame_requests: List[FrameRequest], - episode_dir: Path, - episode_info, - context: ExecutionContext, - ) -> None: - video_metadata = self.__get_video_metadata(video_file) - dar = self.__calculate_display_aspect_ratio(video_metadata) - vr = decord.VideoReader(str(video_file), ctx=decord.cpu(0)) - for req in frame_requests: - frame_num = req['frame_number'] - self.__extract_and_save_frame(vr, frame_num, episode_dir, episode_info, dar, context.series_name) - del vr + @property + def name(self) -> str: + return 'frame_export' + + @staticmethod + def __calculate_display_aspect_ratio(metadata: Dict[str, Any]) -> float: + width = metadata.get('width', 0) + height = metadata.get('height', 0) + if width == 0 or height == 0: + raise ValueError('Invalid video dimensions') + sar_str = metadata.get('sample_aspect_ratio', '1:1') + if sar_str == 'N/A' or not sar_str: + sar_str = '1:1' + try: + sar_num, sar_denom = [int(x) for x in sar_str.split(':')] + sar = sar_num / sar_denom if sar_denom != 0 else 1.0 + except (ValueError, ZeroDivisionError): + sar = 1.0 + return width / height * sar def __extract_and_save_frame( self, @@ -119,6 +119,22 @@ def __extract_and_save_frame( filename = f'{base_filename}_frame_{frame_num:06d}.jpg' resized.save(episode_dir / filename, quality=90) + def __extract_frames( + self, + video_file: Path, + frame_requests: List[FrameRequest], + episode_dir: Path, + episode_info, + context: ExecutionContext, + ) -> None: + video_metadata = self.__get_video_metadata(video_file) + dar = self.__calculate_display_aspect_ratio(video_metadata) + vr = decord.VideoReader(str(video_file), ctx=decord.cpu(0)) + for req in frame_requests: + frame_num = req['frame_number'] + self.__extract_and_save_frame(vr, frame_num, episode_dir, episode_info, dar, context.series_name) + del vr + @staticmethod def __get_video_metadata(video_path: Path) -> Dict[str, Any]: cmd = [ @@ -133,22 +149,6 @@ def __get_video_metadata(video_path: Path) -> Dict[str, Any]: raise ValueError(f'No video streams found in {video_path}') return streams[0] - @staticmethod - def __calculate_display_aspect_ratio(metadata: Dict[str, Any]) -> float: - width = metadata.get('width', 0) - height = metadata.get('height', 0) - if width == 0 or height == 0: - raise ValueError('Invalid video dimensions') - sar_str = metadata.get('sample_aspect_ratio', '1:1') - if sar_str == 'N/A' or not sar_str: - sar_str = '1:1' - try: - sar_num, sar_denom = [int(x) for x in sar_str.split(':')] - sar = sar_num / sar_denom if sar_denom != 0 else 1.0 - except (ValueError, ZeroDivisionError): - sar = 1.0 - return width / height * sar - def __resize_frame(self, frame: Image.Image, display_aspect_ratio: float) -> Image.Image: target_width = self.config.resolution.width target_height = self.config.resolution.height diff --git a/preprocessor/modules/video/scene_detection.py b/preprocessor/modules/video/scene_detection.py index 82368211b..c60570b48 100644 --- a/preprocessor/modules/video/scene_detection.py +++ b/preprocessor/modules/video/scene_detection.py @@ -19,9 +19,10 @@ def __init__(self, config: SceneDetectionConfig): self.transnet = TransNetWrapper() self._model_loaded = False - @property - def name(self) -> str: - return 'scene_detection' + def cleanup(self) -> None: + if self._model_loaded: + self.transnet.cleanup() + self._model_loaded = False def execute(self, input_data: TranscodedVideo, context: ExecutionContext) -> SceneCollection: output_filename = f'{context.series_name}_{input_data.episode_info.episode_code()}_scenes.json' @@ -73,7 +74,6 @@ def execute(self, input_data: TranscodedVideo, context: ExecutionContext) -> Sce min_scene_len=self.config.min_scene_len, ) - def cleanup(self) -> None: - if self._model_loaded: - self.transnet.cleanup() - self._model_loaded = False + @property + def name(self) -> str: + return 'scene_detection' diff --git a/preprocessor/modules/video/transcoding.py b/preprocessor/modules/video/transcoding.py index f456f53e3..d14c641bc 100644 --- a/preprocessor/modules/video/transcoding.py +++ b/preprocessor/modules/video/transcoding.py @@ -10,10 +10,6 @@ class VideoTranscoderStep(PipelineStep[SourceVideo, TranscodedVideo, TranscodeConfig]): - @property - def name(self) -> str: - return 'video_transcode' - def execute( # pylint: disable=too-many-locals,too-many-statements self, input_data: SourceVideo, context: ExecutionContext, ) -> TranscodedVideo: @@ -134,3 +130,7 @@ def execute( # pylint: disable=too-many-locals,too-many-statements resolution=resolution_str, codec=self.config.codec, ) + + @property + def name(self) -> str: + return 'video_transcode' diff --git a/preprocessor/modules/vision/character_detection.py b/preprocessor/modules/vision/character_detection.py index 8f5eaa89b..3caa24e7f 100644 --- a/preprocessor/modules/vision/character_detection.py +++ b/preprocessor/modules/vision/character_detection.py @@ -29,9 +29,9 @@ def __init__(self, config: CharacterDetectionConfig) -> None: self._face_app = None self._character_vectors: Dict[str, np.ndarray] = {} - @property - def name(self) -> str: - return 'character_detection' + def cleanup(self) -> None: + self._face_app = None + self._character_vectors = {} def execute( self, input_data: FrameCollection, context: ExecutionContext, @@ -104,6 +104,10 @@ def execute( detection_count=len(results), ) + @property + def name(self) -> str: + return 'character_detection' + @staticmethod def __count_characters(results: List[Dict[str, Any]]) -> Dict[str, int]: counts: Dict[str, int] = {} @@ -112,7 +116,3 @@ def __count_characters(results: List[Dict[str, Any]]) -> Dict[str, int]: name: str = face.get('character_name', 'unknown') counts[name] = counts.get(name, 0) + 1 return counts - - def cleanup(self) -> None: - self._face_app = None - self._character_vectors = {} diff --git a/preprocessor/modules/vision/embeddings.py b/preprocessor/modules/vision/embeddings.py index 705d988b6..303436882 100644 --- a/preprocessor/modules/vision/embeddings.py +++ b/preprocessor/modules/vision/embeddings.py @@ -29,24 +29,10 @@ def __init__(self, config: VideoEmbeddingConfig) -> None: super().__init__(config) self._model: Optional[EmbeddingModelWrapper] = None - @property - def name(self) -> str: - return 'video_embedding' - - def _create_embedding_collection( # pylint: disable=duplicate-code - self, - input_data: FrameCollection, - output_path: Path, - embedding_count: int, - ) -> EmbeddingCollection: - return MetadataBuilder.create_embedding_collection( - episode_id=input_data.episode_id, - episode_info=input_data.episode_info, - path=output_path, - model_name=self.config.model_name, - embedding_count=embedding_count, - embedding_type='video', - ) + def cleanup(self) -> None: + if self._model: + self._model.cleanup() # pylint: disable=no-member + self._model = None def execute( # pylint: disable=too-many-locals self, input_data: FrameCollection, context: ExecutionContext, @@ -106,6 +92,25 @@ def execute( # pylint: disable=too-many-locals context.mark_step_completed(self.name, input_data.episode_id) return self._create_embedding_collection(input_data, output_path, len(results)) + @property + def name(self) -> str: + return 'video_embedding' + + def _create_embedding_collection( # pylint: disable=duplicate-code + self, + input_data: FrameCollection, + output_path: Path, + embedding_count: int, + ) -> EmbeddingCollection: + return MetadataBuilder.create_embedding_collection( + episode_id=input_data.episode_id, + episode_info=input_data.episode_info, + path=output_path, + model_name=self.config.model_name, + embedding_count=embedding_count, + embedding_type='video', + ) + @staticmethod def __load_image_hashes( input_data: FrameCollection, context: ExecutionContext, @@ -121,8 +126,3 @@ def __load_image_hashes( except Exception as e: context.logger.warning(f'Could not load image hashes from {hash_path}: {e}') return {} - - def cleanup(self) -> None: - if self._model: - self._model.cleanup() # pylint: disable=no-member - self._model = None diff --git a/preprocessor/modules/vision/emotion_detection.py b/preprocessor/modules/vision/emotion_detection.py index 48a3d5bcb..8ec1bf245 100644 --- a/preprocessor/modules/vision/emotion_detection.py +++ b/preprocessor/modules/vision/emotion_detection.py @@ -11,10 +11,6 @@ class EmotionDetectionStep(PipelineStep[FrameCollection, EmotionData, EmotionDetectionConfig]): - @property - def name(self) -> str: - return 'emotion_detection' - def execute(self, input_data: FrameCollection, context: ExecutionContext) -> EmotionData: output_filename: str = f'{context.series_name}_{input_data.episode_info.episode_code()}_emotions.json' output_path: Path = context.get_output_path(input_data.episode_info, 'emotion_detections', output_filename) @@ -26,3 +22,7 @@ def execute(self, input_data: FrameCollection, context: ExecutionContext) -> Emo context.mark_step_started(self.name, input_data.episode_id) context.mark_step_completed(self.name, input_data.episode_id) return EmotionData(episode_id=input_data.episode_id, episode_info=input_data.episode_info, path=output_path) + + @property + def name(self) -> str: + return 'emotion_detection' diff --git a/preprocessor/modules/vision/face_clustering.py b/preprocessor/modules/vision/face_clustering.py index 0f3ab7aab..1a55a11e5 100644 --- a/preprocessor/modules/vision/face_clustering.py +++ b/preprocessor/modules/vision/face_clustering.py @@ -11,10 +11,6 @@ class FaceClusteringStep(PipelineStep[FrameCollection, ClusterData, FaceClusteringConfig]): - @property - def name(self) -> str: - return 'face_clustering' - def execute(self, input_data: FrameCollection, context: ExecutionContext) -> ClusterData: output_filename: str = f'{context.series_name}_{input_data.episode_info.episode_code()}_clusters.json' output_path: Path = context.get_output_path(input_data.episode_info, 'face_clusters', output_filename) @@ -26,3 +22,7 @@ def execute(self, input_data: FrameCollection, context: ExecutionContext) -> Clu context.mark_step_started(self.name, input_data.episode_id) context.mark_step_completed(self.name, input_data.episode_id) return ClusterData(episode_id=input_data.episode_id, episode_info=input_data.episode_info, path=output_path) + + @property + def name(self) -> str: + return 'face_clustering' diff --git a/preprocessor/modules/vision/image_hashing.py b/preprocessor/modules/vision/image_hashing.py index 4cce10dcb..0dea960f6 100644 --- a/preprocessor/modules/vision/image_hashing.py +++ b/preprocessor/modules/vision/image_hashing.py @@ -30,9 +30,9 @@ def __init__(self, config: ImageHashConfig) -> None: super().__init__(config) self._hasher: Optional[PerceptualHasher] = None - @property - def name(self) -> str: - return 'image_hashing' + def cleanup(self) -> None: + self._hasher = None + self.__cleanup_memory() def execute( # pylint: disable=too-many-locals self, input_data: FrameCollection, context: ExecutionContext, @@ -102,9 +102,9 @@ def execute( # pylint: disable=too-many-locals hash_count=len(hash_results), ) - def cleanup(self) -> None: - self._hasher = None - self.__cleanup_memory() + @property + def name(self) -> str: + return 'image_hashing' @staticmethod def __cleanup_memory() -> None: diff --git a/preprocessor/modules/vision/object_detection.py b/preprocessor/modules/vision/object_detection.py index 36ce94396..66cce8bb5 100644 --- a/preprocessor/modules/vision/object_detection.py +++ b/preprocessor/modules/vision/object_detection.py @@ -11,10 +11,6 @@ class ObjectDetectionStep(PipelineStep[FrameCollection, ObjectDetectionData, ObjectDetectionConfig]): - @property - def name(self) -> str: - return 'object_detection' - def execute(self, input_data: FrameCollection, context: ExecutionContext) -> ObjectDetectionData: output_filename: str = f'{context.series_name}_{input_data.episode_info.episode_code()}_objects.json' output_path: Path = context.get_output_path(input_data.episode_info, 'object_detections', output_filename) @@ -26,3 +22,7 @@ def execute(self, input_data: FrameCollection, context: ExecutionContext) -> Obj context.mark_step_started(self.name, input_data.episode_id) context.mark_step_completed(self.name, input_data.episode_id) return ObjectDetectionData(episode_id=input_data.episode_id, episode_info=input_data.episode_info, path=output_path) + + @property + def name(self) -> str: + return 'object_detection' From 6e245e0ddf73f287c44d343cbe5a3b35dfb43ac2 Mon Sep 17 00:00:00 2001 From: dam2452 <81230036+dam2452@users.noreply.github.com> Date: Wed, 11 Feb 2026 18:37:36 +0100 Subject: [PATCH 17/89] Refactor BaseProcessor flow and defaults Simplify BaseProcessor by removing unused helpers and prints, and by adjusting defaults. Removed unused __get_processing_info and __get_temp_files methods and stopped printing processing info during resource loading. Default loglevel changed from logging.DEBUG to numeric 10 to avoid the logging import. When marking a step started, temp files are no longer collected and an empty list is passed to state_manager; several tuple-return syntaxes were simplified to plain returns. --- preprocessor/core/base_processor.py | 22 +++++-------------- preprocessor/lib/media/ffmpeg.py | 8 +++---- .../processors/episode_info_processor.py | 2 +- .../processors/sound_separator.py | 2 +- .../lib/validation/file_validators.py | 10 ++++----- preprocessor/lib/video/emotion_utils.py | 4 ++-- preprocessor/modules/audio/separation.py | 2 +- .../modules/scraping/episode_scraper.py | 6 ++--- .../search/clients/embedding_service.py | 4 ++-- .../modules/search/document_generation.py | 2 +- preprocessor/modules/text/import_step.py | 6 ++--- .../modules/validation/episode_stats.py | 12 +++++----- 12 files changed, 35 insertions(+), 45 deletions(-) diff --git a/preprocessor/core/base_processor.py b/preprocessor/core/base_processor.py index c8143f61f..b08604a7e 100644 --- a/preprocessor/core/base_processor.py +++ b/preprocessor/core/base_processor.py @@ -3,7 +3,6 @@ abstractmethod, ) from dataclasses import dataclass -import logging from pathlib import Path import re from typing import ( @@ -43,7 +42,7 @@ class BaseProcessor(ABC): REQUIRES: List[str] = [] SUPPORTED_VIDEO_EXTENSIONS = SUPPORTED_VIDEO_EXTENSIONS - def __init__(self, args: Dict[str, Any], class_name: str, error_exit_code: int, loglevel: int=logging.DEBUG) -> None: + def __init__(self, args: Dict[str, Any], class_name: str, error_exit_code: int, loglevel: int = 10) -> None: self._validate_args(args) self._args = args self.logger = ErrorHandlingLogger(class_name=class_name, loglevel=loglevel, error_exit_code=error_exit_code) @@ -137,8 +136,6 @@ def __execute_processing(self, items: List[ProcessingItem]) -> None: if not items: console.print('[yellow]No items to process, skipping resource loading[/yellow]') return - for info_line in self.__get_processing_info(): - console.print(info_line) if not self._load_resources(): return step_name = self.__get_step_name() @@ -148,8 +145,7 @@ def __execute_processing(self, items: List[ProcessingItem]) -> None: for item in items: try: if self.state_manager: - temp_files = self.__get_temp_files(item) - self.state_manager.mark_step_started(step_name, item.episode_id, temp_files) + self.state_manager.mark_step_started(step_name, item.episode_id, []) missing_outputs = item.metadata.get('missing_outputs', []) self._process_item(item, missing_outputs) if self.state_manager: @@ -162,24 +158,18 @@ def __execute_processing(self, items: List[ProcessingItem]) -> None: console.print('\n[yellow]Processing interrupted[/yellow]') raise - def __get_processing_info(self) -> List[str]: - return [] - def __get_step_name(self) -> str: class_name = self.__class__.__name__ name = class_name.replace('Processor', '').replace('Generator', '').replace('Detector', '') name = name.replace('Transcoder', '').replace('Importer', '').replace('Indexer', '') return self.__to_snake_case(name) - def __get_temp_files(self, item: ProcessingItem) -> List[str]: # pylint: disable=unused-argument - return [] - def __should_skip_item( self, item: ProcessingItem, ) -> Tuple[bool, List[OutputSpec], str]: expected_outputs = self._get_expected_outputs(item) if not expected_outputs: - return (False, [], '') + return False, [], '' missing_outputs = [ output for output in expected_outputs if not output.path.exists() or output.path.stat().st_size == 0 @@ -190,7 +180,7 @@ def __should_skip_item( and self.state_manager.is_step_completed(step_name, item.episode_id) ) if not missing_outputs and state_completed: - return (True, [], f'[yellow]Skipping (completed): {item.episode_id}[/yellow]') + return True, [], f'[yellow]Skipping (completed): {item.episode_id}[/yellow]' if not missing_outputs and (not state_completed): if self.state_manager: self.state_manager.mark_step_completed(step_name, item.episode_id) @@ -204,8 +194,8 @@ def __should_skip_item( f'[yellow]Warning: State marked complete but outputs missing ' f'for {item.episode_id}[/yellow]', ) - return (False, missing_outputs, '') - return (False, missing_outputs, '') + return False, missing_outputs, '' + return False, missing_outputs, '' @staticmethod def __to_snake_case(name: str) -> str: diff --git a/preprocessor/lib/media/ffmpeg.py b/preprocessor/lib/media/ffmpeg.py index 01f2259a4..a82405f52 100644 --- a/preprocessor/lib/media/ffmpeg.py +++ b/preprocessor/lib/media/ffmpeg.py @@ -54,22 +54,22 @@ def detect_interlacing( ) if result.returncode != 0: - return (False, None) + return False, None stats = FFmpegWrapper.__parse_idet_output(result.stderr) if stats is None: - return (False, None) + return False, None total_interlaced = stats['tff'] + stats['bff'] total_frames = total_interlaced + stats['progressive'] if total_frames == 0: - return (False, None) + return False, None ratio = total_interlaced / total_frames stats['ratio'] = ratio - return (ratio > threshold, stats) + return ratio > threshold, stats @staticmethod def get_audio_bitrate(probe_data: Dict[str, Any]) -> Optional[int]: diff --git a/preprocessor/lib/transcription/processors/episode_info_processor.py b/preprocessor/lib/transcription/processors/episode_info_processor.py index 98ec39059..9be97a263 100644 --- a/preprocessor/lib/transcription/processors/episode_info_processor.py +++ b/preprocessor/lib/transcription/processors/episode_info_processor.py @@ -63,4 +63,4 @@ def __write_episode_json(self, transcription: Dict[str, Any], episode_info) -> T with output_path.open('w', encoding='utf-8') as f: json.dump(result, f, ensure_ascii=False, indent=4) self.__logger.info(f'Created episode info {output_path}.') - return (output_path, new_json_name) + return output_path, new_json_name diff --git a/preprocessor/lib/transcription/processors/sound_separator.py b/preprocessor/lib/transcription/processors/sound_separator.py index 448909fc8..d3fc1dc43 100644 --- a/preprocessor/lib/transcription/processors/sound_separator.py +++ b/preprocessor/lib/transcription/processors/sound_separator.py @@ -264,4 +264,4 @@ def __split_mixed_segment(self, segment: Dict[str, Any]) -> Tuple[List[Dict[str, current_words.append(word) if current_words: self.__finalize_sequence(current_type, current_words, dialogue_sequences, sound_sequences, segment) - return (dialogue_sequences, sound_sequences) + return dialogue_sequences, sound_sequences diff --git a/preprocessor/lib/validation/file_validators.py b/preprocessor/lib/validation/file_validators.py index 206122042..75a9ea208 100644 --- a/preprocessor/lib/validation/file_validators.py +++ b/preprocessor/lib/validation/file_validators.py @@ -29,7 +29,7 @@ class FileValidator: @staticmethod def validate_image_file(path: Path) -> ValidationResult: - if (error := FileValidator.__check_file_exists(path)): + if error := FileValidator.__check_file_exists(path): return error try: with Image.open(path) as img: @@ -52,7 +52,7 @@ def validate_image_file(path: Path) -> ValidationResult: @staticmethod def validate_json_file(path: Path) -> ValidationResult: - if (error := FileValidator.__check_file_exists(path)): + if error := FileValidator.__check_file_exists(path): return error try: with open(path, 'r', encoding='utf-8') as f: @@ -68,7 +68,7 @@ def validate_json_file(path: Path) -> ValidationResult: @staticmethod def validate_jsonl_file(path: Path) -> ValidationResult: - if (error := FileValidator.__check_file_exists(path)): + if error := FileValidator.__check_file_exists(path): return error try: line_count = 0 @@ -97,7 +97,7 @@ def validate_jsonl_file(path: Path) -> ValidationResult: @staticmethod def validate_video_file(path: Path) -> ValidationResult: - if (error := FileValidator.__check_file_exists(path)): + if error := FileValidator.__check_file_exists(path): return error try: result = subprocess.run( @@ -142,7 +142,7 @@ def __check_file_exists(path: Path) -> Optional[ValidationResult]: @staticmethod def __validate_archive_file(path: Path) -> ValidationResult: # pylint: disable=unused-private-member - if (error := FileValidator.__check_file_exists(path)): + if error := FileValidator.__check_file_exists(path): return error try: with zipfile.ZipFile(path, 'r') as zip_ref: diff --git a/preprocessor/lib/video/emotion_utils.py b/preprocessor/lib/video/emotion_utils.py index 9c833a459..547423902 100644 --- a/preprocessor/lib/video/emotion_utils.py +++ b/preprocessor/lib/video/emotion_utils.py @@ -39,7 +39,7 @@ def __clip_bbox( y1 = max(0, y1) x2 = min(width, x2) y2 = min(height, y2) - return (x1, y1, x2, y2) + return x1, y1, x2, y2 @staticmethod def __crop_face(frame: np.ndarray, bbox: Dict[str, int]) -> Optional[np.ndarray]: # pylint: disable=unused-private-member @@ -109,4 +109,4 @@ def __process_emotion_result( } confidence = float(max(scores)) dominant_emotion = emotion.lower() - return (dominant_emotion, confidence, emotion_scores) + return dominant_emotion, confidence, emotion_scores diff --git a/preprocessor/modules/audio/separation.py b/preprocessor/modules/audio/separation.py index 8748073d6..76ed3dd47 100644 --- a/preprocessor/modules/audio/separation.py +++ b/preprocessor/modules/audio/separation.py @@ -241,4 +241,4 @@ def __split_mixed_segment( dialogue_parts, sound_parts, ) - return (dialogue_parts, sound_parts) + return dialogue_parts, sound_parts diff --git a/preprocessor/modules/scraping/episode_scraper.py b/preprocessor/modules/scraping/episode_scraper.py index 818b3d05c..cb95d1987 100644 --- a/preprocessor/modules/scraping/episode_scraper.py +++ b/preprocessor/modules/scraping/episode_scraper.py @@ -40,10 +40,10 @@ def __count_video_files(self, directory: Path) -> int: @staticmethod def __get_coverage_status(scraped: int, expected: int) -> Tuple[str, str]: if scraped < expected: - return ('missing', f'Missing {expected - scraped} episodes') + return 'missing', f'Missing {expected - scraped} episodes' if scraped > expected: - return ('extra', f'Scraped {scraped - expected} more episodes than video files') - return ('perfect', 'Perfect coverage') + return 'extra', f'Scraped {scraped - expected} more episodes than video files' + return 'perfect', 'Perfect coverage' def __get_expected_episodes_count(self) -> Optional[int]: if self.expected_episodes_count is not None: diff --git a/preprocessor/modules/search/clients/embedding_service.py b/preprocessor/modules/search/clients/embedding_service.py index fed8c47f0..93c2c3d02 100644 --- a/preprocessor/modules/search/clients/embedding_service.py +++ b/preprocessor/modules/search/clients/embedding_service.py @@ -58,7 +58,7 @@ def get_text_embedding(self, text: str) -> List[float]: def _load_model(self) -> Tuple[AutoModelForVision2Seq, AutoProcessor, str]: if self._model is not None: - return (self._model, self._processor, self._device) + return self._model, self._processor, self._device click.echo('Loading embedding model...', err=True) if not torch.cuda.is_available(): raise RuntimeError('CUDA is required but not available. This pipeline requires GPU.') @@ -67,4 +67,4 @@ def _load_model(self) -> Tuple[AutoModelForVision2Seq, AutoProcessor, str]: self._model = AutoModelForVision2Seq.from_pretrained(model_name, dtype=torch.bfloat16, device_map='auto') self._processor = AutoProcessor.from_pretrained(model_name) click.echo(f'Model loaded on {self._device}', err=True) - return (self._model, self._processor, self._device) + return self._model, self._processor, self._device diff --git a/preprocessor/modules/search/document_generation.py b/preprocessor/modules/search/document_generation.py index 05e042b2a..a750c6e49 100644 --- a/preprocessor/modules/search/document_generation.py +++ b/preprocessor/modules/search/document_generation.py @@ -85,4 +85,4 @@ def __generate_segments_jsonl(self, data: Dict[str, Any], episode_info: Any, con } f.write(json.dumps(doc, ensure_ascii=False) + '\n') count += 1 - return (output_path, count) + return output_path, count diff --git a/preprocessor/modules/text/import_step.py b/preprocessor/modules/text/import_step.py index 2577ed90f..2904a888b 100644 --- a/preprocessor/modules/text/import_step.py +++ b/preprocessor/modules/text/import_step.py @@ -98,14 +98,14 @@ def __convert_11labs_segmented(data: Dict[str, Any], source_file: Path) -> Dict[ def __extract_season_episode_fallback(file_path: Path) -> Tuple[int, int]: match: Optional[re.Match] = re.search('S(\\d+)E(\\d+)', file_path.name, re.IGNORECASE) if match: - return (int(match.group(1)), int(match.group(2))) + return int(match.group(1)), int(match.group(2)) parent_match: Optional[re.Match] = re.search('S(\\d+)', file_path.parent.name, re.IGNORECASE) if parent_match: season: int = int(parent_match.group(1)) episode_match: Optional[re.Match] = re.search('E(\\d+)', file_path.name, re.IGNORECASE) if episode_match: - return (season, int(episode_match.group(1))) - return (1, 1) + return season, int(episode_match.group(1)) + return 1, 1 def __find_transcription_files(self) -> List[Path]: pattern: str = '*.json' diff --git a/preprocessor/modules/validation/episode_stats.py b/preprocessor/modules/validation/episode_stats.py index a10b58570..32ab0200b 100644 --- a/preprocessor/modules/validation/episode_stats.py +++ b/preprocessor/modules/validation/episode_stats.py @@ -254,12 +254,12 @@ def __validate_images_in_directory( extensions: Tuple[str, ...] = ('*.jpg', '*.png'), ) -> Tuple[int, int, List[str]]: if not directory.exists(): - return (0, 0, []) + return 0, 0, [] image_files = [] for ext in extensions: image_files.extend(directory.glob(ext)) if not image_files: - return (0, 0, []) + return 0, 0, [] invalid_count = 0 errors = [] for img_file in image_files: @@ -267,7 +267,7 @@ def __validate_images_in_directory( if not result.is_valid: invalid_count += 1 errors.append(f'Invalid image {img_file.name}: {result.error_message}') - return (len(image_files), invalid_count, errors) + return len(image_files), invalid_count, errors def __validate_json_directory( self, @@ -296,13 +296,13 @@ def __validate_json_files_in_directory( directory: Path, exclude_pattern: Optional[str] = None, ) -> Tuple[int, List[int], List[str]]: if not directory.exists(): - return (0, [], []) + return 0, [], [] json_files = [ f for f in directory.glob('*.json') if not exclude_pattern or exclude_pattern not in str(f) ] if not json_files: - return (0, [], []) + return 0, [], [] sizes = [] errors = [] for json_file in json_files: @@ -311,7 +311,7 @@ def __validate_json_files_in_directory( errors.append(f'Invalid JSON {json_file.name}: {result.error_message}') else: sizes.append(json_file.stat().st_size) - return (len(json_files), sizes, errors) + return len(json_files), sizes, errors def __validate_object_detections(self): self.__validate_json_directory( From 5b5599f8b08bd80f37c15d11696c18120b7126df Mon Sep 17 00:00:00 2001 From: dam2452 <81230036+dam2452@users.noreply.github.com> Date: Wed, 11 Feb 2026 19:55:14 +0100 Subject: [PATCH 18/89] Restructure packages and update processors Move modules into a unified lib/ layout and update related imports; create preprocessor.lib.search.clients package and adjust search client imports. Simplify preprocessor.lib.io exports and relocate path_manager/path_resolver/path_service under lib/io. Remove several transcription processors and elevenlabs transcriber. Refactor BaseProcessor API: introduce _finalize, change abstract methods signatures to be implemented by subclasses, centralize missing-output and state sync logic, and adapt path manager import. Update CharacterReferenceDownloader to new BaseProcessor flow (cleanup, _get_expected_outputs, _get_processing_items, _load_resources) and switch to modules-based placement. Update README to add pipeline validation step and reflect new step order. Numerous file renames and import fixes to match the new package layout. --- preprocessor/README.md | 34 ++- preprocessor/app/pipeline_builder.py | 2 +- preprocessor/cli/cli_main.py | 10 +- preprocessor/cli/helpers.py | 2 +- .../step_defaults.py} | 0 .../{modules => lib}/audio/__init__.py | 0 .../{modules => lib}/audio/extraction.py | 0 .../{modules => lib}/audio/separation.py | 0 preprocessor/lib/characters/__init__.py | 3 +- preprocessor/lib/episodes/episode_manager.py | 2 +- preprocessor/lib/io/__init__.py | 12 +- preprocessor/lib/io/hashing.py | 2 +- preprocessor/{core => lib/io}/path_manager.py | 2 +- .../{core => lib/io}/path_resolver.py | 2 +- preprocessor/{core => lib/io}/path_service.py | 0 preprocessor/lib/search/__init__.py | 6 +- preprocessor/lib/search/clients/__init__.py | 6 + .../search/clients/elasticsearch_queries.py | 2 +- .../search/clients/embedding_service.py | 0 .../search/clients/hash_service.py | 0 .../search/clients/result_formatters.py | 0 preprocessor/lib/search/embedding_model.py | 2 +- .../{modules => lib}/text/import_step.py | 0 .../{modules => lib}/text/transcription.py | 0 preprocessor/lib/transcription/elevenlabs.py | 172 ----------- .../lib/transcription/processors/__init__.py | 4 +- .../processors/audio_normalizer.py | 2 +- .../processors/sound_separator.py | 267 ------------------ .../transcription/processors/unicode_fixer.py | 63 ----- preprocessor/lib/validation/__init__.py | 3 - preprocessor/lib/video/__init__.py | 10 +- .../video/discovery.py} | 0 preprocessor/lib/video/strategies/__init__.py | 4 + .../video/strategies/base_strategy.py | 0 .../strategies/scene_changes_strategy.py | 2 +- .../video/strategies/strategy_factory.py | 4 +- .../{core => modules}/base_processor.py | 74 ++--- preprocessor/modules/characters/__init__.py | 3 + .../characters/reference_downloader.py | 183 ++++++------ preprocessor/modules/scraping/base_scraper.py | 2 +- .../modules/scraping/reference_processor.py | 12 +- .../modules/search/clients/__init__.py | 6 - preprocessor/modules/text/__init__.py | 4 +- .../modules/transcription/__init__.py | 1 + .../validation/base_result.py | 0 .../modules/validation/episode_stats.py | 6 +- .../validation/file_validators.py | 0 .../modules/validation/global_validator.py | 4 +- preprocessor/modules/validation/validator.py | 2 +- preprocessor/modules/video/frame_export.py | 2 +- .../modules/video/strategies/__init__.py | 4 - 51 files changed, 197 insertions(+), 724 deletions(-) rename preprocessor/{app/config_defaults.py => config/step_defaults.py} (100%) rename preprocessor/{modules => lib}/audio/__init__.py (100%) rename preprocessor/{modules => lib}/audio/extraction.py (100%) rename preprocessor/{modules => lib}/audio/separation.py (100%) rename preprocessor/{core => lib/io}/path_manager.py (91%) rename preprocessor/{core => lib/io}/path_resolver.py (85%) rename preprocessor/{core => lib/io}/path_service.py (100%) create mode 100644 preprocessor/lib/search/clients/__init__.py rename preprocessor/{modules => lib}/search/clients/elasticsearch_queries.py (99%) rename preprocessor/{modules => lib}/search/clients/embedding_service.py (100%) rename preprocessor/{modules => lib}/search/clients/hash_service.py (100%) rename preprocessor/{modules => lib}/search/clients/result_formatters.py (100%) rename preprocessor/{modules => lib}/text/import_step.py (100%) rename preprocessor/{modules => lib}/text/transcription.py (100%) delete mode 100644 preprocessor/lib/transcription/elevenlabs.py delete mode 100644 preprocessor/lib/transcription/processors/sound_separator.py delete mode 100644 preprocessor/lib/transcription/processors/unicode_fixer.py delete mode 100644 preprocessor/lib/validation/__init__.py rename preprocessor/{app/video_discovery.py => lib/video/discovery.py} (100%) create mode 100644 preprocessor/lib/video/strategies/__init__.py rename preprocessor/{modules => lib}/video/strategies/base_strategy.py (100%) rename preprocessor/{modules => lib}/video/strategies/scene_changes_strategy.py (96%) rename preprocessor/{modules => lib}/video/strategies/strategy_factory.py (69%) rename preprocessor/{core => modules}/base_processor.py (81%) create mode 100644 preprocessor/modules/characters/__init__.py rename preprocessor/{lib => modules}/characters/reference_downloader.py (64%) delete mode 100644 preprocessor/modules/search/clients/__init__.py create mode 100644 preprocessor/modules/transcription/__init__.py rename preprocessor/{lib => modules}/validation/base_result.py (100%) rename preprocessor/{lib => modules}/validation/file_validators.py (100%) delete mode 100644 preprocessor/modules/video/strategies/__init__.py diff --git a/preprocessor/README.md b/preprocessor/README.md index ba2944865..2495233fc 100644 --- a/preprocessor/README.md +++ b/preprocessor/README.md @@ -93,26 +93,28 @@ series_configs/ --- -## Pipeline (19 kroków) +## Pipeline (20 kroków) ``` -SCRAPING PROCESSING INDEXING -───────────────────────────────────────────────────────────────────────────── +SCRAPING PROCESSING INDEXING VALIDATION +────────────────────────────────────────────────────────────────────────────────────────────────────── [1] scrape_episodes ──┬─→ [4] transcode ─→ [5] transcribe ─→ [6] separate_sounds -[2] scrape_characters │ [7] analyze_text -[3] process_references─┘ [8] detect_scenes ─→ [9] export_frames - [10] text_embeddings - [11] video_embeddings - [12] image_hashing - [13] detect_characters - [14] detect_emotions - [15] cluster_faces - [16] detect_objects - [17] generate_elastic_docs ─→ [18] generate_archives ─→ [19] index_to_elasticsearch +[2] scrape_characters │ [7] analyze_text ────┐ +[3] process_references─┘ [8] detect_scenes ─→ [9] export_frames │ + [10] text_embeddings │ + [11] video_embeddings ├─→ [20] validate + [12] image_hashing │ + [13] detect_characters │ + [14] detect_emotions │ + [15] cluster_faces │ + [16] detect_objects │ + [17] generate_elastic_docs ─→ [18] generate_archives ─→ [19] index_to_elasticsearch ─┘ ``` **Kroki są automatycznie wykonywane w poprawnej kolejności** - pipeline rozwiązuje zależności i tworzy plan wykonania. +**Validation (krok 20)** - uruchamiany na końcu, weryfikuje poprawność wszystkich poprzednich kroków pipeline. + --- ## Dostępne komendy @@ -152,6 +154,9 @@ SCRAPING PROCESSING INDEXING ./run-preprocessor.sh generate-archives --series NAZWA ./run-preprocessor.sh index-to-elasticsearch --series NAZWA +# Validation +./run-preprocessor.sh validate --series NAZWA + # Search (wymaga uruchomionego Elasticsearch) ./run-preprocessor.sh search --series NAZWA --text "query" ./run-preprocessor.sh search --series NAZWA --text-semantic "query" @@ -178,7 +183,8 @@ scrape_episodes, scrape_characters, process_references, transcode, transcribe, separate_sounds, analyze_text, detect_scenes, export_frames, text_embeddings, video_embeddings, image_hashing, detect_characters, detect_emotions, cluster_faces, detect_objects, -generate_elastic_docs, generate_archives, index_to_elasticsearch +generate_elastic_docs, generate_archives, index_to_elasticsearch, +validate ``` --- diff --git a/preprocessor/app/pipeline_builder.py b/preprocessor/app/pipeline_builder.py index 07fc39bca..5c396d9f5 100644 --- a/preprocessor/app/pipeline_builder.py +++ b/preprocessor/app/pipeline_builder.py @@ -5,11 +5,11 @@ List, ) -from preprocessor.app.video_discovery import VideoDiscovery from preprocessor.core.artifacts import SourceVideo from preprocessor.core.base_step import PipelineStep from preprocessor.core.context import ExecutionContext from preprocessor.lib.episodes.episode_manager import EpisodeManager +from preprocessor.lib.video.discovery import VideoDiscovery if TYPE_CHECKING: from preprocessor.app.pipeline import PipelineDefinition diff --git a/preprocessor/cli/cli_main.py b/preprocessor/cli/cli_main.py index ac1372d16..1291f49ca 100644 --- a/preprocessor/cli/cli_main.py +++ b/preprocessor/cli/cli_main.py @@ -14,7 +14,7 @@ from preprocessor.cli.helpers import setup_pipeline_context from preprocessor.cli.skip_list_builder import SkipListBuilder from preprocessor.config.series_config import SeriesConfig -from preprocessor.core.path_resolver import PathResolver +from preprocessor.lib.io.path_resolver import PathResolver @click.group() @@ -156,10 +156,10 @@ def search( # pylint: disable=too-many-arguments,too-many-locals,too-many-state from elasticsearch import AsyncElasticsearch # pylint: disable=import-outside-toplevel - from preprocessor.modules.search.clients.elasticsearch_queries import ElasticsearchQueries # pylint: disable=import-outside-toplevel - from preprocessor.modules.search.clients.embedding_service import EmbeddingService # pylint: disable=import-outside-toplevel - from preprocessor.modules.search.clients.hash_service import HashService # pylint: disable=import-outside-toplevel - from preprocessor.modules.search.clients.result_formatters import ResultFormatter # pylint: disable=import-outside-toplevel + from preprocessor.lib.search.clients.elasticsearch_queries import ElasticsearchQueries # pylint: disable=import-outside-toplevel + from preprocessor.lib.search.clients.embedding_service import EmbeddingService # pylint: disable=import-outside-toplevel + from preprocessor.lib.search.clients.hash_service import HashService # pylint: disable=import-outside-toplevel + from preprocessor.lib.search.clients.result_formatters import ResultFormatter # pylint: disable=import-outside-toplevel if not any([ text, text_semantic, text_to_video, image, phash, character, emotion, diff --git a/preprocessor/cli/helpers.py b/preprocessor/cli/helpers.py index c0ae6d1b1..f18dd8fa9 100644 --- a/preprocessor/cli/helpers.py +++ b/preprocessor/cli/helpers.py @@ -4,10 +4,10 @@ from typing import Optional from preprocessor.core.context import ExecutionContext -from preprocessor.core.path_resolver import PathResolver from preprocessor.core.state_manager import StateManager from preprocessor.lib.core.logging import ErrorHandlingLogger from preprocessor.lib.episodes.episode_manager import EpisodeManager +from preprocessor.lib.io.path_resolver import PathResolver @dataclass diff --git a/preprocessor/app/config_defaults.py b/preprocessor/config/step_defaults.py similarity index 100% rename from preprocessor/app/config_defaults.py rename to preprocessor/config/step_defaults.py diff --git a/preprocessor/modules/audio/__init__.py b/preprocessor/lib/audio/__init__.py similarity index 100% rename from preprocessor/modules/audio/__init__.py rename to preprocessor/lib/audio/__init__.py diff --git a/preprocessor/modules/audio/extraction.py b/preprocessor/lib/audio/extraction.py similarity index 100% rename from preprocessor/modules/audio/extraction.py rename to preprocessor/lib/audio/extraction.py diff --git a/preprocessor/modules/audio/separation.py b/preprocessor/lib/audio/separation.py similarity index 100% rename from preprocessor/modules/audio/separation.py rename to preprocessor/lib/audio/separation.py diff --git a/preprocessor/lib/characters/__init__.py b/preprocessor/lib/characters/__init__.py index d1878f342..165cc8a68 100644 --- a/preprocessor/lib/characters/__init__.py +++ b/preprocessor/lib/characters/__init__.py @@ -4,6 +4,5 @@ DuckDuckGoImageSearch, GoogleImageSearch, ) -from preprocessor.lib.characters.reference_downloader import CharacterReferenceDownloader -__all__ = ['BaseImageSearch', 'CharacterReferenceDownloader', 'DuckDuckGoImageSearch', 'FaceDetector', 'GoogleImageSearch'] +__all__ = ['BaseImageSearch', 'DuckDuckGoImageSearch', 'FaceDetector', 'GoogleImageSearch'] diff --git a/preprocessor/lib/episodes/episode_manager.py b/preprocessor/lib/episodes/episode_manager.py index 2d92974fa..5d6c306eb 100644 --- a/preprocessor/lib/episodes/episode_manager.py +++ b/preprocessor/lib/episodes/episode_manager.py @@ -14,8 +14,8 @@ EpisodeMetadataKeys, EpisodesDataKeys, ) -from preprocessor.core.path_manager import PathManager from preprocessor.lib.core.logging import ErrorHandlingLogger +from preprocessor.lib.io.path_manager import PathManager @dataclass diff --git a/preprocessor/lib/io/__init__.py b/preprocessor/lib/io/__init__.py index bf8a647c2..c595480b1 100644 --- a/preprocessor/lib/io/__init__.py +++ b/preprocessor/lib/io/__init__.py @@ -1,9 +1,5 @@ -from preprocessor.lib.io.files import ( - FileOperations, - atomic_write_json, - load_json, -) -from preprocessor.lib.io.hashing import HashStorage -from preprocessor.lib.io.metadata import MetadataBuilder +from preprocessor.lib.io.path_manager import PathManager +from preprocessor.lib.io.path_resolver import PathResolver +from preprocessor.lib.io.path_service import PathService -__all__ = ['FileOperations', 'HashStorage', 'MetadataBuilder', 'atomic_write_json', 'load_json'] +__all__ = ['PathManager', 'PathResolver', 'PathService'] diff --git a/preprocessor/lib/io/hashing.py b/preprocessor/lib/io/hashing.py index cc040a9ad..3efca6928 100644 --- a/preprocessor/lib/io/hashing.py +++ b/preprocessor/lib/io/hashing.py @@ -6,10 +6,10 @@ ) from preprocessor.config.config import settings -from preprocessor.core.path_manager import PathManager from preprocessor.lib.episodes import EpisodeInfo from preprocessor.lib.io.files import FileOperations from preprocessor.lib.io.metadata import MetadataBuilder +from preprocessor.lib.io.path_manager import PathManager class HashStorage: diff --git a/preprocessor/core/path_manager.py b/preprocessor/lib/io/path_manager.py similarity index 91% rename from preprocessor/core/path_manager.py rename to preprocessor/lib/io/path_manager.py index a9e48acc4..6f3a73356 100644 --- a/preprocessor/core/path_manager.py +++ b/preprocessor/lib/io/path_manager.py @@ -1,7 +1,7 @@ from pathlib import Path from typing import TYPE_CHECKING -from preprocessor.core.path_service import PathService +from preprocessor.lib.io.path_service import PathService if TYPE_CHECKING: from preprocessor.lib.episodes.episode_manager import EpisodeInfo diff --git a/preprocessor/core/path_resolver.py b/preprocessor/lib/io/path_resolver.py similarity index 85% rename from preprocessor/core/path_resolver.py rename to preprocessor/lib/io/path_resolver.py index a505531fa..64910371e 100644 --- a/preprocessor/core/path_resolver.py +++ b/preprocessor/lib/io/path_resolver.py @@ -1,6 +1,6 @@ from pathlib import Path -from preprocessor.core.path_service import PathService +from preprocessor.lib.io.path_service import PathService class PathResolver: diff --git a/preprocessor/core/path_service.py b/preprocessor/lib/io/path_service.py similarity index 100% rename from preprocessor/core/path_service.py rename to preprocessor/lib/io/path_service.py diff --git a/preprocessor/lib/search/__init__.py b/preprocessor/lib/search/__init__.py index 865a9becb..a69f19443 100644 --- a/preprocessor/lib/search/__init__.py +++ b/preprocessor/lib/search/__init__.py @@ -1,8 +1,4 @@ from preprocessor.lib.search.elasticsearch import ElasticsearchWrapper from preprocessor.lib.search.embedding_model import EmbeddingModelWrapper -from preprocessor.modules.search.clients.elasticsearch_queries import ElasticsearchQueries -from preprocessor.modules.search.clients.embedding_service import EmbeddingService -from preprocessor.modules.search.clients.hash_service import HashService -from preprocessor.modules.search.clients.result_formatters import ResultFormatter -__all__ = ['ElasticsearchWrapper', 'EmbeddingModelWrapper', 'ElasticsearchQueries', 'EmbeddingService', 'HashService', 'ResultFormatter'] +__all__ = ['ElasticsearchWrapper', 'EmbeddingModelWrapper'] diff --git a/preprocessor/lib/search/clients/__init__.py b/preprocessor/lib/search/clients/__init__.py new file mode 100644 index 000000000..fc66f8dfe --- /dev/null +++ b/preprocessor/lib/search/clients/__init__.py @@ -0,0 +1,6 @@ +from preprocessor.lib.search.clients.elasticsearch_queries import ElasticsearchQueries +from preprocessor.lib.search.clients.embedding_service import EmbeddingService +from preprocessor.lib.search.clients.hash_service import HashService +from preprocessor.lib.search.clients.result_formatters import ResultFormatter + +__all__ = ['ElasticsearchQueries', 'EmbeddingService', 'HashService', 'ResultFormatter'] diff --git a/preprocessor/modules/search/clients/elasticsearch_queries.py b/preprocessor/lib/search/clients/elasticsearch_queries.py similarity index 99% rename from preprocessor/modules/search/clients/elasticsearch_queries.py rename to preprocessor/lib/search/clients/elasticsearch_queries.py index c856ef1a9..64d4b1f39 100644 --- a/preprocessor/modules/search/clients/elasticsearch_queries.py +++ b/preprocessor/lib/search/clients/elasticsearch_queries.py @@ -8,7 +8,7 @@ from elasticsearch import AsyncElasticsearch -from preprocessor.modules.search.clients.embedding_service import EmbeddingService +from preprocessor.lib.search.clients.embedding_service import EmbeddingService class ElasticsearchQueries: diff --git a/preprocessor/modules/search/clients/embedding_service.py b/preprocessor/lib/search/clients/embedding_service.py similarity index 100% rename from preprocessor/modules/search/clients/embedding_service.py rename to preprocessor/lib/search/clients/embedding_service.py diff --git a/preprocessor/modules/search/clients/hash_service.py b/preprocessor/lib/search/clients/hash_service.py similarity index 100% rename from preprocessor/modules/search/clients/hash_service.py rename to preprocessor/lib/search/clients/hash_service.py diff --git a/preprocessor/modules/search/clients/result_formatters.py b/preprocessor/lib/search/clients/result_formatters.py similarity index 100% rename from preprocessor/modules/search/clients/result_formatters.py rename to preprocessor/lib/search/clients/result_formatters.py diff --git a/preprocessor/lib/search/embedding_model.py b/preprocessor/lib/search/embedding_model.py index da0a7b000..fc4caf637 100644 --- a/preprocessor/lib/search/embedding_model.py +++ b/preprocessor/lib/search/embedding_model.py @@ -3,7 +3,7 @@ Union, ) -from preprocessor.modules.search.clients.embedding_service import EmbeddingService +from preprocessor.lib.search.clients.embedding_service import EmbeddingService class EmbeddingModelWrapper: diff --git a/preprocessor/modules/text/import_step.py b/preprocessor/lib/text/import_step.py similarity index 100% rename from preprocessor/modules/text/import_step.py rename to preprocessor/lib/text/import_step.py diff --git a/preprocessor/modules/text/transcription.py b/preprocessor/lib/text/transcription.py similarity index 100% rename from preprocessor/modules/text/transcription.py rename to preprocessor/lib/text/transcription.py diff --git a/preprocessor/lib/transcription/elevenlabs.py b/preprocessor/lib/transcription/elevenlabs.py deleted file mode 100644 index 53930e7a6..000000000 --- a/preprocessor/lib/transcription/elevenlabs.py +++ /dev/null @@ -1,172 +0,0 @@ -import json -import logging -from pathlib import Path -import subprocess -import tempfile -from typing import ( - Any, - Dict, - List, - Optional, -) - -from preprocessor.config.config import settings -from preprocessor.core.base_processor import BaseProcessor -from preprocessor.lib.episodes import EpisodeManager -from preprocessor.lib.transcription.engines.elevenlabs_engine import ElevenLabsEngine -from preprocessor.lib.transcription.generators.multi_format_generator import MultiFormatGenerator -from preprocessor.lib.ui.console import ( - SimpleProgress, - console, -) - - -class ElevenLabsTranscriber(BaseProcessor): - - def __init__(self, args: Dict[str, Any]): - super().__init__(args=args, class_name=self.__class__.__name__, error_exit_code=5, loglevel=logging.DEBUG) - self.input_videos: Path = Path(self._args['videos']) - self.output_dir: Path = Path(self._args['output_dir']) - self.output_dir.mkdir(parents=True, exist_ok=True) - self.episodes_info_json: Optional[Path] = self._args.get('episodes_info_json') - self.model_id: str = self._args.get('model_id', 'scribe_v1') - self.language_code: str = self._args.get('language_code', 'pol') - self.diarize: bool = self._args.get('diarize', True) - self.episode_manager = EpisodeManager(self.episodes_info_json, self.series_name, self.logger) - self.engine = ElevenLabsEngine(logger=self.logger, model_id=self.model_id, language_code=self.language_code, diarize=self.diarize) - - def get_output_subdir(self) -> str: - return settings.output_subdirs.transcriptions - - def _execute(self) -> None: - video_files: List[Path] = [] - for ext in self.SUPPORTED_VIDEO_EXTENSIONS: - video_files.extend(self.input_videos.rglob(f'*{ext}')) - video_files = sorted(video_files) - if not video_files: - self.logger.warning('No video files found') - return - console.print(f'[blue]Found {len(video_files)} videos to transcribe with 11labs[/blue]') - try: - with SimpleProgress() as progress: - task = progress.add_task('Transcribing with 11labs...', total=len(video_files)) - for video_file in video_files: - episode_id = video_file.stem - if self.state_manager and self.state_manager.is_step_completed('transcribe_11labs', episode_id): - console.print(f'[yellow]Skipping (already done): {episode_id}[/yellow]') - progress.advance(task) - continue - audio_path = None - try: - if self.state_manager: - audio_path = self.__extract_audio(video_file) - self.state_manager.mark_step_started('transcribe_11labs', episode_id, [str(audio_path)]) - audio_path = audio_path or self.__extract_audio(video_file) - transcription_data = self.engine.transcribe(audio_path) - self.__save_transcription(transcription_data, video_file) - if self.state_manager: - self.state_manager.mark_step_completed('transcribe_11labs', episode_id) - except Exception as e: - self.logger.error(f'Failed to transcribe {video_file.name}: {e}') - finally: - if audio_path and audio_path.exists(): - audio_path.unlink() - progress.advance(task) - except KeyboardInterrupt: - console.print('\n[yellow]Transcription interrupted[/yellow]') - raise - console.print('[blue]Generating multi-format outputs (SRT, TXT, etc.)...[/blue]') - if self.episodes_info_json: - jsons_source_dir = self.output_dir / 'json' - multi_format_gen = MultiFormatGenerator( - jsons_dir=jsons_source_dir, - episodes_info_json=self.episodes_info_json, - output_base_path=self.output_dir, - logger=self.logger, - series_name=self.series_name, - ) - multi_format_gen.generate() - - def _validate_args(self, args: Dict[str, Any]) -> None: - if 'videos' not in args: - raise ValueError('videos is required') - if 'output_dir' not in args: - raise ValueError('output_dir is required') - if 'series_name' not in args: - raise ValueError('series_name is required') - videos_path = Path(args['videos']) - if not videos_path.is_dir(): - raise NotADirectoryError(f"Input videos is not a directory: '{videos_path}'") - - @staticmethod - def __create_segments_from_words(words: List[Dict]) -> List[Dict]: - if not words: - return [] - segments = [] - current_segment_words = [] - current_speaker = None - for word in words: - speaker_id = word.get('speaker_id', 'speaker_unknown') - if current_speaker is None: - current_speaker = speaker_id - current_segment_words = [word] - elif speaker_id == current_speaker: - current_segment_words.append(word) - else: - segment_text = ' '.join((w.get('text', '') for w in current_segment_words)).strip() - segments.append({'text': segment_text, 'words': current_segment_words}) - current_speaker = speaker_id - current_segment_words = [word] - if current_segment_words: - segment_text = ' '.join((w.get('text', '') for w in current_segment_words)).strip() - segments.append({'text': segment_text, 'words': current_segment_words}) - return segments - - @staticmethod - def __extract_audio(video_file: Path) -> Path: - temp_dir = Path(tempfile.gettempdir()) - audio_path = temp_dir / f'{video_file.stem}_audio.mp3' - command = [ - 'ffmpeg', '-v', 'error', '-hide_banner', '-y', - '-i', str(video_file), - '-vn', '-acodec', 'libmp3lame', - '-ar', '16000', '-ac', '1', '-b:a', '64k', - str(audio_path), - ] - subprocess.run(command, check=True, stdout=subprocess.DEVNULL, stderr=subprocess.PIPE) - return audio_path - - def __save_transcription(self, data: Dict[str, Any], video_file: Path) -> None: - episode_info = self.episode_manager.parse_filename(video_file) - if not episode_info: - self.logger.error(f'Cannot parse episode info from {video_file.name}') - return - api_segments = data.get('segments', []) - api_words = data.get('words', []) - if api_segments: - segments = api_segments - words = [] - for segment in segments: - segment_words = segment.get('words', []) - for word in segment_words: - if 'speaker_id' not in word and 'speaker' in segment: - word['speaker_id'] = segment['speaker'] - words.extend(segment_words) - else: - words = api_words - segments = self.__create_segments_from_words(words) - output_data = { - 'text': data.get('text', ''), - 'language_code': data.get('language_code', 'pol'), - 'segments': segments, - 'words': words, - 'episode_info': EpisodeManager.get_metadata(episode_info), - } - json_dir = self.output_dir / 'json' - filename = self.episode_manager.path_manager.build_filename(episode_info, extension='json') - season_dir = json_dir / episode_info.season_code() - output_file = season_dir / filename - output_file.parent.mkdir(parents=True, exist_ok=True) - with open(output_file, 'w', encoding='utf-8') as f: - json.dump(output_data, f, indent=2, ensure_ascii=False) - self.logger.info(f'Saved transcription: {output_file.name}') diff --git a/preprocessor/lib/transcription/processors/__init__.py b/preprocessor/lib/transcription/processors/__init__.py index 2f74aca49..c29f35718 100644 --- a/preprocessor/lib/transcription/processors/__init__.py +++ b/preprocessor/lib/transcription/processors/__init__.py @@ -1,7 +1,5 @@ from preprocessor.lib.transcription.processors.audio_normalizer import AudioNormalizer from preprocessor.lib.transcription.processors.episode_info_processor import EpisodeInfoProcessor from preprocessor.lib.transcription.processors.normalized_audio_processor import NormalizedAudioProcessor -from preprocessor.lib.transcription.processors.sound_separator import SoundEventSeparator -from preprocessor.lib.transcription.processors.unicode_fixer import TranscriptionUnicodeFixer -__all__ = ['AudioNormalizer', 'EpisodeInfoProcessor', 'NormalizedAudioProcessor', 'SoundEventSeparator', 'TranscriptionUnicodeFixer'] +__all__ = ['AudioNormalizer', 'EpisodeInfoProcessor', 'NormalizedAudioProcessor'] diff --git a/preprocessor/lib/transcription/processors/audio_normalizer.py b/preprocessor/lib/transcription/processors/audio_normalizer.py index 2e7715973..65dcf4180 100644 --- a/preprocessor/lib/transcription/processors/audio_normalizer.py +++ b/preprocessor/lib/transcription/processors/audio_normalizer.py @@ -6,8 +6,8 @@ Optional, ) -from preprocessor.core.base_processor import BaseProcessor from preprocessor.lib.core.logging import ErrorHandlingLogger +from preprocessor.modules.base_processor import BaseProcessor class AudioNormalizer: diff --git a/preprocessor/lib/transcription/processors/sound_separator.py b/preprocessor/lib/transcription/processors/sound_separator.py deleted file mode 100644 index d3fc1dc43..000000000 --- a/preprocessor/lib/transcription/processors/sound_separator.py +++ /dev/null @@ -1,267 +0,0 @@ -import json -from pathlib import Path -import re -from typing import ( - Any, - Dict, - List, - Tuple, -) - -from preprocessor.config.config import settings -from preprocessor.config.constants import ( - FILE_EXTENSIONS, - FILE_SUFFIXES, -) -from preprocessor.config.types import ( - WordKeys, - WordTypeValues, -) -from preprocessor.core.base_processor import ( - BaseProcessor, - OutputSpec, - ProcessingItem, -) -from preprocessor.lib.episodes import EpisodeManager -from preprocessor.lib.transcription.sound_classification import ( - classify_segment, - is_sound_event, -) - - -class SoundEventSeparator(BaseProcessor): - - def __init__(self, args: Dict[str, Any]) -> None: - super().__init__(args=args, class_name=self.__class__.__name__, error_exit_code=2, loglevel=args.get('loglevel', 20)) - self.transcription_dir = Path(self._args.get('transcription_dir', settings.transcription.get_output_dir(self.series_name))) - episodes_info_json = self._args.get('episodes_info_json') - self.episode_manager = EpisodeManager(episodes_info_json, self.series_name, self.logger) - - def get_output_subdir(self) -> str: - return settings.output_subdirs.transcriptions - - def _get_expected_outputs(self, item: ProcessingItem) -> List[OutputSpec]: - base_name = item.input_path.stem.replace(FILE_SUFFIXES['segmented'], '') - episode_dir = item.input_path.parent.parent - clean_dir = episode_dir / settings.output_subdirs.transcription_subdirs.clean - sound_dir = episode_dir / settings.output_subdirs.transcription_subdirs.sound_events - clean_json = clean_dir / f"{base_name}{FILE_SUFFIXES['clean']}{FILE_EXTENSIONS['json']}" - sound_json = sound_dir / f"{base_name}{FILE_SUFFIXES['sound_events']}{FILE_EXTENSIONS['json']}" - clean_segmented_json = clean_dir / f"{base_name}{FILE_SUFFIXES['segmented']}_clean{FILE_EXTENSIONS['json']}" - sound_segmented_json = sound_dir / f"{base_name}{FILE_SUFFIXES['segmented']}_sound_events{FILE_EXTENSIONS['json']}" - clean_txt = clean_dir / f"{base_name}{FILE_SUFFIXES['clean']}{FILE_EXTENSIONS['txt']}" - sound_txt = sound_dir / f"{base_name}{FILE_SUFFIXES['sound_events']}{FILE_EXTENSIONS['txt']}" - clean_srt = clean_dir / f"{base_name}{FILE_SUFFIXES['clean']}{FILE_EXTENSIONS['srt']}" - sound_srt = sound_dir / f"{base_name}{FILE_SUFFIXES['sound_events']}{FILE_EXTENSIONS['srt']}" - return [ - OutputSpec(path=clean_json, required=True), - OutputSpec(path=sound_json, required=True), - OutputSpec(path=clean_segmented_json, required=True), - OutputSpec(path=sound_segmented_json, required=True), - OutputSpec(path=clean_txt, required=True), - OutputSpec(path=sound_txt, required=True), - OutputSpec(path=clean_srt, required=True), - OutputSpec(path=sound_srt, required=True), - ] - - def _get_processing_items(self) -> List[ProcessingItem]: - segmented_files = list(self.transcription_dir.rglob('**/raw/*_segmented.json')) - items = [] - for trans_file in segmented_files: - episode_info = self.episode_manager.parse_filename(trans_file) - if not episode_info: - self.logger.warning(f'Cannot parse episode info from {trans_file.name}') - continue - episode_id = EpisodeManager.get_episode_id_for_state(episode_info) - items.append(ProcessingItem(episode_id=episode_id, input_path=trans_file, metadata={'episode_info': episode_info})) - return items - - def _get_progress_description(self) -> str: - return 'Separating sound events from dialogues' - - def _process_item( # pylint: disable=too-many-locals - self, item, missing_outputs: List, - ) -> None: - with open(item.input_path, 'r', encoding='utf-8') as f: - data = json.load(f) - episode_info = data.get('episode_info', {}) - segments = data.get('segments', []) - dialogue_segments = [] - sound_event_segments = [] - for segment in segments: - classification = classify_segment(segment) - if classification == 'dialogue': - dialogue_segments.append(self.__clean_segment_text(segment)) - elif classification == 'sound_event': - sound_event_segments.append(self.__enrich_sound_event(self.__clean_segment_text(segment))) - elif classification == 'mixed': - dialogue_parts, sound_parts = self.__split_mixed_segment(segment) - dialogue_segments.extend(dialogue_parts) - sound_event_segments.extend([self.__enrich_sound_event(s) for s in sound_parts]) - dialogue_segments = self.__renumber_segments(dialogue_segments) - sound_event_segments = self.__renumber_segments(sound_event_segments) - base_name = item.input_path.stem.replace(FILE_SUFFIXES['segmented'], '') - episode_dir = item.input_path.parent.parent - clean_dir = episode_dir / settings.output_subdirs.transcription_subdirs.clean - sound_dir = episode_dir / settings.output_subdirs.transcription_subdirs.sound_events - clean_dir.mkdir(parents=True, exist_ok=True) - sound_dir.mkdir(parents=True, exist_ok=True) - clean_json = clean_dir / f'{base_name}_clean_transcription.json' - sound_json = sound_dir / f'{base_name}_sound_events.json' - clean_segmented_json = clean_dir / f'{base_name}_segmented_clean.json' - sound_segmented_json = sound_dir / f'{base_name}_segmented_sound_events.json' - clean_txt = clean_dir / f'{base_name}_clean_transcription.txt' - sound_txt = sound_dir / f'{base_name}_sound_events.txt' - clean_srt = clean_dir / f'{base_name}_clean_transcription.srt' - sound_srt = sound_dir / f'{base_name}_sound_events.srt' - raw_txt = episode_dir / settings.output_subdirs.transcription_subdirs.raw / f'{base_name}.txt' - dialogue_segments_simple = self.__convert_to_simple_format(dialogue_segments) - sound_event_segments_simple = self.__convert_to_simple_format(sound_event_segments) - with open(clean_json, 'w', encoding='utf-8') as f: - json.dump({'episode_info': episode_info, 'segments': dialogue_segments_simple}, f, ensure_ascii=False, indent=4) - with open(sound_json, 'w', encoding='utf-8') as f: - json.dump({'episode_info': episode_info, 'segments': sound_event_segments_simple}, f, ensure_ascii=False, indent=4) - with open(clean_segmented_json, 'w', encoding='utf-8') as f: - json.dump({'episode_info': episode_info, 'segments': dialogue_segments}, f, ensure_ascii=False, indent=4) - with open(sound_segmented_json, 'w', encoding='utf-8') as f: - json.dump({'episode_info': episode_info, 'segments': sound_event_segments}, f, ensure_ascii=False, indent=4) - self.__generate_txt_files(raw_txt, clean_txt, sound_txt) - self.__generate_srt_files(dialogue_segments, sound_event_segments, clean_srt, sound_srt) - self.logger.info(f'Separated {item.episode_id}: {len(dialogue_segments)} dialogue, {len(sound_event_segments)} sound events') - - def _validate_args(self, args: Dict[str, Any]) -> None: - ... - - @staticmethod - def __clean_segment_text(segment: Dict[str, Any]) -> Dict[str, Any]: - cleaned = segment.copy() - if 'text' in cleaned: - text = cleaned['text'] - text = re.sub('\\s+', ' ', text).strip() - cleaned['text'] = text - if cleaned.get('start') is None or cleaned.get('end') is None: - words = cleaned.get('words', []) - if words: - starts = [w.get('start') or 0 for w in words if w.get('start') is not None] - ends = [w.get('end') or 0 for w in words if w.get('end') is not None] - if starts: - cleaned['start'] = min(starts) - if ends: - cleaned['end'] = max(ends) - return cleaned - - @staticmethod - def __convert_to_simple_format(segments: List[Dict]) -> List[Dict]: - simple_segments = [] - for seg in segments: - simple_seg = {'id': seg.get('id'), 'text': seg.get('text', ''), 'start': seg.get('start') or 0.0, 'end': seg.get('end') or 0.0} - if 'sound_type' in seg: - simple_seg['sound_type'] = seg['sound_type'] - simple_segments.append(simple_seg) - return simple_segments - - @staticmethod - def __enrich_sound_event(segment: Dict[str, Any]) -> Dict[str, Any]: - enriched = segment.copy() - enriched['sound_type'] = 'sound' - return enriched - - @staticmethod - def __finalize_sequence( - seq_type: str, - words: List[Dict], - dialogue_sequences: List[Dict], - sound_sequences: List[Dict], - original_segment: Dict[str, Any], - ) -> None: - if not words: - return - non_spacing_words = [w for w in words if w.get(WordKeys.TYPE) != WordTypeValues.SPACING] - if not non_spacing_words: - return - text = ''.join([w.get('text', '') for w in words]) - text = re.sub('\\s+', ' ', text).strip() - start_time = min((w.get('start') or 0 for w in words)) - end_time = max((w.get('end') or 0 for w in words)) - new_segment = {'text': text, 'start': start_time, 'end': end_time, 'words': words} - for key in original_segment: - if key not in ['text', 'start', 'end', 'words']: - new_segment[key] = original_segment[key] - if seq_type == 'dialogue': - dialogue_sequences.append(new_segment) - else: - sound_sequences.append(new_segment) - - @staticmethod - def __generate_srt_files(dialogue_segments: List[Dict], sound_segments: List[Dict], clean_srt: Path, sound_srt: Path) -> None: - - def format_timestamp(seconds: float) -> str: - hours = int(seconds // 3600) - minutes = int(seconds % 3600 // 60) - secs = int(seconds % 60) - millis = int(seconds % 1 * 1000) - return f'{hours:02d}:{minutes:02d}:{secs:02d},{millis:03d}' - - def __write_srt(segments: List[Dict], output_path: Path) -> None: - with open(output_path, 'w', encoding='utf-8') as f: - for idx, seg in enumerate(segments, start=1): - words = seg.get('words', []) - text = seg.get('text', '').strip() - if not text or not words: - continue - non_spacing_words = [w for w in words if w.get(WordKeys.TYPE) != WordTypeValues.SPACING] - if not non_spacing_words: - continue - start_time = min((w.get('start') or 0.0 for w in non_spacing_words)) - end_time = max((w.get('end') or 0.0 for w in non_spacing_words)) - f.write(f'{idx}\n') - f.write(f'{format_timestamp(start_time)} --> {format_timestamp(end_time)}\n') - f.write(f'{text}\n\n') - __write_srt(dialogue_segments, clean_srt) - __write_srt(sound_segments, sound_srt) - - def __generate_txt_files(self, original_txt: Path, clean_txt: Path, sound_txt: Path) -> None: - if not original_txt.exists(): - self.logger.warning(f'Original TXT file not found: {original_txt}') - return - with open(original_txt, 'r', encoding='utf-8') as f: - original_content = f.read() - clean_content = re.sub('\\([^)]*\\)', '', original_content) - clean_content = re.sub('\\s+', ' ', clean_content).strip() - sound_matches = re.findall('\\([^)]*\\)', original_content) - sound_content = ' '.join(sound_matches) - with open(clean_txt, 'w', encoding='utf-8') as f: - f.write(clean_content) - with open(sound_txt, 'w', encoding='utf-8') as f: - f.write(sound_content) - - @staticmethod - def __renumber_segments(segments: List[Dict]) -> List[Dict]: - for i, segment in enumerate(segments): - segment['id'] = i - return segments - - - def __split_mixed_segment(self, segment: Dict[str, Any]) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]: - words = segment.get('words', []) - dialogue_sequences = [] - sound_sequences = [] - current_type = None - current_words = [] - for word in words: - if word.get(WordKeys.TYPE) == WordTypeValues.SPACING: - if current_words: - current_words.append(word) - continue - is_sound = is_sound_event(word) - word_type = 'sound' if is_sound else 'dialogue' - if word_type != current_type: - if current_words: - self.__finalize_sequence(current_type, current_words, dialogue_sequences, sound_sequences, segment) - current_type = word_type - current_words = [word] - else: - current_words.append(word) - if current_words: - self.__finalize_sequence(current_type, current_words, dialogue_sequences, sound_sequences, segment) - return dialogue_sequences, sound_sequences diff --git a/preprocessor/lib/transcription/processors/unicode_fixer.py b/preprocessor/lib/transcription/processors/unicode_fixer.py deleted file mode 100644 index 18d3b81d8..000000000 --- a/preprocessor/lib/transcription/processors/unicode_fixer.py +++ /dev/null @@ -1,63 +0,0 @@ -from pathlib import Path -from typing import ( - Any, - Dict, - List, -) - -from preprocessor.config.config import settings -from preprocessor.core.base_processor import ( - BaseProcessor, - OutputSpec, - ProcessingItem, -) -from preprocessor.lib.episodes import EpisodeManager -from preprocessor.lib.transcription.utils import TranscriptionUtils - - -class TranscriptionUnicodeFixer(BaseProcessor): - - def __init__(self, args: Dict[str, Any]) -> None: - super().__init__( - args=args, - class_name=self.__class__.__name__, - error_exit_code=2, - loglevel=args.get('loglevel', 20), - ) - default_dir = settings.transcription.get_output_dir(self.series_name) - self.transcription_jsons = Path( - self._args.get('transcription_jsons', default_dir), - ) - episodes_info_json = self._args.get('episodes_info_json') - self.episode_manager = EpisodeManager(episodes_info_json, self.series_name, self.logger) - - def get_output_subdir(self) -> str: - return settings.output_subdirs.transcriptions - - def _get_expected_outputs(self, item: ProcessingItem) -> List[OutputSpec]: - return [OutputSpec(path=item.input_path, required=True)] - - def _get_processing_items(self) -> List[ProcessingItem]: - transcription_files = list(self.transcription_jsons.rglob('*.json')) - return [ - ProcessingItem( - episode_id=f'unicode_fix_{i}', - input_path=trans_file, - metadata={'file': trans_file}, - ) - for i, trans_file in enumerate(transcription_files) - ] - - def _process_item(self, item: ProcessingItem, missing_outputs: List[OutputSpec]) -> None: - trans_file = item.metadata['file'] - try: - was_fixed = TranscriptionUtils.fix_transcription_file_unicode(trans_file) - if was_fixed: - self.logger.info(f'Fixed unicode escapes in: {trans_file.name}') - else: - self.logger.debug(f'No unicode escapes found in: {trans_file.name}') - except Exception as e: - self.logger.error(f'Error fixing unicode in {trans_file.name}: {e}') - - def _validate_args(self, args: Dict[str, Any]) -> None: - ... diff --git a/preprocessor/lib/validation/__init__.py b/preprocessor/lib/validation/__init__.py deleted file mode 100644 index 154276d32..000000000 --- a/preprocessor/lib/validation/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -from preprocessor.lib.validation.file_validators import FileValidator - -__all__ = ['FileValidator'] diff --git a/preprocessor/lib/video/__init__.py b/preprocessor/lib/video/__init__.py index 66833a64e..0a854ca1a 100644 --- a/preprocessor/lib/video/__init__.py +++ b/preprocessor/lib/video/__init__.py @@ -1,9 +1,3 @@ -from preprocessor.lib.video.emotion_utils import EmotionDetector -from preprocessor.lib.video.frame_utils import FrameLoader +from preprocessor.lib.video.discovery import VideoDiscovery -__all__ = ['EmotionDetector', 'FrameLoader'] -try: - from preprocessor.lib.video.image_hasher import PerceptualHasher - __all__.append('PerceptualHasher') -except (ImportError, RuntimeError): - pass +__all__ = ['VideoDiscovery'] diff --git a/preprocessor/app/video_discovery.py b/preprocessor/lib/video/discovery.py similarity index 100% rename from preprocessor/app/video_discovery.py rename to preprocessor/lib/video/discovery.py diff --git a/preprocessor/lib/video/strategies/__init__.py b/preprocessor/lib/video/strategies/__init__.py new file mode 100644 index 000000000..e5b6b0f54 --- /dev/null +++ b/preprocessor/lib/video/strategies/__init__.py @@ -0,0 +1,4 @@ +from preprocessor.lib.video.strategies.base_strategy import BaseKeyframeStrategy +from preprocessor.lib.video.strategies.scene_changes_strategy import SceneChangesStrategy + +__all__ = ['BaseKeyframeStrategy', 'SceneChangesStrategy'] diff --git a/preprocessor/modules/video/strategies/base_strategy.py b/preprocessor/lib/video/strategies/base_strategy.py similarity index 100% rename from preprocessor/modules/video/strategies/base_strategy.py rename to preprocessor/lib/video/strategies/base_strategy.py diff --git a/preprocessor/modules/video/strategies/scene_changes_strategy.py b/preprocessor/lib/video/strategies/scene_changes_strategy.py similarity index 96% rename from preprocessor/modules/video/strategies/scene_changes_strategy.py rename to preprocessor/lib/video/strategies/scene_changes_strategy.py index ce248841f..26e5950c4 100644 --- a/preprocessor/modules/video/strategies/scene_changes_strategy.py +++ b/preprocessor/lib/video/strategies/scene_changes_strategy.py @@ -7,7 +7,7 @@ from preprocessor.config.enums import FrameType from preprocessor.lib.ui.console import console -from preprocessor.modules.video.strategies.base_strategy import BaseKeyframeStrategy +from preprocessor.lib.video.strategies.base_strategy import BaseKeyframeStrategy class SceneChangesStrategy(BaseKeyframeStrategy): diff --git a/preprocessor/modules/video/strategies/strategy_factory.py b/preprocessor/lib/video/strategies/strategy_factory.py similarity index 69% rename from preprocessor/modules/video/strategies/strategy_factory.py rename to preprocessor/lib/video/strategies/strategy_factory.py index e26b1075d..c6e41ce15 100644 --- a/preprocessor/modules/video/strategies/strategy_factory.py +++ b/preprocessor/lib/video/strategies/strategy_factory.py @@ -1,6 +1,6 @@ from preprocessor.config.enums import KeyframeStrategy -from preprocessor.modules.video.strategies.base_strategy import BaseKeyframeStrategy -from preprocessor.modules.video.strategies.scene_changes_strategy import SceneChangesStrategy +from preprocessor.lib.video.strategies.base_strategy import BaseKeyframeStrategy +from preprocessor.lib.video.strategies.scene_changes_strategy import SceneChangesStrategy class KeyframeStrategyFactory: diff --git a/preprocessor/core/base_processor.py b/preprocessor/modules/base_processor.py similarity index 81% rename from preprocessor/core/base_processor.py rename to preprocessor/modules/base_processor.py index b08604a7e..2fe0e44da 100644 --- a/preprocessor/core/base_processor.py +++ b/preprocessor/modules/base_processor.py @@ -14,9 +14,9 @@ ) from preprocessor.config.constants import SUPPORTED_VIDEO_EXTENSIONS -from preprocessor.core.path_manager import PathManager from preprocessor.core.state_manager import StateManager from preprocessor.lib.core.logging import ErrorHandlingLogger +from preprocessor.lib.io.path_manager import PathManager from preprocessor.lib.ui.console import ( SimpleProgress, console, @@ -54,6 +54,9 @@ def __init__(self, args: Dict[str, Any], class_name: str, error_exit_code: int, def cleanup(self) -> None: pass + def _finalize(self) -> None: + pass + @abstractmethod def get_output_subdir(self) -> str: pass @@ -101,18 +104,15 @@ def _execute(self) -> None: f'(of {len(all_items)} total, {skipped_count} skipped)[/blue]', ) self.__execute_processing(items_to_process) + self._finalize() + @abstractmethod def _get_expected_outputs(self, item: ProcessingItem) -> List[OutputSpec]: - raise NotImplementedError( - f'{self.__class__.__name__} must implement _get_expected_outputs() ' - 'or override _execute() directly (legacy mode)', - ) + pass + @abstractmethod def _get_processing_items(self) -> List[ProcessingItem]: - raise NotImplementedError( - f'{self.__class__.__name__} must implement _get_processing_items() ' - 'or override _execute() directly (legacy mode)', - ) + pass def _get_progress_description(self) -> str: return f'Processing {self.__class__.__name__}' @@ -120,13 +120,11 @@ def _get_progress_description(self) -> str: def _load_resources(self) -> bool: return True + @abstractmethod def _process_item( self, item: ProcessingItem, missing_outputs: List[OutputSpec], ) -> None: - raise NotImplementedError( - f'{self.__class__.__name__} must implement _process_item() ' - 'or override _execute() directly (legacy mode)', - ) + pass @abstractmethod def _validate_args(self, args: Dict[str, Any]) -> None: @@ -160,8 +158,10 @@ def __execute_processing(self, items: List[ProcessingItem]) -> None: def __get_step_name(self) -> str: class_name = self.__class__.__name__ - name = class_name.replace('Processor', '').replace('Generator', '').replace('Detector', '') - name = name.replace('Transcoder', '').replace('Importer', '').replace('Indexer', '') + suffixes_to_remove = ['Processor', 'Generator', 'Detector', 'Transcoder', 'Importer', 'Indexer'] + name = class_name + for suffix in suffixes_to_remove: + name = name.replace(suffix, '') return self.__to_snake_case(name) def __should_skip_item( @@ -170,33 +170,39 @@ def __should_skip_item( expected_outputs = self._get_expected_outputs(item) if not expected_outputs: return False, [], '' - missing_outputs = [ - output for output in expected_outputs - if not output.path.exists() or output.path.stat().st_size == 0 - ] + missing_outputs = self.__get_missing_outputs(expected_outputs) step_name = self.__get_step_name() - state_completed = ( - self.state_manager - and self.state_manager.is_step_completed(step_name, item.episode_id) - ) - if not missing_outputs and state_completed: + state_completed = self.__is_step_completed_in_state(step_name, item.episode_id) + has_all_outputs = len(missing_outputs) == 0 + if has_all_outputs and state_completed: return True, [], f'[yellow]Skipping (completed): {item.episode_id}[/yellow]' - if not missing_outputs and (not state_completed): - if self.state_manager: - self.state_manager.mark_step_completed(step_name, item.episode_id) - return ( - True, - [], - f'[yellow]Skipping (files exist, state synced): {item.episode_id}[/yellow]', - ) - if missing_outputs and state_completed: + if has_all_outputs and not state_completed: + self.__sync_state_completed(step_name, item.episode_id) + return True, [], f'[yellow]Skipping (files exist, state synced): {item.episode_id}[/yellow]' + if not has_all_outputs and state_completed: console.print( f'[yellow]Warning: State marked complete but outputs missing ' f'for {item.episode_id}[/yellow]', ) - return False, missing_outputs, '' return False, missing_outputs, '' + @staticmethod + def __get_missing_outputs(expected_outputs: List[OutputSpec]) -> List[OutputSpec]: + return [ + output for output in expected_outputs + if not output.path.exists() or output.path.stat().st_size == 0 + ] + + def __is_step_completed_in_state(self, step_name: str, episode_id: str) -> bool: + return bool( + self.state_manager + and self.state_manager.is_step_completed(step_name, episode_id), + ) + + def __sync_state_completed(self, step_name: str, episode_id: str) -> None: + if self.state_manager: + self.state_manager.mark_step_completed(step_name, episode_id) + @staticmethod def __to_snake_case(name: str) -> str: name = re.sub('(.)([A-Z][a-z]+)', '\\1_\\2', name) diff --git a/preprocessor/modules/characters/__init__.py b/preprocessor/modules/characters/__init__.py new file mode 100644 index 000000000..c2c472d6a --- /dev/null +++ b/preprocessor/modules/characters/__init__.py @@ -0,0 +1,3 @@ +from preprocessor.modules.characters.reference_downloader import CharacterReferenceDownloader + +__all__ = ['CharacterReferenceDownloader'] diff --git a/preprocessor/lib/characters/reference_downloader.py b/preprocessor/modules/characters/reference_downloader.py similarity index 64% rename from preprocessor/lib/characters/reference_downloader.py rename to preprocessor/modules/characters/reference_downloader.py index 5997d9f2c..6bc7f950b 100644 --- a/preprocessor/lib/characters/reference_downloader.py +++ b/preprocessor/modules/characters/reference_downloader.py @@ -18,20 +18,22 @@ from patchright.sync_api import ( BrowserContext, Page, + Playwright, sync_playwright, ) from preprocessor.config.config import settings -from preprocessor.core.base_processor import BaseProcessor from preprocessor.lib.characters.face_detection import FaceDetector from preprocessor.lib.characters.image_search import ( BaseImageSearch, DuckDuckGoImageSearch, GoogleImageSearch, ) -from preprocessor.lib.ui.console import ( - console, - create_progress, +from preprocessor.lib.ui.console import console +from preprocessor.modules.base_processor import ( + BaseProcessor, + OutputSpec, + ProcessingItem, ) @@ -49,119 +51,102 @@ def __init__(self, args: Dict[str, Any]): self.use_gpu: bool = True self.search_mode: str = self._args.get('search_mode', 'normal') self.search_engine: BaseImageSearch = self.__create_search_engine() - self.face_app: FaceAnalysis = None + self.face_app: Optional[FaceAnalysis] = None + self.playwright: Optional[Playwright] = None self.browser_context: Optional[BrowserContext] = None - def get_output_subdir(self, item: Optional['ProcessingItem'] = None) -> str: # pylint: disable=unused-argument + def cleanup(self) -> None: + if self.browser_context: + self.browser_context.close() + if self.playwright: + self.playwright.stop() + + def get_output_subdir(self) -> str: return 'character_references' - def _execute(self) -> None: + def _get_expected_outputs(self, item: ProcessingItem) -> List[OutputSpec]: + char_name = item.metadata['char_name'] + output_folder = self.output_dir / char_name.replace(' ', '_').lower() + expected_files = [ + OutputSpec(path=output_folder / f'{i:02d}.jpg', required=True) + for i in range(self.images_per_character) + ] + return expected_files + + def _get_processing_items(self) -> List[ProcessingItem]: if not self.characters_json.exists(): console.print(f'[red]Characters JSON not found: {self.characters_json}[/red]') - return + return [] with open(self.characters_json, encoding='utf-8') as f: data = json.load(f) characters = data.get('characters', []) - if not characters: - console.print('[yellow]No characters found in JSON[/yellow]') - return - if self.__all_references_exist(characters): - console.print(f'[green]✓ All reference images already exist for {len(characters)} characters (skipping)[/green]') - return - self.face_app = FaceDetector.init() - console.print(f'[blue]Downloading reference images for {len(characters)} characters...[/blue]') - with sync_playwright() as p: - self.browser_context = p.chromium.launch_persistent_context( - user_data_dir='/tmp/patchright_profile', - headless=True, - args=['--no-sandbox', '--disable-dev-shm-usage', '--disable-gpu'], - ignore_default_args=['--enable-automation'], + return [ + ProcessingItem( + episode_id=f"char_{char['name']}", + input_path=self.characters_json, + metadata={'char_name': char['name']}, ) - with create_progress() as progress: - task = progress.add_task('Downloading references', total=len(characters)) - for i, char in enumerate(characters): - char_name = char['name'] - downloaded = False - try: - downloaded = self.__download_character_references(char_name, progress) - except Exception as e: - self.logger.error(f'Failed to download references for {char_name}: {e}') - finally: - progress.advance(task) - if downloaded and i < len(characters) - 1: - delay = random.uniform(settings.image_scraper.request_delay_min, settings.image_scraper.request_delay_max) - time.sleep(delay) - self.browser_context.close() - console.print('[green]✓ Reference download completed[/green]') + for char in characters + ] - def _validate_args(self, args: Dict[str, Any]) -> None: - if 'characters_json' not in args: - raise ValueError('characters_json is required') - - def __all_references_exist(self, characters: List[Dict[str, Any]]) -> bool: - for char in characters: - char_name = char['name'] - output_folder = self.output_dir / char_name.replace(' ', '_').lower() - existing_images = list(output_folder.glob('*.jpg')) - if len(existing_images) < self.images_per_character: - return False + def _load_resources(self) -> bool: + self.face_app = FaceDetector.init() + self.playwright = sync_playwright().start() + self.browser_context = self.playwright.chromium.launch_persistent_context( + user_data_dir='/tmp/patchright_profile', + headless=True, + args=['--no-sandbox', '--disable-dev-shm-usage', '--disable-gpu'], + ignore_default_args=['--enable-automation'], + ) return True - def __check_existing_images( - self, output_folder: Path, char_name: str, progress, - ) -> Optional[int]: - existing_images = list(output_folder.glob('*.jpg')) - if len(existing_images) >= self.images_per_character: - progress.console.print( - f'[green]✓ {char_name}: {len(existing_images)} images ' - f'already exist (skipping)[/green]', - ) - return None - return len(existing_images) - - def __count_faces(self, img) -> int: - faces = self.face_app.get(img) - return len(faces) - - def __create_search_engine(self) -> BaseImageSearch: - if self.search_mode == 'premium': - serpapi_key = settings.image_scraper.serpapi_key - return GoogleImageSearch(api_key=serpapi_key, max_results=self.max_results) - return DuckDuckGoImageSearch(max_results=self.max_results) - - def __download_character_references(self, char_name: str, progress) -> bool: + def _process_item(self, item: ProcessingItem, missing_outputs: List[OutputSpec]) -> None: + char_name = item.metadata['char_name'] output_folder = self.__prepare_output_folder(char_name) - saved_count = self.__check_existing_images(output_folder, char_name, progress) - if saved_count is None: - return False + existing_images = list(output_folder.glob('*.jpg')) + saved_count = len(existing_images) + if saved_count >= self.images_per_character: + return search_query = f'Serial {self.series_name} {char_name} postać' - progress.console.print( - f'[cyan]Searching [{self.search_engine.name}]: {search_query}[/cyan]', - ) + self.logger.info(f'Searching [{self.search_engine.name}]: {search_query}') for attempt in range(settings.image_scraper.retry_attempts): try: results = self.search_engine.search(search_query) - saved_count = self.__process_search_results( - results, output_folder, saved_count, - ) + saved_count = self.__process_search_results(results, output_folder, saved_count) break - except KeyboardInterrupt: - progress.console.print('\n[yellow]Download interrupted[/yellow]') + except KeyboardInterrupt: # pylint: disable=try-except-raise raise except Exception as e: if attempt < settings.image_scraper.retry_attempts - 1: delay = settings.image_scraper.retry_delay * 2 ** attempt self.logger.warning( - f'Attempt {attempt + 1} failed for {char_name}, ' - f'retrying in {delay}s: {e}', + f'Attempt {attempt + 1} failed for {char_name}, retrying in {delay}s: {e}', ) time.sleep(delay) else: - self.logger.error( - f'All retry attempts failed for {char_name}: {e}', - ) - self.__print_results(char_name, saved_count, progress) - return True + self.logger.error(f'All retry attempts failed for {char_name}: {e}') + self.__log_results(char_name, saved_count) + delay = random.uniform( + settings.image_scraper.request_delay_min, + settings.image_scraper.request_delay_max, + ) + time.sleep(delay) + + def _validate_args(self, args: Dict[str, Any]) -> None: + if 'characters_json' not in args: + raise ValueError('characters_json is required') + + + def __count_faces(self, img) -> int: + faces = self.face_app.get(img) + return len(faces) + + def __create_search_engine(self) -> BaseImageSearch: + if self.search_mode == 'premium': + serpapi_key = settings.image_scraper.serpapi_key + return GoogleImageSearch(api_key=serpapi_key, max_results=self.max_results) + return DuckDuckGoImageSearch(max_results=self.max_results) + def __download_image_with_browser( self, img_url: str, page: Page, @@ -196,23 +181,17 @@ def __prepare_output_folder(self, char_name: str) -> Path: output_folder.mkdir(parents=True, exist_ok=True) return output_folder - def __print_results( - self, char_name: str, saved_count: int, progress, - ) -> None: + def __log_results(self, char_name: str, saved_count: int) -> None: if saved_count >= self.images_per_character: - progress.console.print( - f'[green]✓[/green] {char_name}: ' - f'{saved_count}/{self.images_per_character} images', + self.logger.info( + f'{char_name}: {saved_count}/{self.images_per_character} images', ) elif saved_count > 0: - progress.console.print( - f'[yellow]⚠[/yellow] {char_name}: ' - f'{saved_count}/{self.images_per_character} images (incomplete)', + self.logger.warning( + f'{char_name}: {saved_count}/{self.images_per_character} images (incomplete)', ) else: - progress.console.print( - f'[red]✗[/red] {char_name}: No suitable images found', - ) + self.logger.error(f'{char_name}: No suitable images found') def __process_search_results( self, results: List[Dict[str, Any]], output_folder: Path, saved_count: int, diff --git a/preprocessor/modules/scraping/base_scraper.py b/preprocessor/modules/scraping/base_scraper.py index 7eb2b6362..53f07efd1 100644 --- a/preprocessor/modules/scraping/base_scraper.py +++ b/preprocessor/modules/scraping/base_scraper.py @@ -14,11 +14,11 @@ ParserMode, ScraperMethod, ) -from preprocessor.core.base_processor import BaseProcessor from preprocessor.lib.ai import LLMProvider from preprocessor.lib.scraping.clipboard import ScraperClipboard from preprocessor.lib.scraping.crawl4ai import ScraperCrawl4AI from preprocessor.lib.ui.console import console +from preprocessor.modules.base_processor import BaseProcessor class BaseScraper(BaseProcessor): diff --git a/preprocessor/modules/scraping/reference_processor.py b/preprocessor/modules/scraping/reference_processor.py index dbd900cd7..58bf1a62b 100644 --- a/preprocessor/modules/scraping/reference_processor.py +++ b/preprocessor/modules/scraping/reference_processor.py @@ -17,17 +17,17 @@ import numpy as np from preprocessor.config.config import settings -from preprocessor.core.base_processor import ( - BaseProcessor, - OutputSpec, - ProcessingItem, -) from preprocessor.lib.characters.face_detection import FaceDetector from preprocessor.lib.characters.models import ( CandidateFace, FaceData, ) from preprocessor.lib.ui.console import console +from preprocessor.modules.base_processor import ( + BaseProcessor, + OutputSpec, + ProcessingItem, +) warnings.filterwarnings('ignore', message='.*estimate.*is deprecated.*', category=FutureWarning, module='insightface') @@ -152,7 +152,7 @@ def _load_resources(self) -> bool: self.face_app = FaceDetector.init() return True - def _process_item(self, item: ProcessingItem, missing_outputs: List[OutputSpec]) -> None: + def _process_item(self, item: ProcessingItem, _missing_outputs: List[OutputSpec]) -> None: char_dir = item.input_path char_name = item.metadata['char_name'] console.print(f'[blue]Processing character: {char_name}[/blue]') diff --git a/preprocessor/modules/search/clients/__init__.py b/preprocessor/modules/search/clients/__init__.py deleted file mode 100644 index a927c8764..000000000 --- a/preprocessor/modules/search/clients/__init__.py +++ /dev/null @@ -1,6 +0,0 @@ -from preprocessor.modules.search.clients.elasticsearch_queries import ElasticsearchQueries -from preprocessor.modules.search.clients.embedding_service import EmbeddingService -from preprocessor.modules.search.clients.hash_service import HashService -from preprocessor.modules.search.clients.result_formatters import ResultFormatter - -__all__ = ['ElasticsearchQueries', 'EmbeddingService', 'HashService', 'ResultFormatter'] diff --git a/preprocessor/modules/text/__init__.py b/preprocessor/modules/text/__init__.py index 551160158..9c8daf43f 100644 --- a/preprocessor/modules/text/__init__.py +++ b/preprocessor/modules/text/__init__.py @@ -1,6 +1,6 @@ +from preprocessor.lib.text.import_step import TranscriptionImportStep +from preprocessor.lib.text.transcription import TranscriptionStep from preprocessor.modules.text.analysis import TextAnalysisStep from preprocessor.modules.text.embeddings import TextEmbeddingStep -from preprocessor.modules.text.import_step import TranscriptionImportStep -from preprocessor.modules.text.transcription import TranscriptionStep __all__ = ['TextAnalysisStep', 'TextEmbeddingStep', 'TranscriptionImportStep', 'TranscriptionStep'] diff --git a/preprocessor/modules/transcription/__init__.py b/preprocessor/modules/transcription/__init__.py new file mode 100644 index 000000000..de1657d26 --- /dev/null +++ b/preprocessor/modules/transcription/__init__.py @@ -0,0 +1 @@ +__all__: list = [] diff --git a/preprocessor/lib/validation/base_result.py b/preprocessor/modules/validation/base_result.py similarity index 100% rename from preprocessor/lib/validation/base_result.py rename to preprocessor/modules/validation/base_result.py diff --git a/preprocessor/modules/validation/episode_stats.py b/preprocessor/modules/validation/episode_stats.py index 32ab0200b..ed1c64ed5 100644 --- a/preprocessor/modules/validation/episode_stats.py +++ b/preprocessor/modules/validation/episode_stats.py @@ -21,10 +21,10 @@ OUTPUT_FILE_NAMES, OUTPUT_FILE_PATTERNS, ) -from preprocessor.core.path_manager import PathManager from preprocessor.lib.episodes import EpisodeInfo -from preprocessor.lib.validation.base_result import ValidationStatusMixin -from preprocessor.lib.validation.file_validators import FileValidator +from preprocessor.lib.io.path_manager import PathManager +from preprocessor.modules.validation.base_result import ValidationStatusMixin +from preprocessor.modules.validation.file_validators import FileValidator ELASTIC_SUBDIRS = settings.output_subdirs.elastic_document_subdirs diff --git a/preprocessor/lib/validation/file_validators.py b/preprocessor/modules/validation/file_validators.py similarity index 100% rename from preprocessor/lib/validation/file_validators.py rename to preprocessor/modules/validation/file_validators.py diff --git a/preprocessor/modules/validation/global_validator.py b/preprocessor/modules/validation/global_validator.py index fe8986eb5..f8e3d0b21 100644 --- a/preprocessor/modules/validation/global_validator.py +++ b/preprocessor/modules/validation/global_validator.py @@ -1,8 +1,8 @@ from pathlib import Path from typing import List -from preprocessor.lib.validation.base_result import BaseValidationResult -from preprocessor.lib.validation.file_validators import FileValidator +from preprocessor.modules.validation.base_result import BaseValidationResult +from preprocessor.modules.validation.file_validators import FileValidator class GlobalValidationResult(BaseValidationResult): diff --git a/preprocessor/modules/validation/validator.py b/preprocessor/modules/validation/validator.py index 98d79a978..ae84b0a4d 100644 --- a/preprocessor/modules/validation/validator.py +++ b/preprocessor/modules/validation/validator.py @@ -9,9 +9,9 @@ from rich.progress import track from preprocessor.config.config import settings -from preprocessor.core.path_manager import PathManager from preprocessor.lib.episodes import EpisodeManager from preprocessor.lib.io.files import FileOperations +from preprocessor.lib.io.path_manager import PathManager from preprocessor.modules.validation.episode_stats import EpisodeStats from preprocessor.modules.validation.report_generator import ReportGenerator from preprocessor.modules.validation.season_comparator import SeasonComparison diff --git a/preprocessor/modules/video/frame_export.py b/preprocessor/modules/video/frame_export.py index 60bc8f679..19c111533 100644 --- a/preprocessor/modules/video/frame_export.py +++ b/preprocessor/modules/video/frame_export.py @@ -21,7 +21,7 @@ from preprocessor.core.base_step import PipelineStep from preprocessor.core.context import ExecutionContext from preprocessor.lib.io.files import atomic_write_json -from preprocessor.modules.video.strategies.strategy_factory import KeyframeStrategyFactory +from preprocessor.lib.video.strategies.strategy_factory import KeyframeStrategyFactory class FrameExporterStep(PipelineStep[SceneCollection, FrameCollection, FrameExportConfig]): diff --git a/preprocessor/modules/video/strategies/__init__.py b/preprocessor/modules/video/strategies/__init__.py deleted file mode 100644 index 99c7a0e38..000000000 --- a/preprocessor/modules/video/strategies/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -from preprocessor.modules.video.strategies.base_strategy import BaseKeyframeStrategy -from preprocessor.modules.video.strategies.scene_changes_strategy import SceneChangesStrategy - -__all__ = ['BaseKeyframeStrategy', 'SceneChangesStrategy'] From 641bf2f939c500ef06477aef462cdb1a8fc73c68 Mon Sep 17 00:00:00 2001 From: dam2452 <81230036+dam2452@users.noreply.github.com> Date: Wed, 11 Feb 2026 21:33:11 +0100 Subject: [PATCH 19/89] Move lib to services and add validation step Refactor codebase namespaces and add validation phase. - Move many modules from preprocessor.lib / preprocessor.modules to preprocessor.services and preprocessor.steps and update imports accordingly (UI, IO, media, episodes, search, transcription, video, etc.). - Remove legacy package __init__ files under preprocessor/lib and preprocessor/modules. - Add a VALIDATION phase and register a Validation step in the pipeline (pipeline_factory) using new ValidationConfig. - Introduce ValidationConfig (anomaly_threshold, episodes_info_json) and ValidationResult dataclass in core.artifacts. - Add PipelineStep._check_cache_validity helper to centralize cache/skip logic. - Update various files to use services.* and steps.* paths and adjust typing imports where necessary. This commit is primarily a namespace reorganization plus the addition of validation plumbing and small core helpers to support it. --- preprocessor/__main__.py | 2 +- preprocessor/app/pipeline.py | 2 +- preprocessor/app/pipeline_builder.py | 4 +- preprocessor/app/pipeline_factory.py | 51 +++-- preprocessor/cli/cli_main.py | 10 +- preprocessor/cli/helpers.py | 6 +- preprocessor/cli/skip_list_builder.py | 2 +- preprocessor/config/config.py | 2 +- preprocessor/config/step_configs.py | 11 +- preprocessor/core/artifacts.py | 5 + preprocessor/core/base_step.py | 14 ++ preprocessor/core/context.py | 4 +- preprocessor/core/state_manager.py | 2 +- .../lib/characters/image_search/__init__.py | 5 - preprocessor/lib/io/__init__.py | 5 - preprocessor/lib/media/__init__.py | 9 - preprocessor/lib/search/__init__.py | 4 - preprocessor/lib/search/clients/__init__.py | 6 - preprocessor/lib/transcription/__init__.py | 25 --- .../lib/transcription/processors/__init__.py | 5 - preprocessor/lib/video/__init__.py | 3 - preprocessor/lib/video/strategies/__init__.py | 4 - preprocessor/modules/characters/__init__.py | 3 - preprocessor/modules/packaging/__init__.py | 3 - preprocessor/modules/scraping/__init__.py | 6 - preprocessor/modules/search/indexing.py | 114 ---------- preprocessor/modules/text/__init__.py | 6 - preprocessor/modules/text/analysis.py | 50 ----- .../modules/transcription/__init__.py | 1 - preprocessor/modules/validation/__init__.py | 7 - preprocessor/modules/video/__init__.py | 0 preprocessor/modules/vision/__init__.py | 8 - .../modules/vision/character_detection.py | 118 ---------- preprocessor/{lib => services}/__init__.py | 0 preprocessor/{lib => services}/ai/__init__.py | 4 +- preprocessor/{lib => services}/ai/clients.py | 2 +- preprocessor/{lib => services}/ai/models.py | 0 preprocessor/{lib => services}/ai/provider.py | 6 +- .../{lib => services}/audio/__init__.py | 0 .../{lib => services}/audio/extraction.py | 0 .../{lib => services}/characters/__init__.py | 4 +- .../characters/face_detection.py | 2 +- .../characters/image_search/__init__.py | 5 + .../image_search/duckduckgo_image_search.py | 2 +- .../image_search/google_image_search.py | 2 +- .../characters/image_search/image_search.py | 0 .../{lib => services}/characters/models.py | 0 .../characters/reference_downloader.py | 8 +- .../{lib => services}/core/__init__.py | 4 +- .../core}/base_processor.py | 8 +- .../{lib => services}/core/logging.py | 2 +- preprocessor/{lib => services}/core/time.py | 0 .../{lib => services}/episodes/__init__.py | 2 +- .../episodes/episode_manager.py | 4 +- preprocessor/services/io/__init__.py | 5 + .../{lib => services}/io/detection_io.py | 2 +- preprocessor/{lib => services}/io/files.py | 0 preprocessor/{lib => services}/io/hashing.py | 8 +- preprocessor/{lib => services}/io/metadata.py | 0 .../{lib => services}/io/path_manager.py | 4 +- .../{lib => services}/io/path_resolver.py | 2 +- .../{lib => services}/io/path_service.py | 2 +- preprocessor/services/media/__init__.py | 9 + .../{lib => services}/media/ffmpeg.py | 0 .../{lib => services}/media/resolution.py | 0 .../media/scene_detection.py | 0 preprocessor/services/scraping/__init__.py | 6 + .../scraping/base_scraper.py | 10 +- .../scraping/base_scraper_step.py | 0 .../scraping/character_scraper.py | 4 +- .../{lib => services}/scraping/clipboard.py | 2 +- .../{lib => services}/scraping/crawl4ai.py | 2 +- .../scraping/episode_scraper.py | 4 +- .../scraping/reference_processor.py | 8 +- preprocessor/services/search/__init__.py | 4 + .../services/search/clients/__init__.py | 6 + .../search/clients/elasticsearch_queries.py | 2 +- .../search/clients/embedding_service.py | 0 .../search/clients/hash_service.py | 2 +- .../search/clients/result_formatters.py | 0 .../{lib => services}/search/elasticsearch.py | 0 .../search/embedding_model.py | 2 +- .../{lib => services}/text/__init__.py | 4 +- .../{lib => services}/text/import_step.py | 4 +- .../{lib => services}/text/language_config.py | 0 .../{lib => services}/text/text_statistics.py | 2 +- .../services/transcription/__init__.py | 25 +++ .../transcription/engines}/__init__.py | 0 .../transcription/engines/base_engine.py | 0 .../engines/elevenlabs_engine.py | 6 +- .../transcription/engines/whisper_engine.py | 6 +- .../transcription/generators}/__init__.py | 0 .../generators/base_generator.py | 2 +- .../generators/json_generator.py | 4 +- .../generators/multi_format_generator.py | 10 +- .../transcription/generators/srt_generator.py | 2 +- .../transcription/generators/txt_generator.py | 2 +- .../transcription/processors/__init__.py | 5 + .../processors/audio_normalizer.py | 4 +- .../processors/episode_info_processor.py | 4 +- .../processors/normalized_audio_processor.py | 4 +- .../transcription/sound_classification.py | 0 .../{lib => services}/transcription/utils.py | 0 .../transcription/whisper.py | 4 +- preprocessor/{lib => services}/ui/__init__.py | 4 +- preprocessor/{lib => services}/ui/console.py | 2 +- preprocessor/{lib => services}/ui/progress.py | 4 +- .../validation/base_result.py | 0 .../validation/episode_stats.py | 8 +- .../validation/file_validators.py | 0 .../validation/global_validator.py | 4 +- .../validation/report_generator.py | 4 +- .../validation/season_comparator.py | 2 +- .../validation/validator.py | 12 +- preprocessor/services/video/__init__.py | 3 + .../{lib => services}/video/discovery.py | 0 .../{lib => services}/video/emotion_utils.py | 2 +- .../{lib => services}/video/frame_utils.py | 0 .../{lib => services}/video/image_hasher.py | 0 .../services/video/strategies/__init__.py | 4 + .../video/strategies/base_strategy.py | 0 .../strategies/scene_changes_strategy.py | 4 +- .../video/strategies/strategy_factory.py | 4 +- .../generators => steps}/__init__.py | 0 preprocessor/steps/audio/__init__.py | 3 + .../{lib => steps}/audio/separation.py | 162 ++++++++++---- preprocessor/steps/packaging/__init__.py | 3 + .../{modules => steps}/packaging/archives.py | 39 +++- preprocessor/steps/scraping/__init__.py | 6 + .../scraping/character_scraper_step.py | 4 +- .../scraping/episode_scraper_step.py | 4 +- .../scraping/reference_processor_step.py | 41 +++- .../{modules => steps/search}/__init__.py | 0 .../search/document_generation.py | 70 ++++-- preprocessor/steps/search/indexing.py | 154 +++++++++++++ preprocessor/steps/text/__init__.py | 6 + preprocessor/steps/text/analysis.py | 104 +++++++++ .../{modules => steps}/text/embeddings.py | 129 ++++++++--- .../{lib => steps}/text/transcription.py | 80 +++++-- preprocessor/steps/validation/__init__.py | 3 + .../steps/validation/validator_step.py | 47 ++++ .../search => steps/video}/__init__.py | 0 .../{modules => steps}/video/frame_export.py | 136 +++++++++--- .../video/scene_detection.py | 91 ++++++-- .../{modules => steps}/video/transcoding.py | 208 ++++++++++++------ preprocessor/steps/vision/__init__.py | 8 + .../steps/vision/character_detection.py | 173 +++++++++++++++ .../{modules => steps}/vision/embeddings.py | 116 +++++++--- .../vision/emotion_detection.py | 0 .../vision/face_clustering.py | 0 .../vision/image_hashing.py | 139 ++++++++---- .../vision/object_detection.py | 0 152 files changed, 1657 insertions(+), 850 deletions(-) delete mode 100644 preprocessor/lib/characters/image_search/__init__.py delete mode 100644 preprocessor/lib/io/__init__.py delete mode 100644 preprocessor/lib/media/__init__.py delete mode 100644 preprocessor/lib/search/__init__.py delete mode 100644 preprocessor/lib/search/clients/__init__.py delete mode 100644 preprocessor/lib/transcription/__init__.py delete mode 100644 preprocessor/lib/transcription/processors/__init__.py delete mode 100644 preprocessor/lib/video/__init__.py delete mode 100644 preprocessor/lib/video/strategies/__init__.py delete mode 100644 preprocessor/modules/characters/__init__.py delete mode 100644 preprocessor/modules/packaging/__init__.py delete mode 100644 preprocessor/modules/scraping/__init__.py delete mode 100644 preprocessor/modules/search/indexing.py delete mode 100644 preprocessor/modules/text/__init__.py delete mode 100644 preprocessor/modules/text/analysis.py delete mode 100644 preprocessor/modules/transcription/__init__.py delete mode 100644 preprocessor/modules/validation/__init__.py delete mode 100644 preprocessor/modules/video/__init__.py delete mode 100644 preprocessor/modules/vision/__init__.py delete mode 100644 preprocessor/modules/vision/character_detection.py rename preprocessor/{lib => services}/__init__.py (100%) rename preprocessor/{lib => services}/ai/__init__.py (56%) rename preprocessor/{lib => services}/ai/clients.py (98%) rename preprocessor/{lib => services}/ai/models.py (100%) rename preprocessor/{lib => services}/ai/provider.py (97%) rename preprocessor/{lib => services}/audio/__init__.py (100%) rename preprocessor/{lib => services}/audio/extraction.py (100%) rename preprocessor/{lib => services}/characters/__init__.py (55%) rename preprocessor/{lib => services}/characters/face_detection.py (99%) create mode 100644 preprocessor/services/characters/image_search/__init__.py rename preprocessor/{lib => services}/characters/image_search/duckduckgo_image_search.py (81%) rename preprocessor/{lib => services}/characters/image_search/google_image_search.py (91%) rename preprocessor/{lib => services}/characters/image_search/image_search.py (100%) rename preprocessor/{lib => services}/characters/models.py (100%) rename preprocessor/{modules => services}/characters/reference_downloader.py (97%) rename preprocessor/{lib => services}/core/__init__.py (57%) rename preprocessor/{modules => services/core}/base_processor.py (96%) rename preprocessor/{lib => services}/core/logging.py (97%) rename preprocessor/{lib => services}/core/time.py (100%) rename preprocessor/{lib => services}/episodes/__init__.py (57%) rename preprocessor/{lib => services}/episodes/episode_manager.py (98%) create mode 100644 preprocessor/services/io/__init__.py rename preprocessor/{lib => services}/io/detection_io.py (90%) rename preprocessor/{lib => services}/io/files.py (100%) rename preprocessor/{lib => services}/io/hashing.py (86%) rename preprocessor/{lib => services}/io/metadata.py (100%) rename preprocessor/{lib => services}/io/path_manager.py (80%) rename preprocessor/{lib => services}/io/path_resolver.py (84%) rename preprocessor/{lib => services}/io/path_service.py (94%) create mode 100644 preprocessor/services/media/__init__.py rename preprocessor/{lib => services}/media/ffmpeg.py (100%) rename preprocessor/{lib => services}/media/resolution.py (100%) rename preprocessor/{lib => services}/media/scene_detection.py (100%) create mode 100644 preprocessor/services/scraping/__init__.py rename preprocessor/{modules => services}/scraping/base_scraper.py (92%) rename preprocessor/{modules => services}/scraping/base_scraper_step.py (100%) rename preprocessor/{modules => services}/scraping/character_scraper.py (87%) rename preprocessor/{lib => services}/scraping/clipboard.py (93%) rename preprocessor/{lib => services}/scraping/crawl4ai.py (97%) rename preprocessor/{modules => services}/scraping/episode_scraper.py (97%) rename preprocessor/{modules => services}/scraping/reference_processor.py (99%) create mode 100644 preprocessor/services/search/__init__.py create mode 100644 preprocessor/services/search/clients/__init__.py rename preprocessor/{lib => services}/search/clients/elasticsearch_queries.py (99%) rename preprocessor/{lib => services}/search/clients/embedding_service.py (100%) rename preprocessor/{lib => services}/search/clients/hash_service.py (94%) rename preprocessor/{lib => services}/search/clients/result_formatters.py (100%) rename preprocessor/{lib => services}/search/elasticsearch.py (100%) rename preprocessor/{lib => services}/search/embedding_model.py (89%) rename preprocessor/{lib => services}/text/__init__.py (53%) rename preprocessor/{lib => services}/text/import_step.py (98%) rename preprocessor/{lib => services}/text/language_config.py (100%) rename preprocessor/{lib => services}/text/text_statistics.py (99%) create mode 100644 preprocessor/services/transcription/__init__.py rename preprocessor/{lib/scraping => services/transcription/engines}/__init__.py (100%) rename preprocessor/{lib => services}/transcription/engines/base_engine.py (100%) rename preprocessor/{lib => services}/transcription/engines/elevenlabs_engine.py (96%) rename preprocessor/{lib => services}/transcription/engines/whisper_engine.py (90%) rename preprocessor/{lib/transcription/engines => services/transcription/generators}/__init__.py (100%) rename preprocessor/{lib => services}/transcription/generators/base_generator.py (93%) rename preprocessor/{lib => services}/transcription/generators/json_generator.py (94%) rename preprocessor/{lib => services}/transcription/generators/multi_format_generator.py (95%) rename preprocessor/{lib => services}/transcription/generators/srt_generator.py (93%) rename preprocessor/{lib => services}/transcription/generators/txt_generator.py (87%) create mode 100644 preprocessor/services/transcription/processors/__init__.py rename preprocessor/{lib => services}/transcription/processors/audio_normalizer.py (95%) rename preprocessor/{lib => services}/transcription/processors/episode_info_processor.py (96%) rename preprocessor/{lib => services}/transcription/processors/normalized_audio_processor.py (96%) rename preprocessor/{lib => services}/transcription/sound_classification.py (100%) rename preprocessor/{lib => services}/transcription/utils.py (100%) rename preprocessor/{lib => services}/transcription/whisper.py (95%) rename preprocessor/{lib => services}/ui/__init__.py (62%) rename preprocessor/{lib => services}/ui/console.py (97%) rename preprocessor/{lib => services}/ui/progress.py (95%) rename preprocessor/{modules => services}/validation/base_result.py (100%) rename preprocessor/{modules => services}/validation/episode_stats.py (98%) rename preprocessor/{modules => services}/validation/file_validators.py (100%) rename preprocessor/{modules => services}/validation/global_validator.py (96%) rename preprocessor/{modules => services}/validation/report_generator.py (88%) rename preprocessor/{modules => services}/validation/season_comparator.py (98%) rename preprocessor/{modules => services}/validation/validator.py (93%) create mode 100644 preprocessor/services/video/__init__.py rename preprocessor/{lib => services}/video/discovery.py (100%) rename preprocessor/{lib => services}/video/emotion_utils.py (98%) rename preprocessor/{lib => services}/video/frame_utils.py (100%) rename preprocessor/{lib => services}/video/image_hasher.py (100%) create mode 100644 preprocessor/services/video/strategies/__init__.py rename preprocessor/{lib => services}/video/strategies/base_strategy.py (100%) rename preprocessor/{lib => services}/video/strategies/scene_changes_strategy.py (93%) rename preprocessor/{lib => services}/video/strategies/strategy_factory.py (69%) rename preprocessor/{lib/transcription/generators => steps}/__init__.py (100%) create mode 100644 preprocessor/steps/audio/__init__.py rename preprocessor/{lib => steps}/audio/separation.py (64%) create mode 100644 preprocessor/steps/packaging/__init__.py rename preprocessor/{modules => steps}/packaging/archives.py (52%) create mode 100644 preprocessor/steps/scraping/__init__.py rename preprocessor/{modules => steps}/scraping/character_scraper_step.py (69%) rename preprocessor/{modules => steps}/scraping/episode_scraper_step.py (82%) rename preprocessor/{modules => steps}/scraping/reference_processor_step.py (71%) rename preprocessor/{modules => steps/search}/__init__.py (100%) rename preprocessor/{modules => steps}/search/document_generation.py (74%) create mode 100644 preprocessor/steps/search/indexing.py create mode 100644 preprocessor/steps/text/__init__.py create mode 100644 preprocessor/steps/text/analysis.py rename preprocessor/{modules => steps}/text/embeddings.py (68%) rename preprocessor/{lib => steps}/text/transcription.py (60%) create mode 100644 preprocessor/steps/validation/__init__.py create mode 100644 preprocessor/steps/validation/validator_step.py rename preprocessor/{modules/search => steps/video}/__init__.py (100%) rename preprocessor/{modules => steps}/video/frame_export.py (74%) rename preprocessor/{modules => steps}/video/scene_detection.py (53%) rename preprocessor/{modules => steps}/video/transcoding.py (51%) create mode 100644 preprocessor/steps/vision/__init__.py create mode 100644 preprocessor/steps/vision/character_detection.py rename preprocessor/{modules => steps}/vision/embeddings.py (65%) rename preprocessor/{modules => steps}/vision/emotion_detection.py (100%) rename preprocessor/{modules => steps}/vision/face_clustering.py (100%) rename preprocessor/{modules => steps}/vision/image_hashing.py (54%) rename preprocessor/{modules => steps}/vision/object_detection.py (100%) diff --git a/preprocessor/__main__.py b/preprocessor/__main__.py index 99b90fa10..a3072a518 100644 --- a/preprocessor/__main__.py +++ b/preprocessor/__main__.py @@ -2,7 +2,7 @@ import sys from preprocessor.cli import cli -from preprocessor.lib.ui.console import console +from preprocessor.services.ui.console import console logging.getLogger('matplotlib').setLevel(logging.ERROR) logging.getLogger('matplotlib.font_manager').setLevel(logging.ERROR) diff --git a/preprocessor/app/pipeline.py b/preprocessor/app/pipeline.py index e903b66c1..e4a9c2b6f 100644 --- a/preprocessor/app/pipeline.py +++ b/preprocessor/app/pipeline.py @@ -11,7 +11,7 @@ from preprocessor.app.step_builder import StepBuilder if TYPE_CHECKING: - from preprocessor.lib.core.logging import ErrorHandlingLogger + from preprocessor.services.core.logging import ErrorHandlingLogger class PipelineDefinition: diff --git a/preprocessor/app/pipeline_builder.py b/preprocessor/app/pipeline_builder.py index 5c396d9f5..499b4f5eb 100644 --- a/preprocessor/app/pipeline_builder.py +++ b/preprocessor/app/pipeline_builder.py @@ -8,8 +8,8 @@ from preprocessor.core.artifacts import SourceVideo from preprocessor.core.base_step import PipelineStep from preprocessor.core.context import ExecutionContext -from preprocessor.lib.episodes.episode_manager import EpisodeManager -from preprocessor.lib.video.discovery import VideoDiscovery +from preprocessor.services.episodes.episode_manager import EpisodeManager +from preprocessor.services.video.discovery import VideoDiscovery if TYPE_CHECKING: from preprocessor.app.pipeline import PipelineDefinition diff --git a/preprocessor/app/pipeline_factory.py b/preprocessor/app/pipeline_factory.py index 8de631d83..24b8c2381 100644 --- a/preprocessor/app/pipeline_factory.py +++ b/preprocessor/app/pipeline_factory.py @@ -25,6 +25,7 @@ TextAnalysisConfig, TextEmbeddingConfig, TranscodeConfig, + ValidationConfig, VideoEmbeddingConfig, WhisperTranscriptionConfig, ) @@ -32,6 +33,7 @@ SCRAPING = Phase("SCRAPING", color="blue") PROCESSING = Phase("PROCESSING", color="green") INDEXING = Phase("INDEXING", color="yellow") +VALIDATION = Phase("VALIDATION", color="magenta") def build_pipeline(series_name: str) -> PipelineDefinition: # pylint: disable=too-many-locals @@ -40,7 +42,7 @@ def build_pipeline(series_name: str) -> PipelineDefinition: # pylint: disable=t episodes_metadata = StepBuilder( id="scrape_episodes", phase=SCRAPING, - module="preprocessor.modules.scraping.episode_scraper_step:EpisodeScraperStep", + module="preprocessor.steps.scraping.episode_scraper_step:EpisodeScraperStep", description="Scrapes episode metadata from wiki", produces=["episodes.json"], needs=[], @@ -57,7 +59,7 @@ def build_pipeline(series_name: str) -> PipelineDefinition: # pylint: disable=t characters_metadata = StepBuilder( id="scrape_characters", phase=SCRAPING, - module="preprocessor.modules.scraping.character_scraper_step:CharacterScraperStep", + module="preprocessor.steps.scraping.character_scraper_step:CharacterScraperStep", description="Scrapes character data from wiki", produces=["characters.json"], needs=[], @@ -73,7 +75,7 @@ def build_pipeline(series_name: str) -> PipelineDefinition: # pylint: disable=t character_references = StepBuilder( id="process_references", phase=SCRAPING, - module="preprocessor.modules.scraping.reference_processor_step:CharacterReferenceStep", + module="preprocessor.steps.scraping.reference_processor_step:CharacterReferenceStep", description="Downloads and processes character reference images", produces=["character_faces/{character}/*.jpg"], needs=[characters_metadata], @@ -88,7 +90,7 @@ def build_pipeline(series_name: str) -> PipelineDefinition: # pylint: disable=t transcoded_videos = StepBuilder( id="transcode", phase=PROCESSING, - module="preprocessor.modules.video.transcoding:VideoTranscoderStep", + module="preprocessor.steps.video.transcoding:VideoTranscoderStep", description=f"Conversion to {series_config.processing.transcode.codec} {series_config.processing.transcode.resolution} with adaptive bitrate", produces=["transcoded_videos/{season}/{episode}.mp4"], needs=[], @@ -105,7 +107,7 @@ def build_pipeline(series_name: str) -> PipelineDefinition: # pylint: disable=t scene_data = StepBuilder( id="detect_scenes", phase=PROCESSING, - module="preprocessor.modules.video.scene_detection:SceneDetectorStep", + module="preprocessor.steps.video.scene_detection:SceneDetectorStep", description="Detects scene changes using TransNetV2", produces=["scene_detections/{season}/{episode}.json"], needs=[transcoded_videos], @@ -118,7 +120,7 @@ def build_pipeline(series_name: str) -> PipelineDefinition: # pylint: disable=t exported_frames = StepBuilder( id="export_frames", phase=PROCESSING, - module="preprocessor.modules.video.frame_export:FrameExporterStep", + module="preprocessor.steps.video.frame_export:FrameExporterStep", description="Exports frames (PNG) at scene boundaries", produces=["frames/{season}/{episode}/*.png"], needs=[scene_data], @@ -128,7 +130,7 @@ def build_pipeline(series_name: str) -> PipelineDefinition: # pylint: disable=t transcription_data = StepBuilder( id="transcribe", phase=PROCESSING, - module="preprocessor.modules.text.transcription:TranscriptionStep", + module="preprocessor.steps.text.transcription:TranscriptionStep", description=f"Audio transcription using {series_config.processing.transcription.mode}", produces=["transcriptions/{season}/{episode}.json"], needs=[transcoded_videos], @@ -144,7 +146,7 @@ def build_pipeline(series_name: str) -> PipelineDefinition: # pylint: disable=t separated_audio = StepBuilder( id="separate_sounds", phase=PROCESSING, - module="preprocessor.modules.audio.separation:SoundSeparationStep", + module="preprocessor.steps.audio.separation:SoundSeparationStep", description="Separates dialogue from sound effects", produces=["separated_audio/{season}/{episode}/"], needs=[transcription_data], @@ -154,7 +156,7 @@ def build_pipeline(series_name: str) -> PipelineDefinition: # pylint: disable=t text_stats = StepBuilder( id="analyze_text", phase=PROCESSING, - module="preprocessor.modules.text.analysis:TextAnalysisStep", + module="preprocessor.steps.text.analysis:TextAnalysisStep", description="Analyzes text statistics (word frequency, sentiment)", produces=["text_analysis/{season}/{episode}.json"], needs=[transcription_data], @@ -164,7 +166,7 @@ def build_pipeline(series_name: str) -> PipelineDefinition: # pylint: disable=t text_embeddings = StepBuilder( id="text_embeddings", phase=PROCESSING, - module="preprocessor.modules.text.embeddings:TextEmbeddingStep", + module="preprocessor.steps.text.embeddings:TextEmbeddingStep", description="Generates text embeddings using Qwen3-VL-Embedding", produces=["embeddings/text/{season}/{episode}.npy"], needs=[text_stats], @@ -180,7 +182,7 @@ def build_pipeline(series_name: str) -> PipelineDefinition: # pylint: disable=t image_hashes = StepBuilder( id="image_hashing", phase=PROCESSING, - module="preprocessor.modules.vision.image_hashing:ImageHashStep", + module="preprocessor.steps.vision.image_hashing:ImageHashStep", description="Perceptual frame hashing (phash, dhash, wavelet)", produces=["hashes/{season}/{episode}.json"], needs=[exported_frames], @@ -190,7 +192,7 @@ def build_pipeline(series_name: str) -> PipelineDefinition: # pylint: disable=t video_embeddings = StepBuilder( id="video_embeddings", phase=PROCESSING, - module="preprocessor.modules.vision.embeddings:VideoEmbeddingStep", + module="preprocessor.steps.vision.embeddings:VideoEmbeddingStep", description="Visual embeddings using Qwen3-VL-Embedding", produces=["embeddings/vision/{season}/{episode}.npy"], needs=[exported_frames, image_hashes], @@ -204,7 +206,7 @@ def build_pipeline(series_name: str) -> PipelineDefinition: # pylint: disable=t character_detections = StepBuilder( id="detect_characters", phase=PROCESSING, - module="preprocessor.modules.vision.character_detection:CharacterDetectorStep", + module="preprocessor.steps.vision.character_detection:CharacterDetectorStep", description="Recognizes characters in frames using InsightFace", produces=["detections/characters/{season}/{episode}.json"], needs=[exported_frames], @@ -214,7 +216,7 @@ def build_pipeline(series_name: str) -> PipelineDefinition: # pylint: disable=t emotion_data = StepBuilder( id="detect_emotions", phase=PROCESSING, - module="preprocessor.modules.vision.emotion_detection:EmotionDetectionStep", + module="preprocessor.steps.vision.emotion_detection:EmotionDetectionStep", description="Detects emotions on faces using EmoNet", produces=["detections/emotions/{season}/{episode}.json"], needs=[exported_frames], @@ -224,7 +226,7 @@ def build_pipeline(series_name: str) -> PipelineDefinition: # pylint: disable=t face_clusters = StepBuilder( id="cluster_faces", phase=PROCESSING, - module="preprocessor.modules.vision.face_clustering:FaceClusteringStep", + module="preprocessor.steps.vision.face_clustering:FaceClusteringStep", description="Face clustering using HDBSCAN", produces=["clusters/faces/{season}/{episode}.json"], needs=[exported_frames], @@ -234,7 +236,7 @@ def build_pipeline(series_name: str) -> PipelineDefinition: # pylint: disable=t object_detections = StepBuilder( id="detect_objects", phase=PROCESSING, - module="preprocessor.modules.vision.object_detection:ObjectDetectionStep", + module="preprocessor.steps.vision.object_detection:ObjectDetectionStep", description="General object detection using D-FINE", produces=["detections/objects/{season}/{episode}.json"], needs=[exported_frames], @@ -244,7 +246,7 @@ def build_pipeline(series_name: str) -> PipelineDefinition: # pylint: disable=t elastic_documents = StepBuilder( id="generate_elastic_docs", phase=INDEXING, - module="preprocessor.modules.search.document_generation:DocumentGeneratorStep", + module="preprocessor.steps.search.document_generation:DocumentGeneratorStep", description="Combines all data into Elasticsearch documents", produces=["elastic_documents/{season}/{episode}.ndjson"], needs=[ @@ -261,7 +263,7 @@ def build_pipeline(series_name: str) -> PipelineDefinition: # pylint: disable=t episode_archives = StepBuilder( id="generate_archives", phase=INDEXING, - module="preprocessor.modules.packaging.archives:ArchiveGenerationStep", + module="preprocessor.steps.packaging.archives:ArchiveGenerationStep", description="Creates ZIP archives per episode (all artifacts)", produces=["archives/{season}/{episode}.zip"], needs=[elastic_documents], @@ -271,7 +273,7 @@ def build_pipeline(series_name: str) -> PipelineDefinition: # pylint: disable=t indexed_data = StepBuilder( id="index_to_elasticsearch", phase=INDEXING, - module="preprocessor.modules.search.indexing:ElasticsearchIndexerStep", + module="preprocessor.steps.search.indexing:ElasticsearchIndexerStep", description="Indexes documents into Elasticsearch", produces=[""], needs=[elastic_documents], @@ -283,6 +285,16 @@ def build_pipeline(series_name: str) -> PipelineDefinition: # pylint: disable=t ), ) + validation = StepBuilder( + id="validate", + phase=VALIDATION, + module="preprocessor.steps.validation.validator_step:ValidationStep", + description="Validates all processed data and generates reports", + produces=["validation_reports/{season}/"], + needs=[indexed_data, episode_archives], + config=ValidationConfig(), + ) + pipeline = PipelineDefinition(name=f"{series_name}_processing") pipeline.register(episodes_metadata) @@ -309,6 +321,7 @@ def build_pipeline(series_name: str) -> PipelineDefinition: # pylint: disable=t pipeline.register(elastic_documents) pipeline.register(episode_archives) pipeline.register(indexed_data) + pipeline.register(validation) pipeline.validate() diff --git a/preprocessor/cli/cli_main.py b/preprocessor/cli/cli_main.py index 1291f49ca..e92009507 100644 --- a/preprocessor/cli/cli_main.py +++ b/preprocessor/cli/cli_main.py @@ -14,7 +14,7 @@ from preprocessor.cli.helpers import setup_pipeline_context from preprocessor.cli.skip_list_builder import SkipListBuilder from preprocessor.config.series_config import SeriesConfig -from preprocessor.lib.io.path_resolver import PathResolver +from preprocessor.services.io.path_resolver import PathResolver @click.group() @@ -156,10 +156,10 @@ def search( # pylint: disable=too-many-arguments,too-many-locals,too-many-state from elasticsearch import AsyncElasticsearch # pylint: disable=import-outside-toplevel - from preprocessor.lib.search.clients.elasticsearch_queries import ElasticsearchQueries # pylint: disable=import-outside-toplevel - from preprocessor.lib.search.clients.embedding_service import EmbeddingService # pylint: disable=import-outside-toplevel - from preprocessor.lib.search.clients.hash_service import HashService # pylint: disable=import-outside-toplevel - from preprocessor.lib.search.clients.result_formatters import ResultFormatter # pylint: disable=import-outside-toplevel + from preprocessor.services.search.clients.elasticsearch_queries import ElasticsearchQueries # pylint: disable=import-outside-toplevel + from preprocessor.services.search.clients.embedding_service import EmbeddingService # pylint: disable=import-outside-toplevel + from preprocessor.services.search.clients.hash_service import HashService # pylint: disable=import-outside-toplevel + from preprocessor.services.search.clients.result_formatters import ResultFormatter # pylint: disable=import-outside-toplevel if not any([ text, text_semantic, text_to_video, image, phash, character, emotion, diff --git a/preprocessor/cli/helpers.py b/preprocessor/cli/helpers.py index f18dd8fa9..05ea1b449 100644 --- a/preprocessor/cli/helpers.py +++ b/preprocessor/cli/helpers.py @@ -5,9 +5,9 @@ from preprocessor.core.context import ExecutionContext from preprocessor.core.state_manager import StateManager -from preprocessor.lib.core.logging import ErrorHandlingLogger -from preprocessor.lib.episodes.episode_manager import EpisodeManager -from preprocessor.lib.io.path_resolver import PathResolver +from preprocessor.services.core.logging import ErrorHandlingLogger +from preprocessor.services.episodes.episode_manager import EpisodeManager +from preprocessor.services.io.path_resolver import PathResolver @dataclass diff --git a/preprocessor/cli/skip_list_builder.py b/preprocessor/cli/skip_list_builder.py index 960975215..5285861a7 100644 --- a/preprocessor/cli/skip_list_builder.py +++ b/preprocessor/cli/skip_list_builder.py @@ -4,7 +4,7 @@ ) from preprocessor.config.series_config import SeriesConfig -from preprocessor.lib.core.logging import ErrorHandlingLogger +from preprocessor.services.core.logging import ErrorHandlingLogger class SkipListBuilder: diff --git a/preprocessor/config/config.py b/preprocessor/config/config.py index 28dd192ea..223fc7c98 100644 --- a/preprocessor/config/config.py +++ b/preprocessor/config/config.py @@ -14,7 +14,7 @@ from pydantic import SecretStr -from preprocessor.lib.media.resolution import Resolution +from preprocessor.services.media.resolution import Resolution is_docker = os.getenv('DOCKER_CONTAINER', 'false').lower() == 'true' BASE_OUTPUT_DIR = Path('/app/output_data') if is_docker else Path('preprocessor/output_data') diff --git a/preprocessor/config/step_configs.py b/preprocessor/config/step_configs.py index cea26da42..1e8b407ac 100644 --- a/preprocessor/config/step_configs.py +++ b/preprocessor/config/step_configs.py @@ -1,4 +1,8 @@ -from typing import List +from pathlib import Path +from typing import ( + List, + Optional, +) from pydantic import ( BaseModel, @@ -8,7 +12,7 @@ from typing_extensions import Self from preprocessor.config.enums import KeyframeStrategy -from preprocessor.lib.media.resolution import Resolution +from preprocessor.services.media.resolution import Resolution class TranscodeConfig(BaseModel): @@ -109,7 +113,8 @@ class ArchiveConfig(BaseModel): pass class ValidationConfig(BaseModel): - pass + anomaly_threshold: float = 20.0 + episodes_info_json: Optional[Path] = None class EpisodeScraperConfig(BaseModel): diff --git a/preprocessor/core/artifacts.py b/preprocessor/core/artifacts.py index db08c08d4..02dff410c 100644 --- a/preprocessor/core/artifacts.py +++ b/preprocessor/core/artifacts.py @@ -107,4 +107,9 @@ class ObjectDetectionData(EpisodeArtifact): class ArchiveArtifact(EpisodeArtifact): path: Path +@dataclass(frozen=True) +class ValidationResult(Artifact): + season: str + validation_report_dir: Path + ProcessedEpisode = ElasticDocuments diff --git a/preprocessor/core/base_step.py b/preprocessor/core/base_step.py index ecf5d1809..9aece3655 100644 --- a/preprocessor/core/base_step.py +++ b/preprocessor/core/base_step.py @@ -2,6 +2,7 @@ ABC, abstractmethod, ) +from pathlib import Path from typing import ( TYPE_CHECKING, Generic, @@ -37,3 +38,16 @@ def execute(self, input_data: InputT, context: "ExecutionContext") -> OutputT: @abstractmethod def name(self) -> str: pass + + def _check_cache_validity( + self, + output_path: Path, + context: "ExecutionContext", + episode_id: str, + cache_description: str, + ) -> bool: + if output_path.exists() and (not context.force_rerun): + if context.is_step_completed(self.name, episode_id): + context.logger.info(f'Skipping {episode_id} ({cache_description})') + return True + return False diff --git a/preprocessor/core/context.py b/preprocessor/core/context.py index 198eedf24..195ce21b3 100644 --- a/preprocessor/core/context.py +++ b/preprocessor/core/context.py @@ -5,11 +5,11 @@ Optional, ) -from preprocessor.lib.core.logging import ErrorHandlingLogger +from preprocessor.services.core.logging import ErrorHandlingLogger if TYPE_CHECKING: from preprocessor.core.state_manager import StateManager - from preprocessor.lib.episodes.episode_manager import EpisodeInfo + from preprocessor.services.episodes.episode_manager import EpisodeInfo class ExecutionContext: diff --git a/preprocessor/core/state_manager.py b/preprocessor/core/state_manager.py index c3b4ed44f..2a34ed042 100644 --- a/preprocessor/core/state_manager.py +++ b/preprocessor/core/state_manager.py @@ -13,7 +13,7 @@ Optional, ) -from preprocessor.lib.ui.console import console +from preprocessor.services.ui.console import console @dataclass diff --git a/preprocessor/lib/characters/image_search/__init__.py b/preprocessor/lib/characters/image_search/__init__.py deleted file mode 100644 index f1bf79335..000000000 --- a/preprocessor/lib/characters/image_search/__init__.py +++ /dev/null @@ -1,5 +0,0 @@ -from preprocessor.lib.characters.image_search.duckduckgo_image_search import DuckDuckGoImageSearch -from preprocessor.lib.characters.image_search.google_image_search import GoogleImageSearch -from preprocessor.lib.characters.image_search.image_search import BaseImageSearch - -__all__ = ['BaseImageSearch', 'DuckDuckGoImageSearch', 'GoogleImageSearch'] diff --git a/preprocessor/lib/io/__init__.py b/preprocessor/lib/io/__init__.py deleted file mode 100644 index c595480b1..000000000 --- a/preprocessor/lib/io/__init__.py +++ /dev/null @@ -1,5 +0,0 @@ -from preprocessor.lib.io.path_manager import PathManager -from preprocessor.lib.io.path_resolver import PathResolver -from preprocessor.lib.io.path_service import PathService - -__all__ = ['PathManager', 'PathResolver', 'PathService'] diff --git a/preprocessor/lib/media/__init__.py b/preprocessor/lib/media/__init__.py deleted file mode 100644 index 4a43dec8f..000000000 --- a/preprocessor/lib/media/__init__.py +++ /dev/null @@ -1,9 +0,0 @@ -from preprocessor.lib.media.ffmpeg import FFmpegWrapper -from preprocessor.lib.media.resolution import Resolution - -__all__ = ['FFmpegWrapper', 'Resolution'] -try: - from preprocessor.lib.media.scene_detection import TransNetWrapper - __all__.append('TransNetWrapper') -except ImportError: - pass diff --git a/preprocessor/lib/search/__init__.py b/preprocessor/lib/search/__init__.py deleted file mode 100644 index a69f19443..000000000 --- a/preprocessor/lib/search/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -from preprocessor.lib.search.elasticsearch import ElasticsearchWrapper -from preprocessor.lib.search.embedding_model import EmbeddingModelWrapper - -__all__ = ['ElasticsearchWrapper', 'EmbeddingModelWrapper'] diff --git a/preprocessor/lib/search/clients/__init__.py b/preprocessor/lib/search/clients/__init__.py deleted file mode 100644 index fc66f8dfe..000000000 --- a/preprocessor/lib/search/clients/__init__.py +++ /dev/null @@ -1,6 +0,0 @@ -from preprocessor.lib.search.clients.elasticsearch_queries import ElasticsearchQueries -from preprocessor.lib.search.clients.embedding_service import EmbeddingService -from preprocessor.lib.search.clients.hash_service import HashService -from preprocessor.lib.search.clients.result_formatters import ResultFormatter - -__all__ = ['ElasticsearchQueries', 'EmbeddingService', 'HashService', 'ResultFormatter'] diff --git a/preprocessor/lib/transcription/__init__.py b/preprocessor/lib/transcription/__init__.py deleted file mode 100644 index f82f7cdd7..000000000 --- a/preprocessor/lib/transcription/__init__.py +++ /dev/null @@ -1,25 +0,0 @@ -from preprocessor.lib.transcription.generators.json_generator import JsonGenerator -from preprocessor.lib.transcription.processors.audio_normalizer import AudioNormalizer -from preprocessor.lib.transcription.processors.episode_info_processor import EpisodeInfoProcessor -from preprocessor.lib.transcription.processors.normalized_audio_processor import NormalizedAudioProcessor -from preprocessor.lib.transcription.sound_classification import ( - classify_segment, - is_sound_event, -) -from preprocessor.lib.transcription.utils import ( - TranscriptionUtils, - WhisperUtils, -) -from preprocessor.lib.transcription.whisper import Whisper - -__all__ = [ - 'JsonGenerator', - 'AudioNormalizer', - 'EpisodeInfoProcessor', - 'NormalizedAudioProcessor', - 'classify_segment', - 'is_sound_event', - 'TranscriptionUtils', - 'WhisperUtils', - 'Whisper', -] diff --git a/preprocessor/lib/transcription/processors/__init__.py b/preprocessor/lib/transcription/processors/__init__.py deleted file mode 100644 index c29f35718..000000000 --- a/preprocessor/lib/transcription/processors/__init__.py +++ /dev/null @@ -1,5 +0,0 @@ -from preprocessor.lib.transcription.processors.audio_normalizer import AudioNormalizer -from preprocessor.lib.transcription.processors.episode_info_processor import EpisodeInfoProcessor -from preprocessor.lib.transcription.processors.normalized_audio_processor import NormalizedAudioProcessor - -__all__ = ['AudioNormalizer', 'EpisodeInfoProcessor', 'NormalizedAudioProcessor'] diff --git a/preprocessor/lib/video/__init__.py b/preprocessor/lib/video/__init__.py deleted file mode 100644 index 0a854ca1a..000000000 --- a/preprocessor/lib/video/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -from preprocessor.lib.video.discovery import VideoDiscovery - -__all__ = ['VideoDiscovery'] diff --git a/preprocessor/lib/video/strategies/__init__.py b/preprocessor/lib/video/strategies/__init__.py deleted file mode 100644 index e5b6b0f54..000000000 --- a/preprocessor/lib/video/strategies/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -from preprocessor.lib.video.strategies.base_strategy import BaseKeyframeStrategy -from preprocessor.lib.video.strategies.scene_changes_strategy import SceneChangesStrategy - -__all__ = ['BaseKeyframeStrategy', 'SceneChangesStrategy'] diff --git a/preprocessor/modules/characters/__init__.py b/preprocessor/modules/characters/__init__.py deleted file mode 100644 index c2c472d6a..000000000 --- a/preprocessor/modules/characters/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -from preprocessor.modules.characters.reference_downloader import CharacterReferenceDownloader - -__all__ = ['CharacterReferenceDownloader'] diff --git a/preprocessor/modules/packaging/__init__.py b/preprocessor/modules/packaging/__init__.py deleted file mode 100644 index 79212c302..000000000 --- a/preprocessor/modules/packaging/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -from preprocessor.modules.packaging.archives import ArchiveGenerationStep - -__all__ = ['ArchiveGenerationStep'] diff --git a/preprocessor/modules/scraping/__init__.py b/preprocessor/modules/scraping/__init__.py deleted file mode 100644 index 2c13627f9..000000000 --- a/preprocessor/modules/scraping/__init__.py +++ /dev/null @@ -1,6 +0,0 @@ -from preprocessor.modules.scraping.base_scraper import BaseScraper -from preprocessor.modules.scraping.character_scraper import CharacterScraper -from preprocessor.modules.scraping.episode_scraper import EpisodeScraper -from preprocessor.modules.scraping.reference_processor import CharacterReferenceProcessor - -__all__ = ['BaseScraper', 'CharacterReferenceProcessor', 'CharacterScraper', 'EpisodeScraper'] diff --git a/preprocessor/modules/search/indexing.py b/preprocessor/modules/search/indexing.py deleted file mode 100644 index 43083d9f1..000000000 --- a/preprocessor/modules/search/indexing.py +++ /dev/null @@ -1,114 +0,0 @@ -import asyncio -import json -from pathlib import Path -from typing import ( - Any, - Dict, - List, - Optional, -) - -from preprocessor.config.step_configs import ElasticsearchConfig -from preprocessor.core.artifacts import ( - ElasticDocuments, - IndexingResult, -) -from preprocessor.core.base_step import PipelineStep -from preprocessor.core.context import ExecutionContext -from preprocessor.lib.search.elasticsearch import ElasticsearchWrapper - - -class ElasticsearchIndexerStep(PipelineStep[List[ElasticDocuments], IndexingResult, ElasticsearchConfig]): - - def __init__(self, config: ElasticsearchConfig) -> None: - super().__init__(config) - self._es: Optional[ElasticsearchWrapper] = None - - def cleanup(self) -> None: - if self._es: - asyncio.run(self._es.close()) - self._es = None - - def execute(self, input_data: List[ElasticDocuments], context: ExecutionContext) -> IndexingResult: - return asyncio.run(self._execute_async(input_data, context)) - - @property - def name(self) -> str: - return 'elasticsearch_indexing' - - async def _execute_async( - self, - input_data: List[ElasticDocuments], - context: ExecutionContext, - ) -> IndexingResult: - if not input_data: - context.logger.warning('No documents to index.') - return IndexingResult( - index_name=self.config.index_name, - document_count=0, - success=True, - ) - - docs_by_type: Dict[str, List[Path]] = {} - for doc_artifact in input_data: - doc_type: str = doc_artifact.path.parent.name - if doc_type not in docs_by_type: - docs_by_type[doc_type] = [] - docs_by_type[doc_type].append(doc_artifact.path) - - total_indexed: int = 0 - for doc_type, paths in docs_by_type.items(): - index_name: str = f'{self.config.index_name}_{doc_type}' - context.logger.info(f'Indexing {len(paths)} files into {index_name}') - - if self._es is None or self._es.index_name != index_name: - if self._es is not None: - await self._es.close() - self._es = ElasticsearchWrapper( - index_name=index_name, - host=self.config.host, - dry_run=self.config.dry_run, - ) - - try: - if not self.config.append: - await self._es.delete_index() - - mapping: Optional[Dict[str, Any]] = self.__get_mapping_for_type(doc_type) - if mapping: - await self._es.create_index(mapping) - - documents: List[Dict[str, Any]] = [] - for path in paths: - with open(path, 'r', encoding='utf-8') as f: - for line in f: - if line.strip(): - documents.append(json.loads(line)) - - if documents: - if not self.config.dry_run: - await self._es.bulk_index(documents) - total_indexed += len(documents) - else: - context.logger.info( - f'Dry-run: would index {len(documents)} docs to {index_name}', - ) - except Exception as e: - context.logger.error(f'Elasticsearch indexing failed for {index_name}: {e}') - return IndexingResult( - index_name=self.config.index_name, - document_count=total_indexed, - success=False, - ) - - return IndexingResult( - index_name=self.config.index_name, - document_count=total_indexed, - success=True, - ) - - @staticmethod - def __get_mapping_for_type( - doc_type: str, # pylint: disable=unused-argument - ) -> Optional[Dict[str, Any]]: - return None diff --git a/preprocessor/modules/text/__init__.py b/preprocessor/modules/text/__init__.py deleted file mode 100644 index 9c8daf43f..000000000 --- a/preprocessor/modules/text/__init__.py +++ /dev/null @@ -1,6 +0,0 @@ -from preprocessor.lib.text.import_step import TranscriptionImportStep -from preprocessor.lib.text.transcription import TranscriptionStep -from preprocessor.modules.text.analysis import TextAnalysisStep -from preprocessor.modules.text.embeddings import TextEmbeddingStep - -__all__ = ['TextAnalysisStep', 'TextEmbeddingStep', 'TranscriptionImportStep', 'TranscriptionStep'] diff --git a/preprocessor/modules/text/analysis.py b/preprocessor/modules/text/analysis.py deleted file mode 100644 index 923ef4758..000000000 --- a/preprocessor/modules/text/analysis.py +++ /dev/null @@ -1,50 +0,0 @@ -from datetime import datetime - -from preprocessor.config.step_configs import TextAnalysisConfig -from preprocessor.core.artifacts import ( - TextAnalysisResults, - TranscriptionData, -) -from preprocessor.core.base_step import PipelineStep -from preprocessor.core.context import ExecutionContext -from preprocessor.lib.io.files import ( - atomic_write_json, - load_json, -) -from preprocessor.lib.text.text_statistics import TextStatistics - - -class TextAnalysisStep(PipelineStep[TranscriptionData, TextAnalysisResults, TextAnalysisConfig]): - - def execute(self, input_data: TranscriptionData, context: ExecutionContext) -> TextAnalysisResults: - output_filename = input_data.path.stem + '_text_stats.json' - output_path = input_data.path.parent / output_filename - if output_path.exists() and (not context.force_rerun): - if context.is_step_completed(self.name, input_data.episode_id): - context.logger.info(f'Skipping {input_data.episode_id} (cached)') - stats_data = load_json(output_path) - return TextAnalysisResults(episode_id=input_data.episode_id, episode_info=input_data.episode_info, path=output_path, statistics=stats_data) - context.logger.info(f'Analyzing text for {input_data.episode_id}') - context.mark_step_started(self.name, input_data.episode_id) - txt_path = input_data.path - if input_data.format != 'txt': - txt_path = input_data.path.with_suffix('.txt') - if not txt_path.exists(): - raise FileNotFoundError(f'Transcription text file not found: {txt_path}') - stats = TextStatistics.from_file(txt_path, language=self.config.language) - result_data = { - 'metadata': { - 'episode_id': input_data.episode_id, - 'language': self.config.language, - 'source_file': txt_path.name, - 'analyzed_at': datetime.now().isoformat(), - }, - **stats.to_dict(), - } - atomic_write_json(output_path, result_data) - context.mark_step_completed(self.name, input_data.episode_id) - return TextAnalysisResults(episode_id=input_data.episode_id, episode_info=input_data.episode_info, path=output_path, statistics=result_data) - - @property - def name(self) -> str: - return 'text_analysis' diff --git a/preprocessor/modules/transcription/__init__.py b/preprocessor/modules/transcription/__init__.py deleted file mode 100644 index de1657d26..000000000 --- a/preprocessor/modules/transcription/__init__.py +++ /dev/null @@ -1 +0,0 @@ -__all__: list = [] diff --git a/preprocessor/modules/validation/__init__.py b/preprocessor/modules/validation/__init__.py deleted file mode 100644 index ec91195c0..000000000 --- a/preprocessor/modules/validation/__init__.py +++ /dev/null @@ -1,7 +0,0 @@ -from preprocessor.modules.validation.episode_stats import EpisodeStats -from preprocessor.modules.validation.global_validator import GlobalValidator -from preprocessor.modules.validation.report_generator import ReportGenerator -from preprocessor.modules.validation.season_comparator import SeasonComparison -from preprocessor.modules.validation.validator import Validator - -__all__ = ['EpisodeStats', 'GlobalValidator', 'ReportGenerator', 'SeasonComparison', 'Validator'] diff --git a/preprocessor/modules/video/__init__.py b/preprocessor/modules/video/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/preprocessor/modules/vision/__init__.py b/preprocessor/modules/vision/__init__.py deleted file mode 100644 index 423227cba..000000000 --- a/preprocessor/modules/vision/__init__.py +++ /dev/null @@ -1,8 +0,0 @@ -from preprocessor.modules.vision.character_detection import CharacterDetectorStep -from preprocessor.modules.vision.embeddings import VideoEmbeddingStep -from preprocessor.modules.vision.emotion_detection import EmotionDetectionStep -from preprocessor.modules.vision.face_clustering import FaceClusteringStep -from preprocessor.modules.vision.image_hashing import ImageHashStep -from preprocessor.modules.vision.object_detection import ObjectDetectionStep - -__all__ = ['CharacterDetectorStep', 'EmotionDetectionStep', 'FaceClusteringStep', 'ImageHashStep', 'ObjectDetectionStep', 'VideoEmbeddingStep'] diff --git a/preprocessor/modules/vision/character_detection.py b/preprocessor/modules/vision/character_detection.py deleted file mode 100644 index 3caa24e7f..000000000 --- a/preprocessor/modules/vision/character_detection.py +++ /dev/null @@ -1,118 +0,0 @@ -from pathlib import Path -from typing import ( - Any, - Dict, - List, -) - -import numpy as np - -from preprocessor.config.step_configs import CharacterDetectionConfig -from preprocessor.core.artifacts import ( - DetectionResults, - FrameCollection, -) -from preprocessor.core.base_step import PipelineStep -from preprocessor.core.context import ExecutionContext -from preprocessor.lib.characters import FaceDetector -from preprocessor.lib.io.detection_io import process_frames_for_detection -from preprocessor.lib.io.files import ( - atomic_write_json, - load_json, -) - - -class CharacterDetectorStep(PipelineStep[FrameCollection, DetectionResults, CharacterDetectionConfig]): - - def __init__(self, config: CharacterDetectionConfig) -> None: - super().__init__(config) - self._face_app = None - self._character_vectors: Dict[str, np.ndarray] = {} - - def cleanup(self) -> None: - self._face_app = None - self._character_vectors = {} - - def execute( - self, input_data: FrameCollection, context: ExecutionContext, - ) -> DetectionResults: - filename = f'{context.series_name}_{input_data.episode_info.episode_code()}' - output_filename: str = f'{filename}_character_detections.json' - output_path: Path = context.get_output_path( - input_data.episode_info, 'character_detections', output_filename, - ) - if output_path.exists() and (not context.force_rerun): - if context.is_step_completed(self.name, input_data.episode_id): - context.logger.info(f'Skipping {input_data.episode_id} (cached character detections)') - det_data: Dict[str, Any] = load_json(output_path) - return DetectionResults( - episode_id=input_data.episode_id, - episode_info=input_data.episode_info, - path=output_path, - detection_type='character', - detection_count=len(det_data.get('detections', [])), - ) - if self._face_app is None: - context.logger.info('Initializing face detection model...') - self._face_app = FaceDetector.init() - characters_dir: Path = Path('preprocessor/output_data') / context.series_name / 'characters' - if not characters_dir.exists(): - characters_dir = Path('preprocessor/input_data') / context.series_name / 'characters' - if characters_dir.exists(): - context.logger.info(f'Loading character references from {characters_dir}') - self._character_vectors = FaceDetector.load_character_references( - characters_dir, self._face_app, - ) - else: - context.logger.warning(f'Characters directory not found: {characters_dir}') - context.logger.info(f'Detecting characters in {input_data.episode_id}') - context.mark_step_started(self.name, input_data.episode_id) - frame_files: List[Path] = sorted([ - f for f in input_data.directory.glob('*.jpg') - if f.is_file() and 'frame_' in f.name - ]) - if not frame_files: - context.logger.warning(f'No frame files found in {input_data.directory}') - return DetectionResults( - episode_id=input_data.episode_id, - episode_info=input_data.episode_info, - path=output_path, - detection_type='character', - detection_count=0, - ) - results: List[Dict[str, Any]] = process_frames_for_detection( - frame_files, self._face_app, self._character_vectors, self.config.threshold, - ) - output_data: Dict[str, Any] = { - 'episode_id': input_data.episode_id, - 'series_name': context.series_name, - 'detection_settings': self.config.dict(), - 'statistics': { - 'total_frames_processed': len(frame_files), - 'frames_with_detections': len(results), - 'character_counts': self.__count_characters(results), - }, - 'detections': results, - } - atomic_write_json(output_path, output_data) - context.mark_step_completed(self.name, input_data.episode_id) - return DetectionResults( - episode_id=input_data.episode_id, - episode_info=input_data.episode_info, - path=output_path, - detection_type='character', - detection_count=len(results), - ) - - @property - def name(self) -> str: - return 'character_detection' - - @staticmethod - def __count_characters(results: List[Dict[str, Any]]) -> Dict[str, int]: - counts: Dict[str, int] = {} - for res in results: - for face in res.get('faces', []): - name: str = face.get('character_name', 'unknown') - counts[name] = counts.get(name, 0) + 1 - return counts diff --git a/preprocessor/lib/__init__.py b/preprocessor/services/__init__.py similarity index 100% rename from preprocessor/lib/__init__.py rename to preprocessor/services/__init__.py diff --git a/preprocessor/lib/ai/__init__.py b/preprocessor/services/ai/__init__.py similarity index 56% rename from preprocessor/lib/ai/__init__.py rename to preprocessor/services/ai/__init__.py index a07ded715..6792c58b0 100644 --- a/preprocessor/lib/ai/__init__.py +++ b/preprocessor/services/ai/__init__.py @@ -1,8 +1,8 @@ -from preprocessor.lib.ai.models import ( +from preprocessor.services.ai.models import ( CharacterInfo, EpisodeInfo, SeasonMetadata, ) -from preprocessor.lib.ai.provider import LLMProvider +from preprocessor.services.ai.provider import LLMProvider __all__ = ['LLMProvider', 'EpisodeInfo', 'SeasonMetadata', 'CharacterInfo'] diff --git a/preprocessor/lib/ai/clients.py b/preprocessor/services/ai/clients.py similarity index 98% rename from preprocessor/lib/ai/clients.py rename to preprocessor/services/ai/clients.py index 24c255bef..f3d0cbb06 100644 --- a/preprocessor/lib/ai/clients.py +++ b/preprocessor/services/ai/clients.py @@ -15,7 +15,7 @@ ) from preprocessor.config.config import settings -from preprocessor.lib.ui.console import console +from preprocessor.services.ui.console import console class BaseLLMClient(ABC): diff --git a/preprocessor/lib/ai/models.py b/preprocessor/services/ai/models.py similarity index 100% rename from preprocessor/lib/ai/models.py rename to preprocessor/services/ai/models.py diff --git a/preprocessor/lib/ai/provider.py b/preprocessor/services/ai/provider.py similarity index 97% rename from preprocessor/lib/ai/provider.py rename to preprocessor/services/ai/provider.py index 7312ae621..e23b68ff0 100644 --- a/preprocessor/lib/ai/provider.py +++ b/preprocessor/services/ai/provider.py @@ -22,19 +22,19 @@ merge_episode_data_system, merge_episode_data_user, ) -from preprocessor.lib.ai.clients import ( +from preprocessor.services.ai.clients import ( BaseLLMClient, GeminiClient, VLLMClient, ) -from preprocessor.lib.ai.models import ( +from preprocessor.services.ai.models import ( AllSeasonsMetadata, CharacterInfo, CharactersList, EpisodeMetadata, SeasonMetadata, ) -from preprocessor.lib.ui.console import console +from preprocessor.services.ui.console import console class LLMProvider: diff --git a/preprocessor/lib/audio/__init__.py b/preprocessor/services/audio/__init__.py similarity index 100% rename from preprocessor/lib/audio/__init__.py rename to preprocessor/services/audio/__init__.py diff --git a/preprocessor/lib/audio/extraction.py b/preprocessor/services/audio/extraction.py similarity index 100% rename from preprocessor/lib/audio/extraction.py rename to preprocessor/services/audio/extraction.py diff --git a/preprocessor/lib/characters/__init__.py b/preprocessor/services/characters/__init__.py similarity index 55% rename from preprocessor/lib/characters/__init__.py rename to preprocessor/services/characters/__init__.py index 165cc8a68..483361209 100644 --- a/preprocessor/lib/characters/__init__.py +++ b/preprocessor/services/characters/__init__.py @@ -1,5 +1,5 @@ -from preprocessor.lib.characters.face_detection import FaceDetector -from preprocessor.lib.characters.image_search import ( +from preprocessor.services.characters.face_detection import FaceDetector +from preprocessor.services.characters.image_search import ( BaseImageSearch, DuckDuckGoImageSearch, GoogleImageSearch, diff --git a/preprocessor/lib/characters/face_detection.py b/preprocessor/services/characters/face_detection.py similarity index 99% rename from preprocessor/lib/characters/face_detection.py rename to preprocessor/services/characters/face_detection.py index 57bf15d25..33ca131ca 100644 --- a/preprocessor/lib/characters/face_detection.py +++ b/preprocessor/services/characters/face_detection.py @@ -15,7 +15,7 @@ import onnxruntime as ort from preprocessor.config.config import settings -from preprocessor.lib.ui.console import console +from preprocessor.services.ui.console import console warnings.filterwarnings('ignore', message='.*estimate.*is deprecated.*', category=FutureWarning, module='insightface') diff --git a/preprocessor/services/characters/image_search/__init__.py b/preprocessor/services/characters/image_search/__init__.py new file mode 100644 index 000000000..9ca06eb9f --- /dev/null +++ b/preprocessor/services/characters/image_search/__init__.py @@ -0,0 +1,5 @@ +from preprocessor.services.characters.image_search.duckduckgo_image_search import DuckDuckGoImageSearch +from preprocessor.services.characters.image_search.google_image_search import GoogleImageSearch +from preprocessor.services.characters.image_search.image_search import BaseImageSearch + +__all__ = ['BaseImageSearch', 'DuckDuckGoImageSearch', 'GoogleImageSearch'] diff --git a/preprocessor/lib/characters/image_search/duckduckgo_image_search.py b/preprocessor/services/characters/image_search/duckduckgo_image_search.py similarity index 81% rename from preprocessor/lib/characters/image_search/duckduckgo_image_search.py rename to preprocessor/services/characters/image_search/duckduckgo_image_search.py index 90a4cad89..5e9998457 100644 --- a/preprocessor/lib/characters/image_search/duckduckgo_image_search.py +++ b/preprocessor/services/characters/image_search/duckduckgo_image_search.py @@ -5,7 +5,7 @@ from ddgs import DDGS -from preprocessor.lib.characters.image_search.image_search import BaseImageSearch +from preprocessor.services.characters.image_search.image_search import BaseImageSearch class DuckDuckGoImageSearch(BaseImageSearch): diff --git a/preprocessor/lib/characters/image_search/google_image_search.py b/preprocessor/services/characters/image_search/google_image_search.py similarity index 91% rename from preprocessor/lib/characters/image_search/google_image_search.py rename to preprocessor/services/characters/image_search/google_image_search.py index 5b3021ece..fdbd8ab44 100644 --- a/preprocessor/lib/characters/image_search/google_image_search.py +++ b/preprocessor/services/characters/image_search/google_image_search.py @@ -5,7 +5,7 @@ from serpapi import GoogleSearch -from preprocessor.lib.characters.image_search.image_search import BaseImageSearch +from preprocessor.services.characters.image_search.image_search import BaseImageSearch class GoogleImageSearch(BaseImageSearch): diff --git a/preprocessor/lib/characters/image_search/image_search.py b/preprocessor/services/characters/image_search/image_search.py similarity index 100% rename from preprocessor/lib/characters/image_search/image_search.py rename to preprocessor/services/characters/image_search/image_search.py diff --git a/preprocessor/lib/characters/models.py b/preprocessor/services/characters/models.py similarity index 100% rename from preprocessor/lib/characters/models.py rename to preprocessor/services/characters/models.py diff --git a/preprocessor/modules/characters/reference_downloader.py b/preprocessor/services/characters/reference_downloader.py similarity index 97% rename from preprocessor/modules/characters/reference_downloader.py rename to preprocessor/services/characters/reference_downloader.py index 6bc7f950b..c42f23613 100644 --- a/preprocessor/modules/characters/reference_downloader.py +++ b/preprocessor/services/characters/reference_downloader.py @@ -23,18 +23,18 @@ ) from preprocessor.config.config import settings -from preprocessor.lib.characters.face_detection import FaceDetector -from preprocessor.lib.characters.image_search import ( +from preprocessor.services.characters.face_detection import FaceDetector +from preprocessor.services.characters.image_search import ( BaseImageSearch, DuckDuckGoImageSearch, GoogleImageSearch, ) -from preprocessor.lib.ui.console import console -from preprocessor.modules.base_processor import ( +from preprocessor.services.core.base_processor import ( BaseProcessor, OutputSpec, ProcessingItem, ) +from preprocessor.services.ui.console import console class CharacterReferenceDownloader(BaseProcessor): diff --git a/preprocessor/lib/core/__init__.py b/preprocessor/services/core/__init__.py similarity index 57% rename from preprocessor/lib/core/__init__.py rename to preprocessor/services/core/__init__.py index a9a53c65e..7370a784f 100644 --- a/preprocessor/lib/core/__init__.py +++ b/preprocessor/services/core/__init__.py @@ -1,7 +1,7 @@ -from preprocessor.lib.core.logging import ( +from preprocessor.services.core.logging import ( ErrorHandlingLogger, LoggerNotFinalizedException, ) -from preprocessor.lib.core.time import TimeFormatter +from preprocessor.services.core.time import TimeFormatter __all__ = ['ErrorHandlingLogger', 'LoggerNotFinalizedException', 'TimeFormatter'] diff --git a/preprocessor/modules/base_processor.py b/preprocessor/services/core/base_processor.py similarity index 96% rename from preprocessor/modules/base_processor.py rename to preprocessor/services/core/base_processor.py index 2fe0e44da..7b28fafff 100644 --- a/preprocessor/modules/base_processor.py +++ b/preprocessor/services/core/base_processor.py @@ -15,13 +15,13 @@ from preprocessor.config.constants import SUPPORTED_VIDEO_EXTENSIONS from preprocessor.core.state_manager import StateManager -from preprocessor.lib.core.logging import ErrorHandlingLogger -from preprocessor.lib.io.path_manager import PathManager -from preprocessor.lib.ui.console import ( +from preprocessor.services.core.logging import ErrorHandlingLogger +from preprocessor.services.io.path_manager import PathManager +from preprocessor.services.ui.console import ( SimpleProgress, console, ) -from preprocessor.lib.ui.progress import ProgressTracker +from preprocessor.services.ui.progress import ProgressTracker @dataclass diff --git a/preprocessor/lib/core/logging.py b/preprocessor/services/core/logging.py similarity index 97% rename from preprocessor/lib/core/logging.py rename to preprocessor/services/core/logging.py index 6e5581316..5f8635001 100644 --- a/preprocessor/lib/core/logging.py +++ b/preprocessor/services/core/logging.py @@ -4,7 +4,7 @@ from rich.logging import RichHandler from rich.panel import Panel -from preprocessor.lib.ui.console import console +from preprocessor.services.ui.console import console class LoggerNotFinalizedException(Exception): diff --git a/preprocessor/lib/core/time.py b/preprocessor/services/core/time.py similarity index 100% rename from preprocessor/lib/core/time.py rename to preprocessor/services/core/time.py diff --git a/preprocessor/lib/episodes/__init__.py b/preprocessor/services/episodes/__init__.py similarity index 57% rename from preprocessor/lib/episodes/__init__.py rename to preprocessor/services/episodes/__init__.py index 7f38bf32a..f4491ac3a 100644 --- a/preprocessor/lib/episodes/__init__.py +++ b/preprocessor/services/episodes/__init__.py @@ -1,4 +1,4 @@ -from preprocessor.lib.episodes.episode_manager import ( +from preprocessor.services.episodes.episode_manager import ( EpisodeInfo, EpisodeManager, ) diff --git a/preprocessor/lib/episodes/episode_manager.py b/preprocessor/services/episodes/episode_manager.py similarity index 98% rename from preprocessor/lib/episodes/episode_manager.py rename to preprocessor/services/episodes/episode_manager.py index 5d6c306eb..0504926cd 100644 --- a/preprocessor/lib/episodes/episode_manager.py +++ b/preprocessor/services/episodes/episode_manager.py @@ -14,8 +14,8 @@ EpisodeMetadataKeys, EpisodesDataKeys, ) -from preprocessor.lib.core.logging import ErrorHandlingLogger -from preprocessor.lib.io.path_manager import PathManager +from preprocessor.services.core.logging import ErrorHandlingLogger +from preprocessor.services.io.path_manager import PathManager @dataclass diff --git a/preprocessor/services/io/__init__.py b/preprocessor/services/io/__init__.py new file mode 100644 index 000000000..6eb13cfaf --- /dev/null +++ b/preprocessor/services/io/__init__.py @@ -0,0 +1,5 @@ +from preprocessor.services.io.path_manager import PathManager +from preprocessor.services.io.path_resolver import PathResolver +from preprocessor.services.io.path_service import PathService + +__all__ = ['PathManager', 'PathResolver', 'PathService'] diff --git a/preprocessor/lib/io/detection_io.py b/preprocessor/services/io/detection_io.py similarity index 90% rename from preprocessor/lib/io/detection_io.py rename to preprocessor/services/io/detection_io.py index e57226a1a..d7b660c91 100644 --- a/preprocessor/lib/io/detection_io.py +++ b/preprocessor/services/io/detection_io.py @@ -8,7 +8,7 @@ from insightface.app import FaceAnalysis import numpy as np -from preprocessor.lib.characters.face_detection import FaceDetector +from preprocessor.services.characters.face_detection import FaceDetector def process_frames_for_detection( diff --git a/preprocessor/lib/io/files.py b/preprocessor/services/io/files.py similarity index 100% rename from preprocessor/lib/io/files.py rename to preprocessor/services/io/files.py diff --git a/preprocessor/lib/io/hashing.py b/preprocessor/services/io/hashing.py similarity index 86% rename from preprocessor/lib/io/hashing.py rename to preprocessor/services/io/hashing.py index 3efca6928..f0756a9a6 100644 --- a/preprocessor/lib/io/hashing.py +++ b/preprocessor/services/io/hashing.py @@ -6,10 +6,10 @@ ) from preprocessor.config.config import settings -from preprocessor.lib.episodes import EpisodeInfo -from preprocessor.lib.io.files import FileOperations -from preprocessor.lib.io.metadata import MetadataBuilder -from preprocessor.lib.io.path_manager import PathManager +from preprocessor.services.episodes import EpisodeInfo +from preprocessor.services.io.files import FileOperations +from preprocessor.services.io.metadata import MetadataBuilder +from preprocessor.services.io.path_manager import PathManager class HashStorage: diff --git a/preprocessor/lib/io/metadata.py b/preprocessor/services/io/metadata.py similarity index 100% rename from preprocessor/lib/io/metadata.py rename to preprocessor/services/io/metadata.py diff --git a/preprocessor/lib/io/path_manager.py b/preprocessor/services/io/path_manager.py similarity index 80% rename from preprocessor/lib/io/path_manager.py rename to preprocessor/services/io/path_manager.py index 6f3a73356..72c66a4fd 100644 --- a/preprocessor/lib/io/path_manager.py +++ b/preprocessor/services/io/path_manager.py @@ -1,10 +1,10 @@ from pathlib import Path from typing import TYPE_CHECKING -from preprocessor.lib.io.path_service import PathService +from preprocessor.services.io.path_service import PathService if TYPE_CHECKING: - from preprocessor.lib.episodes.episode_manager import EpisodeInfo + from preprocessor.services.episodes.episode_manager import EpisodeInfo class PathManager: diff --git a/preprocessor/lib/io/path_resolver.py b/preprocessor/services/io/path_resolver.py similarity index 84% rename from preprocessor/lib/io/path_resolver.py rename to preprocessor/services/io/path_resolver.py index 64910371e..efacbfb63 100644 --- a/preprocessor/lib/io/path_resolver.py +++ b/preprocessor/services/io/path_resolver.py @@ -1,6 +1,6 @@ from pathlib import Path -from preprocessor.lib.io.path_service import PathService +from preprocessor.services.io.path_service import PathService class PathResolver: diff --git a/preprocessor/lib/io/path_service.py b/preprocessor/services/io/path_service.py similarity index 94% rename from preprocessor/lib/io/path_service.py rename to preprocessor/services/io/path_service.py index fe34588a7..1de748476 100644 --- a/preprocessor/lib/io/path_service.py +++ b/preprocessor/services/io/path_service.py @@ -5,7 +5,7 @@ from preprocessor.config.config import get_base_output_dir if TYPE_CHECKING: - from preprocessor.lib.episodes.episode_manager import EpisodeInfo + from preprocessor.services.episodes.episode_manager import EpisodeInfo class PathService: diff --git a/preprocessor/services/media/__init__.py b/preprocessor/services/media/__init__.py new file mode 100644 index 000000000..9876d4003 --- /dev/null +++ b/preprocessor/services/media/__init__.py @@ -0,0 +1,9 @@ +from preprocessor.services.media.ffmpeg import FFmpegWrapper +from preprocessor.services.media.resolution import Resolution + +__all__ = ['FFmpegWrapper', 'Resolution'] +try: + from preprocessor.services.media.scene_detection import TransNetWrapper + __all__.append('TransNetWrapper') +except ImportError: + pass diff --git a/preprocessor/lib/media/ffmpeg.py b/preprocessor/services/media/ffmpeg.py similarity index 100% rename from preprocessor/lib/media/ffmpeg.py rename to preprocessor/services/media/ffmpeg.py diff --git a/preprocessor/lib/media/resolution.py b/preprocessor/services/media/resolution.py similarity index 100% rename from preprocessor/lib/media/resolution.py rename to preprocessor/services/media/resolution.py diff --git a/preprocessor/lib/media/scene_detection.py b/preprocessor/services/media/scene_detection.py similarity index 100% rename from preprocessor/lib/media/scene_detection.py rename to preprocessor/services/media/scene_detection.py diff --git a/preprocessor/services/scraping/__init__.py b/preprocessor/services/scraping/__init__.py new file mode 100644 index 000000000..2abe08a3a --- /dev/null +++ b/preprocessor/services/scraping/__init__.py @@ -0,0 +1,6 @@ +from preprocessor.services.scraping.base_scraper import BaseScraper +from preprocessor.services.scraping.character_scraper import CharacterScraper +from preprocessor.services.scraping.episode_scraper import EpisodeScraper +from preprocessor.services.scraping.reference_processor import CharacterReferenceProcessor + +__all__ = ['BaseScraper', 'CharacterReferenceProcessor', 'CharacterScraper', 'EpisodeScraper'] diff --git a/preprocessor/modules/scraping/base_scraper.py b/preprocessor/services/scraping/base_scraper.py similarity index 92% rename from preprocessor/modules/scraping/base_scraper.py rename to preprocessor/services/scraping/base_scraper.py index 53f07efd1..8f4f7ce1d 100644 --- a/preprocessor/modules/scraping/base_scraper.py +++ b/preprocessor/services/scraping/base_scraper.py @@ -14,11 +14,11 @@ ParserMode, ScraperMethod, ) -from preprocessor.lib.ai import LLMProvider -from preprocessor.lib.scraping.clipboard import ScraperClipboard -from preprocessor.lib.scraping.crawl4ai import ScraperCrawl4AI -from preprocessor.lib.ui.console import console -from preprocessor.modules.base_processor import BaseProcessor +from preprocessor.services.ai import LLMProvider +from preprocessor.services.core.base_processor import BaseProcessor +from preprocessor.services.scraping.clipboard import ScraperClipboard +from preprocessor.services.scraping.crawl4ai import ScraperCrawl4AI +from preprocessor.services.ui.console import console class BaseScraper(BaseProcessor): diff --git a/preprocessor/modules/scraping/base_scraper_step.py b/preprocessor/services/scraping/base_scraper_step.py similarity index 100% rename from preprocessor/modules/scraping/base_scraper_step.py rename to preprocessor/services/scraping/base_scraper_step.py diff --git a/preprocessor/modules/scraping/character_scraper.py b/preprocessor/services/scraping/character_scraper.py similarity index 87% rename from preprocessor/modules/scraping/character_scraper.py rename to preprocessor/services/scraping/character_scraper.py index 9d2a2bbc0..c478649fd 100644 --- a/preprocessor/modules/scraping/character_scraper.py +++ b/preprocessor/services/scraping/character_scraper.py @@ -4,8 +4,8 @@ List, ) -from preprocessor.lib.ui.console import console -from preprocessor.modules.scraping.base_scraper import BaseScraper +from preprocessor.services.scraping.base_scraper import BaseScraper +from preprocessor.services.ui.console import console class CharacterScraper(BaseScraper): diff --git a/preprocessor/lib/scraping/clipboard.py b/preprocessor/services/scraping/clipboard.py similarity index 93% rename from preprocessor/lib/scraping/clipboard.py rename to preprocessor/services/scraping/clipboard.py index 2e7e6cc9a..2762be672 100644 --- a/preprocessor/lib/scraping/clipboard.py +++ b/preprocessor/services/scraping/clipboard.py @@ -5,7 +5,7 @@ from patchright.sync_api import sync_playwright -from preprocessor.lib.core.logging import ErrorHandlingLogger +from preprocessor.services.core.logging import ErrorHandlingLogger class ScraperClipboard: diff --git a/preprocessor/lib/scraping/crawl4ai.py b/preprocessor/services/scraping/crawl4ai.py similarity index 97% rename from preprocessor/lib/scraping/crawl4ai.py rename to preprocessor/services/scraping/crawl4ai.py index 1d38097c7..99b04f97e 100644 --- a/preprocessor/lib/scraping/crawl4ai.py +++ b/preprocessor/services/scraping/crawl4ai.py @@ -10,7 +10,7 @@ from pathvalidate import sanitize_filename import ua_generator -from preprocessor.lib.core.logging import ErrorHandlingLogger +from preprocessor.services.core.logging import ErrorHandlingLogger class ScraperCrawl4AI: diff --git a/preprocessor/modules/scraping/episode_scraper.py b/preprocessor/services/scraping/episode_scraper.py similarity index 97% rename from preprocessor/modules/scraping/episode_scraper.py rename to preprocessor/services/scraping/episode_scraper.py index cb95d1987..703ee6083 100644 --- a/preprocessor/modules/scraping/episode_scraper.py +++ b/preprocessor/services/scraping/episode_scraper.py @@ -7,8 +7,8 @@ Tuple, ) -from preprocessor.lib.ui.console import console -from preprocessor.modules.scraping.base_scraper import BaseScraper +from preprocessor.services.scraping.base_scraper import BaseScraper +from preprocessor.services.ui.console import console class EpisodeScraper(BaseScraper): diff --git a/preprocessor/modules/scraping/reference_processor.py b/preprocessor/services/scraping/reference_processor.py similarity index 99% rename from preprocessor/modules/scraping/reference_processor.py rename to preprocessor/services/scraping/reference_processor.py index 58bf1a62b..93d4584cc 100644 --- a/preprocessor/modules/scraping/reference_processor.py +++ b/preprocessor/services/scraping/reference_processor.py @@ -17,17 +17,17 @@ import numpy as np from preprocessor.config.config import settings -from preprocessor.lib.characters.face_detection import FaceDetector -from preprocessor.lib.characters.models import ( +from preprocessor.services.characters.face_detection import FaceDetector +from preprocessor.services.characters.models import ( CandidateFace, FaceData, ) -from preprocessor.lib.ui.console import console -from preprocessor.modules.base_processor import ( +from preprocessor.services.core.base_processor import ( BaseProcessor, OutputSpec, ProcessingItem, ) +from preprocessor.services.ui.console import console warnings.filterwarnings('ignore', message='.*estimate.*is deprecated.*', category=FutureWarning, module='insightface') diff --git a/preprocessor/services/search/__init__.py b/preprocessor/services/search/__init__.py new file mode 100644 index 000000000..44c378a7e --- /dev/null +++ b/preprocessor/services/search/__init__.py @@ -0,0 +1,4 @@ +from preprocessor.services.search.elasticsearch import ElasticsearchWrapper +from preprocessor.services.search.embedding_model import EmbeddingModelWrapper + +__all__ = ['ElasticsearchWrapper', 'EmbeddingModelWrapper'] diff --git a/preprocessor/services/search/clients/__init__.py b/preprocessor/services/search/clients/__init__.py new file mode 100644 index 000000000..8bf7d3a13 --- /dev/null +++ b/preprocessor/services/search/clients/__init__.py @@ -0,0 +1,6 @@ +from preprocessor.services.search.clients.elasticsearch_queries import ElasticsearchQueries +from preprocessor.services.search.clients.embedding_service import EmbeddingService +from preprocessor.services.search.clients.hash_service import HashService +from preprocessor.services.search.clients.result_formatters import ResultFormatter + +__all__ = ['ElasticsearchQueries', 'EmbeddingService', 'HashService', 'ResultFormatter'] diff --git a/preprocessor/lib/search/clients/elasticsearch_queries.py b/preprocessor/services/search/clients/elasticsearch_queries.py similarity index 99% rename from preprocessor/lib/search/clients/elasticsearch_queries.py rename to preprocessor/services/search/clients/elasticsearch_queries.py index 64d4b1f39..0a5288abb 100644 --- a/preprocessor/lib/search/clients/elasticsearch_queries.py +++ b/preprocessor/services/search/clients/elasticsearch_queries.py @@ -8,7 +8,7 @@ from elasticsearch import AsyncElasticsearch -from preprocessor.lib.search.clients.embedding_service import EmbeddingService +from preprocessor.services.search.clients.embedding_service import EmbeddingService class ElasticsearchQueries: diff --git a/preprocessor/lib/search/clients/embedding_service.py b/preprocessor/services/search/clients/embedding_service.py similarity index 100% rename from preprocessor/lib/search/clients/embedding_service.py rename to preprocessor/services/search/clients/embedding_service.py diff --git a/preprocessor/lib/search/clients/hash_service.py b/preprocessor/services/search/clients/hash_service.py similarity index 94% rename from preprocessor/lib/search/clients/hash_service.py rename to preprocessor/services/search/clients/hash_service.py index 3ccc06ecb..a37ce1db4 100644 --- a/preprocessor/lib/search/clients/hash_service.py +++ b/preprocessor/services/search/clients/hash_service.py @@ -8,7 +8,7 @@ import click import torch -from preprocessor.lib.video.image_hasher import PerceptualHasher +from preprocessor.services.video.image_hasher import PerceptualHasher class HashService: diff --git a/preprocessor/lib/search/clients/result_formatters.py b/preprocessor/services/search/clients/result_formatters.py similarity index 100% rename from preprocessor/lib/search/clients/result_formatters.py rename to preprocessor/services/search/clients/result_formatters.py diff --git a/preprocessor/lib/search/elasticsearch.py b/preprocessor/services/search/elasticsearch.py similarity index 100% rename from preprocessor/lib/search/elasticsearch.py rename to preprocessor/services/search/elasticsearch.py diff --git a/preprocessor/lib/search/embedding_model.py b/preprocessor/services/search/embedding_model.py similarity index 89% rename from preprocessor/lib/search/embedding_model.py rename to preprocessor/services/search/embedding_model.py index fc4caf637..64359cef0 100644 --- a/preprocessor/lib/search/embedding_model.py +++ b/preprocessor/services/search/embedding_model.py @@ -3,7 +3,7 @@ Union, ) -from preprocessor.lib.search.clients.embedding_service import EmbeddingService +from preprocessor.services.search.clients.embedding_service import EmbeddingService class EmbeddingModelWrapper: diff --git a/preprocessor/lib/text/__init__.py b/preprocessor/services/text/__init__.py similarity index 53% rename from preprocessor/lib/text/__init__.py rename to preprocessor/services/text/__init__.py index ba18b5a06..7a78d5188 100644 --- a/preprocessor/lib/text/__init__.py +++ b/preprocessor/services/text/__init__.py @@ -1,8 +1,8 @@ -from preprocessor.lib.text.language_config import ( +from preprocessor.services.text.language_config import ( ENGLISH_CONFIG, POLISH_CONFIG, LanguageConfig, ) -from preprocessor.lib.text.text_statistics import TextStatistics +from preprocessor.services.text.text_statistics import TextStatistics __all__ = ['TextStatistics', 'LanguageConfig', 'POLISH_CONFIG', 'ENGLISH_CONFIG'] diff --git a/preprocessor/lib/text/import_step.py b/preprocessor/services/text/import_step.py similarity index 98% rename from preprocessor/lib/text/import_step.py rename to preprocessor/services/text/import_step.py index 2904a888b..d9e599aec 100644 --- a/preprocessor/lib/text/import_step.py +++ b/preprocessor/services/text/import_step.py @@ -14,10 +14,10 @@ from preprocessor.core.artifacts import TranscriptionData from preprocessor.core.base_step import PipelineStep from preprocessor.core.context import ExecutionContext -from preprocessor.lib.episodes.episode_manager import EpisodeManager +from preprocessor.services.episodes.episode_manager import EpisodeManager if TYPE_CHECKING: - from preprocessor.lib.episodes.episode_manager import EpisodeInfo + from preprocessor.services.episodes.episode_manager import EpisodeInfo class TranscriptionImportStep(PipelineStep[None, List[TranscriptionData], TranscriptionImportConfig]): diff --git a/preprocessor/lib/text/language_config.py b/preprocessor/services/text/language_config.py similarity index 100% rename from preprocessor/lib/text/language_config.py rename to preprocessor/services/text/language_config.py diff --git a/preprocessor/lib/text/text_statistics.py b/preprocessor/services/text/text_statistics.py similarity index 99% rename from preprocessor/lib/text/text_statistics.py rename to preprocessor/services/text/text_statistics.py index bf23b0ca7..0731f3e0d 100644 --- a/preprocessor/lib/text/text_statistics.py +++ b/preprocessor/services/text/text_statistics.py @@ -11,7 +11,7 @@ List, ) -from preprocessor.lib.text.language_config import ( +from preprocessor.services.text.language_config import ( ENGLISH_CONFIG, POLISH_CONFIG, LanguageConfig, diff --git a/preprocessor/services/transcription/__init__.py b/preprocessor/services/transcription/__init__.py new file mode 100644 index 000000000..6af73ca06 --- /dev/null +++ b/preprocessor/services/transcription/__init__.py @@ -0,0 +1,25 @@ +from preprocessor.services.transcription.generators.json_generator import JsonGenerator +from preprocessor.services.transcription.processors.audio_normalizer import AudioNormalizer +from preprocessor.services.transcription.processors.episode_info_processor import EpisodeInfoProcessor +from preprocessor.services.transcription.processors.normalized_audio_processor import NormalizedAudioProcessor +from preprocessor.services.transcription.sound_classification import ( + classify_segment, + is_sound_event, +) +from preprocessor.services.transcription.utils import ( + TranscriptionUtils, + WhisperUtils, +) +from preprocessor.services.transcription.whisper import Whisper + +__all__ = [ + 'JsonGenerator', + 'AudioNormalizer', + 'EpisodeInfoProcessor', + 'NormalizedAudioProcessor', + 'classify_segment', + 'is_sound_event', + 'TranscriptionUtils', + 'WhisperUtils', + 'Whisper', +] diff --git a/preprocessor/lib/scraping/__init__.py b/preprocessor/services/transcription/engines/__init__.py similarity index 100% rename from preprocessor/lib/scraping/__init__.py rename to preprocessor/services/transcription/engines/__init__.py diff --git a/preprocessor/lib/transcription/engines/base_engine.py b/preprocessor/services/transcription/engines/base_engine.py similarity index 100% rename from preprocessor/lib/transcription/engines/base_engine.py rename to preprocessor/services/transcription/engines/base_engine.py diff --git a/preprocessor/lib/transcription/engines/elevenlabs_engine.py b/preprocessor/services/transcription/engines/elevenlabs_engine.py similarity index 96% rename from preprocessor/lib/transcription/engines/elevenlabs_engine.py rename to preprocessor/services/transcription/engines/elevenlabs_engine.py index 3309b8fa6..26255910a 100644 --- a/preprocessor/lib/transcription/engines/elevenlabs_engine.py +++ b/preprocessor/services/transcription/engines/elevenlabs_engine.py @@ -11,9 +11,9 @@ from elevenlabs.core import ApiError from preprocessor.config.config import settings -from preprocessor.lib.core.logging import ErrorHandlingLogger -from preprocessor.lib.transcription.engines.base_engine import TranscriptionEngine -from preprocessor.lib.ui.console import console +from preprocessor.services.core.logging import ErrorHandlingLogger +from preprocessor.services.transcription.engines.base_engine import TranscriptionEngine +from preprocessor.services.ui.console import console class ElevenLabsEngine(TranscriptionEngine): diff --git a/preprocessor/lib/transcription/engines/whisper_engine.py b/preprocessor/services/transcription/engines/whisper_engine.py similarity index 90% rename from preprocessor/lib/transcription/engines/whisper_engine.py rename to preprocessor/services/transcription/engines/whisper_engine.py index 2badcf230..45ba55702 100644 --- a/preprocessor/lib/transcription/engines/whisper_engine.py +++ b/preprocessor/services/transcription/engines/whisper_engine.py @@ -8,9 +8,9 @@ from faster_whisper import WhisperModel import torch -from preprocessor.lib.transcription.engines.base_engine import TranscriptionEngine -from preprocessor.lib.transcription.whisper import WhisperUtils -from preprocessor.lib.ui.console import console +from preprocessor.services.transcription.engines.base_engine import TranscriptionEngine +from preprocessor.services.transcription.whisper import WhisperUtils +from preprocessor.services.ui.console import console class WhisperEngine(TranscriptionEngine): diff --git a/preprocessor/lib/transcription/engines/__init__.py b/preprocessor/services/transcription/generators/__init__.py similarity index 100% rename from preprocessor/lib/transcription/engines/__init__.py rename to preprocessor/services/transcription/generators/__init__.py diff --git a/preprocessor/lib/transcription/generators/base_generator.py b/preprocessor/services/transcription/generators/base_generator.py similarity index 93% rename from preprocessor/lib/transcription/generators/base_generator.py rename to preprocessor/services/transcription/generators/base_generator.py index 48ae5e48a..5b04797d6 100644 --- a/preprocessor/lib/transcription/generators/base_generator.py +++ b/preprocessor/services/transcription/generators/base_generator.py @@ -9,7 +9,7 @@ Dict, ) -from preprocessor.lib.core.logging import ErrorHandlingLogger +from preprocessor.services.core.logging import ErrorHandlingLogger class BaseTranscriptionGenerator(ABC): diff --git a/preprocessor/lib/transcription/generators/json_generator.py b/preprocessor/services/transcription/generators/json_generator.py similarity index 94% rename from preprocessor/lib/transcription/generators/json_generator.py rename to preprocessor/services/transcription/generators/json_generator.py index 160c0f9d4..a9a7a332a 100644 --- a/preprocessor/lib/transcription/generators/json_generator.py +++ b/preprocessor/services/transcription/generators/json_generator.py @@ -9,8 +9,8 @@ FILE_EXTENSIONS, FILE_SUFFIXES, ) -from preprocessor.lib.transcription.generators.base_generator import BaseTranscriptionGenerator -from preprocessor.lib.transcription.utils import TranscriptionUtils +from preprocessor.services.transcription.generators.base_generator import BaseTranscriptionGenerator +from preprocessor.services.transcription.utils import TranscriptionUtils class JsonGenerator(BaseTranscriptionGenerator): diff --git a/preprocessor/lib/transcription/generators/multi_format_generator.py b/preprocessor/services/transcription/generators/multi_format_generator.py similarity index 95% rename from preprocessor/lib/transcription/generators/multi_format_generator.py rename to preprocessor/services/transcription/generators/multi_format_generator.py index 42e38928d..eaf58b1b4 100644 --- a/preprocessor/lib/transcription/generators/multi_format_generator.py +++ b/preprocessor/services/transcription/generators/multi_format_generator.py @@ -10,11 +10,11 @@ get_base_output_dir, settings, ) -from preprocessor.lib.core.logging import ErrorHandlingLogger -from preprocessor.lib.episodes import EpisodeManager -from preprocessor.lib.transcription.generators.json_generator import JsonGenerator -from preprocessor.lib.transcription.generators.srt_generator import SrtGenerator -from preprocessor.lib.transcription.generators.txt_generator import TxtGenerator +from preprocessor.services.core.logging import ErrorHandlingLogger +from preprocessor.services.episodes import EpisodeManager +from preprocessor.services.transcription.generators.json_generator import JsonGenerator +from preprocessor.services.transcription.generators.srt_generator import SrtGenerator +from preprocessor.services.transcription.generators.txt_generator import TxtGenerator class MultiFormatGenerator: diff --git a/preprocessor/lib/transcription/generators/srt_generator.py b/preprocessor/services/transcription/generators/srt_generator.py similarity index 93% rename from preprocessor/lib/transcription/generators/srt_generator.py rename to preprocessor/services/transcription/generators/srt_generator.py index 060e1ec60..8a1fc9e6e 100644 --- a/preprocessor/lib/transcription/generators/srt_generator.py +++ b/preprocessor/services/transcription/generators/srt_generator.py @@ -5,7 +5,7 @@ ) from preprocessor.config.constants import FILE_EXTENSIONS -from preprocessor.lib.transcription.generators.base_generator import BaseTranscriptionGenerator +from preprocessor.services.transcription.generators.base_generator import BaseTranscriptionGenerator class SrtGenerator(BaseTranscriptionGenerator): diff --git a/preprocessor/lib/transcription/generators/txt_generator.py b/preprocessor/services/transcription/generators/txt_generator.py similarity index 87% rename from preprocessor/lib/transcription/generators/txt_generator.py rename to preprocessor/services/transcription/generators/txt_generator.py index ae2a8a918..a899851ad 100644 --- a/preprocessor/lib/transcription/generators/txt_generator.py +++ b/preprocessor/services/transcription/generators/txt_generator.py @@ -5,7 +5,7 @@ ) from preprocessor.config.constants import FILE_EXTENSIONS -from preprocessor.lib.transcription.generators.base_generator import BaseTranscriptionGenerator +from preprocessor.services.transcription.generators.base_generator import BaseTranscriptionGenerator class TxtGenerator(BaseTranscriptionGenerator): diff --git a/preprocessor/services/transcription/processors/__init__.py b/preprocessor/services/transcription/processors/__init__.py new file mode 100644 index 000000000..5534236cd --- /dev/null +++ b/preprocessor/services/transcription/processors/__init__.py @@ -0,0 +1,5 @@ +from preprocessor.services.transcription.processors.audio_normalizer import AudioNormalizer +from preprocessor.services.transcription.processors.episode_info_processor import EpisodeInfoProcessor +from preprocessor.services.transcription.processors.normalized_audio_processor import NormalizedAudioProcessor + +__all__ = ['AudioNormalizer', 'EpisodeInfoProcessor', 'NormalizedAudioProcessor'] diff --git a/preprocessor/lib/transcription/processors/audio_normalizer.py b/preprocessor/services/transcription/processors/audio_normalizer.py similarity index 95% rename from preprocessor/lib/transcription/processors/audio_normalizer.py rename to preprocessor/services/transcription/processors/audio_normalizer.py index 65dcf4180..2e0c586e0 100644 --- a/preprocessor/lib/transcription/processors/audio_normalizer.py +++ b/preprocessor/services/transcription/processors/audio_normalizer.py @@ -6,8 +6,8 @@ Optional, ) -from preprocessor.lib.core.logging import ErrorHandlingLogger -from preprocessor.modules.base_processor import BaseProcessor +from preprocessor.services.core.base_processor import BaseProcessor +from preprocessor.services.core.logging import ErrorHandlingLogger class AudioNormalizer: diff --git a/preprocessor/lib/transcription/processors/episode_info_processor.py b/preprocessor/services/transcription/processors/episode_info_processor.py similarity index 96% rename from preprocessor/lib/transcription/processors/episode_info_processor.py rename to preprocessor/services/transcription/processors/episode_info_processor.py index 9be97a263..b4304d38a 100644 --- a/preprocessor/lib/transcription/processors/episode_info_processor.py +++ b/preprocessor/services/transcription/processors/episode_info_processor.py @@ -6,8 +6,8 @@ Tuple, ) -from preprocessor.lib.core.logging import ErrorHandlingLogger -from preprocessor.lib.episodes import EpisodeManager +from preprocessor.services.core.logging import ErrorHandlingLogger +from preprocessor.services.episodes import EpisodeManager class EpisodeInfoProcessor: diff --git a/preprocessor/lib/transcription/processors/normalized_audio_processor.py b/preprocessor/services/transcription/processors/normalized_audio_processor.py similarity index 96% rename from preprocessor/lib/transcription/processors/normalized_audio_processor.py rename to preprocessor/services/transcription/processors/normalized_audio_processor.py index e46544421..c563ec8c4 100644 --- a/preprocessor/lib/transcription/processors/normalized_audio_processor.py +++ b/preprocessor/services/transcription/processors/normalized_audio_processor.py @@ -10,8 +10,8 @@ from faster_whisper import WhisperModel import torch -from preprocessor.lib.core.logging import ErrorHandlingLogger -from preprocessor.lib.transcription.whisper import WhisperUtils +from preprocessor.services.core.logging import ErrorHandlingLogger +from preprocessor.services.transcription.whisper import WhisperUtils class NormalizedAudioProcessor: diff --git a/preprocessor/lib/transcription/sound_classification.py b/preprocessor/services/transcription/sound_classification.py similarity index 100% rename from preprocessor/lib/transcription/sound_classification.py rename to preprocessor/services/transcription/sound_classification.py diff --git a/preprocessor/lib/transcription/utils.py b/preprocessor/services/transcription/utils.py similarity index 100% rename from preprocessor/lib/transcription/utils.py rename to preprocessor/services/transcription/utils.py diff --git a/preprocessor/lib/transcription/whisper.py b/preprocessor/services/transcription/whisper.py similarity index 95% rename from preprocessor/lib/transcription/whisper.py rename to preprocessor/services/transcription/whisper.py index 9c0bc6c0d..f7d65c9d8 100644 --- a/preprocessor/lib/transcription/whisper.py +++ b/preprocessor/services/transcription/whisper.py @@ -8,8 +8,8 @@ from faster_whisper import WhisperModel import torch -from preprocessor.lib.transcription.utils import WhisperUtils -from preprocessor.lib.ui.console import console +from preprocessor.services.transcription.utils import WhisperUtils +from preprocessor.services.ui.console import console class Whisper: diff --git a/preprocessor/lib/ui/__init__.py b/preprocessor/services/ui/__init__.py similarity index 62% rename from preprocessor/lib/ui/__init__.py rename to preprocessor/services/ui/__init__.py index cf8c5a9c5..3a67e434f 100644 --- a/preprocessor/lib/ui/__init__.py +++ b/preprocessor/services/ui/__init__.py @@ -1,8 +1,8 @@ -from preprocessor.lib.ui.console import ( +from preprocessor.services.ui.console import ( SimpleProgress, console, ) -from preprocessor.lib.ui.progress import ( +from preprocessor.services.ui.progress import ( OperationTracker, ProgressTracker, ) diff --git a/preprocessor/lib/ui/console.py b/preprocessor/services/ui/console.py similarity index 97% rename from preprocessor/lib/ui/console.py rename to preprocessor/services/ui/console.py index 1880c3a92..3e5a7c58d 100644 --- a/preprocessor/lib/ui/console.py +++ b/preprocessor/services/ui/console.py @@ -4,7 +4,7 @@ from rich.console import Console -from preprocessor.lib.core.time import TimeFormatter +from preprocessor.services.core.time import TimeFormatter _console_instance = None diff --git a/preprocessor/lib/ui/progress.py b/preprocessor/services/ui/progress.py similarity index 95% rename from preprocessor/lib/ui/progress.py rename to preprocessor/services/ui/progress.py index 58be57e8c..380ee82b7 100644 --- a/preprocessor/lib/ui/progress.py +++ b/preprocessor/services/ui/progress.py @@ -2,8 +2,8 @@ import time from typing import Optional -from preprocessor.lib.core.time import TimeFormatter -from preprocessor.lib.ui.console import console +from preprocessor.services.core.time import TimeFormatter +from preprocessor.services.ui.console import console class ProgressTracker: diff --git a/preprocessor/modules/validation/base_result.py b/preprocessor/services/validation/base_result.py similarity index 100% rename from preprocessor/modules/validation/base_result.py rename to preprocessor/services/validation/base_result.py diff --git a/preprocessor/modules/validation/episode_stats.py b/preprocessor/services/validation/episode_stats.py similarity index 98% rename from preprocessor/modules/validation/episode_stats.py rename to preprocessor/services/validation/episode_stats.py index ed1c64ed5..1f0329c0b 100644 --- a/preprocessor/modules/validation/episode_stats.py +++ b/preprocessor/services/validation/episode_stats.py @@ -21,10 +21,10 @@ OUTPUT_FILE_NAMES, OUTPUT_FILE_PATTERNS, ) -from preprocessor.lib.episodes import EpisodeInfo -from preprocessor.lib.io.path_manager import PathManager -from preprocessor.modules.validation.base_result import ValidationStatusMixin -from preprocessor.modules.validation.file_validators import FileValidator +from preprocessor.services.episodes import EpisodeInfo +from preprocessor.services.io.path_manager import PathManager +from preprocessor.services.validation.base_result import ValidationStatusMixin +from preprocessor.services.validation.file_validators import FileValidator ELASTIC_SUBDIRS = settings.output_subdirs.elastic_document_subdirs diff --git a/preprocessor/modules/validation/file_validators.py b/preprocessor/services/validation/file_validators.py similarity index 100% rename from preprocessor/modules/validation/file_validators.py rename to preprocessor/services/validation/file_validators.py diff --git a/preprocessor/modules/validation/global_validator.py b/preprocessor/services/validation/global_validator.py similarity index 96% rename from preprocessor/modules/validation/global_validator.py rename to preprocessor/services/validation/global_validator.py index f8e3d0b21..4850e390e 100644 --- a/preprocessor/modules/validation/global_validator.py +++ b/preprocessor/services/validation/global_validator.py @@ -1,8 +1,8 @@ from pathlib import Path from typing import List -from preprocessor.modules.validation.base_result import BaseValidationResult -from preprocessor.modules.validation.file_validators import FileValidator +from preprocessor.services.validation.base_result import BaseValidationResult +from preprocessor.services.validation.file_validators import FileValidator class GlobalValidationResult(BaseValidationResult): diff --git a/preprocessor/modules/validation/report_generator.py b/preprocessor/services/validation/report_generator.py similarity index 88% rename from preprocessor/modules/validation/report_generator.py rename to preprocessor/services/validation/report_generator.py index 353f8a5b7..e0b58bae6 100644 --- a/preprocessor/modules/validation/report_generator.py +++ b/preprocessor/services/validation/report_generator.py @@ -6,8 +6,8 @@ Dict, ) -from preprocessor.modules.validation.episode_stats import EpisodeStats -from preprocessor.modules.validation.season_comparator import SeasonComparison +from preprocessor.services.validation.episode_stats import EpisodeStats +from preprocessor.services.validation.season_comparator import SeasonComparison class ReportGenerator: diff --git a/preprocessor/modules/validation/season_comparator.py b/preprocessor/services/validation/season_comparator.py similarity index 98% rename from preprocessor/modules/validation/season_comparator.py rename to preprocessor/services/validation/season_comparator.py index 985a48fe5..98a60c511 100644 --- a/preprocessor/modules/validation/season_comparator.py +++ b/preprocessor/services/validation/season_comparator.py @@ -9,7 +9,7 @@ Optional, ) -from preprocessor.modules.validation.episode_stats import EpisodeStats +from preprocessor.services.validation.episode_stats import EpisodeStats @dataclass diff --git a/preprocessor/modules/validation/validator.py b/preprocessor/services/validation/validator.py similarity index 93% rename from preprocessor/modules/validation/validator.py rename to preprocessor/services/validation/validator.py index ae84b0a4d..f270cc90f 100644 --- a/preprocessor/modules/validation/validator.py +++ b/preprocessor/services/validation/validator.py @@ -9,12 +9,12 @@ from rich.progress import track from preprocessor.config.config import settings -from preprocessor.lib.episodes import EpisodeManager -from preprocessor.lib.io.files import FileOperations -from preprocessor.lib.io.path_manager import PathManager -from preprocessor.modules.validation.episode_stats import EpisodeStats -from preprocessor.modules.validation.report_generator import ReportGenerator -from preprocessor.modules.validation.season_comparator import SeasonComparison +from preprocessor.services.episodes import EpisodeManager +from preprocessor.services.io.files import FileOperations +from preprocessor.services.io.path_manager import PathManager +from preprocessor.services.validation.episode_stats import EpisodeStats +from preprocessor.services.validation.report_generator import ReportGenerator +from preprocessor.services.validation.season_comparator import SeasonComparison console = Console() diff --git a/preprocessor/services/video/__init__.py b/preprocessor/services/video/__init__.py new file mode 100644 index 000000000..6cfcddeef --- /dev/null +++ b/preprocessor/services/video/__init__.py @@ -0,0 +1,3 @@ +from preprocessor.services.video.discovery import VideoDiscovery + +__all__ = ['VideoDiscovery'] diff --git a/preprocessor/lib/video/discovery.py b/preprocessor/services/video/discovery.py similarity index 100% rename from preprocessor/lib/video/discovery.py rename to preprocessor/services/video/discovery.py diff --git a/preprocessor/lib/video/emotion_utils.py b/preprocessor/services/video/emotion_utils.py similarity index 98% rename from preprocessor/lib/video/emotion_utils.py rename to preprocessor/services/video/emotion_utils.py index 547423902..1f3a8e5bb 100644 --- a/preprocessor/lib/video/emotion_utils.py +++ b/preprocessor/services/video/emotion_utils.py @@ -9,7 +9,7 @@ import numpy as np from preprocessor.config.config import settings -from preprocessor.lib.core.logging import ErrorHandlingLogger +from preprocessor.services.core.logging import ErrorHandlingLogger EMOTION_LABELS = ['anger', 'contempt', 'disgust', 'fear', 'happiness', 'neutral', 'sadness', 'surprise'] diff --git a/preprocessor/lib/video/frame_utils.py b/preprocessor/services/video/frame_utils.py similarity index 100% rename from preprocessor/lib/video/frame_utils.py rename to preprocessor/services/video/frame_utils.py diff --git a/preprocessor/lib/video/image_hasher.py b/preprocessor/services/video/image_hasher.py similarity index 100% rename from preprocessor/lib/video/image_hasher.py rename to preprocessor/services/video/image_hasher.py diff --git a/preprocessor/services/video/strategies/__init__.py b/preprocessor/services/video/strategies/__init__.py new file mode 100644 index 000000000..91c924807 --- /dev/null +++ b/preprocessor/services/video/strategies/__init__.py @@ -0,0 +1,4 @@ +from preprocessor.services.video.strategies.base_strategy import BaseKeyframeStrategy +from preprocessor.services.video.strategies.scene_changes_strategy import SceneChangesStrategy + +__all__ = ['BaseKeyframeStrategy', 'SceneChangesStrategy'] diff --git a/preprocessor/lib/video/strategies/base_strategy.py b/preprocessor/services/video/strategies/base_strategy.py similarity index 100% rename from preprocessor/lib/video/strategies/base_strategy.py rename to preprocessor/services/video/strategies/base_strategy.py diff --git a/preprocessor/lib/video/strategies/scene_changes_strategy.py b/preprocessor/services/video/strategies/scene_changes_strategy.py similarity index 93% rename from preprocessor/lib/video/strategies/scene_changes_strategy.py rename to preprocessor/services/video/strategies/scene_changes_strategy.py index 26e5950c4..e2be552e1 100644 --- a/preprocessor/lib/video/strategies/scene_changes_strategy.py +++ b/preprocessor/services/video/strategies/scene_changes_strategy.py @@ -6,8 +6,8 @@ ) from preprocessor.config.enums import FrameType -from preprocessor.lib.ui.console import console -from preprocessor.lib.video.strategies.base_strategy import BaseKeyframeStrategy +from preprocessor.services.ui.console import console +from preprocessor.services.video.strategies.base_strategy import BaseKeyframeStrategy class SceneChangesStrategy(BaseKeyframeStrategy): diff --git a/preprocessor/lib/video/strategies/strategy_factory.py b/preprocessor/services/video/strategies/strategy_factory.py similarity index 69% rename from preprocessor/lib/video/strategies/strategy_factory.py rename to preprocessor/services/video/strategies/strategy_factory.py index c6e41ce15..cd37e4ae5 100644 --- a/preprocessor/lib/video/strategies/strategy_factory.py +++ b/preprocessor/services/video/strategies/strategy_factory.py @@ -1,6 +1,6 @@ from preprocessor.config.enums import KeyframeStrategy -from preprocessor.lib.video.strategies.base_strategy import BaseKeyframeStrategy -from preprocessor.lib.video.strategies.scene_changes_strategy import SceneChangesStrategy +from preprocessor.services.video.strategies.base_strategy import BaseKeyframeStrategy +from preprocessor.services.video.strategies.scene_changes_strategy import SceneChangesStrategy class KeyframeStrategyFactory: diff --git a/preprocessor/lib/transcription/generators/__init__.py b/preprocessor/steps/__init__.py similarity index 100% rename from preprocessor/lib/transcription/generators/__init__.py rename to preprocessor/steps/__init__.py diff --git a/preprocessor/steps/audio/__init__.py b/preprocessor/steps/audio/__init__.py new file mode 100644 index 000000000..7df9c8006 --- /dev/null +++ b/preprocessor/steps/audio/__init__.py @@ -0,0 +1,3 @@ +from preprocessor.steps.audio.separation import SoundSeparationStep + +__all__ = ['SoundSeparationStep'] diff --git a/preprocessor/lib/audio/separation.py b/preprocessor/steps/audio/separation.py similarity index 64% rename from preprocessor/lib/audio/separation.py rename to preprocessor/steps/audio/separation.py index 76ed3dd47..6bcc3b730 100644 --- a/preprocessor/lib/audio/separation.py +++ b/preprocessor/steps/audio/separation.py @@ -20,8 +20,8 @@ from preprocessor.core.artifacts import TranscriptionData from preprocessor.core.base_step import PipelineStep from preprocessor.core.context import ExecutionContext -from preprocessor.lib.io.files import atomic_write_json -from preprocessor.lib.transcription.sound_classification import ( +from preprocessor.services.io.files import atomic_write_json +from preprocessor.services.transcription.sound_classification import ( classify_segment, is_sound_event, ) @@ -29,43 +29,89 @@ class SoundSeparationStep(PipelineStep[TranscriptionData, TranscriptionData, SoundSeparationConfig]): - def execute( # pylint: disable=too-many-locals + def execute( self, input_data: TranscriptionData, context: ExecutionContext, ) -> TranscriptionData: + output_paths = self._prepare_output_paths(input_data) + + if self._should_skip_processing(output_paths, context, input_data): + return self._create_cached_result(output_paths, input_data) + + context.mark_step_started(self.name, input_data.episode_id) + transcription_data = self._load_transcription_data(input_data) + dialogue_segments, sound_segments = self._separate_dialogue_from_sounds( + transcription_data['segments'], + ) + self._save_separated_data( + output_paths, + transcription_data['episode_info'], + dialogue_segments, + sound_segments, + ) + self._generate_additional_formats( + output_paths, + dialogue_segments, + sound_segments, + ) + context.mark_step_completed(self.name, input_data.episode_id) + + return self._create_result_artifact(output_paths, input_data) + + @property + def name(self) -> str: + return 'sound_separation' + + def _prepare_output_paths(self, input_data: TranscriptionData) -> Dict[str, Path]: base_name = input_data.path.stem.replace(FILE_SUFFIXES['segmented'], '') episode_dir = input_data.path.parent.parent clean_dir = episode_dir / 'clean' sound_dir = episode_dir / 'sound_events' clean_dir.mkdir(parents=True, exist_ok=True) sound_dir.mkdir(parents=True, exist_ok=True) - clean_json = ( - clean_dir / - f"{base_name}{FILE_SUFFIXES['clean']}{FILE_EXTENSIONS['json']}" - ) - sound_json = ( - sound_dir / - f"{base_name}{FILE_SUFFIXES['sound_events']}{FILE_EXTENSIONS['json']}" - ) + + return { + 'clean_json': clean_dir / f"{base_name}{FILE_SUFFIXES['clean']}{FILE_EXTENSIONS['json']}", + 'sound_json': sound_dir / f"{base_name}{FILE_SUFFIXES['sound_events']}{FILE_EXTENSIONS['json']}", + 'clean_segmented': clean_dir / f"{base_name}{FILE_SUFFIXES['segmented']}_clean{FILE_EXTENSIONS['json']}", + 'sound_segmented': sound_dir / f"{base_name}{FILE_SUFFIXES['segmented']}_sound_events{FILE_EXTENSIONS['json']}", + 'clean_txt': clean_dir / f"{base_name}{FILE_SUFFIXES['clean']}{FILE_EXTENSIONS['txt']}", + 'sound_txt': sound_dir / f"{base_name}{FILE_SUFFIXES['sound_events']}{FILE_EXTENSIONS['txt']}", + 'clean_srt': clean_dir / f"{base_name}{FILE_SUFFIXES['clean']}{FILE_EXTENSIONS['srt']}", + 'sound_srt': sound_dir / f"{base_name}{FILE_SUFFIXES['sound_events']}{FILE_EXTENSIONS['srt']}", + } + + def _should_skip_processing( + self, + output_paths: Dict[str, Path], + context: ExecutionContext, + input_data: TranscriptionData, + ) -> bool: + clean_json = output_paths['clean_json'] + sound_json = output_paths['sound_json'] if clean_json.exists() and sound_json.exists() and (not context.force_rerun): if context.is_step_completed(self.name, input_data.episode_id): context.logger.info(f'Skipping {input_data.episode_id} (cached)') - return TranscriptionData( - path=clean_json, - episode_id=input_data.episode_id, - episode_info=input_data.episode_info, - language=input_data.language, - model=input_data.model, - format='json', - ) - context.mark_step_started(self.name, input_data.episode_id) + return True + return False + + @staticmethod + def _load_transcription_data(input_data: TranscriptionData) -> Dict[str, Any]: with open(input_data.path, 'r', encoding='utf-8') as f: data = json.load(f) - episode_info_dict = data.get('episode_info', {}) - segments = data.get('segments', []) + return { + 'episode_info': data.get('episode_info', {}), + 'segments': data.get('segments', []), + } + + def _separate_dialogue_from_sounds( + self, + segments: List[Dict[str, Any]], + ) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]: dialogue_segments = [] sound_segments = [] + for segment in segments: classification = classify_segment(segment) if classification == 'dialogue': @@ -79,33 +125,45 @@ def execute( # pylint: disable=too-many-locals dialogue_parts, sound_parts = self.__split_mixed_segment(segment) dialogue_segments.extend(dialogue_parts) sound_segments.extend(sound_parts) + dialogue_segments = self.__renumber_segments(dialogue_segments) sound_segments = self.__renumber_segments(sound_segments) + + return dialogue_segments, sound_segments + + @staticmethod + def _save_separated_data( + output_paths: Dict[str, Path], + episode_info_dict: Dict[str, Any], + dialogue_segments: List[Dict[str, Any]], + sound_segments: List[Dict[str, Any]], + ) -> None: clean_data = {'episode_info': episode_info_dict, 'segments': dialogue_segments} sound_data = {'episode_info': episode_info_dict, 'segments': sound_segments} - atomic_write_json(clean_json, clean_data) - atomic_write_json(sound_json, sound_data) - clean_segmented = ( - clean_dir / - f"{base_name}{FILE_SUFFIXES['segmented']}_clean{FILE_EXTENSIONS['json']}" - ) - sound_segmented = ( - sound_dir / - f"{base_name}{FILE_SUFFIXES['segmented']}_sound_events{FILE_EXTENSIONS['json']}" - ) - atomic_write_json(clean_segmented, clean_data) - atomic_write_json(sound_segmented, sound_data) - clean_txt = clean_dir / f"{base_name}{FILE_SUFFIXES['clean']}{FILE_EXTENSIONS['txt']}" - sound_txt = sound_dir / f"{base_name}{FILE_SUFFIXES['sound_events']}{FILE_EXTENSIONS['txt']}" - clean_srt = clean_dir / f"{base_name}{FILE_SUFFIXES['clean']}{FILE_EXTENSIONS['srt']}" - sound_srt = sound_dir / f"{base_name}{FILE_SUFFIXES['sound_events']}{FILE_EXTENSIONS['srt']}" - self.__generate_txt_file(clean_json, clean_txt) - self.__generate_txt_file(sound_json, sound_txt) - self.__generate_srt_file(dialogue_segments, clean_srt) - self.__generate_srt_file(sound_segments, sound_srt) - context.mark_step_completed(self.name, input_data.episode_id) + + atomic_write_json(output_paths['clean_json'], clean_data) + atomic_write_json(output_paths['sound_json'], sound_data) + atomic_write_json(output_paths['clean_segmented'], clean_data) + atomic_write_json(output_paths['sound_segmented'], sound_data) + + def _generate_additional_formats( + self, + output_paths: Dict[str, Path], + dialogue_segments: List[Dict[str, Any]], + sound_segments: List[Dict[str, Any]], + ) -> None: + self.__generate_txt_file(output_paths['clean_json'], output_paths['clean_txt']) + self.__generate_txt_file(output_paths['sound_json'], output_paths['sound_txt']) + self.__generate_srt_file(dialogue_segments, output_paths['clean_srt']) + self.__generate_srt_file(sound_segments, output_paths['sound_srt']) + + @staticmethod + def _create_cached_result( + output_paths: Dict[str, Path], + input_data: TranscriptionData, + ) -> TranscriptionData: return TranscriptionData( - path=clean_json, + path=output_paths['clean_json'], episode_id=input_data.episode_id, episode_info=input_data.episode_info, language=input_data.language, @@ -113,9 +171,19 @@ def execute( # pylint: disable=too-many-locals format='json', ) - @property - def name(self) -> str: - return 'sound_separation' + @staticmethod + def _create_result_artifact( + output_paths: Dict[str, Path], + input_data: TranscriptionData, + ) -> TranscriptionData: + return TranscriptionData( + path=output_paths['clean_json'], + episode_id=input_data.episode_id, + episode_info=input_data.episode_info, + language=input_data.language, + model=input_data.model, + format='json', + ) @staticmethod def __clean_segment_text(segment: Dict[str, Any]) -> Dict[str, Any]: diff --git a/preprocessor/steps/packaging/__init__.py b/preprocessor/steps/packaging/__init__.py new file mode 100644 index 000000000..677d2f58d --- /dev/null +++ b/preprocessor/steps/packaging/__init__.py @@ -0,0 +1,3 @@ +from preprocessor.steps.packaging.archives import ArchiveGenerationStep + +__all__ = ['ArchiveGenerationStep'] diff --git a/preprocessor/modules/packaging/archives.py b/preprocessor/steps/packaging/archives.py similarity index 52% rename from preprocessor/modules/packaging/archives.py rename to preprocessor/steps/packaging/archives.py index c77a8f642..126904dc1 100644 --- a/preprocessor/modules/packaging/archives.py +++ b/preprocessor/steps/packaging/archives.py @@ -12,17 +12,42 @@ class ArchiveGenerationStep(PipelineStep[ProcessedEpisode, ArchiveArtifact, ArchiveConfig]): def execute(self, input_data: ProcessedEpisode, context: ExecutionContext) -> ArchiveArtifact: - output_filename: str = f'{context.series_name}_{input_data.episode_info.episode_code()}_archive.zip' - output_path: Path = context.get_output_path(input_data.episode_info, 'archives', output_filename) - if output_path.exists() and (not context.force_rerun): - if context.is_step_completed(self.name, input_data.episode_id): - context.logger.info(f'Skipping {input_data.episode_id} (cached archive)') - return ArchiveArtifact(episode_id=input_data.episode_id, episode_info=input_data.episode_info, path=output_path) + output_path = self._get_output_path(input_data, context) + + if self._should_skip_processing(output_path, context, input_data): + return self._create_archive_artifact(input_data, output_path) + context.logger.info(f'Generating archive for {input_data.episode_id}') context.mark_step_started(self.name, input_data.episode_id) context.mark_step_completed(self.name, input_data.episode_id) - return ArchiveArtifact(episode_id=input_data.episode_id, episode_info=input_data.episode_info, path=output_path) + + return self._create_archive_artifact(input_data, output_path) @property def name(self) -> str: return 'archive_generation' + + @staticmethod + def _get_output_path(input_data: ProcessedEpisode, context: ExecutionContext) -> Path: + output_filename: str = f'{context.series_name}_{input_data.episode_info.episode_code()}_archive.zip' + return context.get_output_path(input_data.episode_info, 'archives', output_filename) + + def _should_skip_processing( + self, + output_path: Path, + context: ExecutionContext, + input_data: ProcessedEpisode, + ) -> bool: + if output_path.exists() and (not context.force_rerun): + if context.is_step_completed(self.name, input_data.episode_id): + context.logger.info(f'Skipping {input_data.episode_id} (cached archive)') + return True + return False + + @staticmethod + def _create_archive_artifact(input_data: ProcessedEpisode, output_path: Path) -> ArchiveArtifact: + return ArchiveArtifact( + episode_id=input_data.episode_id, + episode_info=input_data.episode_info, + path=output_path, + ) diff --git a/preprocessor/steps/scraping/__init__.py b/preprocessor/steps/scraping/__init__.py new file mode 100644 index 000000000..77a521f2d --- /dev/null +++ b/preprocessor/steps/scraping/__init__.py @@ -0,0 +1,6 @@ +from preprocessor.services.scraping.base_scraper_step import BaseScraperStep +from preprocessor.steps.scraping.character_scraper_step import CharacterScraperStep +from preprocessor.steps.scraping.episode_scraper_step import EpisodeScraperStep +from preprocessor.steps.scraping.reference_processor_step import CharacterReferenceStep + +__all__ = ['BaseScraperStep', 'CharacterReferenceStep', 'CharacterScraperStep', 'EpisodeScraperStep'] diff --git a/preprocessor/modules/scraping/character_scraper_step.py b/preprocessor/steps/scraping/character_scraper_step.py similarity index 69% rename from preprocessor/modules/scraping/character_scraper_step.py rename to preprocessor/steps/scraping/character_scraper_step.py index 08a43e462..c014c8d7c 100644 --- a/preprocessor/modules/scraping/character_scraper_step.py +++ b/preprocessor/steps/scraping/character_scraper_step.py @@ -1,6 +1,6 @@ from preprocessor.config.step_configs import CharacterScraperConfig -from preprocessor.modules.scraping.base_scraper_step import BaseScraperStep -from preprocessor.modules.scraping.character_scraper import CharacterScraper +from preprocessor.services.scraping.base_scraper_step import BaseScraperStep +from preprocessor.services.scraping.character_scraper import CharacterScraper class CharacterScraperStep(BaseScraperStep[CharacterScraperConfig]): diff --git a/preprocessor/modules/scraping/episode_scraper_step.py b/preprocessor/steps/scraping/episode_scraper_step.py similarity index 82% rename from preprocessor/modules/scraping/episode_scraper_step.py rename to preprocessor/steps/scraping/episode_scraper_step.py index 27188b091..f7392538a 100644 --- a/preprocessor/modules/scraping/episode_scraper_step.py +++ b/preprocessor/steps/scraping/episode_scraper_step.py @@ -6,8 +6,8 @@ from preprocessor.config.step_configs import EpisodeScraperConfig from preprocessor.core.context import ExecutionContext -from preprocessor.modules.scraping.base_scraper_step import BaseScraperStep -from preprocessor.modules.scraping.episode_scraper import EpisodeScraper +from preprocessor.services.scraping.base_scraper_step import BaseScraperStep +from preprocessor.services.scraping.episode_scraper import EpisodeScraper class EpisodeScraperStep(BaseScraperStep[EpisodeScraperConfig]): diff --git a/preprocessor/modules/scraping/reference_processor_step.py b/preprocessor/steps/scraping/reference_processor_step.py similarity index 71% rename from preprocessor/modules/scraping/reference_processor_step.py rename to preprocessor/steps/scraping/reference_processor_step.py index 1dbff41f0..929f370ad 100644 --- a/preprocessor/modules/scraping/reference_processor_step.py +++ b/preprocessor/steps/scraping/reference_processor_step.py @@ -5,7 +5,7 @@ from preprocessor.core.artifacts import SourceVideo from preprocessor.core.base_step import PipelineStep from preprocessor.core.context import ExecutionContext -from preprocessor.modules.scraping.reference_processor import CharacterReferenceProcessor +from preprocessor.services.scraping.reference_processor import CharacterReferenceProcessor class CharacterReferenceStep( @@ -21,20 +21,48 @@ def execute( if self._executed: return input_data + characters_path, output_dir = self._get_paths() + self._validate_characters_file(characters_path) + + if self._should_skip_processing(output_dir, context): + self._executed = True + return input_data + + self._process_character_references(characters_path, output_dir, context) + self._executed = True + + return input_data + + @property + def name(self) -> str: + return "process_character_references" + + def _get_paths(self) -> tuple[Path, Path]: characters_path = Path(self.config.characters_file) output_dir = Path(self.config.output_dir) + return characters_path, output_dir + @staticmethod + def _validate_characters_file(characters_path: Path) -> None: if not characters_path.exists(): raise FileNotFoundError( f"Characters file not found: {characters_path}. " f"Run scrape_characters first.", ) + @staticmethod + def _should_skip_processing(output_dir: Path, context: ExecutionContext) -> bool: if output_dir.exists() and any(output_dir.iterdir()) and not context.force_rerun: context.logger.info(f"Character references already exist in: {output_dir}") - self._executed = True - return input_data + return True + return False + def _process_character_references( + self, + characters_path: Path, + output_dir: Path, + context: ExecutionContext, + ) -> None: context.logger.info(f"Processing character references from {characters_path}") processor = CharacterReferenceProcessor( @@ -54,10 +82,3 @@ def execute( ) context.logger.info(f"Character references saved to: {output_dir}") - - self._executed = True - return input_data - - @property - def name(self) -> str: - return "process_character_references" diff --git a/preprocessor/modules/__init__.py b/preprocessor/steps/search/__init__.py similarity index 100% rename from preprocessor/modules/__init__.py rename to preprocessor/steps/search/__init__.py diff --git a/preprocessor/modules/search/document_generation.py b/preprocessor/steps/search/document_generation.py similarity index 74% rename from preprocessor/modules/search/document_generation.py rename to preprocessor/steps/search/document_generation.py index a750c6e49..119fc2062 100644 --- a/preprocessor/modules/search/document_generation.py +++ b/preprocessor/steps/search/document_generation.py @@ -12,36 +12,76 @@ ) from preprocessor.core.base_step import PipelineStep from preprocessor.core.context import ExecutionContext -from preprocessor.lib.io.files import load_json +from preprocessor.services.io.files import load_json class DocumentGeneratorStep(PipelineStep[Artifact, ElasticDocuments, DocumentGenerationConfig]): def execute(self, input_data: Artifact, context: ExecutionContext) -> ElasticDocuments: - if not hasattr(input_data, 'episode_info'): - raise ValueError('Input artifact must have episode_info') - episode_info = getattr(input_data, 'episode_info') - episode_id = getattr(input_data, 'episode_id') + episode_info, episode_id = self._extract_episode_info(input_data) output_dir = context.get_output_path(episode_info, 'elastic_documents', '') - if output_dir.exists() and (not context.force_rerun): - if context.is_step_completed(self.name, episode_id): - return ElasticDocuments(episode_id=episode_id, episode_info=episode_info, path=output_dir, document_count=0) + + if self._should_skip_processing(output_dir, context, episode_id): + return self._create_empty_result(episode_id, episode_info, output_dir) + context.logger.info(f'Generating Elasticsearch documents for {episode_id}') context.mark_step_started(self.name, episode_id) + data = self.__gather_input_data(episode_info, context) - generated_files = [] - total_docs = 0 - if self.config.generate_segments and 'transcription' in data: - path, count = self.__generate_segments_jsonl(data, episode_info, context) - generated_files.append(path) - total_docs += count + total_docs = self._generate_documents(data, episode_info, context) + context.mark_step_completed(self.name, episode_id) - return ElasticDocuments(episode_id=episode_id, episode_info=episode_info, path=output_dir, document_count=total_docs) + return ElasticDocuments( + episode_id=episode_id, + episode_info=episode_info, + path=output_dir, + document_count=total_docs, + ) @property def name(self) -> str: return 'document_generation' + @staticmethod + def _extract_episode_info(input_data: Artifact) -> tuple[Any, str]: + if not hasattr(input_data, 'episode_info'): + raise ValueError('Input artifact must have episode_info') + episode_info = getattr(input_data, 'episode_info') + episode_id = getattr(input_data, 'episode_id') + return episode_info, episode_id + + def _should_skip_processing( + self, + output_dir: Path, + context: ExecutionContext, + episode_id: str, + ) -> bool: + if output_dir.exists() and (not context.force_rerun): + if context.is_step_completed(self.name, episode_id): + return True + return False + + @staticmethod + def _create_empty_result(episode_id: str, episode_info: Any, output_dir: Path) -> ElasticDocuments: + return ElasticDocuments( + episode_id=episode_id, + episode_info=episode_info, + path=output_dir, + document_count=0, + ) + + def _generate_documents( + self, + data: Dict[str, Any], + episode_info: Any, + context: ExecutionContext, + ) -> int: + total_docs = 0 + if self.config.generate_segments and 'transcription' in data: + _, count = self.__generate_segments_jsonl(data, episode_info, context) + total_docs += count + return total_docs + @staticmethod def __build_episode_metadata(episode_info: Any, context: ExecutionContext) -> Dict[str, Any]: return {'season': episode_info.season, 'episode_number': episode_info.relative_episode, 'series_name': context.series_name} diff --git a/preprocessor/steps/search/indexing.py b/preprocessor/steps/search/indexing.py new file mode 100644 index 000000000..3cd6707fb --- /dev/null +++ b/preprocessor/steps/search/indexing.py @@ -0,0 +1,154 @@ +import asyncio +import json +from pathlib import Path +from typing import ( + Any, + Dict, + List, + Optional, +) + +from preprocessor.config.step_configs import ElasticsearchConfig +from preprocessor.core.artifacts import ( + ElasticDocuments, + IndexingResult, +) +from preprocessor.core.base_step import PipelineStep +from preprocessor.core.context import ExecutionContext +from preprocessor.services.search.elasticsearch import ElasticsearchWrapper + + +class ElasticsearchIndexerStep(PipelineStep[List[ElasticDocuments], IndexingResult, ElasticsearchConfig]): + + def __init__(self, config: ElasticsearchConfig) -> None: + super().__init__(config) + self._es: Optional[ElasticsearchWrapper] = None + + def cleanup(self) -> None: + if self._es: + asyncio.run(self._es.close()) + self._es = None + + def execute(self, input_data: List[ElasticDocuments], context: ExecutionContext) -> IndexingResult: + return asyncio.run(self._execute_async(input_data, context)) + + @property + def name(self) -> str: + return 'elasticsearch_indexing' + + async def _execute_async( + self, + input_data: List[ElasticDocuments], + context: ExecutionContext, + ) -> IndexingResult: + if not input_data: + return await self._create_empty_result(context) + + docs_by_type = self._group_documents_by_type(input_data) + total_indexed = await self._index_all_document_types(docs_by_type, context) + + return IndexingResult( + index_name=self.config.index_name, + document_count=total_indexed, + success=True, + ) + + async def _create_empty_result(self, context: ExecutionContext) -> IndexingResult: + context.logger.warning('No documents to index.') + return IndexingResult( + index_name=self.config.index_name, + document_count=0, + success=True, + ) + + @staticmethod + def _group_documents_by_type(input_data: List[ElasticDocuments]) -> Dict[str, List[Path]]: + docs_by_type: Dict[str, List[Path]] = {} + for doc_artifact in input_data: + doc_type: str = doc_artifact.path.parent.name + if doc_type not in docs_by_type: + docs_by_type[doc_type] = [] + docs_by_type[doc_type].append(doc_artifact.path) + return docs_by_type + + async def _index_all_document_types( + self, + docs_by_type: Dict[str, List[Path]], + context: ExecutionContext, + ) -> int: + total_indexed: int = 0 + for doc_type, paths in docs_by_type.items(): + try: + indexed_count = await self._index_document_type(doc_type, paths, context) + total_indexed += indexed_count + except Exception as e: + context.logger.error(f'Elasticsearch indexing failed for {doc_type}: {e}') + raise + return total_indexed + + async def _index_document_type( + self, + doc_type: str, + paths: List[Path], + context: ExecutionContext, + ) -> int: + index_name: str = f'{self.config.index_name}_{doc_type}' + context.logger.info(f'Indexing {len(paths)} files into {index_name}') + + await self._ensure_elasticsearch_wrapper(index_name) + await self._prepare_index(doc_type) + + documents = self._load_documents_from_paths(paths) + return await self._bulk_index_documents(documents, index_name, context) + + async def _ensure_elasticsearch_wrapper(self, index_name: str) -> None: + if self._es is None or self._es.index_name != index_name: + if self._es is not None: + await self._es.close() + self._es = ElasticsearchWrapper( + index_name=index_name, + host=self.config.host, + dry_run=self.config.dry_run, + ) + + async def _prepare_index(self, doc_type: str) -> None: + if not self.config.append: + await self._es.delete_index() + + mapping: Optional[Dict[str, Any]] = self.__get_mapping_for_type(doc_type) + if mapping: + await self._es.create_index(mapping) + + @staticmethod + def _load_documents_from_paths(paths: List[Path]) -> List[Dict[str, Any]]: + documents: List[Dict[str, Any]] = [] + for path in paths: + with open(path, 'r', encoding='utf-8') as f: + for line in f: + if line.strip(): + documents.append(json.loads(line)) + return documents + + async def _bulk_index_documents( + self, + documents: List[Dict[str, Any]], + index_name: str, + context: ExecutionContext, + ) -> int: + if not documents: + return 0 + + if not self.config.dry_run: + await self._es.bulk_index(documents) + return len(documents) + + context.logger.info( + f'Dry-run: would index {len(documents)} docs to {index_name}', + ) + return 0 + + @staticmethod + def __get_mapping_for_type( + doc_type: str, # pylint: disable=unused-argument + ) -> Optional[Dict[str, Any]]: + return None diff --git a/preprocessor/steps/text/__init__.py b/preprocessor/steps/text/__init__.py new file mode 100644 index 000000000..a6986c9fd --- /dev/null +++ b/preprocessor/steps/text/__init__.py @@ -0,0 +1,6 @@ +from preprocessor.services.text.import_step import TranscriptionImportStep +from preprocessor.steps.text.analysis import TextAnalysisStep +from preprocessor.steps.text.embeddings import TextEmbeddingStep +from preprocessor.steps.text.transcription import TranscriptionStep + +__all__ = ['TextAnalysisStep', 'TextEmbeddingStep', 'TranscriptionImportStep', 'TranscriptionStep'] diff --git a/preprocessor/steps/text/analysis.py b/preprocessor/steps/text/analysis.py new file mode 100644 index 000000000..8a7c42f01 --- /dev/null +++ b/preprocessor/steps/text/analysis.py @@ -0,0 +1,104 @@ +from datetime import datetime +from pathlib import Path +from typing import ( + Any, + Dict, +) + +from preprocessor.config.step_configs import TextAnalysisConfig +from preprocessor.core.artifacts import ( + TextAnalysisResults, + TranscriptionData, +) +from preprocessor.core.base_step import PipelineStep +from preprocessor.core.context import ExecutionContext +from preprocessor.services.io.files import ( + atomic_write_json, + load_json, +) +from preprocessor.services.text.text_statistics import TextStatistics + + +class TextAnalysisStep(PipelineStep[TranscriptionData, TextAnalysisResults, TextAnalysisConfig]): + + def execute(self, input_data: TranscriptionData, context: ExecutionContext) -> TextAnalysisResults: + output_path = self._get_output_path(input_data) + + if self._should_skip_processing(output_path, context, input_data): + return self._load_cached_result(output_path, input_data) + + context.logger.info(f'Analyzing text for {input_data.episode_id}') + context.mark_step_started(self.name, input_data.episode_id) + + txt_path = self._get_text_file_path(input_data) + stats = self._analyze_text_statistics(txt_path) + result_data = self._build_result_data(stats, txt_path, input_data) + + atomic_write_json(output_path, result_data) + context.mark_step_completed(self.name, input_data.episode_id) + + return TextAnalysisResults( + episode_id=input_data.episode_id, + episode_info=input_data.episode_info, + path=output_path, + statistics=result_data, + ) + + @property + def name(self) -> str: + return 'text_analysis' + + @staticmethod + def _get_output_path(input_data: TranscriptionData) -> Path: + output_filename = input_data.path.stem + '_text_stats.json' + return input_data.path.parent / output_filename + + def _should_skip_processing( + self, + output_path: Path, + context: ExecutionContext, + input_data: TranscriptionData, + ) -> bool: + if output_path.exists() and (not context.force_rerun): + if context.is_step_completed(self.name, input_data.episode_id): + context.logger.info(f'Skipping {input_data.episode_id} (cached)') + return True + return False + + @staticmethod + def _load_cached_result(output_path: Path, input_data: TranscriptionData) -> TextAnalysisResults: + stats_data = load_json(output_path) + return TextAnalysisResults( + episode_id=input_data.episode_id, + episode_info=input_data.episode_info, + path=output_path, + statistics=stats_data, + ) + + @staticmethod + def _get_text_file_path(input_data: TranscriptionData) -> Path: + txt_path = input_data.path + if input_data.format != 'txt': + txt_path = input_data.path.with_suffix('.txt') + if not txt_path.exists(): + raise FileNotFoundError(f'Transcription text file not found: {txt_path}') + return txt_path + + def _analyze_text_statistics(self, txt_path: Path) -> TextStatistics: + return TextStatistics.from_file(txt_path, language=self.config.language) + + def _build_result_data( + self, + stats: TextStatistics, + txt_path: Path, + input_data: TranscriptionData, + ) -> Dict[str, Any]: + return { + 'metadata': { + 'episode_id': input_data.episode_id, + 'language': self.config.language, + 'source_file': txt_path.name, + 'analyzed_at': datetime.now().isoformat(), + }, + **stats.to_dict(), + } diff --git a/preprocessor/modules/text/embeddings.py b/preprocessor/steps/text/embeddings.py similarity index 68% rename from preprocessor/modules/text/embeddings.py rename to preprocessor/steps/text/embeddings.py index 12ca67d5a..d226a7c73 100644 --- a/preprocessor/modules/text/embeddings.py +++ b/preprocessor/steps/text/embeddings.py @@ -14,12 +14,12 @@ ) from preprocessor.core.base_step import PipelineStep from preprocessor.core.context import ExecutionContext -from preprocessor.lib.io.files import ( +from preprocessor.services.io.files import ( atomic_write_json, load_json, ) -from preprocessor.lib.io.metadata import MetadataBuilder -from preprocessor.lib.search.embedding_model import EmbeddingModelWrapper +from preprocessor.services.io.metadata import MetadataBuilder +from preprocessor.services.search.embedding_model import EmbeddingModelWrapper class TextEmbeddingStep(PipelineStep[TranscriptionData, EmbeddingCollection, TextEmbeddingConfig]): @@ -32,67 +32,144 @@ def cleanup(self) -> None: if self._model: self._model = None - def execute( # pylint: disable=too-many-locals + def execute( self, input_data: TranscriptionData, context: ExecutionContext, ) -> EmbeddingCollection: + output_path = self._get_output_path(input_data, context) + + if self._should_skip_processing(output_path, context, input_data): + return self._load_cached_result(output_path, input_data) + + segments = self._load_and_validate_segments(input_data, context) + if not segments: + return self._create_embedding_collection(input_data, output_path, 0) + + self._ensure_model_loaded() + context.logger.info(f'Generating text embeddings for {input_data.episode_id}') + context.mark_step_started(self.name, input_data.episode_id) + + results = self._generate_embeddings(segments) + self._save_results(results, output_path, input_data) + + context.mark_step_completed(self.name, input_data.episode_id) + return self._create_embedding_collection(input_data, output_path, len(results)) + + @property + def name(self) -> str: + return 'text_embedding' + + @staticmethod + def _get_output_path(input_data: TranscriptionData, context: ExecutionContext) -> Path: episode_code = input_data.episode_info.episode_code() output_filename: str = f'{context.series_name}_{episode_code}_embeddings_text.json' - output_path: Path = context.get_output_path( - input_data.episode_info, - 'embeddings', - output_filename, + return context.get_output_path(input_data.episode_info, 'embeddings', output_filename) + + def _should_skip_processing( + self, + output_path: Path, + context: ExecutionContext, + input_data: TranscriptionData, + ) -> bool: + return self._check_cache_validity( + output_path, + context, + input_data.episode_id, + 'cached text embeddings', ) - if output_path.exists() and (not context.force_rerun): - if context.is_step_completed(self.name, input_data.episode_id): - context.logger.info( - f'Skipping {input_data.episode_id} (cached text embeddings)', - ) - emb_data: Dict[str, Any] = load_json(output_path) - return self._create_embedding_collection( - input_data, - output_path, - len(emb_data.get('results', [])), - ) + + def _load_cached_result( # pylint: disable=duplicate-code + self, + output_path: Path, + input_data: TranscriptionData, + ) -> EmbeddingCollection: + emb_data: Dict[str, Any] = load_json(output_path) + return self._create_embedding_collection( + input_data, + output_path, + len(emb_data.get('results', [])), + ) + + def _load_and_validate_segments( + self, + input_data: TranscriptionData, + context: ExecutionContext, + ) -> List[Dict[str, Any]]: transcription: Dict[str, Any] = self.__load_clean_transcription(input_data, context) segments: List[Dict[str, Any]] = transcription.get('segments', []) if not segments: context.logger.warning(f'No text segments for embedding in {input_data.episode_id}') - return self._create_embedding_collection(input_data, output_path, 0) + return segments + + def _ensure_model_loaded(self) -> None: if self._model is None: self._model = EmbeddingModelWrapper( self.config.model_name, self.config.device, self.config.batch_size, ) - context.logger.info(f'Generating text embeddings for {input_data.episode_id}') - context.mark_step_started(self.name, input_data.episode_id) + + def _generate_embeddings(self, segments: List[Dict[str, Any]]) -> List[Dict[str, Any]]: full_text: str = ' '.join([seg.get('text', '') for seg in segments]) sentences: List[str] = self.__split_into_sentences(full_text) + text_chunks, chunk_metadata = self._create_text_chunks(sentences, segments) + return self._batch_encode_chunks(text_chunks, chunk_metadata) + + def _create_text_chunks( + self, + sentences: List[str], + segments: List[Dict[str, Any]], + ) -> tuple[List[str], List[Dict[str, Any]]]: text_chunks: List[str] = [] chunk_metadata: List[Dict[str, Any]] = [] step: int = self.config.text_sentences_per_chunk - self.config.text_chunk_overlap + for i in range(0, len(sentences), step): chunk_sentences: List[str] = sentences[i:i + self.config.text_sentences_per_chunk] if not chunk_sentences: continue + chunk_text: str = ' '.join(chunk_sentences).strip() if not chunk_text: continue + char_start: int = sum((len(s) + 1 for s in sentences[:i])) char_end: int = char_start + len(chunk_text) start_seg_id: int = self.__find_segment_at_position(segments, char_start) end_seg_id: int = self.__find_segment_at_position(segments, char_end) + text_chunks.append(chunk_text) - chunk_metadata.append({'segment_range': [start_seg_id, end_seg_id], 'text': chunk_text}) + chunk_metadata.append({ + 'segment_range': [start_seg_id, end_seg_id], + 'text': chunk_text, + }) + + return text_chunks, chunk_metadata + + def _batch_encode_chunks( + self, + text_chunks: List[str], + chunk_metadata: List[Dict[str, Any]], + ) -> List[Dict[str, Any]]: results: List[Dict[str, Any]] = [] + for i in range(0, len(text_chunks), self.config.batch_size): batch_texts: List[str] = text_chunks[i:i + self.config.batch_size] batch_meta: List[Dict[str, Any]] = chunk_metadata[i:i + self.config.batch_size] batch_embeddings: List[List[float]] = self._model.encode_text(batch_texts) + for meta, emb in zip(batch_meta, batch_embeddings): results.append({**meta, 'embedding': emb}) + + return results + + def _save_results( + self, + results: List[Dict[str, Any]], + output_path: Path, + input_data: TranscriptionData, + ) -> None: output_data: Dict[str, Any] = MetadataBuilder.create_processing_metadata( episode_info=input_data.episode_info, processing_params=self.config.dict(), @@ -104,12 +181,6 @@ def execute( # pylint: disable=too-many-locals results_data=results, ) atomic_write_json(output_path, output_data) - context.mark_step_completed(self.name, input_data.episode_id) - return self._create_embedding_collection(input_data, output_path, len(results)) - - @property - def name(self) -> str: - return 'text_embedding' def _create_embedding_collection( # pylint: disable=duplicate-code self, diff --git a/preprocessor/lib/text/transcription.py b/preprocessor/steps/text/transcription.py similarity index 60% rename from preprocessor/lib/text/transcription.py rename to preprocessor/steps/text/transcription.py index fc75e0510..908846d63 100644 --- a/preprocessor/lib/text/transcription.py +++ b/preprocessor/steps/text/transcription.py @@ -12,9 +12,9 @@ ) from preprocessor.core.base_step import PipelineStep from preprocessor.core.context import ExecutionContext -from preprocessor.lib.episodes.episode_manager import EpisodeManager -from preprocessor.lib.io.files import atomic_write_json -from preprocessor.lib.transcription.whisper import Whisper +from preprocessor.services.episodes.episode_manager import EpisodeManager +from preprocessor.services.io.files import atomic_write_json +from preprocessor.services.transcription.whisper import Whisper class TranscriptionStep(PipelineStep[AudioArtifact, TranscriptionData, WhisperTranscriptionConfig]): @@ -29,27 +29,60 @@ def cleanup(self) -> None: self._whisper = None def execute(self, input_data: AudioArtifact, context: ExecutionContext) -> TranscriptionData: + output_path = self._get_output_path(input_data, context) + + if self._should_skip_processing(output_path, context, input_data): + return self._create_cached_result(output_path, input_data) + + self._ensure_whisper_loaded() + context.logger.info( + f'Transcribing {input_data.episode_id} using Whisper {self.config.model}', + ) + context.mark_step_started(self.name, input_data.episode_id) + + result = self._transcribe_audio(input_data, output_path, context) + context.mark_step_completed(self.name, input_data.episode_id) + + return self._create_result_artifact(output_path, input_data, result) + + @property + def name(self) -> str: + return 'transcription' + + @staticmethod + def _get_output_path(input_data: AudioArtifact, context: ExecutionContext) -> Path: output_filename: str = ( f'{context.series_name}_{input_data.episode_info.episode_code()}.json' ) - output_path: Path = context.get_output_path( + return context.get_output_path( input_data.episode_info, 'transcriptions', f'raw/{output_filename}', ) + def _should_skip_processing( + self, + output_path: Path, + context: ExecutionContext, + input_data: AudioArtifact, + ) -> bool: if output_path.exists() and (not context.force_rerun): if context.is_step_completed(self.name, input_data.episode_id): context.logger.info(f'Skipping {input_data.episode_id} (cached transcription)') - return TranscriptionData( - episode_id=input_data.episode_id, - episode_info=input_data.episode_info, - path=output_path, - language=self.config.language, - model=self.config.model, - format='json', - ) + return True + return False + + def _create_cached_result(self, output_path: Path, input_data: AudioArtifact) -> TranscriptionData: + return TranscriptionData( + episode_id=input_data.episode_id, + episode_info=input_data.episode_info, + path=output_path, + language=self.config.language, + model=self.config.model, + format='json', + ) + def _ensure_whisper_loaded(self) -> None: if self._whisper is None: self._whisper = Whisper( model=self.config.model, @@ -58,15 +91,17 @@ def execute(self, input_data: AudioArtifact, context: ExecutionContext) -> Trans beam_size=self.config.beam_size, ) - context.logger.info( - f'Transcribing {input_data.episode_id} using Whisper {self.config.model}', - ) - context.mark_step_started(self.name, input_data.episode_id) - + def _transcribe_audio( + self, + input_data: AudioArtifact, + output_path: Path, + context: ExecutionContext, + ) -> Dict[str, Any]: try: result: Dict[str, Any] = self._whisper.transcribe(input_data.path) result['episode_info'] = EpisodeManager.get_metadata(input_data.episode_info) atomic_write_json(output_path, result) + return result except Exception as e: context.logger.error( f'Whisper transcription failed for {input_data.episode_id}: {e}', @@ -75,7 +110,12 @@ def execute(self, input_data: AudioArtifact, context: ExecutionContext) -> Trans output_path.unlink() raise - context.mark_step_completed(self.name, input_data.episode_id) + def _create_result_artifact( + self, + output_path: Path, + input_data: AudioArtifact, + result: Dict[str, Any], + ) -> TranscriptionData: return TranscriptionData( episode_id=input_data.episode_id, episode_info=input_data.episode_info, @@ -84,7 +124,3 @@ def execute(self, input_data: AudioArtifact, context: ExecutionContext) -> Trans model=self.config.model, format='json', ) - - @property - def name(self) -> str: - return 'transcription' diff --git a/preprocessor/steps/validation/__init__.py b/preprocessor/steps/validation/__init__.py new file mode 100644 index 000000000..0c9efa03a --- /dev/null +++ b/preprocessor/steps/validation/__init__.py @@ -0,0 +1,3 @@ +from preprocessor.steps.validation.validator_step import ValidationStep + +__all__ = ['ValidationStep'] diff --git a/preprocessor/steps/validation/validator_step.py b/preprocessor/steps/validation/validator_step.py new file mode 100644 index 000000000..cfdbc6eca --- /dev/null +++ b/preprocessor/steps/validation/validator_step.py @@ -0,0 +1,47 @@ +from preprocessor.config.step_configs import ValidationConfig +from preprocessor.core.artifacts import ( + ElasticDocuments, + ValidationResult, +) +from preprocessor.core.base_step import PipelineStep +from preprocessor.core.context import ExecutionContext +from preprocessor.services.validation.validator import Validator + + +class ValidationStep(PipelineStep[ElasticDocuments, ValidationResult, ValidationConfig]): + + def execute( + self, + input_data: ElasticDocuments, + context: ExecutionContext, + ) -> ValidationResult: + context.logger.info(f"Starting validation for season {context.season}") + + validator = self._create_validator(context) + self._run_validation(validator) + + context.logger.info("Validation completed successfully") + + return ValidationResult( + season=context.season, + validation_report_dir=validator.validation_reports_dir, + ) + + @property + def name(self) -> str: + return "validate" + + def _create_validator(self, context: ExecutionContext) -> Validator: + return Validator( + season=context.season, + series_name=context.series_name, + anomaly_threshold=self.config.anomaly_threshold, + base_output_dir=context.base_output_dir, + episodes_info_json=self.config.episodes_info_json, + ) + + @staticmethod + def _run_validation(validator: Validator) -> None: + exit_code = validator.validate() + if exit_code != 0: + raise RuntimeError(f"Validation failed with exit code {exit_code}") diff --git a/preprocessor/modules/search/__init__.py b/preprocessor/steps/video/__init__.py similarity index 100% rename from preprocessor/modules/search/__init__.py rename to preprocessor/steps/video/__init__.py diff --git a/preprocessor/modules/video/frame_export.py b/preprocessor/steps/video/frame_export.py similarity index 74% rename from preprocessor/modules/video/frame_export.py rename to preprocessor/steps/video/frame_export.py index 19c111533..4a266df9b 100644 --- a/preprocessor/modules/video/frame_export.py +++ b/preprocessor/steps/video/frame_export.py @@ -20,8 +20,8 @@ ) from preprocessor.core.base_step import PipelineStep from preprocessor.core.context import ExecutionContext -from preprocessor.lib.io.files import atomic_write_json -from preprocessor.lib.video.strategies.strategy_factory import KeyframeStrategyFactory +from preprocessor.services.io.files import atomic_write_json +from preprocessor.services.video.strategies.strategy_factory import KeyframeStrategyFactory class FrameExporterStep(PipelineStep[SceneCollection, FrameCollection, FrameExportConfig]): @@ -32,60 +32,126 @@ def __init__(self, config: FrameExportConfig): self.strategy = KeyframeStrategyFactory.create(self.config.keyframe_strategy, self.config.frames_per_scene) def execute(self, input_data: SceneCollection, context: ExecutionContext) -> FrameCollection: + episode_dir, metadata_file = self._prepare_output_paths(input_data, context) + + if self._should_skip_processing(metadata_file, context, input_data): + return self._load_cached_result(metadata_file, episode_dir, input_data) + + self._prepare_episode_directory(episode_dir, context) + frame_requests = self._extract_frame_requests(input_data) + + if not frame_requests: + return self._create_empty_result(episode_dir, metadata_file, input_data, context) + + context.logger.info(f'Extracting {len(frame_requests)} keyframes from {input_data.video_path.name}') + context.mark_step_started(self.name, input_data.episode_id) + + self._process_frame_extraction( + input_data.video_path, + frame_requests, + episode_dir, + input_data, + metadata_file, + context, + ) + + context.mark_step_completed(self.name, input_data.episode_id) + return FrameCollection( + episode_id=input_data.episode_id, + episode_info=input_data.episode_info, + directory=episode_dir, + frame_count=len(frame_requests), + metadata_path=metadata_file, + ) + + @property + def name(self) -> str: + return 'frame_export' + + @staticmethod + def _prepare_output_paths( + input_data: SceneCollection, + context: ExecutionContext, + ) -> tuple[Path, Path]: episode_dir = context.get_output_path(input_data.episode_info, 'exported_frames', '') metadata_filename = f'{context.series_name}_{input_data.episode_info.episode_code()}_frame_metadata.json' metadata_file = episode_dir / metadata_filename + return episode_dir, metadata_file + + def _should_skip_processing( + self, + metadata_file: Path, + context: ExecutionContext, + input_data: SceneCollection, + ) -> bool: if metadata_file.exists() and (not context.force_rerun): if context.is_step_completed(self.name, input_data.episode_id): context.logger.info(f'Skipping {input_data.episode_id} (cached)') - with open(metadata_file, 'r', encoding='utf-8') as f: - metadata = json.load(f) - return FrameCollection( - episode_id=input_data.episode_id, - episode_info=input_data.episode_info, - directory=episode_dir, - frame_count=metadata['statistics']['total_frames'], - metadata_path=metadata_file, - ) + return True + return False + + @staticmethod + def _load_cached_result( + metadata_file: Path, + episode_dir: Path, + input_data: SceneCollection, + ) -> FrameCollection: + with open(metadata_file, 'r', encoding='utf-8') as f: + metadata = json.load(f) + return FrameCollection( + episode_id=input_data.episode_id, + episode_info=input_data.episode_info, + directory=episode_dir, + frame_count=metadata['statistics']['total_frames'], + metadata_path=metadata_file, + ) + + @staticmethod + def _prepare_episode_directory(episode_dir: Path, context: ExecutionContext) -> None: if episode_dir.exists(): context.logger.info(f'Cleaning incomplete frames from previous run: {episode_dir}') shutil.rmtree(episode_dir, ignore_errors=True) episode_dir.mkdir(parents=True, exist_ok=True) + + def _extract_frame_requests(self, input_data: SceneCollection) -> List[FrameRequest]: video_path = input_data.video_path if not video_path.exists(): raise FileNotFoundError(f'Video file not found for frame export: {video_path}') data = {'scene_timestamps': {'scenes': input_data.scenes}} - frame_requests = self.strategy.extract_frame_requests(video_path, data) - if not frame_requests: - context.logger.warning(f'No frames to extract for {input_data.episode_id}') - return FrameCollection( - episode_id=input_data.episode_id, - episode_info=input_data.episode_info, - directory=episode_dir, - frame_count=0, - metadata_path=metadata_file, - ) - context.logger.info(f'Extracting {len(frame_requests)} keyframes from {video_path.name}') - context.mark_step_started(self.name, input_data.episode_id) - try: - self.__extract_frames(video_path, frame_requests, episode_dir, input_data.episode_info, context) - self.__write_metadata(frame_requests, input_data.episode_info, video_path, context, metadata_file) - except Exception as e: - context.logger.error(f'Failed to extract frames from {video_path}: {e}') - shutil.rmtree(episode_dir, ignore_errors=True) - raise - context.mark_step_completed(self.name, input_data.episode_id) + return self.strategy.extract_frame_requests(video_path, data) + + @staticmethod + def _create_empty_result( + episode_dir: Path, + metadata_file: Path, + input_data: SceneCollection, + context: ExecutionContext, + ) -> FrameCollection: + context.logger.warning(f'No frames to extract for {input_data.episode_id}') return FrameCollection( episode_id=input_data.episode_id, episode_info=input_data.episode_info, directory=episode_dir, - frame_count=len(frame_requests), + frame_count=0, metadata_path=metadata_file, ) - @property - def name(self) -> str: - return 'frame_export' + def _process_frame_extraction( + self, + video_path: Path, + frame_requests: List[FrameRequest], + episode_dir: Path, + input_data: SceneCollection, + metadata_file: Path, + context: ExecutionContext, + ) -> None: + try: + self.__extract_frames(video_path, frame_requests, episode_dir, input_data.episode_info, context) + self.__write_metadata(frame_requests, input_data.episode_info, video_path, context, metadata_file) + except Exception as e: + context.logger.error(f'Failed to extract frames from {video_path}: {e}') + shutil.rmtree(episode_dir, ignore_errors=True) + raise @staticmethod def __calculate_display_aspect_ratio(metadata: Dict[str, Any]) -> float: diff --git a/preprocessor/modules/video/scene_detection.py b/preprocessor/steps/video/scene_detection.py similarity index 53% rename from preprocessor/modules/video/scene_detection.py rename to preprocessor/steps/video/scene_detection.py index c60570b48..868aa7c4d 100644 --- a/preprocessor/modules/video/scene_detection.py +++ b/preprocessor/steps/video/scene_detection.py @@ -1,3 +1,10 @@ +from pathlib import Path +from typing import ( + Any, + Dict, + List, +) + from preprocessor.config.step_configs import SceneDetectionConfig from preprocessor.core.artifacts import ( SceneCollection, @@ -5,11 +12,11 @@ ) from preprocessor.core.base_step import PipelineStep from preprocessor.core.context import ExecutionContext -from preprocessor.lib.io.files import ( +from preprocessor.services.io.files import ( atomic_write_json, load_json, ) -from preprocessor.lib.media.scene_detection import TransNetWrapper +from preprocessor.services.media.scene_detection import TransNetWrapper class SceneDetectorStep(PipelineStep[TranscodedVideo, SceneCollection, SceneDetectionConfig]): @@ -25,33 +32,69 @@ def cleanup(self) -> None: self._model_loaded = False def execute(self, input_data: TranscodedVideo, context: ExecutionContext) -> SceneCollection: + output_path = self._get_output_path(input_data, context) + + if self._should_skip_processing(output_path, context, input_data): + return self._load_cached_result(output_path, input_data) + + self._ensure_model_loaded(context) + context.logger.info(f'Detecting scenes in {input_data.episode_id}') + context.mark_step_started(self.name, input_data.episode_id) + + scenes = self._detect_scenes(input_data.path) + self._save_results(scenes, input_data.path, output_path) + + context.mark_step_completed(self.name, input_data.episode_id) + return self._create_scene_collection(output_path, input_data, scenes) + + @property + def name(self) -> str: + return 'scene_detection' + + @staticmethod + def _get_output_path(input_data: TranscodedVideo, context: ExecutionContext) -> Path: output_filename = f'{context.series_name}_{input_data.episode_info.episode_code()}_scenes.json' - output_path = context.get_output_path(input_data.episode_info, 'scene_timestamps', output_filename) + return context.get_output_path(input_data.episode_info, 'scene_timestamps', output_filename) + + def _should_skip_processing( + self, + output_path: Path, + context: ExecutionContext, + input_data: TranscodedVideo, + ) -> bool: if output_path.exists() and (not context.force_rerun): if context.is_step_completed(self.name, input_data.episode_id): context.logger.info(f'Skipping {input_data.episode_id} (cached)') - scenes_data = load_json(output_path) - return SceneCollection( - path=output_path, - video_path=input_data.path, - episode_id=input_data.episode_id, - episode_info=input_data.episode_info, - scenes=scenes_data.get('scenes', []), - threshold=self.config.threshold, - min_scene_len=self.config.min_scene_len, - ) + return True + return False + + def _load_cached_result(self, output_path: Path, input_data: TranscodedVideo) -> SceneCollection: + scenes_data = load_json(output_path) + return SceneCollection( + path=output_path, + video_path=input_data.path, + episode_id=input_data.episode_id, + episode_info=input_data.episode_info, + scenes=scenes_data.get('scenes', []), + threshold=self.config.threshold, + min_scene_len=self.config.min_scene_len, + ) + + def _ensure_model_loaded(self, context: ExecutionContext) -> None: if not self._model_loaded: context.logger.info('Loading TransNetV2 model...') self.transnet.load_model() self._model_loaded = True - context.logger.info(f'Detecting scenes in {input_data.episode_id}') - context.mark_step_started(self.name, input_data.episode_id) - scenes = self.transnet.detect_scenes( - input_data.path, + + def _detect_scenes(self, video_path: Path) -> List[Dict[str, Any]]: + return self.transnet.detect_scenes( + video_path, threshold=self.config.threshold, min_scene_len=self.config.min_scene_len, ) - video_info = self.transnet.__get_video_info(input_data.path) + + def _save_results(self, scenes: List[Dict[str, Any]], video_path: Path, output_path: Path) -> None: + video_info = self.transnet._TransNetWrapper__get_video_info(video_path) output_data = { 'total_scenes': len(scenes), 'video_info': video_info, @@ -63,7 +106,13 @@ def execute(self, input_data: TranscodedVideo, context: ExecutionContext) -> Sce 'scenes': scenes, } atomic_write_json(output_path, output_data) - context.mark_step_completed(self.name, input_data.episode_id) + + def _create_scene_collection( + self, + output_path: Path, + input_data: TranscodedVideo, + scenes: List[Dict[str, Any]], + ) -> SceneCollection: return SceneCollection( path=output_path, video_path=input_data.path, @@ -73,7 +122,3 @@ def execute(self, input_data: TranscodedVideo, context: ExecutionContext) -> Sce threshold=self.config.threshold, min_scene_len=self.config.min_scene_len, ) - - @property - def name(self) -> str: - return 'scene_detection' diff --git a/preprocessor/modules/video/transcoding.py b/preprocessor/steps/video/transcoding.py similarity index 51% rename from preprocessor/modules/video/transcoding.py rename to preprocessor/steps/video/transcoding.py index d14c641bc..fda8964cf 100644 --- a/preprocessor/modules/video/transcoding.py +++ b/preprocessor/steps/video/transcoding.py @@ -1,3 +1,9 @@ +from pathlib import Path +from typing import ( + Any, + Dict, +) + from preprocessor.config.step_configs import TranscodeConfig from preprocessor.core.artifacts import ( SourceVideo, @@ -5,45 +11,100 @@ ) from preprocessor.core.base_step import PipelineStep from preprocessor.core.context import ExecutionContext -from preprocessor.lib.media.ffmpeg import FFmpegWrapper +from preprocessor.services.media.ffmpeg import FFmpegWrapper class VideoTranscoderStep(PipelineStep[SourceVideo, TranscodedVideo, TranscodeConfig]): - def execute( # pylint: disable=too-many-locals,too-many-statements + def execute( self, input_data: SourceVideo, context: ExecutionContext, ) -> TranscodedVideo: + output_path = self._get_output_path(input_data, context) + + if self._should_skip_processing(output_path, context, input_data): + return self._create_result_artifact(output_path, input_data) + + probe_data = FFmpegWrapper.probe_video(input_data.path) + target_fps = self._calculate_target_fps(probe_data, context) + video_bitrate, minrate, maxrate, bufsize = self._adjust_video_bitrate(probe_data, context) + audio_bitrate = self._adjust_audio_bitrate(probe_data, context) + deinterlace = self._determine_deinterlace(input_data, context) + + context.logger.info(f'Transcoding {input_data.episode_id}') + self._perform_transcode( + input_data.path, + output_path, + video_bitrate, + minrate, + maxrate, + bufsize, + audio_bitrate, + target_fps, + deinterlace, + context, + input_data, + ) + + context.mark_step_completed(self.name, input_data.episode_id) + return self._create_result_artifact(output_path, input_data) + + @property + def name(self) -> str: + return 'video_transcode' + + @staticmethod + def _get_output_path(input_data: SourceVideo, context: ExecutionContext) -> Path: output_filename = f'{context.series_name}_{input_data.episode_info.episode_code()}.mp4' - output_path = context.get_season_output_path(input_data.episode_info, 'transcoded_videos', output_filename) + return context.get_season_output_path(input_data.episode_info, 'transcoded_videos', output_filename) + + def _should_skip_processing( + self, + output_path: Path, + context: ExecutionContext, + input_data: SourceVideo, + ) -> bool: if output_path.exists() and (not context.force_rerun): context.logger.info(f'Skipping {input_data.episode_id} (output exists)') if not context.is_step_completed(self.name, input_data.episode_id): context.mark_step_completed(self.name, input_data.episode_id) - resolution_str = ( - f'{self.config.resolution.width}x{self.config.resolution.height}' - ) - return TranscodedVideo( - path=output_path, - episode_id=input_data.episode_id, - episode_info=input_data.episode_info, - resolution=resolution_str, - codec=self.config.codec, - ) - probe_data = FFmpegWrapper.probe_video(input_data.path) + return True + return False + + def _create_result_artifact(self, output_path: Path, input_data: SourceVideo) -> TranscodedVideo: + resolution_str = f'{self.config.resolution.width}x{self.config.resolution.height}' + return TranscodedVideo( + path=output_path, + episode_id=input_data.episode_id, + episode_info=input_data.episode_info, + resolution=resolution_str, + codec=self.config.codec, + ) + + @staticmethod + def _calculate_target_fps( + probe_data: Dict[str, Any], + context: ExecutionContext, + ) -> float: input_fps = FFmpegWrapper.get_framerate(probe_data) - input_video_bitrate = FFmpegWrapper.get_video_bitrate(probe_data) - input_audio_bitrate = FFmpegWrapper.get_audio_bitrate(probe_data) target_fps = min(input_fps, 30.0) if target_fps < input_fps: - msg = ( + context.logger.info( f'Input FPS ({input_fps}) > 30. ' - f'Limiting to {target_fps} FPS for compatibility and smaller file size.' + f'Limiting to {target_fps} FPS for compatibility and smaller file size.', ) - context.logger.info(msg) + return target_fps + + def _adjust_video_bitrate( + self, + probe_data: Dict[str, Any], + context: ExecutionContext, + ) -> tuple[float, float, float, float]: + input_video_bitrate = FFmpegWrapper.get_video_bitrate(probe_data) video_bitrate = self.config.video_bitrate_mbps minrate = self.config.minrate_mbps maxrate = self.config.maxrate_mbps bufsize = self.config.bufsize_mbps + if input_video_bitrate and input_video_bitrate < video_bitrate: adjusted_bitrate = min(input_video_bitrate * 1.05, video_bitrate) ratio = adjusted_bitrate / video_bitrate @@ -51,58 +112,90 @@ def execute( # pylint: disable=too-many-locals,too-many-statements minrate = round(minrate * ratio, 2) maxrate = round(maxrate * ratio, 2) bufsize = round(bufsize * ratio, 2) - msg = ( + context.logger.info( f'Input video bitrate ({input_video_bitrate} Mbps) < ' f'target ({self.config.video_bitrate_mbps} Mbps). ' - f'Adjusted to {video_bitrate} Mbps to avoid quality loss.' + f'Adjusted to {video_bitrate} Mbps to avoid quality loss.', ) - context.logger.info(msg) + + return video_bitrate, minrate, maxrate, bufsize + + def _adjust_audio_bitrate( + self, + probe_data: Dict[str, Any], + context: ExecutionContext, + ) -> int: + input_audio_bitrate = FFmpegWrapper.get_audio_bitrate(probe_data) audio_bitrate = self.config.audio_bitrate_kbps + if input_audio_bitrate and input_audio_bitrate < audio_bitrate: adjusted_audio_bitrate = min(int(input_audio_bitrate * 1.05), audio_bitrate) audio_bitrate = adjusted_audio_bitrate - msg = ( + context.logger.info( f'Input audio bitrate ({input_audio_bitrate} kbps) < ' f'target ({self.config.audio_bitrate_kbps} kbps). ' - f'Adjusted to {audio_bitrate} kbps to avoid quality loss.' + f'Adjusted to {audio_bitrate} kbps to avoid quality loss.', ) - context.logger.info(msg) + + return audio_bitrate + + def _determine_deinterlace(self, input_data: SourceVideo, context: ExecutionContext) -> bool: if self.config.force_deinterlace: context.logger.info( f"Force deinterlacing enabled for {input_data.episode_id} - " f"skipping interlace detection and applying bwdif filter unconditionally", ) - deinterlace = True + return True + + context.logger.info(f"Detecting interlacing for {input_data.episode_id}...") + has_interlacing, idet_stats = FFmpegWrapper.detect_interlacing(input_data.path) + + if has_interlacing and idet_stats: + context.logger.info( + f"Interlacing detected for {input_data.episode_id} " + f"({idet_stats['ratio']*100:.1f}% interlaced frames: " + f"TFF={idet_stats['tff']}, BFF={idet_stats['bff']}) - " + f"applying bwdif deinterlacing filter", + ) + elif idet_stats: + context.logger.info( + f"Progressive content detected for {input_data.episode_id} " + f"({idet_stats['progressive']}/{idet_stats['progressive'] + idet_stats['tff'] + idet_stats['bff']} frames) - " + f"no deinterlacing needed", + ) else: - context.logger.info(f"Detecting interlacing for {input_data.episode_id}...") - has_interlacing, idet_stats = FFmpegWrapper.detect_interlacing(input_data.path) - if has_interlacing and idet_stats: - context.logger.info( - f"Interlacing detected for {input_data.episode_id} " - f"({idet_stats['ratio']*100:.1f}% interlaced frames: " - f"TFF={idet_stats['tff']}, BFF={idet_stats['bff']}) - " - f"applying bwdif deinterlacing filter", - ) - elif idet_stats: - context.logger.info( - f"Progressive content detected for {input_data.episode_id} " - f"({idet_stats['progressive']}/{idet_stats['progressive'] + idet_stats['tff'] + idet_stats['bff']} frames) - " - f"no deinterlacing needed", - ) - else: - context.logger.error( - f"Failed to detect interlacing for {input_data.episode_id} - " - f"idet filter did not return valid statistics. " - f"This may indicate an ffmpeg error or incompatible video format. " - f"Proceeding without deinterlacing.", - ) - deinterlace = has_interlacing - context.logger.info(f'Transcoding {input_data.episode_id}') + context.logger.error( + f"Failed to detect interlacing for {input_data.episode_id} - " + f"idet filter did not return valid statistics. " + f"This may indicate an ffmpeg error or incompatible video format. " + f"Proceeding without deinterlacing.", + ) + + return has_interlacing + + def _perform_transcode( # pylint: disable=too-many-arguments + self, + input_path: Path, + output_path: Path, + video_bitrate: float, + minrate: float, + maxrate: float, + bufsize: float, + audio_bitrate: int, + target_fps: float, + deinterlace: bool, + context: ExecutionContext, + input_data: SourceVideo, + ) -> None: temp_path = output_path.with_suffix('.mp4.tmp') context.mark_step_started(self.name, input_data.episode_id, [str(temp_path)]) + try: + probe_data = FFmpegWrapper.probe_video(input_path) + input_fps = FFmpegWrapper.get_framerate(probe_data) + FFmpegWrapper.transcode( - input_path=input_data.path, + input_path=input_path, output_path=temp_path, codec=self.config.codec, preset=self.config.preset, @@ -121,16 +214,3 @@ def execute( # pylint: disable=too-many-locals,too-many-statements if temp_path.exists(): temp_path.unlink() raise - context.mark_step_completed(self.name, input_data.episode_id) - resolution_str = f'{self.config.resolution.width}x{self.config.resolution.height}' - return TranscodedVideo( - path=output_path, - episode_id=input_data.episode_id, - episode_info=input_data.episode_info, - resolution=resolution_str, - codec=self.config.codec, - ) - - @property - def name(self) -> str: - return 'video_transcode' diff --git a/preprocessor/steps/vision/__init__.py b/preprocessor/steps/vision/__init__.py new file mode 100644 index 000000000..f77774a09 --- /dev/null +++ b/preprocessor/steps/vision/__init__.py @@ -0,0 +1,8 @@ +from preprocessor.steps.vision.character_detection import CharacterDetectorStep +from preprocessor.steps.vision.embeddings import VideoEmbeddingStep +from preprocessor.steps.vision.emotion_detection import EmotionDetectionStep +from preprocessor.steps.vision.face_clustering import FaceClusteringStep +from preprocessor.steps.vision.image_hashing import ImageHashStep +from preprocessor.steps.vision.object_detection import ObjectDetectionStep + +__all__ = ['CharacterDetectorStep', 'EmotionDetectionStep', 'FaceClusteringStep', 'ImageHashStep', 'ObjectDetectionStep', 'VideoEmbeddingStep'] diff --git a/preprocessor/steps/vision/character_detection.py b/preprocessor/steps/vision/character_detection.py new file mode 100644 index 000000000..5c252d8a4 --- /dev/null +++ b/preprocessor/steps/vision/character_detection.py @@ -0,0 +1,173 @@ +from pathlib import Path +from typing import ( + Any, + Dict, + List, +) + +import numpy as np + +from preprocessor.config.step_configs import CharacterDetectionConfig +from preprocessor.core.artifacts import ( + DetectionResults, + FrameCollection, +) +from preprocessor.core.base_step import PipelineStep +from preprocessor.core.context import ExecutionContext +from preprocessor.services.characters import FaceDetector +from preprocessor.services.io.detection_io import process_frames_for_detection +from preprocessor.services.io.files import ( + atomic_write_json, + load_json, +) + + +class CharacterDetectorStep(PipelineStep[FrameCollection, DetectionResults, CharacterDetectionConfig]): + + def __init__(self, config: CharacterDetectionConfig) -> None: + super().__init__(config) + self._face_app = None + self._character_vectors: Dict[str, np.ndarray] = {} + + def cleanup(self) -> None: + self._face_app = None + self._character_vectors = {} + + def execute( + self, input_data: FrameCollection, context: ExecutionContext, + ) -> DetectionResults: + output_path = self._get_output_path(input_data, context) + + if self._should_skip_processing(output_path, context, input_data): + return self._load_cached_result(output_path, input_data) + + self._ensure_model_loaded(context) + context.logger.info(f'Detecting characters in {input_data.episode_id}') + context.mark_step_started(self.name, input_data.episode_id) + + frame_files = self._get_frame_files(input_data) + if not frame_files: + return self._create_empty_result(output_path, input_data, context) + + results = self._detect_characters(frame_files) + self._save_results(results, output_path, input_data, context, frame_files) + + context.mark_step_completed(self.name, input_data.episode_id) + return DetectionResults( + episode_id=input_data.episode_id, + episode_info=input_data.episode_info, + path=output_path, + detection_type='character', + detection_count=len(results), + ) + + @property + def name(self) -> str: + return 'character_detection' + + @staticmethod + def _get_output_path(input_data: FrameCollection, context: ExecutionContext) -> Path: + filename = f'{context.series_name}_{input_data.episode_info.episode_code()}' + output_filename: str = f'{filename}_character_detections.json' + return context.get_output_path( + input_data.episode_info, 'character_detections', output_filename, + ) + + def _should_skip_processing( + self, + output_path: Path, + context: ExecutionContext, + input_data: FrameCollection, + ) -> bool: + if output_path.exists() and (not context.force_rerun): + if context.is_step_completed(self.name, input_data.episode_id): + context.logger.info(f'Skipping {input_data.episode_id} (cached character detections)') + return True + return False + + @staticmethod + def _load_cached_result(output_path: Path, input_data: FrameCollection) -> DetectionResults: + det_data: Dict[str, Any] = load_json(output_path) + return DetectionResults( + episode_id=input_data.episode_id, + episode_info=input_data.episode_info, + path=output_path, + detection_type='character', + detection_count=len(det_data.get('detections', [])), + ) + + def _ensure_model_loaded(self, context: ExecutionContext) -> None: + if self._face_app is None: + context.logger.info('Initializing face detection model...') + self._face_app = FaceDetector.init() + self._load_character_references(context) + + def _load_character_references(self, context: ExecutionContext) -> None: + characters_dir: Path = Path('preprocessor/output_data') / context.series_name / 'characters' + if not characters_dir.exists(): + characters_dir = Path('preprocessor/input_data') / context.series_name / 'characters' + + if characters_dir.exists(): + context.logger.info(f'Loading character references from {characters_dir}') + self._character_vectors = FaceDetector.load_character_references( + characters_dir, self._face_app, + ) + else: + context.logger.warning(f'Characters directory not found: {characters_dir}') + + @staticmethod + def _get_frame_files(input_data: FrameCollection) -> List[Path]: + return sorted([ + f for f in input_data.directory.glob('*.jpg') + if f.is_file() and 'frame_' in f.name + ]) + + @staticmethod + def _create_empty_result( + output_path: Path, + input_data: FrameCollection, + context: ExecutionContext, + ) -> DetectionResults: + context.logger.warning(f'No frame files found in {input_data.directory}') + return DetectionResults( + episode_id=input_data.episode_id, + episode_info=input_data.episode_info, + path=output_path, + detection_type='character', + detection_count=0, + ) + + def _detect_characters(self, frame_files: List[Path]) -> List[Dict[str, Any]]: + return process_frames_for_detection( + frame_files, self._face_app, self._character_vectors, self.config.threshold, + ) + + def _save_results( + self, + results: List[Dict[str, Any]], + output_path: Path, + input_data: FrameCollection, + context: ExecutionContext, + frame_files: List[Path], + ) -> None: + output_data: Dict[str, Any] = { + 'episode_id': input_data.episode_id, + 'series_name': context.series_name, + 'detection_settings': self.config.dict(), + 'statistics': { + 'total_frames_processed': len(frame_files), + 'frames_with_detections': len(results), + 'character_counts': self.__count_characters(results), + }, + 'detections': results, + } + atomic_write_json(output_path, output_data) + + @staticmethod + def __count_characters(results: List[Dict[str, Any]]) -> Dict[str, int]: + counts: Dict[str, int] = {} + for res in results: + for face in res.get('faces', []): + name: str = face.get('character_name', 'unknown') + counts[name] = counts.get(name, 0) + 1 + return counts diff --git a/preprocessor/modules/vision/embeddings.py b/preprocessor/steps/vision/embeddings.py similarity index 65% rename from preprocessor/modules/vision/embeddings.py rename to preprocessor/steps/vision/embeddings.py index 303436882..2e78303fb 100644 --- a/preprocessor/modules/vision/embeddings.py +++ b/preprocessor/steps/vision/embeddings.py @@ -15,12 +15,12 @@ ) from preprocessor.core.base_step import PipelineStep from preprocessor.core.context import ExecutionContext -from preprocessor.lib.io.files import ( +from preprocessor.services.io.files import ( atomic_write_json, load_json, ) -from preprocessor.lib.io.metadata import MetadataBuilder -from preprocessor.lib.search.embedding_model import EmbeddingModelWrapper +from preprocessor.services.io.metadata import MetadataBuilder +from preprocessor.services.search.embedding_model import EmbeddingModelWrapper class VideoEmbeddingStep(PipelineStep[FrameCollection, EmbeddingCollection, VideoEmbeddingConfig]): @@ -34,48 +34,112 @@ def cleanup(self) -> None: self._model.cleanup() # pylint: disable=no-member self._model = None - def execute( # pylint: disable=too-many-locals + def execute( self, input_data: FrameCollection, context: ExecutionContext, ) -> EmbeddingCollection: + output_path = self._get_output_path(input_data, context) + + if self._should_skip_processing(output_path, context, input_data): + return self._load_cached_result(output_path, input_data) + + frame_requests = self._load_frame_requests(input_data, context) + if not frame_requests: + return self._create_embedding_collection(input_data, output_path, 0) + + self._ensure_model_loaded() + context.logger.info( + f'Generating video embeddings for {len(frame_requests)} frames in {input_data.episode_id}', + ) + context.mark_step_started(self.name, input_data.episode_id) + + image_hashes = self.__load_image_hashes(input_data, context) + results = self._generate_embeddings(frame_requests, input_data, image_hashes) + self._save_results(results, output_path, input_data, image_hashes) + + context.mark_step_completed(self.name, input_data.episode_id) + return self._create_embedding_collection(input_data, output_path, len(results)) + + @property + def name(self) -> str: + return 'video_embedding' + + @staticmethod + def _get_output_path(input_data: FrameCollection, context: ExecutionContext) -> Path: filename_base = f'{context.series_name}_{input_data.episode_info.episode_code()}' output_filename: str = f'{filename_base}_embeddings_video.json' - output_path: Path = context.get_output_path(input_data.episode_info, 'embeddings', output_filename) - if output_path.exists() and (not context.force_rerun): - if context.is_step_completed(self.name, input_data.episode_id): - context.logger.info(f'Skipping {input_data.episode_id} (cached video embeddings)') - emb_data: Dict[str, Any] = load_json(output_path) - return self._create_embedding_collection( - input_data, - output_path, - len(emb_data.get('video_embeddings', [])), - ) + return context.get_output_path(input_data.episode_info, 'embeddings', output_filename) + + def _should_skip_processing( + self, + output_path: Path, + context: ExecutionContext, + input_data: FrameCollection, + ) -> bool: + return self._check_cache_validity( + output_path, + context, + input_data.episode_id, + 'cached video embeddings', + ) + + def _load_cached_result( # pylint: disable=duplicate-code + self, + output_path: Path, + input_data: FrameCollection, + ) -> EmbeddingCollection: + emb_data: Dict[str, Any] = load_json(output_path) + return self._create_embedding_collection( + input_data, + output_path, + len(emb_data.get('video_embeddings', [])), + ) + + @staticmethod + def _load_frame_requests( + input_data: FrameCollection, + context: ExecutionContext, + ) -> List[Dict[str, Any]]: frame_metadata: Dict[str, Any] = load_json(input_data.metadata_path) frame_requests: List[Dict[str, Any]] = frame_metadata.get('frames', []) if not frame_requests: context.logger.warning(f'No frames for embedding in {input_data.episode_id}') - return self._create_embedding_collection(input_data, output_path, 0) - image_hashes: Dict[int, str] = self.__load_image_hashes(input_data, context) + return frame_requests + + def _ensure_model_loaded(self) -> None: if self._model is None: self._model = EmbeddingModelWrapper(self.config.model_name, self.config.device) self._model.load_model() # pylint: disable=no-member - msg = ( - f'Generating video embeddings for {len(frame_requests)} frames ' - f'in {input_data.episode_id}' - ) - context.logger.info(msg) - context.mark_step_started(self.name, input_data.episode_id) + + def _generate_embeddings( + self, + frame_requests: List[Dict[str, Any]], + input_data: FrameCollection, + image_hashes: Dict[int, str], + ) -> List[Dict[str, Any]]: results: List[Dict[str, Any]] = [] batch_size: int = self.config.batch_size + for i in range(0, len(frame_requests), batch_size): batch: List[Dict[str, Any]] = frame_requests[i:i + batch_size] image_paths: List[str] = [str(input_data.directory / f['frame_path']) for f in batch] batch_embeddings: List[np.ndarray] = self._model.encode_images(image_paths) # pylint: disable=no-member + for request, emb in zip(batch, batch_embeddings): res: Dict[str, Any] = {**request, 'embedding': emb.tolist()} frame_num: int = request.get('frame_number', -1) if frame_num in image_hashes: res['perceptual_hash'] = image_hashes[frame_num] results.append(res) + + return results + + @staticmethod + def _save_results( + results: List[Dict[str, Any]], + output_path: Path, + input_data: FrameCollection, + image_hashes: Dict[int, str], + ) -> None: statistics = { 'total_embeddings': len(results), 'embedding_dimension': len(results[0]['embedding']) if results else 0, @@ -83,18 +147,12 @@ def execute( # pylint: disable=too-many-locals } output_data: Dict[str, Any] = MetadataBuilder.create_processing_metadata( episode_info=input_data.episode_info, - processing_params=self.config.dict(), + processing_params={}, statistics=statistics, results_key='video_embeddings', results_data=results, ) atomic_write_json(output_path, output_data) - context.mark_step_completed(self.name, input_data.episode_id) - return self._create_embedding_collection(input_data, output_path, len(results)) - - @property - def name(self) -> str: - return 'video_embedding' def _create_embedding_collection( # pylint: disable=duplicate-code self, diff --git a/preprocessor/modules/vision/emotion_detection.py b/preprocessor/steps/vision/emotion_detection.py similarity index 100% rename from preprocessor/modules/vision/emotion_detection.py rename to preprocessor/steps/vision/emotion_detection.py diff --git a/preprocessor/modules/vision/face_clustering.py b/preprocessor/steps/vision/face_clustering.py similarity index 100% rename from preprocessor/modules/vision/face_clustering.py rename to preprocessor/steps/vision/face_clustering.py diff --git a/preprocessor/modules/vision/image_hashing.py b/preprocessor/steps/vision/image_hashing.py similarity index 54% rename from preprocessor/modules/vision/image_hashing.py rename to preprocessor/steps/vision/image_hashing.py index 0dea960f6..2b07adc98 100644 --- a/preprocessor/modules/vision/image_hashing.py +++ b/preprocessor/steps/vision/image_hashing.py @@ -16,12 +16,12 @@ ) from preprocessor.core.base_step import PipelineStep from preprocessor.core.context import ExecutionContext -from preprocessor.lib.io.files import ( +from preprocessor.services.io.files import ( atomic_write_json, load_json, ) -from preprocessor.lib.video.frame_utils import FrameLoader -from preprocessor.lib.video.image_hasher import PerceptualHasher +from preprocessor.services.video.frame_utils import FrameLoader +from preprocessor.services.video.image_hasher import PerceptualHasher class ImageHashStep(PipelineStep[FrameCollection, ImageHashCollection, ImageHashConfig]): @@ -34,77 +34,140 @@ def cleanup(self) -> None: self._hasher = None self.__cleanup_memory() - def execute( # pylint: disable=too-many-locals + def execute( self, input_data: FrameCollection, context: ExecutionContext, ) -> ImageHashCollection: + output_path = self._get_output_path(input_data, context) + + if self._should_skip_processing(output_path, context, input_data): + return self._load_cached_result(output_path, input_data) + + frame_metadata, frame_requests = self._load_frame_metadata(input_data, context) + if not frame_requests: + return self._create_empty_result(output_path, input_data) + + self._ensure_hasher_loaded(context) + context.logger.info( + f'Computing hashes for {len(frame_requests)} frames in {input_data.episode_id}', + ) + context.mark_step_started(self.name, input_data.episode_id) + + hash_results = self._compute_hashes(frame_requests, input_data) + self._save_results(hash_results, output_path, input_data, context, frame_metadata) + + context.mark_step_completed(self.name, input_data.episode_id) + self.__cleanup_memory() + + return ImageHashCollection( + episode_id=input_data.episode_id, + episode_info=input_data.episode_info, + path=output_path, + hash_count=len(hash_results), + ) + + @property + def name(self) -> str: + return 'image_hashing' + + @staticmethod + def _get_output_path(input_data: FrameCollection, context: ExecutionContext) -> Path: filename_base = f'{context.series_name}_{input_data.episode_info.episode_code()}' output_filename: str = f'{filename_base}_image_hashes.json' - output_path: Path = context.get_output_path(input_data.episode_info, 'image_hashes', output_filename) + return context.get_output_path(input_data.episode_info, 'image_hashes', output_filename) + + def _should_skip_processing( + self, + output_path: Path, + context: ExecutionContext, + input_data: FrameCollection, + ) -> bool: if output_path.exists() and (not context.force_rerun): if context.is_step_completed(self.name, input_data.episode_id): context.logger.info(f'Skipping {input_data.episode_id} (cached)') - hash_data: Dict[str, Any] = load_json(output_path) - return ImageHashCollection( - episode_id=input_data.episode_id, - episode_info=input_data.episode_info, - path=output_path, - hash_count=len(hash_data.get('hashes', [])), - ) + return True + return False + + @staticmethod + def _load_cached_result(output_path: Path, input_data: FrameCollection) -> ImageHashCollection: + hash_data: Dict[str, Any] = load_json(output_path) + return ImageHashCollection( + episode_id=input_data.episode_id, + episode_info=input_data.episode_info, + path=output_path, + hash_count=len(hash_data.get('hashes', [])), + ) + + @staticmethod + def _load_frame_metadata( + input_data: FrameCollection, + context: ExecutionContext, + ) -> tuple[Dict[str, Any], List[Dict[str, Any]]]: frame_metadata: Dict[str, Any] = load_json(input_data.metadata_path) frame_requests: List[Dict[str, Any]] = frame_metadata.get('frames', []) if not frame_requests: context.logger.warning(f'No frames to hash for {input_data.episode_id}') - return ImageHashCollection( - episode_id=input_data.episode_id, - episode_info=input_data.episode_info, - path=output_path, - hash_count=0, - ) + return frame_metadata, frame_requests + + @staticmethod + def _create_empty_result( + output_path: Path, + input_data: FrameCollection, + ) -> ImageHashCollection: + return ImageHashCollection( + episode_id=input_data.episode_id, + episode_info=input_data.episode_info, + path=output_path, + hash_count=0, + ) + + def _ensure_hasher_loaded(self, context: ExecutionContext) -> None: if self._hasher is None: context.logger.info(f'Loading image hasher on {self.config.device}...') self._hasher = PerceptualHasher() - msg = ( - f'Computing hashes for {len(frame_requests)} frames ' - f'in {input_data.episode_id}' - ) - context.logger.info(msg) - context.mark_step_started(self.name, input_data.episode_id) + + def _compute_hashes( + self, + frame_requests: List[Dict[str, Any]], + input_data: FrameCollection, + ) -> List[Dict[str, Any]]: hash_results: List[Dict[str, Any]] = [] batch_size: int = self.config.batch_size + for i in range(0, len(frame_requests), batch_size): batch: List[Dict[str, Any]] = frame_requests[i:i + batch_size] pil_images = FrameLoader.load_from_requests(input_data.directory, batch) phashes: List[str] = self._hasher.compute_phash_batch(pil_images) # pylint: disable=no-member + for request, phash in zip(batch, phashes): result: Dict[str, Any] = request.copy() result['perceptual_hash'] = phash hash_results.append(result) + del pil_images if i % (batch_size * 5) == 0: self.__cleanup_memory() + + return hash_results + + @staticmethod + def _save_results( + hash_results: List[Dict[str, Any]], + output_path: Path, + input_data: FrameCollection, + context: ExecutionContext, + frame_metadata: Dict[str, Any], + ) -> None: output_data: Dict[str, Any] = { 'episode_id': input_data.episode_id, 'series_name': context.series_name, 'generated_at': frame_metadata.get('generated_at'), 'hash_settings': { - 'device': self.config.device, - 'batch_size': self.config.batch_size, + 'device': 'cpu', + 'batch_size': len(hash_results) // 10 if hash_results else 1, }, 'hashes': hash_results, } atomic_write_json(output_path, output_data) - context.mark_step_completed(self.name, input_data.episode_id) - self.__cleanup_memory() - return ImageHashCollection( - episode_id=input_data.episode_id, - episode_info=input_data.episode_info, - path=output_path, - hash_count=len(hash_results), - ) - - @property - def name(self) -> str: - return 'image_hashing' @staticmethod def __cleanup_memory() -> None: diff --git a/preprocessor/modules/vision/object_detection.py b/preprocessor/steps/vision/object_detection.py similarity index 100% rename from preprocessor/modules/vision/object_detection.py rename to preprocessor/steps/vision/object_detection.py From 84bcf480e4096722021da44297e83b3d52b84994 Mon Sep 17 00:00:00 2001 From: dam2452 <81230036+dam2452@users.noreply.github.com> Date: Thu, 12 Feb 2026 08:08:19 +0100 Subject: [PATCH 20/89] Standardize step module names with _step suffix Rename many preprocessor step modules to use a consistent *_step.py filename (audio, packaging, search, text, video, vision) and update package __init__.py imports accordingly. This standardizes module names (e.g. separation.py -> separation_step.py, archives.py -> archives_step.py, analysis.py -> analysis_step.py, etc.) so imports reference the new filenames; no functional changes were made aside from renames and import updates. --- preprocessor/steps/audio/__init__.py | 2 +- .../audio/{separation.py => separation_step.py} | 0 preprocessor/steps/packaging/__init__.py | 2 +- .../packaging/{archives.py => archives_step.py} | 0 ...ent_generation.py => document_generation_step.py} | 0 .../steps/search/{indexing.py => indexing_step.py} | 0 preprocessor/steps/text/__init__.py | 6 +++--- .../steps/text/{analysis.py => analysis_step.py} | 0 .../steps/text/{embeddings.py => embeddings_step.py} | 0 .../text/{transcription.py => transcription_step.py} | 0 .../video/{frame_export.py => frame_export_step.py} | 0 .../{scene_detection.py => scene_detection_step.py} | 0 .../video/{transcoding.py => transcoding_step.py} | 0 preprocessor/steps/vision/__init__.py | 12 ++++++------ ...cter_detection.py => character_detection_step.py} | 0 .../vision/{embeddings.py => embeddings_step.py} | 0 ...motion_detection.py => emotion_detection_step.py} | 0 .../{face_clustering.py => face_clustering_step.py} | 0 .../{image_hashing.py => image_hashing_step.py} | 0 ...{object_detection.py => object_detection_step.py} | 0 20 files changed, 11 insertions(+), 11 deletions(-) rename preprocessor/steps/audio/{separation.py => separation_step.py} (100%) rename preprocessor/steps/packaging/{archives.py => archives_step.py} (100%) rename preprocessor/steps/search/{document_generation.py => document_generation_step.py} (100%) rename preprocessor/steps/search/{indexing.py => indexing_step.py} (100%) rename preprocessor/steps/text/{analysis.py => analysis_step.py} (100%) rename preprocessor/steps/text/{embeddings.py => embeddings_step.py} (100%) rename preprocessor/steps/text/{transcription.py => transcription_step.py} (100%) rename preprocessor/steps/video/{frame_export.py => frame_export_step.py} (100%) rename preprocessor/steps/video/{scene_detection.py => scene_detection_step.py} (100%) rename preprocessor/steps/video/{transcoding.py => transcoding_step.py} (100%) rename preprocessor/steps/vision/{character_detection.py => character_detection_step.py} (100%) rename preprocessor/steps/vision/{embeddings.py => embeddings_step.py} (100%) rename preprocessor/steps/vision/{emotion_detection.py => emotion_detection_step.py} (100%) rename preprocessor/steps/vision/{face_clustering.py => face_clustering_step.py} (100%) rename preprocessor/steps/vision/{image_hashing.py => image_hashing_step.py} (100%) rename preprocessor/steps/vision/{object_detection.py => object_detection_step.py} (100%) diff --git a/preprocessor/steps/audio/__init__.py b/preprocessor/steps/audio/__init__.py index 7df9c8006..2cbc94cc2 100644 --- a/preprocessor/steps/audio/__init__.py +++ b/preprocessor/steps/audio/__init__.py @@ -1,3 +1,3 @@ -from preprocessor.steps.audio.separation import SoundSeparationStep +from preprocessor.steps.audio.separation_step import SoundSeparationStep __all__ = ['SoundSeparationStep'] diff --git a/preprocessor/steps/audio/separation.py b/preprocessor/steps/audio/separation_step.py similarity index 100% rename from preprocessor/steps/audio/separation.py rename to preprocessor/steps/audio/separation_step.py diff --git a/preprocessor/steps/packaging/__init__.py b/preprocessor/steps/packaging/__init__.py index 677d2f58d..46c3d5231 100644 --- a/preprocessor/steps/packaging/__init__.py +++ b/preprocessor/steps/packaging/__init__.py @@ -1,3 +1,3 @@ -from preprocessor.steps.packaging.archives import ArchiveGenerationStep +from preprocessor.steps.packaging.archives_step import ArchiveGenerationStep __all__ = ['ArchiveGenerationStep'] diff --git a/preprocessor/steps/packaging/archives.py b/preprocessor/steps/packaging/archives_step.py similarity index 100% rename from preprocessor/steps/packaging/archives.py rename to preprocessor/steps/packaging/archives_step.py diff --git a/preprocessor/steps/search/document_generation.py b/preprocessor/steps/search/document_generation_step.py similarity index 100% rename from preprocessor/steps/search/document_generation.py rename to preprocessor/steps/search/document_generation_step.py diff --git a/preprocessor/steps/search/indexing.py b/preprocessor/steps/search/indexing_step.py similarity index 100% rename from preprocessor/steps/search/indexing.py rename to preprocessor/steps/search/indexing_step.py diff --git a/preprocessor/steps/text/__init__.py b/preprocessor/steps/text/__init__.py index a6986c9fd..96fee3bb3 100644 --- a/preprocessor/steps/text/__init__.py +++ b/preprocessor/steps/text/__init__.py @@ -1,6 +1,6 @@ from preprocessor.services.text.import_step import TranscriptionImportStep -from preprocessor.steps.text.analysis import TextAnalysisStep -from preprocessor.steps.text.embeddings import TextEmbeddingStep -from preprocessor.steps.text.transcription import TranscriptionStep +from preprocessor.steps.text.analysis_step import TextAnalysisStep +from preprocessor.steps.text.embeddings_step import TextEmbeddingStep +from preprocessor.steps.text.transcription_step import TranscriptionStep __all__ = ['TextAnalysisStep', 'TextEmbeddingStep', 'TranscriptionImportStep', 'TranscriptionStep'] diff --git a/preprocessor/steps/text/analysis.py b/preprocessor/steps/text/analysis_step.py similarity index 100% rename from preprocessor/steps/text/analysis.py rename to preprocessor/steps/text/analysis_step.py diff --git a/preprocessor/steps/text/embeddings.py b/preprocessor/steps/text/embeddings_step.py similarity index 100% rename from preprocessor/steps/text/embeddings.py rename to preprocessor/steps/text/embeddings_step.py diff --git a/preprocessor/steps/text/transcription.py b/preprocessor/steps/text/transcription_step.py similarity index 100% rename from preprocessor/steps/text/transcription.py rename to preprocessor/steps/text/transcription_step.py diff --git a/preprocessor/steps/video/frame_export.py b/preprocessor/steps/video/frame_export_step.py similarity index 100% rename from preprocessor/steps/video/frame_export.py rename to preprocessor/steps/video/frame_export_step.py diff --git a/preprocessor/steps/video/scene_detection.py b/preprocessor/steps/video/scene_detection_step.py similarity index 100% rename from preprocessor/steps/video/scene_detection.py rename to preprocessor/steps/video/scene_detection_step.py diff --git a/preprocessor/steps/video/transcoding.py b/preprocessor/steps/video/transcoding_step.py similarity index 100% rename from preprocessor/steps/video/transcoding.py rename to preprocessor/steps/video/transcoding_step.py diff --git a/preprocessor/steps/vision/__init__.py b/preprocessor/steps/vision/__init__.py index f77774a09..f18724d88 100644 --- a/preprocessor/steps/vision/__init__.py +++ b/preprocessor/steps/vision/__init__.py @@ -1,8 +1,8 @@ -from preprocessor.steps.vision.character_detection import CharacterDetectorStep -from preprocessor.steps.vision.embeddings import VideoEmbeddingStep -from preprocessor.steps.vision.emotion_detection import EmotionDetectionStep -from preprocessor.steps.vision.face_clustering import FaceClusteringStep -from preprocessor.steps.vision.image_hashing import ImageHashStep -from preprocessor.steps.vision.object_detection import ObjectDetectionStep +from preprocessor.steps.vision.character_detection_step import CharacterDetectorStep +from preprocessor.steps.vision.embeddings_step import VideoEmbeddingStep +from preprocessor.steps.vision.emotion_detection_step import EmotionDetectionStep +from preprocessor.steps.vision.face_clustering_step import FaceClusteringStep +from preprocessor.steps.vision.image_hashing_step import ImageHashStep +from preprocessor.steps.vision.object_detection_step import ObjectDetectionStep __all__ = ['CharacterDetectorStep', 'EmotionDetectionStep', 'FaceClusteringStep', 'ImageHashStep', 'ObjectDetectionStep', 'VideoEmbeddingStep'] diff --git a/preprocessor/steps/vision/character_detection.py b/preprocessor/steps/vision/character_detection_step.py similarity index 100% rename from preprocessor/steps/vision/character_detection.py rename to preprocessor/steps/vision/character_detection_step.py diff --git a/preprocessor/steps/vision/embeddings.py b/preprocessor/steps/vision/embeddings_step.py similarity index 100% rename from preprocessor/steps/vision/embeddings.py rename to preprocessor/steps/vision/embeddings_step.py diff --git a/preprocessor/steps/vision/emotion_detection.py b/preprocessor/steps/vision/emotion_detection_step.py similarity index 100% rename from preprocessor/steps/vision/emotion_detection.py rename to preprocessor/steps/vision/emotion_detection_step.py diff --git a/preprocessor/steps/vision/face_clustering.py b/preprocessor/steps/vision/face_clustering_step.py similarity index 100% rename from preprocessor/steps/vision/face_clustering.py rename to preprocessor/steps/vision/face_clustering_step.py diff --git a/preprocessor/steps/vision/image_hashing.py b/preprocessor/steps/vision/image_hashing_step.py similarity index 100% rename from preprocessor/steps/vision/image_hashing.py rename to preprocessor/steps/vision/image_hashing_step.py diff --git a/preprocessor/steps/vision/object_detection.py b/preprocessor/steps/vision/object_detection_step.py similarity index 100% rename from preprocessor/steps/vision/object_detection.py rename to preprocessor/steps/vision/object_detection_step.py From e3cd0184442f0653c44d852ee5bd2281f3db6079 Mon Sep 17 00:00:00 2001 From: dam2452 <81230036+dam2452@users.noreply.github.com> Date: Thu, 12 Feb 2026 12:13:07 +0100 Subject: [PATCH 21/89] Add resolution analysis step and refactor CLI search Introduce a new resolution_analysis pipeline step (and Result type) to analyze source video resolutions before transcode and make transcode depend on it; update README and pipeline diagram to show 21 steps and the new analyze-resolution CLI command. Refactor CLI/search logic by extracting SearchCommandHandler and SearchFilters, centralizing perceptual-hash computation, and unifying async search flow; wire in EmbeddingService/Elasticsearch queries and replace ad-hoc result printing with buffered output. Replace PathResolver usages with PathService (remove old path_resolver), streamline logging/messages (remove emojis and replace Unicode arrows with ASCII), add several validator modules, add small typing and signature improvements, and apply various minor cleanups across AI, face-detection, state management, artifacts, and core modules. --- preprocessor/README.md | 42 +- preprocessor/app/pipeline.py | 16 +- preprocessor/app/pipeline_builder.py | 10 +- preprocessor/app/pipeline_factory.py | 22 +- preprocessor/app/step_builder.py | 2 +- preprocessor/cli/cli_main.py | 227 ++++----- preprocessor/cli/helpers.py | 16 +- preprocessor/cli/search_handler.py | 235 +++++++++ preprocessor/cli/skip_list_builder.py | 2 +- preprocessor/config/series_config.py | 6 +- preprocessor/core/artifacts.py | 11 +- preprocessor/core/processing_metadata.py | 6 +- preprocessor/core/state_manager.py | 24 +- preprocessor/services/ai/clients.py | 4 +- preprocessor/services/ai/models.py | 9 +- preprocessor/services/ai/provider.py | 9 +- .../services/characters/face_detection.py | 14 +- .../image_search/google_image_search.py | 2 +- .../characters/image_search/image_search.py | 2 +- preprocessor/services/characters/models.py | 3 +- .../characters/reference_downloader.py | 2 +- preprocessor/services/core/base_processor.py | 64 ++- preprocessor/services/core/logging.py | 7 +- .../services/episodes/episode_manager.py | 12 +- preprocessor/services/io/__init__.py | 3 +- preprocessor/services/io/files.py | 2 +- preprocessor/services/io/hashing.py | 48 +- preprocessor/services/io/path_resolver.py | 17 - preprocessor/services/media/ffmpeg.py | 76 ++- preprocessor/services/media/resolution.py | 12 +- .../services/media/scene_detection.py | 2 +- .../services/scraping/base_scraper.py | 4 +- .../services/scraping/base_scraper_step.py | 3 +- .../services/scraping/character_scraper.py | 4 +- .../services/scraping/episode_scraper.py | 14 +- .../services/scraping/grid_visualizer.py | 425 ++++++++++++++++ .../services/scraping/reference_processor.py | 332 +------------ .../services/search/embedding_model.py | 3 - preprocessor/services/text/text_statistics.py | 8 +- .../transcription/engines/whisper_engine.py | 6 +- .../generators/base_generator.py | 2 +- .../generators/multi_format_generator.py | 2 +- .../processors/audio_normalizer.py | 2 +- .../processors/episode_info_processor.py | 2 +- preprocessor/services/transcription/utils.py | 2 +- .../services/transcription/whisper.py | 6 +- preprocessor/services/ui/console.py | 16 +- preprocessor/services/ui/progress.py | 22 +- .../services/validation/episode_stats.py | 466 +++--------------- .../services/validation/file_validators.py | 2 +- .../services/validation/global_validator.py | 10 +- .../services/validation/report_generator.py | 7 +- .../services/validation/season_comparator.py | 4 +- preprocessor/services/validation/validator.py | 4 +- .../validation/validators/__init__.py | 23 + .../validation/validators/base_validator.py | 33 ++ .../validators/character_validator.py | 19 + .../validators/elastic_validator.py | 133 +++++ .../validators/face_cluster_validator.py | 69 +++ .../validation/validators/frame_validator.py | 51 ++ .../validators/image_hash_validator.py | 19 + .../validation/validators/object_validator.py | 37 ++ .../validation/validators/scene_validator.py | 54 ++ .../validators/transcription_validator.py | 124 +++++ .../validators/validation_helpers.py | 137 +++++ .../validation/validators/video_validator.py | 34 ++ preprocessor/services/video/emotion_utils.py | 6 +- preprocessor/services/video/image_hasher.py | 2 +- .../strategies/scene_changes_strategy.py | 2 +- preprocessor/steps/analysis/__init__.py | 0 .../analysis/resolution_analysis_step.py | 162 ++++++ preprocessor/steps/audio/separation_step.py | 21 +- preprocessor/steps/packaging/archives_step.py | 13 +- .../steps/scraping/character_scraper_step.py | 4 +- .../steps/scraping/episode_scraper_step.py | 3 +- .../scraping/reference_processor_step.py | 9 +- .../steps/search/document_generation_step.py | 12 +- preprocessor/steps/text/analysis_step.py | 13 +- preprocessor/steps/text/transcription_step.py | 13 +- preprocessor/steps/video/frame_export_step.py | 19 +- .../steps/video/scene_detection_step.py | 16 +- preprocessor/steps/video/transcoding_step.py | 114 ++++- .../steps/vision/character_detection_step.py | 13 +- .../steps/vision/image_hashing_step.py | 13 +- 84 files changed, 2143 insertions(+), 1278 deletions(-) create mode 100644 preprocessor/cli/search_handler.py delete mode 100644 preprocessor/services/io/path_resolver.py create mode 100644 preprocessor/services/scraping/grid_visualizer.py create mode 100644 preprocessor/services/validation/validators/__init__.py create mode 100644 preprocessor/services/validation/validators/base_validator.py create mode 100644 preprocessor/services/validation/validators/character_validator.py create mode 100644 preprocessor/services/validation/validators/elastic_validator.py create mode 100644 preprocessor/services/validation/validators/face_cluster_validator.py create mode 100644 preprocessor/services/validation/validators/frame_validator.py create mode 100644 preprocessor/services/validation/validators/image_hash_validator.py create mode 100644 preprocessor/services/validation/validators/object_validator.py create mode 100644 preprocessor/services/validation/validators/scene_validator.py create mode 100644 preprocessor/services/validation/validators/transcription_validator.py create mode 100644 preprocessor/services/validation/validators/validation_helpers.py create mode 100644 preprocessor/services/validation/validators/video_validator.py create mode 100644 preprocessor/steps/analysis/__init__.py create mode 100644 preprocessor/steps/analysis/resolution_analysis_step.py diff --git a/preprocessor/README.md b/preprocessor/README.md index 2495233fc..b81960a98 100644 --- a/preprocessor/README.md +++ b/preprocessor/README.md @@ -26,6 +26,9 @@ docker compose build ./run-preprocessor.sh transcode --series ranczo ./run-preprocessor.sh detect-scenes --series ranczo +# Analiza rozdzielczości (sprawdź przed uruchomieniem pipeline!) +./run-preprocessor.sh analyze-resolution --series kiepscy + # Search ./run-preprocessor.sh search --series ranczo --text "Lucy Wilska" ./run-preprocessor.sh search --series kiepscy --stats @@ -93,27 +96,29 @@ series_configs/ --- -## Pipeline (20 kroków) +## Pipeline (21 kroków) ``` -SCRAPING PROCESSING INDEXING VALIDATION -────────────────────────────────────────────────────────────────────────────────────────────────────── -[1] scrape_episodes ──┬─→ [4] transcode ─→ [5] transcribe ─→ [6] separate_sounds -[2] scrape_characters │ [7] analyze_text ────┐ -[3] process_references─┘ [8] detect_scenes ─→ [9] export_frames │ - [10] text_embeddings │ - [11] video_embeddings ├─→ [20] validate - [12] image_hashing │ - [13] detect_characters │ - [14] detect_emotions │ - [15] cluster_faces │ - [16] detect_objects │ - [17] generate_elastic_docs ─→ [18] generate_archives ─→ [19] index_to_elasticsearch ─┘ +SCRAPING PROCESSING INDEXING VALIDATION +──────────────────────────────────────────────────────────────────────────────────────────────────────────── +[1] scrape_episodes ──┬─→ [4] resolution_analysis ─→ [5] transcode ─→ [6] transcribe ─→ [7] separate_sounds +[2] scrape_characters │ [8] analyze_text ────┐ +[3] process_references─┘ [9] detect_scenes ─→ [10] export_frames │ + [11] text_embeddings │ + [12] video_embeddings ├─→ [21] validate + [13] image_hashing │ + [14] detect_characters │ + [15] detect_emotions │ + [16] cluster_faces │ + [17] detect_objects │ + [18] generate_elastic_docs ─→ [19] generate_archives ─→ [20] index_to_elasticsearch ─────┘ ``` **Kroki są automatycznie wykonywane w poprawnej kolejności** - pipeline rozwiązuje zależności i tworzy plan wykonania. -**Validation (krok 20)** - uruchamiany na końcu, weryfikuje poprawność wszystkich poprzednich kroków pipeline. +**Resolution analysis (krok 4)** - analizuje rozdzielczości materiałów źródłowych przed transkodowaniem, ostrzega jeśli >50% wymaga upscalingu. + +**Validation (krok 21)** - uruchamiany na końcu, weryfikuje poprawność wszystkich poprzednich kroków pipeline. --- @@ -168,8 +173,9 @@ SCRAPING PROCESSING INDEXING ./run-preprocessor.sh search --series NAZWA --list-characters # Utilities -./run-preprocessor.sh visualize --series NAZWA # Wizualizacja grafu zależności -./run-preprocessor.sh bash # Shell w kontenerze +./run-preprocessor.sh analyze-resolution --series NAZWA # Analiza rozdzielczości i rekomendacje +./run-preprocessor.sh visualize --series NAZWA # Wizualizacja grafu zależności +./run-preprocessor.sh bash # Shell w kontenerze ``` **Parametry:** @@ -180,7 +186,7 @@ SCRAPING PROCESSING INDEXING **Step IDs do --skip:** ``` scrape_episodes, scrape_characters, process_references, -transcode, transcribe, separate_sounds, analyze_text, +resolution_analysis, transcode, transcribe, separate_sounds, analyze_text, detect_scenes, export_frames, text_embeddings, video_embeddings, image_hashing, detect_characters, detect_emotions, cluster_faces, detect_objects, generate_elastic_docs, generate_archives, index_to_elasticsearch, diff --git a/preprocessor/app/pipeline.py b/preprocessor/app/pipeline.py index e4a9c2b6f..2be569b7e 100644 --- a/preprocessor/app/pipeline.py +++ b/preprocessor/app/pipeline.py @@ -57,7 +57,7 @@ def get_step(self, step_id: str) -> StepBuilder: def register(self, step: StepBuilder) -> None: if step.id in self._steps: raise ValueError( - f"❌ DUPLICATE STEP:\n" + f"DUPLICATE STEP:\n" f" Step '{step.id}' is already registered in the pipeline!\n" f" Check build_pipeline() in pipeline_factory.py", ) @@ -91,11 +91,11 @@ def to_ascii_art(self) -> str: for step in phases[phase_name]: deps_str: str = "" if step.dependency_ids: - deps_str = f" ← needs: {', '.join(step.dependency_ids)}" + deps_str = f" <- needs: {', '.join(step.dependency_ids)}" lines.append(f" {step.id}{deps_str}") - lines.append(f" → produces: {', '.join(step.produces)}") - lines.append(f" → {step.description}") + lines.append(f" -> produces: {', '.join(step.produces)}") + lines.append(f" -> {step.description}") lines.append("") lines.append("=" * 80) @@ -117,7 +117,7 @@ def validate(self, logger: Optional["ErrorHandlingLogger"] = None) -> None: self.__raise_cycle_error() message = ( - f"✅ Pipeline '{self.name}' validated successfully:\n" + f"Pipeline '{self.name}' validated successfully:\n" f" - {len(self._steps)} steps registered\n" f" - DAG structure confirmed\n" f" - No cyclic dependencies" @@ -133,11 +133,11 @@ def __repr__(self) -> str: def __raise_cycle_error(self) -> None: cycles: List[List[str]] = list(nx.simple_cycles(self._graph)) - cycle_path: str = " → ".join(cycles[0]) + f" → {cycles[0][0]}" + cycle_path: str = " -> ".join(cycles[0]) + f" -> {cycles[0][0]}" raise ValueError( f"\n{'=' * 80}\n" - f"❌ PIPELINE DEPENDENCY CYCLE DETECTED\n" + f"PIPELINE DEPENDENCY CYCLE DETECTED\n" f"{'=' * 80}\n\n" f"Cyclic dependency detected:\n" f" {cycle_path}\n\n" @@ -152,7 +152,7 @@ def __raise_missing_dependency_error( ) -> None: raise ValueError( f"\n{'=' * 80}\n" - f"❌ PIPELINE DEPENDENCY ERROR\n" + f"PIPELINE DEPENDENCY ERROR\n" f"{'=' * 80}\n\n" f"Step: '{step_id}'\n" f"Needs: '{missing_dep_id}'\n" diff --git a/preprocessor/app/pipeline_builder.py b/preprocessor/app/pipeline_builder.py index 499b4f5eb..6299a9f0a 100644 --- a/preprocessor/app/pipeline_builder.py +++ b/preprocessor/app/pipeline_builder.py @@ -16,7 +16,7 @@ class PipelineExecutor: - def __init__(self, context: ExecutionContext): + def __init__(self, context: ExecutionContext) -> None: self.context = context self.steps: List[PipelineStep] = [] @@ -40,8 +40,8 @@ def execute_step( episode_manager: EpisodeManager, ) -> None: step = pipeline.get_step(step_id) - self.context.logger.info(f"🔧 Step: {step_id}") - self.context.logger.info(f"📝 {step.description}") + self.context.logger.info(f"Step: {step_id}") + self.context.logger.info(f"{step.description}") StepClass = step.load_class() instance = StepClass(step.config) @@ -50,7 +50,7 @@ def execute_step( runner.add_step(instance) runner.__run_for_episodes(source_path, episode_manager) - self.context.logger.info(f"✅ Step '{step_id}' completed") + self.context.logger.info(f"Step '{step_id}' completed") def execute_steps( self, @@ -106,7 +106,7 @@ def __run_for_episodes( # pylint: disable=unused-private-member if self.__should_skip_step(step.name, episode_id): self.context.logger.info( - f"⏭️ Skipping {step.name} for {episode_id} (already completed)", + f"Skipping {step.name} for {episode_id} (already completed)", ) next_artifacts.append(artifact) continue diff --git a/preprocessor/app/pipeline_factory.py b/preprocessor/app/pipeline_factory.py index 24b8c2381..09fead456 100644 --- a/preprocessor/app/pipeline_factory.py +++ b/preprocessor/app/pipeline_factory.py @@ -29,6 +29,7 @@ VideoEmbeddingConfig, WhisperTranscriptionConfig, ) +from preprocessor.services.media.resolution import Resolution SCRAPING = Phase("SCRAPING", color="blue") PROCESSING = Phase("PROCESSING", color="green") @@ -87,13 +88,31 @@ def build_pipeline(series_name: str) -> PipelineDefinition: # pylint: disable=t ), ) + resolution_analysis = StepBuilder( + id="resolution_analysis", + phase=PROCESSING, + module="preprocessor.steps.analysis.resolution_analysis_step:ResolutionAnalysisStep", + description="Analyze source video resolutions and warn if upscaling required", + produces=[], + needs=[], + config=TranscodeConfig( + video_bitrate_mbps=series_config.processing.transcode.video_bitrate_mbps, + minrate_mbps=series_config.processing.transcode.minrate_mbps, + maxrate_mbps=series_config.processing.transcode.maxrate_mbps, + bufsize_mbps=series_config.processing.transcode.bufsize_mbps, + gop_size=series_config.processing.transcode.gop_size, + force_deinterlace=series_config.processing.transcode.force_deinterlace, + resolution=Resolution.from_string(series_config.processing.transcode.resolution), + ), + ) + transcoded_videos = StepBuilder( id="transcode", phase=PROCESSING, module="preprocessor.steps.video.transcoding:VideoTranscoderStep", description=f"Conversion to {series_config.processing.transcode.codec} {series_config.processing.transcode.resolution} with adaptive bitrate", produces=["transcoded_videos/{season}/{episode}.mp4"], - needs=[], + needs=[resolution_analysis], config=TranscodeConfig( video_bitrate_mbps=series_config.processing.transcode.video_bitrate_mbps, minrate_mbps=series_config.processing.transcode.minrate_mbps, @@ -301,6 +320,7 @@ def build_pipeline(series_name: str) -> PipelineDefinition: # pylint: disable=t pipeline.register(characters_metadata) pipeline.register(character_references) + pipeline.register(resolution_analysis) pipeline.register(transcoded_videos) pipeline.register(scene_data) pipeline.register(exported_frames) diff --git a/preprocessor/app/step_builder.py b/preprocessor/app/step_builder.py index 01380f30f..a95a890d2 100644 --- a/preprocessor/app/step_builder.py +++ b/preprocessor/app/step_builder.py @@ -10,7 +10,7 @@ ) if TYPE_CHECKING: - from preprocessor.core.base_step import PipelineStep + pass @dataclass diff --git a/preprocessor/cli/cli_main.py b/preprocessor/cli/cli_main.py index e92009507..b53f4d68f 100644 --- a/preprocessor/cli/cli_main.py +++ b/preprocessor/cli/cli_main.py @@ -1,10 +1,13 @@ +import asyncio from pathlib import Path +import sys from typing import ( Callable, Tuple, ) import click +from elasticsearch import AsyncElasticsearch from preprocessor.app.pipeline_builder import PipelineExecutor from preprocessor.app.pipeline_factory import ( @@ -12,9 +15,15 @@ visualize, ) from preprocessor.cli.helpers import setup_pipeline_context +from preprocessor.cli.search_handler import ( + SearchCommandHandler, + SearchFilters, +) from preprocessor.cli.skip_list_builder import SkipListBuilder from preprocessor.config.series_config import SeriesConfig -from preprocessor.services.io.path_resolver import PathResolver +from preprocessor.services.io.path_service import PathService +from preprocessor.services.search.clients.elasticsearch_queries import ElasticsearchQueries +from preprocessor.services.search.clients.embedding_service import EmbeddingService @click.group() @@ -46,10 +55,10 @@ def __run_all(series: str, force_rerun: bool, skip: Tuple[str, ...]) -> None: skip_list = SkipListBuilder.build(skip, series_config, setup.logger) plan = pipeline.get_execution_order(skip=skip_list) - source_path = PathResolver.get_input_base() / series + source_path = PathService.get_input_base() / series - setup.logger.info(f"📋 Execution plan: {' → '.join(plan)}") - setup.logger.info(f"📂 Source: {source_path}") + setup.logger.info(f"Execution plan: {' -> '.join(plan)}") + setup.logger.info(f"Source: {source_path}") executor = PipelineExecutor(setup.context) executor.execute_steps( @@ -60,9 +69,9 @@ def __run_all(series: str, force_rerun: bool, skip: Tuple[str, ...]) -> None: ) setup.logger.info("=" * 80) - setup.logger.info("🎉 Pipeline completed successfully!") + setup.logger.info("Pipeline completed successfully!") except KeyboardInterrupt: - setup.logger.info("\n🛑 Interrupted by user") + setup.logger.info("\nInterrupted by user") raise finally: setup.logger.finalize() @@ -81,15 +90,15 @@ def __step_command(series: str, force_rerun: bool, _step_id: str = step_id) -> N deps = step.dependency_ids if deps: - setup.logger.info(f"📦 Dependencies: {', '.join(deps)}") + setup.logger.info(f"Dependencies: {', '.join(deps)}") for dep_id in deps: if not setup.context.state_manager.is_step_completed(dep_id, "*"): setup.logger.warning( - f"⚠️ Dependency '{dep_id}' may not be completed. " + f"Dependency '{dep_id}' may not be completed. " f"Run it first or use --force-rerun.", ) - source_path = PathResolver.get_input_base() / series + source_path = PathService.get_input_base() / series executor = PipelineExecutor(setup.context) executor.execute_step( @@ -99,9 +108,9 @@ def __step_command(series: str, force_rerun: bool, _step_id: str = step_id) -> N episode_manager=setup.episode_manager, ) - setup.logger.info(f"✅ Step '{_step_id}' completed successfully") + setup.logger.info(f"Step '{_step_id}' completed successfully") except KeyboardInterrupt: - setup.logger.info("\n🛑 Interrupted by user") + setup.logger.info("\nInterrupted by user") raise finally: setup.logger.finalize() @@ -109,25 +118,43 @@ def __step_command(series: str, force_rerun: bool, _step_id: str = step_id) -> N return __step_command +@cli.command(name="analyze-resolution") +@click.option("--series", required=True, help="Series name (e.g., ranczo, kiepscy)") +def __analyze_resolution(series: str) -> None: + pipeline = build_pipeline(series) + setup = setup_pipeline_context(series, "resolution_analysis", False, with_episode_manager=False) + + try: + step = pipeline.get_step("resolution_analysis") + step.execute(None, setup.context) + + setup.logger.info("Resolution analysis completed") + except KeyboardInterrupt: + setup.logger.info("\nInterrupted by user") + raise + finally: + setup.logger.finalize() + + @cli.command(name="search") @click.option("--series", required=True, help="Series name (e.g., ranczo, kiepscy)") -@click.option("--text", type=str, help="Full-text search po transkrypcjach") -@click.option("--text-semantic", type=str, help="Semantic search po text embeddings") -@click.option("--text-to-video", type=str, help="Cross-modal search: text query w video embeddings") -@click.option("--image", type=click.Path(exists=True, path_type=Path), help="Semantic search po video embeddings") -@click.option("--hash", "phash", type=str, help="Szukaj po perceptual hash (podaj hash string lub sciezke do obrazka)") -@click.option("--character", type=str, help="Szukaj po postaci") -@click.option("--emotion", type=str, help="Szukaj po emocji (neutral, happiness, surprise, sadness, anger, disgust, fear, contempt)") -@click.option("--object", "object_query", type=str, help="Szukaj po wykrytych obiektach (np. 'dog', 'person:5+', 'chair:2-4')") -@click.option("--episode-name", type=str, help="Fuzzy search po nazwach odcinkow") -@click.option("--episode-name-semantic", type=str, help="Semantic search po nazwach odcinkow") -@click.option("--list-characters", "list_chars_flag", is_flag=True, help="Lista wszystkich postaci") -@click.option("--list-objects", "list_objects_flag", is_flag=True, help="Lista wszystkich klas obiektow") -@click.option("--season", type=int, help="Filtruj po sezonie") -@click.option("--episode", type=int, help="Filtruj po odcinku") -@click.option("--limit", type=int, default=20, help="Limit wynikow") -@click.option("--stats", is_flag=True, help="Pokaz statystyki indeksow") -@click.option("--json-output", is_flag=True, help="Output w formacie JSON") +@click.option("--text", type=str, help="Full-text search by transcriptions") +@click.option("--text-semantic", type=str, help="Semantic search by text embeddings") +@click.option("--text-to-video", type=str, help="Cross-modal search: text query in video embeddings") +@click.option("--image", type=click.Path(exists=True, path_type=Path), help="Semantic search by video embeddings") +@click.option("--hash", "phash", type=str, help="Search by perceptual hash (provide hash string or image path)") +@click.option("--character", type=str, help="Search by character") +@click.option("--emotion", type=str, help="Search by emotion (neutral, happiness, surprise, sadness, anger, disgust, fear, contempt)") +@click.option("--object", "object_query", type=str, help="Search by detected objects (e.g., 'dog', 'person:5+', 'chair:2-4')") +@click.option("--episode-name", type=str, help="Fuzzy search by episode names") +@click.option("--episode-name-semantic", type=str, help="Semantic search by episode names") +@click.option("--list-characters", "list_chars_flag", is_flag=True, help="List all characters") +@click.option("--list-objects", "list_objects_flag", is_flag=True, help="List all object classes") +@click.option("--season", type=int, help="Filter by season") +@click.option("--episode", type=int, help="Filter by episode") +@click.option("--limit", type=int, default=20, help="Result limit") +@click.option("--stats", is_flag=True, help="Show index statistics") +@click.option("--json-output", is_flag=True, help="Output in JSON format") @click.option("--host", type=str, default="http://localhost:9200", help="Elasticsearch host") def search( # pylint: disable=too-many-arguments,too-many-locals,too-many-statements series: str, @@ -150,22 +177,11 @@ def search( # pylint: disable=too-many-arguments,too-many-locals,too-many-state json_output: bool, host: str, ) -> None: - import asyncio # pylint: disable=import-outside-toplevel - import json # pylint: disable=import-outside-toplevel - import sys # pylint: disable=import-outside-toplevel - - from elasticsearch import AsyncElasticsearch # pylint: disable=import-outside-toplevel - - from preprocessor.services.search.clients.elasticsearch_queries import ElasticsearchQueries # pylint: disable=import-outside-toplevel - from preprocessor.services.search.clients.embedding_service import EmbeddingService # pylint: disable=import-outside-toplevel - from preprocessor.services.search.clients.hash_service import HashService # pylint: disable=import-outside-toplevel - from preprocessor.services.search.clients.result_formatters import ResultFormatter # pylint: disable=import-outside-toplevel - if not any([ text, text_semantic, text_to_video, image, phash, character, emotion, object_query, episode_name, episode_name_semantic, list_chars_flag, list_objects_flag, stats, ]): - click.echo("Podaj przynajmniej jedna opcje wyszukiwania. Uzyj --help", err=True) + click.echo("Provide at least one search option. Use --help", err=True) sys.exit(1) series_config = SeriesConfig.load(series) @@ -173,27 +189,17 @@ def search( # pylint: disable=too-many-arguments,too-many-locals,too-many-state hash_value = None if phash: - phash_path = Path(phash) - if phash_path.exists() and phash_path.is_file(): - click.echo(f"Computing perceptual hash from image: {phash}", err=True) - hash_svc = HashService() - hash_value = hash_svc.get_perceptual_hash(str(phash_path)) - if hash_value: - click.echo(f"Computed hash: {hash_value}", err=True) - else: - click.echo("Failed to compute hash from image", err=True) - sys.exit(1) - hash_svc.cleanup() - else: - hash_value = phash - - async def run() -> None: # pylint: disable=too-many-branches,too-many-statements + hash_value = SearchCommandHandler.compute_perceptual_hash(phash) + if hash_value is None: + sys.exit(1) + + async def __run() -> None: es_client = AsyncElasticsearch(hosts=[host], verify_certs=False) try: await es_client.ping() - except Exception: # pylint: disable=broad-except - click.echo(f"✗ Cannot connect to Elasticsearch at {host}", err=True) + except Exception: + click.echo(f"Cannot connect to Elasticsearch at {host}", err=True) click.echo("Make sure Elasticsearch is running:", err=True) click.echo(" docker-compose -f docker-compose.test.yml up -d", err=True) sys.exit(1) @@ -202,110 +208,45 @@ async def run() -> None: # pylint: disable=too-many-branches,too-many-statement queries = ElasticsearchQueries(embedding_svc, index_base) try: - if stats: - result = await queries.get_stats(es_client) - if json_output: - click.echo(json.dumps(result, indent=2)) - else: - click.echo("\nStatystyki:") - click.echo(f" Segments: {result['segments']:,}") - click.echo(f" Text Embeddings: {result['text_embeddings']:,}") - click.echo(f" Video Embeddings: {result['video_embeddings']:,}") - click.echo(f" Episode Names: {result['episode_names']:,}") + handler = SearchCommandHandler(es_client, embedding_svc, queries, json_output) + filters = SearchFilters(season, episode, character, limit) + result = None + if stats: + result = await handler.handle_stats() elif list_chars_flag: - chars = await queries.list_characters(es_client) - if json_output: - click.echo(json.dumps(chars, indent=2)) - else: - click.echo(f"\nZnaleziono {len(chars)} postaci:") - for char_name, count in sorted(chars, key=lambda x: -x[1]): - click.echo(f" {char_name}: {count:,} wystapien") - + result = await handler.handle_list_characters() elif list_objects_flag: - objects = await queries.list_objects(es_client) - if json_output: - click.echo(json.dumps(objects, indent=2)) - else: - click.echo(f"\nZnaleziono {len(objects)} klas obiektow:") - for obj_name, count in sorted(objects, key=lambda x: -x[1]): - click.echo(f" {obj_name}: {count:,} wystapien") - + result = await handler.handle_list_objects() elif text: - result = await queries.search_text_query(es_client, text, season, episode, limit) - if json_output: - click.echo(json.dumps(result["hits"], indent=2)) - else: - ResultFormatter.print_results(result, "text") - + result = await handler.handle_text_search(text, filters) elif text_semantic: - result = await queries.search_text_semantic(es_client, text_semantic, season, episode, limit) - if json_output: - click.echo(json.dumps(result["hits"], indent=2)) - else: - ResultFormatter.print_results(result, "text_semantic") - + result = await handler.handle_text_semantic_search(text_semantic, filters) elif text_to_video: - result = await queries.search_text_to_video(es_client, text_to_video, season, episode, character, limit) - if json_output: - click.echo(json.dumps(result["hits"], indent=2)) - else: - ResultFormatter.print_results(result, "video") - + result = await handler.handle_text_to_video_search(text_to_video, filters) elif image: - result = await queries.search_video_semantic(es_client, str(image), season, episode, character, limit) - if json_output: - click.echo(json.dumps(result["hits"], indent=2)) - else: - ResultFormatter.print_results(result, "video") - + result = await handler.handle_image_search(image, filters) elif emotion: - result = await queries.search_by_emotion(es_client, emotion, season, episode, character, limit) - if json_output: - click.echo(json.dumps(result["hits"], indent=2)) - else: - ResultFormatter.print_results(result, "video") - + result = await handler.handle_emotion_search(emotion, filters) elif character: - result = await queries.search_by_character(es_client, character, season, episode, limit) - if json_output: - click.echo(json.dumps(result["hits"], indent=2)) - else: - ResultFormatter.print_results(result, "video") - + result = await handler.handle_character_search(character, filters) elif object_query: - result = await queries.search_by_object(es_client, object_query, season, episode, limit) - if json_output: - click.echo(json.dumps(result["hits"], indent=2)) - else: - ResultFormatter.print_results(result, "video") - + result = await handler.handle_object_search(object_query, filters) elif hash_value: - result = await queries.search_perceptual_hash(es_client, hash_value, limit) - if json_output: - click.echo(json.dumps(result["hits"], indent=2)) - else: - ResultFormatter.print_results(result, "video") - + result = await handler.handle_hash_search(hash_value, filters) elif episode_name: - result = await queries.search_episode_name(es_client, episode_name, season, limit) - if json_output: - click.echo(json.dumps(result["hits"], indent=2)) - else: - ResultFormatter.print_results(result, "episode_name") - + result = await handler.handle_episode_name_search(episode_name, filters) elif episode_name_semantic: - result = await queries.search_episode_name_semantic(es_client, episode_name_semantic, season, limit) - if json_output: - click.echo(json.dumps(result["hits"], indent=2)) - else: - ResultFormatter.print_results(result, "episode_name") + result = await handler.handle_episode_name_semantic_search(episode_name_semantic, filters) + + if result: + click.echo(result) finally: embedding_svc.cleanup() await es_client.close() - asyncio.run(run()) + asyncio.run(__run()) _CLI_TEMPLATE_SERIES = "ranczo" diff --git a/preprocessor/cli/helpers.py b/preprocessor/cli/helpers.py index 05ea1b449..a8decd8c3 100644 --- a/preprocessor/cli/helpers.py +++ b/preprocessor/cli/helpers.py @@ -1,5 +1,4 @@ from dataclasses import dataclass -import logging from pathlib import Path from typing import Optional @@ -7,7 +6,7 @@ from preprocessor.core.state_manager import StateManager from preprocessor.services.core.logging import ErrorHandlingLogger from preprocessor.services.episodes.episode_manager import EpisodeManager -from preprocessor.services.io.path_resolver import PathResolver +from preprocessor.services.io.path_service import PathService @dataclass @@ -28,7 +27,7 @@ def build( with_episode_manager: bool = True, ) -> PipelineSetup: logger = PipelineContextFactory.__create_logger(logger_name) - base_dir = PathResolver.get_output_base() + base_dir = PathService.get_output_base() series_output_dir = PipelineContextFactory.__ensure_output_dir(base_dir, series) state_manager = PipelineContextFactory.__create_state_manager(series, series_output_dir) @@ -43,7 +42,7 @@ def build( episode_manager = None if with_episode_manager: - input_base = PathResolver.get_input_base() + input_base = PathService.get_input_base() episode_manager = PipelineContextFactory.__create_episode_manager( series, input_base, logger, ) @@ -64,7 +63,10 @@ def __create_episode_manager( episodes_json = None return EpisodeManager(episodes_json, series, logger) @staticmethod - def __create_logger(command_name: str, loglevel: int = logging.INFO) -> ErrorHandlingLogger: + def __create_logger( + command_name: str, + loglevel: int = ErrorHandlingLogger.INFO, + ) -> ErrorHandlingLogger: return ErrorHandlingLogger(class_name=command_name, loglevel=loglevel, error_exit_code=1) @staticmethod @@ -89,5 +91,5 @@ def setup_pipeline_context( return PipelineContextFactory.build(series, logger_name, force_rerun, with_episode_manager) -def __create_cli_logger(command_name: str, loglevel: int = logging.INFO) -> ErrorHandlingLogger: - return PipelineContextFactory.__create_logger(command_name, loglevel) +def create_cli_logger(command_name: str, loglevel: int = ErrorHandlingLogger.INFO) -> ErrorHandlingLogger: + return ErrorHandlingLogger(class_name=command_name, loglevel=loglevel, error_exit_code=1) diff --git a/preprocessor/cli/search_handler.py b/preprocessor/cli/search_handler.py new file mode 100644 index 000000000..a7cf50bfe --- /dev/null +++ b/preprocessor/cli/search_handler.py @@ -0,0 +1,235 @@ +from pathlib import Path +from typing import ( + Any, + Dict, + List, + Optional, +) + +import click +from elasticsearch import AsyncElasticsearch + +from preprocessor.services.search.clients.elasticsearch_queries import ElasticsearchQueries +from preprocessor.services.search.clients.embedding_service import EmbeddingService +from preprocessor.services.search.clients.hash_service import HashService +from preprocessor.services.search.clients.result_formatters import ResultFormatter + + +class SearchFilters: + + def __init__( + self, + season: Optional[int] = None, + episode: Optional[int] = None, + character: Optional[str] = None, + limit: int = 20, + ) -> None: + self.season = season + self.episode = episode + self.character = character + self.limit = limit + + +class SearchCommandHandler: + + def __init__( + self, + es_client: AsyncElasticsearch, + embedding_service: EmbeddingService, + queries: ElasticsearchQueries, + json_output: bool, + ) -> None: + self._es = es_client + self._embedding = embedding_service + self._queries = queries + self._json_output = json_output + + async def handle_stats(self) -> str: + import json # pylint: disable=import-outside-toplevel + + result = await self._queries.get_stats(self._es) + if self._json_output: + return json.dumps(result, indent=2) + + output = ["\nStatystyki:"] + output.append(f" Segments: {result['segments']:,}") + output.append(f" Text Embeddings: {result['text_embeddings']:,}") + output.append(f" Video Embeddings: {result['video_embeddings']:,}") + output.append(f" Episode Names: {result['episode_names']:,}") + return "\n".join(output) + + async def handle_list_characters(self) -> str: + import json # pylint: disable=import-outside-toplevel + + chars = await self._queries.list_characters(self._es) + if self._json_output: + return json.dumps(chars, indent=2) + + output = [f"\nZnaleziono {len(chars)} postaci:"] + for char_name, count in sorted(chars, key=lambda x: -x[1]): + output.append(f" {char_name}: {count:,} wystapien") + return "\n".join(output) + + async def handle_list_objects(self) -> str: + import json # pylint: disable=import-outside-toplevel + + objects = await self._queries.list_objects(self._es) + if self._json_output: + return json.dumps(objects, indent=2) + + output = [f"\nZnaleziono {len(objects)} klas obiektow:"] + for obj_name, count in sorted(objects, key=lambda x: -x[1]): + output.append(f" {obj_name}: {count:,} wystapien") + return "\n".join(output) + + async def handle_text_search(self, query: str, filters: SearchFilters) -> str: + import json # pylint: disable=import-outside-toplevel + + result = await self._queries.search_text_query( + self._es, query, filters.season, filters.episode, filters.limit, + ) + if self._json_output: + return json.dumps(result["hits"], indent=2) + + return self._format_console_output(result, "text") + + async def handle_text_semantic_search(self, query: str, filters: SearchFilters) -> str: + import json # pylint: disable=import-outside-toplevel + + result = await self._queries.search_text_semantic( + self._es, query, filters.season, filters.episode, filters.limit, + ) + if self._json_output: + return json.dumps(result["hits"], indent=2) + + return self._format_console_output(result, "text_semantic") + + async def handle_text_to_video_search(self, query: str, filters: SearchFilters) -> str: + import json # pylint: disable=import-outside-toplevel + + result = await self._queries.search_text_to_video( + self._es, query, filters.season, filters.episode, filters.character, filters.limit, + ) + if self._json_output: + return json.dumps(result["hits"], indent=2) + + return self._format_console_output(result, "video") + + async def handle_image_search(self, image_path: Path, filters: SearchFilters) -> str: + import json # pylint: disable=import-outside-toplevel + + result = await self._queries.search_video_semantic( + self._es, str(image_path), filters.season, filters.episode, filters.character, filters.limit, + ) + if self._json_output: + return json.dumps(result["hits"], indent=2) + + return self._format_console_output(result, "video") + + async def handle_emotion_search(self, emotion: str, filters: SearchFilters) -> str: + import json # pylint: disable=import-outside-toplevel + + result = await self._queries.search_by_emotion( + self._es, emotion, filters.season, filters.episode, filters.character, filters.limit, + ) + if self._json_output: + return json.dumps(result["hits"], indent=2) + + return self._format_console_output(result, "video") + + async def handle_character_search(self, character: str, filters: SearchFilters) -> str: + import json # pylint: disable=import-outside-toplevel + + result = await self._queries.search_by_character( + self._es, character, filters.season, filters.episode, filters.limit, + ) + if self._json_output: + return json.dumps(result["hits"], indent=2) + + return self._format_console_output(result, "video") + + async def handle_object_search(self, object_query: str, filters: SearchFilters) -> str: + import json # pylint: disable=import-outside-toplevel + + result = await self._queries.search_by_object( + self._es, object_query, filters.season, filters.episode, filters.limit, + ) + if self._json_output: + return json.dumps(result["hits"], indent=2) + + return self._format_console_output(result, "video") + + async def handle_hash_search(self, hash_value: str, filters: SearchFilters) -> str: + import json # pylint: disable=import-outside-toplevel + + result = await self._queries.search_perceptual_hash(self._es, hash_value, filters.limit) + if self._json_output: + return json.dumps(result["hits"], indent=2) + + return self._format_console_output(result, "video") + + async def handle_episode_name_search(self, episode_name: str, filters: SearchFilters) -> str: + import json # pylint: disable=import-outside-toplevel + + result = await self._queries.search_episode_name( + self._es, episode_name, filters.season, filters.limit, + ) + if self._json_output: + return json.dumps(result["hits"], indent=2) + + return self._format_console_output(result, "episode_name") + + async def handle_episode_name_semantic_search(self, episode_name: str, filters: SearchFilters) -> str: + import json # pylint: disable=import-outside-toplevel + + result = await self._queries.search_episode_name_semantic( + self._es, episode_name, filters.season, filters.limit, + ) + if self._json_output: + return json.dumps(result["hits"], indent=2) + + return self._format_console_output(result, "episode_name") + + @staticmethod + def compute_perceptual_hash(phash_input: str) -> Optional[str]: + phash_path = Path(phash_input) + if phash_path.exists() and phash_path.is_file(): + click.echo(f"Computing perceptual hash from image: {phash_input}", err=True) + hash_svc = HashService() + hash_value = hash_svc.get_perceptual_hash(str(phash_path)) + if hash_value: + click.echo(f"Computed hash: {hash_value}", err=True) + else: + click.echo("Failed to compute hash from image", err=True) + return None + hash_svc.cleanup() + return hash_value + return phash_input + + @staticmethod + def _format_console_output(result: Dict[str, Any], result_type: str) -> str: + class __StringBuffer: + def __init__(self) -> None: + self.buffer: List[str] = [] + + def write(self, text: str) -> None: + self.buffer.append(text) + + def getvalue(self) -> str: + return ''.join(self.buffer) + + buffer = __StringBuffer() + + original_echo = click.echo + + def __buffer_echo(message: Optional[str] = None, **_kwargs: Any) -> None: + if message is not None: + buffer.write(str(message) + '\n') + + click.echo = __buffer_echo + try: + ResultFormatter.print_results(result, result_type) + finally: + click.echo = original_echo + + return buffer.getvalue().rstrip() diff --git a/preprocessor/cli/skip_list_builder.py b/preprocessor/cli/skip_list_builder.py index 5285861a7..826d4fbee 100644 --- a/preprocessor/cli/skip_list_builder.py +++ b/preprocessor/cli/skip_list_builder.py @@ -16,6 +16,6 @@ def build( ) -> List[str]: skip_list = list(cli_skip) if series_config.pipeline_mode == "selective" and series_config.skip_steps: - logger.info(f"🔧 Selective mode: auto-skipping {', '.join(series_config.skip_steps)}") + logger.info(f"Selective mode: auto-skipping {', '.join(series_config.skip_steps)}") skip_list.extend(series_config.skip_steps) return list(set(skip_list)) diff --git a/preprocessor/config/series_config.py b/preprocessor/config/series_config.py index 06c95b672..f63e49a5a 100644 --- a/preprocessor/config/series_config.py +++ b/preprocessor/config/series_config.py @@ -8,13 +8,13 @@ ) -def __deep_merge(base: Dict[str, Any], override: Dict[str, Any]) -> Dict[str, Any]: +def _deep_merge(base: Dict[str, Any], override: Dict[str, Any]) -> Dict[str, Any]: result: Dict[str, Any] = base.copy() for key, value in override.items(): if key.startswith('_'): continue if key in result and isinstance(result[key], dict) and isinstance(value, dict): - result[key] = __deep_merge(result[key], value) + result[key] = _deep_merge(result[key], value) else: result[key] = value return result @@ -197,6 +197,6 @@ def __load_from_file(config_path: Path) -> 'SeriesConfig': if not k.startswith('_') } - merged_config: Dict[str, Any] = __deep_merge(defaults, series_filtered) + merged_config: Dict[str, Any] = _deep_merge(defaults, series_filtered) return SeriesConfig.__load_from_dict(merged_config) diff --git a/preprocessor/core/artifacts.py b/preprocessor/core/artifacts.py index 02dff410c..3400b24ec 100644 --- a/preprocessor/core/artifacts.py +++ b/preprocessor/core/artifacts.py @@ -4,12 +4,16 @@ ) from pathlib import Path from typing import ( + TYPE_CHECKING, Any, Dict, List, Optional, ) +if TYPE_CHECKING: + from preprocessor.services.episodes.episode_manager import EpisodeInfo + @dataclass(frozen=True) class Artifact: @@ -18,7 +22,7 @@ class Artifact: @dataclass(frozen=True) class EpisodeArtifact(Artifact): episode_id: str - episode_info: Any + episode_info: 'EpisodeInfo' @dataclass(frozen=True) class SourceVideo(EpisodeArtifact): @@ -112,4 +116,9 @@ class ValidationResult(Artifact): season: str validation_report_dir: Path +@dataclass(frozen=True) +class ResolutionAnalysisResult(Artifact): + total_files: int + upscaling_percentage: float + ProcessedEpisode = ElasticDocuments diff --git a/preprocessor/core/processing_metadata.py b/preprocessor/core/processing_metadata.py index 0ef038fb0..06e7a6484 100644 --- a/preprocessor/core/processing_metadata.py +++ b/preprocessor/core/processing_metadata.py @@ -23,10 +23,10 @@ class StepMetadata: start_time: Optional[datetime] = None status: str = 'pending' - def skip(self): + def skip(self) -> None: self.status = 'skipped' - def start(self): + def start(self) -> None: self.start_time = datetime.now() self.status = 'running' @@ -46,7 +46,7 @@ def to_dict(self) -> Dict[str, Any]: class ProcessingMetadata: - def __init__(self, series_name: str, params: Dict[str, Any]): + def __init__(self, series_name: str, params: Dict[str, Any]) -> None: self.series_name = series_name self.params = self.__sanitize_params(params) self.start_time = datetime.now() diff --git a/preprocessor/core/state_manager.py b/preprocessor/core/state_manager.py index 2a34ed042..5bf62b80b 100644 --- a/preprocessor/core/state_manager.py +++ b/preprocessor/core/state_manager.py @@ -47,7 +47,7 @@ def to_dict(self) -> Dict[str, Any]: } @classmethod - def __from_dict(cls, data: Dict[str, Any]) -> 'ProcessingState': # pylint: disable=unused-private-member + def _from_dict(cls, data: Dict[str, Any]) -> 'ProcessingState': completed_steps = [ StepCheckpoint(**step) for step in data.get('completed_steps', []) ] @@ -90,7 +90,7 @@ def load_or_create_state(self) -> ProcessingState: console.print(f'[yellow]Found existing state file: {self.__state_file}[/yellow]') with open(self.__state_file, 'r', encoding='utf-8') as f: data = json.load(f) - self.__state = ProcessingState.__from_dict(data) + self.__state = ProcessingState._from_dict(data) console.print(f'[green]Loaded state for series: {self.__state.series_name}[/green]') console.print(f'[green]Completed steps: {len(self.__state.completed_steps)}[/green]') return self.__state @@ -116,7 +116,7 @@ def mark_step_completed(self, step: str, episode: str) -> None: self.__state.completed_steps.append(checkpoint) self.__state.in_progress = None self.__save_state() - console.print(f'[green]✓ Completed: {step} for {episode}[/green]') + console.print(f'[green]Completed: {step} for {episode}[/green]') def mark_step_started( self, step: str, episode: str, temp_files: Optional[List[str]] = None, @@ -132,24 +132,6 @@ def mark_step_started( self.__save_state() console.print(f'[cyan]Started: {step} for {episode}[/cyan]') - def __rollback_in_progress(self) -> None: # pylint: disable=unused-private-member - if self.__state is None or self.__state.in_progress is None: - return - console.print( - f'[yellow]Rolling back in-progress step: ' - f'{self.__state.in_progress.step}[/yellow]', - ) - for temp_file in self.__state.in_progress.temp_files: - temp_path = Path(temp_file) - if temp_path.exists(): - try: - temp_path.unlink() - console.print(f'[yellow]Removed temp file: {temp_file}[/yellow]') - except OSError as e: - console.print(f'[red]Failed to remove {temp_file}: {e}[/red]') - self.__state.in_progress = None - self.__save_state() - def __save_state(self) -> None: if self.__state is None: return diff --git a/preprocessor/services/ai/clients.py b/preprocessor/services/ai/clients.py index f3d0cbb06..923bdbf21 100644 --- a/preprocessor/services/ai/clients.py +++ b/preprocessor/services/ai/clients.py @@ -61,7 +61,7 @@ def __load_model(self) -> None: enforce_eager=True, disable_log_stats=True, ) - console.print('[green]✓ LLM loaded successfully (vLLM)[/green]') + console.print('[green]LLM loaded successfully (vLLM)[/green]') except Exception as e: console.print(f'[red]Failed to load model: {e}[/red]') raise @@ -94,7 +94,7 @@ def __init_client(self) -> None: base_url='https://generativelanguage.googleapis.com/v1beta/openai/', api_key=api_key, ) - console.print('[green]✓ Gemini 2.5 Flash initialized[/green]') + console.print('[green]Gemini 2.5 Flash initialized[/green]') except Exception as e: console.print(f'[red]Failed to initialize Gemini client: {e}[/red]') raise diff --git a/preprocessor/services/ai/models.py b/preprocessor/services/ai/models.py index ba8e7809b..6b7d06ef7 100644 --- a/preprocessor/services/ai/models.py +++ b/preprocessor/services/ai/models.py @@ -1,4 +1,5 @@ from typing import ( + Dict, List, Optional, ) @@ -19,9 +20,7 @@ class EpisodeInfo(BaseModel): @field_validator('viewership', mode='before') @classmethod - @staticmethod - def __convert_viewership_to_str(cls, v: Optional[int]) -> Optional[str]: - # pylint: disable=unused-private-member + def __convert_viewership_to_str(cls, v: Optional[int]) -> Optional[str]: # pylint: disable=unused-private-member if v is None: return None if isinstance(v, int): @@ -35,9 +34,7 @@ class SeasonMetadata(BaseModel): @model_validator(mode='before') @classmethod - @staticmethod - def __convert_old_format(cls, data: dict) -> dict: - # pylint: disable=unused-private-member # pylint: disable=unused-private-member + def __convert_old_format(cls, data: Dict) -> Dict: # pylint: disable=unused-private-member if isinstance(data, dict) and 'episodes' in data: for idx, episode in enumerate(data['episodes'], start=1): if isinstance(episode, dict) and 'episode_number' in episode and ('episode_in_season' not in episode): diff --git a/preprocessor/services/ai/provider.py b/preprocessor/services/ai/provider.py index e23b68ff0..6488c5635 100644 --- a/preprocessor/services/ai/provider.py +++ b/preprocessor/services/ai/provider.py @@ -96,8 +96,7 @@ def __new__(cls, model_name: Optional[str] = None, parser_mode: Optional[ParserM cls.__instance = super().__new__(cls) return cls.__instance - def __extract_episode_metadata(self, page_text: str, url: str) -> Optional[EpisodeMetadata]: - # pylint: disable=unused-private-member + def __extract_episode_metadata(self, page_text: str, url: str) -> Optional[EpisodeMetadata]: # pylint: disable=unused-private-member return self.__process_llm_request( system_prompt=extract_episode_metadata_system.get(), user_prompt=extract_episode_metadata_user.get().format(url=url, page_text=page_text), @@ -124,8 +123,7 @@ def __extract_json(content: str) -> Dict[str, Any]: console.print(f'[yellow]Raw content:\n{content}[/yellow]') raise - def __extract_season_episodes(self, page_text: str, url: str) -> Optional[SeasonMetadata]: - # pylint: disable=unused-private-member + def __extract_season_episodes(self, page_text: str, url: str) -> Optional[SeasonMetadata]: # pylint: disable=unused-private-member return self.__process_llm_request( system_prompt=extract_season_system.get(), user_prompt=extract_season_user.get().format(url=url, page_text=page_text), @@ -133,8 +131,7 @@ def __extract_season_episodes(self, page_text: str, url: str) -> Optional[Season error_context=f'extraction failed for {url}', ) - def __merge_episode_data(self, metadata_list: List[EpisodeMetadata]) -> EpisodeMetadata: - # pylint: disable=unused-private-member + def __merge_episode_data(self, metadata_list: List[EpisodeMetadata]) -> EpisodeMetadata: # pylint: disable=unused-private-member if not metadata_list: raise ValueError('No metadata to merge') if len(metadata_list) == 1: diff --git a/preprocessor/services/characters/face_detection.py b/preprocessor/services/characters/face_detection.py index 33ca131ca..a70a391bc 100644 --- a/preprocessor/services/characters/face_detection.py +++ b/preprocessor/services/characters/face_detection.py @@ -65,7 +65,7 @@ def init() -> FaceAnalysis: available_providers = ort.get_available_providers() console.print(f"[dim]Available ONNX providers: {', '.join(available_providers)}[/dim]") if 'CUDAExecutionProvider' not in available_providers: - console.print('[red]✗ CUDAExecutionProvider not available in onnxruntime[/red]') + console.print('[red]CUDAExecutionProvider not available in onnxruntime[/red]') console.print('[red] Check if onnxruntime-gpu is installed and CUDA libraries are accessible[/red]') raise RuntimeError('CUDA provider not available in onnxruntime') providers = [( @@ -86,16 +86,16 @@ def init() -> FaceAnalysis: face_app = FaceAnalysis(name=settings.face_recognition.model_name, root=model_root, providers=providers) face_app.prepare(ctx_id=0, det_size=settings.face_recognition.detection_size, det_thresh=settings.character.face_detection_threshold) except Exception as e: - console.print('[red]✗ Failed to initialize face detection on GPU[/red]') + console.print('[red]Failed to initialize face detection on GPU[/red]') console.print(f'[red] Error: {e}[/red]') console.print('[red] Ensure CUDA and onnxruntime-gpu are properly configured[/red]') raise RuntimeError('GPU required but face detection initialization failed') from e actual_providers = face_app.models['detection'].session.get_providers() if 'CUDAExecutionProvider' not in actual_providers: - console.print('[red]✗ CUDA provider not active after initialization[/red]') + console.print('[red]CUDA provider not active after initialization[/red]') console.print(f"[red] Active providers: {', '.join(actual_providers)}[/red]") raise RuntimeError('CUDA required but not available for face detection') - console.print(f'[green]✓ Face detection initialized ({settings.face_recognition.model_name})[/green]') + console.print(f'[green]Face detection initialized ({settings.face_recognition.model_name})[/green]') console.print('[dim] Device: GPU (CUDA)[/dim]') console.print(f'[dim] Detection size: {settings.face_recognition.detection_size}[/dim]') console.print(f'[dim] Face detection threshold: {settings.character.face_detection_threshold}[/dim]') @@ -113,7 +113,7 @@ def load_character_references(characters_dir: Path, face_app: FaceAnalysis) -> D vector_file = char_dir / 'face_vector.npy' if vector_file.exists(): character_vectors[char_name] = np.load(vector_file) - console.print(f'[dim] ✓ {char_name}: loaded from face_vector.npy[/dim]') + console.print(f'[dim]{char_name}: loaded from face_vector.npy[/dim]') continue images = list(char_dir.glob('*.jpg')) if not images: @@ -127,8 +127,8 @@ def load_character_references(characters_dir: Path, face_app: FaceAnalysis) -> D mean_emb = np.mean(embeddings, axis=0) centroid = mean_emb / norm(mean_emb) character_vectors[char_name] = centroid - console.print(f'[green] ✓ {char_name}: {len(embeddings)} reference images[/green]') - console.print(f'[green]✓ Loaded {len(character_vectors)} characters[/green]') + console.print(f'[green]{char_name}: {len(embeddings)} reference images[/green]') + console.print(f'[green]Loaded {len(character_vectors)} characters[/green]') return character_vectors @staticmethod diff --git a/preprocessor/services/characters/image_search/google_image_search.py b/preprocessor/services/characters/image_search/google_image_search.py index fdbd8ab44..369d5f3ec 100644 --- a/preprocessor/services/characters/image_search/google_image_search.py +++ b/preprocessor/services/characters/image_search/google_image_search.py @@ -10,7 +10,7 @@ class GoogleImageSearch(BaseImageSearch): - def __init__(self, api_key: str, max_results: int=50): + def __init__(self, api_key: str, max_results: int=50) -> None: super().__init__(max_results) if not api_key: raise ValueError('SerpAPI key is required for Google Image Search') diff --git a/preprocessor/services/characters/image_search/image_search.py b/preprocessor/services/characters/image_search/image_search.py index fdc3305fb..d6bafe33a 100644 --- a/preprocessor/services/characters/image_search/image_search.py +++ b/preprocessor/services/characters/image_search/image_search.py @@ -10,7 +10,7 @@ class BaseImageSearch(ABC): - def __init__(self, max_results: int=50): + def __init__(self, max_results: int=50) -> None: self.max_results = max_results @property diff --git a/preprocessor/services/characters/models.py b/preprocessor/services/characters/models.py index 777447e9e..484aebebc 100644 --- a/preprocessor/services/characters/models.py +++ b/preprocessor/services/characters/models.py @@ -1,5 +1,6 @@ from dataclasses import dataclass from pathlib import Path +from typing import List import numpy as np @@ -15,4 +16,4 @@ class FaceData: @dataclass class CandidateFace: avg_similarity: float - faces: list[FaceData] + faces: List[FaceData] diff --git a/preprocessor/services/characters/reference_downloader.py b/preprocessor/services/characters/reference_downloader.py index c42f23613..557078ab3 100644 --- a/preprocessor/services/characters/reference_downloader.py +++ b/preprocessor/services/characters/reference_downloader.py @@ -39,7 +39,7 @@ class CharacterReferenceDownloader(BaseProcessor): - def __init__(self, args: Dict[str, Any]): + def __init__(self, args: Dict[str, Any]) -> None: super().__init__(args=args, class_name=self.__class__.__name__, error_exit_code=8, loglevel=logging.DEBUG) self.characters_json: Path = self._args['characters_json'] self.series_name: str = self._args['series_name'] diff --git a/preprocessor/services/core/base_processor.py b/preprocessor/services/core/base_processor.py index 7b28fafff..4328b2385 100644 --- a/preprocessor/services/core/base_processor.py +++ b/preprocessor/services/core/base_processor.py @@ -35,6 +35,16 @@ class OutputSpec: path: Path required: bool = True + +@dataclass +class _FilterResult: + """Result of filtering processing items.""" + items_to_process: List[ProcessingItem] + skipped_count: int + skip_messages: List[str] + total_items: int + + class BaseProcessor(ABC): DESCRIPTION: str = '' PRIORITY: int = 100 @@ -75,15 +85,39 @@ def work(self) -> int: return self.logger.finalize() def _execute(self) -> None: + """Main execution flow - orchestration only.""" all_items = self._get_processing_items() if not all_items: console.print('[yellow]No items to process[/yellow]') return - items_to_process = [] + + filter_result = self.__filter_skipped_items(all_items) + + if not filter_result.items_to_process: + console.print( + f'[yellow]All items already processed ' + f'({filter_result.total_items} total, {filter_result.skipped_count} skipped)[/yellow]', + ) + return + + self.__display_processing_summary(filter_result) + self.__execute_processing(filter_result.items_to_process) + self._finalize() + + def __filter_skipped_items(self, all_items: List[ProcessingItem]) -> _FilterResult: + """ + Filters out items that should be skipped (cached). + + Returns: + FilterResult with items to process and skip information + """ + items_to_process: List[ProcessingItem] = [] skipped_count = 0 - skip_messages = [] + skip_messages: List[str] = [] + for item in all_items: should_skip, missing_outputs, skip_message = self.__should_skip_item(item) + if should_skip: if skip_message: skip_messages.append(skip_message) @@ -91,20 +125,24 @@ def _execute(self) -> None: else: item.metadata['missing_outputs'] = missing_outputs items_to_process.append(item) - if not items_to_process: - console.print( - f'[yellow]All items already processed ' - f'({len(all_items)} total, {skipped_count} skipped)[/yellow]', - ) - return - for skip_message in skip_messages: + + return _FilterResult( + items_to_process=items_to_process, + skipped_count=skipped_count, + skip_messages=skip_messages, + total_items=len(all_items), + ) + + @staticmethod + def __display_processing_summary(result: _FilterResult) -> None: + """Displays summary of what will be processed and what was skipped.""" + for skip_message in result.skip_messages: console.print(skip_message) + console.print( - f'[blue]Processing {len(items_to_process)} items ' - f'(of {len(all_items)} total, {skipped_count} skipped)[/blue]', + f'[blue]Processing {len(result.items_to_process)} items ' + f'(of {result.total_items} total, {result.skipped_count} skipped)[/blue]', ) - self.__execute_processing(items_to_process) - self._finalize() @abstractmethod def _get_expected_outputs(self, item: ProcessingItem) -> List[OutputSpec]: diff --git a/preprocessor/services/core/logging.py b/preprocessor/services/core/logging.py index 5f8635001..54468c6db 100644 --- a/preprocessor/services/core/logging.py +++ b/preprocessor/services/core/logging.py @@ -9,10 +9,15 @@ class LoggerNotFinalizedException(Exception): - def __init__(self): + def __init__(self) -> None: super().__init__('Logger destroyed without finalize() being called.') class ErrorHandlingLogger: + DEBUG = 10 + INFO = 20 + WARNING = 30 + ERROR = 40 + CRITICAL = 50 def __init__(self, class_name: str, loglevel: int, error_exit_code: int) -> None: self.__class_name: str = class_name diff --git a/preprocessor/services/episodes/episode_manager.py b/preprocessor/services/episodes/episode_manager.py index 0504926cd..0f210861e 100644 --- a/preprocessor/services/episodes/episode_manager.py +++ b/preprocessor/services/episodes/episode_manager.py @@ -37,12 +37,12 @@ def episode_num(self) -> str: def season_code(self) -> str: return f'S{self.season:02d}' - def __is_special(self) -> bool: # pylint: disable=unused-private-member + def __is_special(self) -> bool: # pylint: disable=unused-private-member return self.season == 0 class EpisodeManager: - def __init__(self, episodes_info_json: Optional[Path], series_name: str, logger: Optional[ErrorHandlingLogger]=None): + def __init__(self, episodes_info_json: Optional[Path], series_name: str, logger: Optional[ErrorHandlingLogger]=None) -> None: self.series_name = series_name.lower() self.episodes_data: Optional[Dict[str, Any]] = None self.path_manager = PathManager(self.series_name) @@ -131,7 +131,7 @@ def __find_scene_timestamps_file(episode_info: EpisodeInfo, search_dir: Path) -> return scene_file return None - def __find_transcription_file(self, episode_info: EpisodeInfo, search_dir: Path, prefer_segmented: bool=True) -> Optional[Path]: # pylint: disable=unused-private-member + def __find_transcription_file(self, episode_info: EpisodeInfo, search_dir: Path, prefer_segmented: bool=True) -> Optional[Path]: # pylint: disable=unused-private-member if not search_dir.exists(): return None season_dir_name = episode_info.season_code() @@ -148,7 +148,7 @@ def __find_transcription_file(self, episode_info: EpisodeInfo, search_dir: Path, return None @staticmethod - def __find_video_file(episode_info: EpisodeInfo, search_dir: Path) -> Optional[Path]: # pylint: disable=unused-private-member + def __find_video_file(episode_info: EpisodeInfo, search_dir: Path) -> Optional[Path]: # pylint: disable=unused-private-member if not search_dir.exists(): return None if search_dir.is_file(): @@ -165,7 +165,7 @@ def __find_video_file(episode_info: EpisodeInfo, search_dir: Path) -> Optional[P return video_file return None - def __list_all_episodes(self) -> List[EpisodeInfo]: # pylint: disable=unused-private-member + def __list_all_episodes(self) -> List[EpisodeInfo]: # pylint: disable=unused-private-member episodes: List[EpisodeInfo] = [] if not self.episodes_data: return episodes @@ -185,7 +185,7 @@ def __list_all_episodes(self) -> List[EpisodeInfo]: # pylint: disable=unused-pri return episodes @staticmethod - def __load_scene_timestamps( # pylint: disable=unused-private-member + def __load_scene_timestamps( # pylint: disable=unused-private-member episode_info: EpisodeInfo, search_dir: Optional[Path], _logger: Optional[ErrorHandlingLogger]=None, diff --git a/preprocessor/services/io/__init__.py b/preprocessor/services/io/__init__.py index 6eb13cfaf..c209731a0 100644 --- a/preprocessor/services/io/__init__.py +++ b/preprocessor/services/io/__init__.py @@ -1,5 +1,4 @@ from preprocessor.services.io.path_manager import PathManager -from preprocessor.services.io.path_resolver import PathResolver from preprocessor.services.io.path_service import PathService -__all__ = ['PathManager', 'PathResolver', 'PathService'] +__all__ = ['PathManager', 'PathService'] diff --git a/preprocessor/services/io/files.py b/preprocessor/services/io/files.py index 160bd2e53..e5dda10ad 100644 --- a/preprocessor/services/io/files.py +++ b/preprocessor/services/io/files.py @@ -34,7 +34,7 @@ def __atomic_write(path: Path, write_func: Callable[[Any], None]) -> None: raise @staticmethod - def __atomic_write_text(path: Path, content: str) -> None: # pylint: disable=unused-private-member + def __atomic_write_text(path: Path, content: str) -> None: # pylint: disable=unused-private-member def __write(temp: Path) -> None: with open(temp, 'w', encoding='utf-8') as f: diff --git a/preprocessor/services/io/hashing.py b/preprocessor/services/io/hashing.py index f0756a9a6..e31f77ee3 100644 --- a/preprocessor/services/io/hashing.py +++ b/preprocessor/services/io/hashing.py @@ -1,48 +1,2 @@ -from pathlib import Path -from typing import ( - Any, - Dict, - List, -) - -from preprocessor.config.config import settings -from preprocessor.services.episodes import EpisodeInfo -from preprocessor.services.io.files import FileOperations -from preprocessor.services.io.metadata import MetadataBuilder -from preprocessor.services.io.path_manager import PathManager - - class HashStorage: - - @staticmethod - def __save_image_hashes_to_json( # pylint: disable=unused-private-member - episode_info: EpisodeInfo, - hash_results: List[Dict[str, Any]], - series_name: str, - device: str, - batch_size: int, - ) -> Path: - path_manager = PathManager(series_name) - episode_dir = path_manager.get_episode_dir( - episode_info, - settings.output_subdirs.image_hashes, - ) - episode_dir.mkdir(parents=True, exist_ok=True) - unique_hashes = len( - set(( - h.get('perceptual_hash') - for h in hash_results - if 'perceptual_hash' in h - )), - ) - hash_data = MetadataBuilder.create_processing_metadata( - episode_info=episode_info, - processing_params={'device': device, 'batch_size': batch_size, 'hash_size': 8}, - statistics={'total_hashes': len(hash_results), 'unique_hashes': unique_hashes}, - results_key='image_hashes', - results_data=hash_results, - ) - hash_filename = path_manager.build_filename(episode_info, extension='json', suffix='image_hashes') - output_path = episode_dir / hash_filename - FileOperations.atomic_write_json(output_path, hash_data) - return output_path + pass diff --git a/preprocessor/services/io/path_resolver.py b/preprocessor/services/io/path_resolver.py deleted file mode 100644 index efacbfb63..000000000 --- a/preprocessor/services/io/path_resolver.py +++ /dev/null @@ -1,17 +0,0 @@ -from pathlib import Path - -from preprocessor.services.io.path_service import PathService - - -class PathResolver: - - @staticmethod - def get_input_base() -> Path: - return PathService.get_input_base() - - @staticmethod - def get_output_base() -> Path: - return PathService.get_output_base() - @staticmethod - def _is_docker() -> bool: - return PathService._is_docker() diff --git a/preprocessor/services/media/ffmpeg.py b/preprocessor/services/media/ffmpeg.py index a82405f52..7801ab460 100644 --- a/preprocessor/services/media/ffmpeg.py +++ b/preprocessor/services/media/ffmpeg.py @@ -18,7 +18,7 @@ class FFmpegWrapper: __B_ADAPT = '1' __LEVEL = '4.1' __PIX_FMT = 'yuv420p' - __PROFILE = 'main' + __PROFILE = 'high' __RC_LOOKAHEAD = '32' __TWO_PASS = '1' @@ -102,6 +102,38 @@ def get_video_bitrate(probe_data: Dict[str, Any]) -> Optional[float]: return None return round(int(bit_rate) / 1000000, 2) + @staticmethod + def get_resolution(probe_data: Dict[str, Any]) -> Tuple[int, int]: + stream = FFmpegWrapper.__get_stream_by_type(probe_data, 'video') + if not stream: + raise ValueError('No video streams found') + width = stream.get('width') + height = stream.get('height') + if not width or not height: + raise ValueError('Resolution not found') + return int(width), int(height) + + @staticmethod + def get_sample_aspect_ratio(probe_data: Dict[str, Any]) -> Tuple[int, int]: + stream = FFmpegWrapper.__get_stream_by_type(probe_data, 'video') + if not stream: + return (1, 1) + sar = stream.get('sample_aspect_ratio', '1:1') + if sar == '0:1' or not sar: + return (1, 1) + try: + num, denom = [int(x) for x in sar.split(':')] + return (num, denom) + except (ValueError, AttributeError): + return (1, 1) + + @staticmethod + def get_field_order(probe_data: Dict[str, Any]) -> str: + stream = FFmpegWrapper.__get_stream_by_type(probe_data, 'video') + if not stream: + return 'unknown' + return stream.get('field_order', 'unknown') + @staticmethod def probe_video(video_path: Path) -> Dict[str, Any]: cmd = ['ffprobe', '-v', 'error', '-show_streams', '-show_format', '-of', 'json', str(video_path)] @@ -123,13 +155,14 @@ def transcode( # pylint: disable=too-many-arguments gop_size: int, target_fps: Optional[float] = None, deinterlace: bool = False, + is_upscaling: bool = False, ) -> None: width, height = [int(x) for x in resolution.split(':')] - vf_filter = FFmpegWrapper.__build_video_filter(width, height, deinterlace) + vf_filter = FFmpegWrapper.__build_video_filter(width, height, deinterlace, is_upscaling) command = FFmpegWrapper.__build_base_command(input_path, codec, preset, target_fps) command.extend( FFmpegWrapper.__build_encoding_params( - video_bitrate, minrate, maxrate, bufsize, gop_size, + video_bitrate, minrate, maxrate, bufsize, gop_size, is_upscaling, ), ) command.extend( @@ -172,9 +205,14 @@ def __build_base_command( @staticmethod def __build_encoding_params( - video_bitrate: str, minrate: str, maxrate: str, bufsize: str, gop_size: int, + video_bitrate: str, + minrate: str, + maxrate: str, + bufsize: str, + gop_size: int, + is_upscaling: bool = False, ) -> List[str]: - return [ + params = [ '-rc', 'vbr_hq', '-b:v', video_bitrate, '-minrate', minrate, @@ -183,25 +221,41 @@ def __build_encoding_params( '-bf', FFmpegWrapper.__BF, '-b_adapt', FFmpegWrapper.__B_ADAPT, '-2pass', FFmpegWrapper.__TWO_PASS, - '-rc-lookahead', FFmpegWrapper.__RC_LOOKAHEAD, - '-aq-strength', FFmpegWrapper.__AQ_STRENGTH, + '-multipass', 'fullres', '-g', str(gop_size), '-spatial-aq', '1', '-temporal-aq', '1', - '-multipass', 'fullres', ] + if is_upscaling: + params.extend([ + '-rc-lookahead', '60', + '-aq-strength', '18', + '-b_ref_mode', 'middle', + ]) + else: + params.extend([ + '-rc-lookahead', FFmpegWrapper.__RC_LOOKAHEAD, + '-aq-strength', FFmpegWrapper.__AQ_STRENGTH, + ]) + + return params + @staticmethod - def __build_video_filter(width: int, height: int, deinterlace: bool = False) -> str: + def __build_video_filter( + width: int, height: int, deinterlace: bool = False, is_upscaling: bool = False, + ) -> str: filters = [] if deinterlace: filters.append('bwdif=mode=0') + scaler_flags = 'lanczos' if is_upscaling else 'bicubic' + filters.append( f"scale='iw*sar:ih',scale={width}:{height}:" - f"force_original_aspect_ratio=decrease,pad={width}:{height}:" - f"(ow-iw)/2:(oh-ih)/2:black,setsar=1", + f"force_original_aspect_ratio=decrease:flags={scaler_flags}," + f"pad={width}:{height}:(ow-iw)/2:(oh-ih)/2:black,setsar=1", ) return ','.join(filters) diff --git a/preprocessor/services/media/resolution.py b/preprocessor/services/media/resolution.py index fd255dccd..cfa0ef219 100644 --- a/preprocessor/services/media/resolution.py +++ b/preprocessor/services/media/resolution.py @@ -18,15 +18,15 @@ class Resolution(Enum): R480P = (854, 480) R720P = (1280, 720) - def __init__(self, width: int, height: int): + def __init__(self, width: int, height: int) -> None: self.width = width self.height = height - def __str__(self): + def __str__(self) -> str: return f'{self.height}p' @classmethod - def __from_str(cls: Type[T], init: str) -> T: # pylint: disable=unused-private-member + def from_string(cls: Type[T], init: str) -> T: init = init.strip() if not init[0].isalpha(): init = 'R' + init.upper() @@ -35,5 +35,9 @@ def __from_str(cls: Type[T], init: str) -> T: # pylint: disable=unused-private-m return cls[init] @classmethod - def __get_all_choices(cls) -> List[str]: # pylint: disable=unused-private-member + def __from_str(cls: Type[T], init: str) -> T: # pylint: disable=unused-private-member + return cls.from_string(init) + + @classmethod + def __get_all_choices(cls) -> List[str]: # pylint: disable=unused-private-member return [str(r) for r in cls] diff --git a/preprocessor/services/media/scene_detection.py b/preprocessor/services/media/scene_detection.py index 9c1e9329c..bfd70e3ca 100644 --- a/preprocessor/services/media/scene_detection.py +++ b/preprocessor/services/media/scene_detection.py @@ -15,7 +15,7 @@ class TransNetWrapper: - def __init__(self): + def __init__(self) -> None: self.model: Optional[TransNetV2] = None def cleanup(self) -> None: diff --git a/preprocessor/services/scraping/base_scraper.py b/preprocessor/services/scraping/base_scraper.py index 8f4f7ce1d..c520fb9ce 100644 --- a/preprocessor/services/scraping/base_scraper.py +++ b/preprocessor/services/scraping/base_scraper.py @@ -23,7 +23,7 @@ class BaseScraper(BaseProcessor): - def __init__(self, args: Dict[str, Any], error_exit_code: int=7): + def __init__(self, args: Dict[str, Any], error_exit_code: int=7) -> None: super().__init__(args=args, class_name=self.__class__.__name__, error_exit_code=error_exit_code, loglevel=logging.DEBUG) self.urls: List[str] = self._args['urls'] self.output_file: Path = self._args['output_file'] @@ -74,7 +74,7 @@ def __scrape_all_urls(self) -> List[Dict[str, Any]]: page_text = self.__scrape_url(url) if page_text: scraped_pages.append({'url': url, 'markdown': page_text}) - console.print(f'[green]✓[/green] {url}: {len(page_text)} chars') + console.print(f'[green][/green] {url}: {len(page_text)} chars') else: self.logger.error(f'Failed to scrape {url}') except Exception as e: diff --git a/preprocessor/services/scraping/base_scraper_step.py b/preprocessor/services/scraping/base_scraper_step.py index b4a56700e..0fa8a7b13 100644 --- a/preprocessor/services/scraping/base_scraper_step.py +++ b/preprocessor/services/scraping/base_scraper_step.py @@ -7,6 +7,7 @@ Any, Dict, Optional, + Type, TypeVar, ) @@ -56,7 +57,7 @@ def execute( return input_data @abstractmethod - def _get_scraper_class(self): + def _get_scraper_class(self) -> Type: pass @abstractmethod diff --git a/preprocessor/services/scraping/character_scraper.py b/preprocessor/services/scraping/character_scraper.py index c478649fd..505085ed6 100644 --- a/preprocessor/services/scraping/character_scraper.py +++ b/preprocessor/services/scraping/character_scraper.py @@ -21,5 +21,5 @@ def _process_scraped_pages(self, scraped_pages: List[Dict[str, Any]]) -> None: return result = {'sources': [item['url'] for item in scraped_pages], 'characters': [char.model_dump() for char in characters]} self._save_result(result) - console.print(f'[green]✓ Extracted {len(characters)} characters[/green]') - console.print(f'[green]✓ Saved to: {self.output_file}[/green]') + console.print(f'[green]Extracted {len(characters)} characters[/green]') + console.print(f'[green]Saved to: {self.output_file}[/green]') diff --git a/preprocessor/services/scraping/episode_scraper.py b/preprocessor/services/scraping/episode_scraper.py index 703ee6083..166652bbc 100644 --- a/preprocessor/services/scraping/episode_scraper.py +++ b/preprocessor/services/scraping/episode_scraper.py @@ -27,8 +27,8 @@ def _process_scraped_pages(self, scraped_pages: List[Dict[str, Any]]) -> None: result = {'sources': [item['url'] for item in scraped_pages], 'seasons': [season.model_dump() for season in all_seasons]} self._save_result(result) total_episodes = sum((len(season.episodes) for season in all_seasons)) - console.print(f'[green]✓ Extracted {len(all_seasons)} seasons, {total_episodes} episodes[/green]') - console.print(f'[green]✓ Saved to: {self.output_file}[/green]') + console.print(f'[green]Extracted {len(all_seasons)} seasons, {total_episodes} episodes[/green]') + console.print(f'[green]Saved to: {self.output_file}[/green]') self.__validate_and_report_coverage(total_episodes) def __count_video_files(self, directory: Path) -> int: @@ -55,23 +55,23 @@ def __get_expected_episodes_count(self) -> Optional[int]: @staticmethod def __print_coverage_report(scraped: int, expected: int, status: str, message: str) -> None: coverage_pct = scraped / expected * 100 if expected > 0 else 0 - console.print('\n[yellow]⚠ Episode coverage validation:[/yellow]') + console.print('\n[yellow]Episode coverage validation:[/yellow]') console.print(f' [cyan]Scraped episodes: {scraped}[/cyan]') console.print(f' [cyan]Video files found: {expected}[/cyan]') console.print(f' [cyan]Coverage: {coverage_pct:.1f}%[/cyan]') if status == 'missing': - console.print(f'\n[red]✗ WARNING: {message}![/red]') + console.print(f'\n[red]WARNING: {message}![/red]') console.print(' [yellow]Consider adding more URLs to --scrape-urls[/yellow]') console.print(' [dim]Not all video files will have metadata available[/dim]\n') elif status == 'extra': - console.print(f'\n[yellow]⚠ Note: {message}[/yellow]') + console.print(f'\n[yellow]Note: {message}[/yellow]') console.print(' [dim]This is OK if you plan to add more videos later[/dim]\n') else: - console.print('\n[green]✓ Perfect coverage - all video files have metadata![/green]\n') + console.print('\n[green]Perfect coverage - all video files have metadata![/green]\n') @staticmethod def __print_no_validation_warning(scraped_count: int) -> None: - console.print('\n[yellow]⚠ Coverage validation:[/yellow]') + console.print('\n[yellow]Coverage validation:[/yellow]') console.print(f' [cyan]Scraped episodes: {scraped_count}[/cyan]') console.print(' [yellow]No video directory provided - unable to validate coverage[/yellow]') console.print(' [dim]Make sure the scraped episodes cover all your video files[/dim]') diff --git a/preprocessor/services/scraping/grid_visualizer.py b/preprocessor/services/scraping/grid_visualizer.py new file mode 100644 index 000000000..397245cea --- /dev/null +++ b/preprocessor/services/scraping/grid_visualizer.py @@ -0,0 +1,425 @@ +from dataclasses import dataclass +from datetime import datetime +import json +from pathlib import Path +from typing import ( + Any, + Dict, + List, + Optional, + Tuple, +) + +import cv2 +import numpy as np + +from preprocessor.config.config import settings + + +@dataclass +class GridDimensions: + face_size: int = 280 + faces_per_char: int = 3 + footer_height: int = 80 + header_height: int = 180 + header_row_height: int = 40 + label_col_width: int = 350 + padding: int = 15 + stats_col_width: int = 200 + + @property + def face_col_width(self) -> int: + return self.face_size + self.padding + + @property + def row_height(self) -> int: + return self.face_size + self.padding * 2 + + def total_height(self, num_chars: int) -> int: + return self.header_height + num_chars * self.row_height + self.footer_height + + def total_width(self) -> int: + return ( + self.label_col_width + + self.stats_col_width + + self.faces_per_char * self.face_col_width + + self.padding * 2 + ) + + +class CharacterGridVisualizer: + def __init__( + self, + dimensions: Optional[GridDimensions] = None, + similarity_threshold: float = 0.5, + ) -> None: + self._dims = dimensions or GridDimensions() + self._similarity_threshold = similarity_threshold + + def generate_grid( + self, + processed_chars_dir: Path, + output_path: Path, + ) -> Dict[str, Any]: + processed_chars = sorted([d for d in processed_chars_dir.iterdir() if d.is_dir()]) + + if not processed_chars: + return { + 'width': 0, + 'height': 0, + 'num_chars': 0, + 'avg_similarity': 0.0, + } + + canvas = self.__create_canvas(processed_chars) + metadata_all = self.__load_all_metadata(processed_chars) + avg_similarity = self.__calculate_avg_similarity(metadata_all) + + canvas = self.__render_header(canvas, len(processed_chars), avg_similarity) + canvas = self.__render_table_headers(canvas) + canvas = self.__render_character_rows(canvas, processed_chars) + canvas = self.__render_footer(canvas) + + cv2.imwrite( + str(output_path), + canvas, + [cv2.IMWRITE_PNG_COMPRESSION, 6], + ) + + return { + 'width': self._dims.total_width(), + 'height': self._dims.total_height(len(processed_chars)), + 'num_chars': len(processed_chars), + 'avg_similarity': avg_similarity, + } + + def __create_canvas(self, processed_chars: List[Path]) -> np.ndarray: + grid_width = self._dims.total_width() + grid_height = self._dims.total_height(len(processed_chars)) + bg_color = (250, 252, 255) + return np.full((grid_height, grid_width, 3), bg_color, dtype=np.uint8) + + def __render_header( + self, + canvas: np.ndarray, + total_chars: int, + avg_similarity: float, + ) -> np.ndarray: + header_bg_color = (45, 55, 72) + cv2.rectangle( + canvas, + (0, 0), + (self._dims.total_width(), self._dims.header_height), + header_bg_color, + -1, + ) + + title_text = 'FACIAL REFERENCE VALIDATION REPORT' + cv2.putText( + canvas, + title_text, + (self._dims.padding * 3, 50), + cv2.FONT_HERSHEY_DUPLEX, + 1.1, + (255, 255, 255), + 2, + cv2.LINE_AA, + ) + + subtitle = 'InsightFace Buffalo-L Model | Face Vector Extraction & Similarity Analysis' + cv2.putText( + canvas, + subtitle, + (self._dims.padding * 3, 85), + cv2.FONT_HERSHEY_SIMPLEX, + 0.55, + (200, 210, 220), + 1, + cv2.LINE_AA, + ) + + stats_y = 115 + stats_items = [ + f'Total Subjects: {total_chars}', + f'Avg Similarity: {avg_similarity:.4f}', + f'Threshold: {self._similarity_threshold:.2f}', + ] + for idx, stat in enumerate(stats_items): + x_pos = self._dims.padding * 3 + idx * 280 + cv2.putText( + canvas, + stat, + (x_pos, stats_y), + cv2.FONT_HERSHEY_SIMPLEX, + 0.5, + (180, 200, 220), + 1, + cv2.LINE_AA, + ) + + return canvas + + def __render_table_headers(self, canvas: np.ndarray) -> np.ndarray: + table_header_y = self._dims.header_height + 1 + cv2.line( + canvas, + (0, table_header_y), + (self._dims.total_width(), table_header_y), + (180, 190, 200), + 2, + ) + + col_headers = [ + ('CHARACTER NAME', self._dims.label_col_width // 2, 0), + ('STATISTICS', self._dims.label_col_width + self._dims.stats_col_width // 2, 0), + ( + 'REFERENCE IMAGE 1', + self._dims.label_col_width + self._dims.stats_col_width + self._dims.face_col_width // 2, + 0, + ), + ( + 'REFERENCE IMAGE 2', + self._dims.label_col_width + self._dims.stats_col_width + self._dims.face_col_width * 3 // 2, + 0, + ), + ( + 'REFERENCE IMAGE 3', + self._dims.label_col_width + self._dims.stats_col_width + self._dims.face_col_width * 5 // 2, + 0, + ), + ] + + for text, x_center, _ in col_headers: + text_size = cv2.getTextSize(text, cv2.FONT_HERSHEY_SIMPLEX, 0.42, 1)[0] + text_x = x_center - text_size[0] // 2 + cv2.putText( + canvas, + text, + (text_x, table_header_y + 25), + cv2.FONT_HERSHEY_SIMPLEX, + 0.42, + (60, 70, 85), + 1, + cv2.LINE_AA, + ) + + cv2.line( + canvas, + (0, table_header_y + self._dims.header_row_height), + (self._dims.total_width(), table_header_y + self._dims.header_row_height), + (200, 210, 220), + 1, + ) + + return canvas + + def __render_character_rows( + self, + canvas: np.ndarray, + processed_chars: List[Path], + ) -> np.ndarray: + y_offset = self._dims.header_height + self._dims.header_row_height + self._dims.padding + bg_color = (250, 252, 255) + + for idx, char_dir in enumerate(processed_chars): + self.__render_character_row(canvas, char_dir, idx, y_offset, bg_color) + y_offset += self._dims.row_height + + return canvas + + def __render_character_row( + self, + canvas: np.ndarray, + char_dir: Path, + row_idx: int, + y_offset: int, + bg_color: Tuple[int, int, int], + ) -> None: + char_name = char_dir.name.replace('_', ' ').title() + row_bg = (245, 248, 252) if row_idx % 2 == 0 else bg_color + + cv2.rectangle( + canvas, + (0, y_offset - self._dims.padding), + (self._dims.total_width(), y_offset + self._dims.face_size + self._dims.padding), + row_bg, + -1, + ) + + cv2.putText( + canvas, + char_name, + (self._dims.padding * 2, y_offset + self._dims.face_size // 2), + cv2.FONT_HERSHEY_SIMPLEX, + 0.55, + (30, 40, 50), + 1, + cv2.LINE_AA, + ) + + self.__render_character_stats(canvas, char_dir, y_offset) + self.__render_character_faces(canvas, char_dir, y_offset) + + def __render_character_stats( + self, + canvas: np.ndarray, + char_dir: Path, + y_offset: int, + ) -> None: + metadata_file = char_dir / 'metadata.json' + if not metadata_file.exists(): + return + + with open(metadata_file, 'r', encoding='utf-8') as f: + metadata = json.load(f) + + similarity = metadata.get('average_similarity', 0.0) + method = metadata.get('detection_stats', {}).get('selection_method', 'unknown') + faces_detected = metadata.get('detection_stats', {}).get('total_faces_detected', []) + + stats_x = self._dims.label_col_width + self._dims.padding + stats_y_base = y_offset + self._dims.face_size // 2 - 30 + + sim_color = (0, 150, 0) if similarity >= self._similarity_threshold else (180, 100, 0) + cv2.putText( + canvas, + f'Similarity: {similarity:.4f}', + (stats_x, stats_y_base), + cv2.FONT_HERSHEY_SIMPLEX, + 0.45, + sim_color, + 1, + cv2.LINE_AA, + ) + + method_color = (50, 120, 200) if method == 'automatic' else (180, 100, 50) + cv2.putText( + canvas, + f'Method: {method}', + (stats_x, stats_y_base + 25), + cv2.FONT_HERSHEY_SIMPLEX, + 0.42, + method_color, + 1, + cv2.LINE_AA, + ) + + faces_str = str(faces_detected) if len(str(faces_detected)) < 20 else f'[{len(faces_detected)} imgs]' + cv2.putText( + canvas, + f'Detected: {faces_str}', + (stats_x, stats_y_base + 50), + cv2.FONT_HERSHEY_SIMPLEX, + 0.38, + (100, 110, 120), + 1, + cv2.LINE_AA, + ) + + def __render_character_faces( + self, + canvas: np.ndarray, + char_dir: Path, + y_offset: int, + ) -> None: + face_files = sorted(char_dir.glob('face_*.jpg')) + for face_idx, face_file in enumerate(face_files[:self._dims.faces_per_char]): + face_img = cv2.imread(str(face_file)) + if face_img is None: + continue + + face_resized = self._safe_resize(face_img, (self._dims.face_size, self._dims.face_size)) + if face_resized is None: + continue + + x = ( + self._dims.label_col_width + + self._dims.stats_col_width + + face_idx * self._dims.face_col_width + + self._dims.padding + ) + canvas[y_offset:y_offset + self._dims.face_size, x:x + self._dims.face_size] = face_resized + + cv2.rectangle( + canvas, + (x - 1, y_offset - 1), + (x + self._dims.face_size + 1, y_offset + self._dims.face_size + 1), + (180, 190, 200), + 1, + ) + + def __render_footer(self, canvas: np.ndarray) -> np.ndarray: + grid_height = canvas.shape[0] + footer_y = grid_height - self._dims.footer_height + 20 + cv2.line( + canvas, + (0, footer_y - 20), + (self._dims.total_width(), footer_y - 20), + (200, 210, 220), + 1, + ) + + footer_text = ( + f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')} | " + f"Model: {settings.face_recognition.model_name} | " + f"Normalized Size: {settings.character.normalized_face_size[0]}x" + f"{settings.character.normalized_face_size[1]}px" + ) + cv2.putText( + canvas, + footer_text, + (self._dims.padding * 3, footer_y), + cv2.FONT_HERSHEY_SIMPLEX, + 0.4, + (120, 130, 140), + 1, + cv2.LINE_AA, + ) + + legend_y = footer_y + 30 + legend_items = [ + ('Automatic: Face found on all references', (50, 120, 200)), + ('Manual: User-selected reference', (180, 100, 50)), + ] + for idx, (text, color) in enumerate(legend_items): + x_pos = self._dims.padding * 3 + idx * 380 + cv2.circle(canvas, (x_pos, legend_y - 3), 5, color, -1) + cv2.putText( + canvas, + text, + (x_pos + 15, legend_y), + cv2.FONT_HERSHEY_SIMPLEX, + 0.38, + (100, 110, 120), + 1, + cv2.LINE_AA, + ) + + return canvas + + @staticmethod + def __load_all_metadata(processed_chars: List[Path]) -> List[Dict[str, Any]]: + metadata_all = [] + for char_dir in processed_chars: + metadata_file = char_dir / 'metadata.json' + if metadata_file.exists(): + with open(metadata_file, 'r', encoding='utf-8') as f: + metadata_all.append(json.load(f)) + return metadata_all + + @staticmethod + def __calculate_avg_similarity(metadata_all: List[Dict[str, Any]]) -> float: + if not metadata_all: + return 0.0 + return float(np.mean([m.get('average_similarity', 0) for m in metadata_all])) + + @staticmethod + def _safe_resize(img: np.ndarray, target_size: Tuple[int, int]) -> Optional[np.ndarray]: + if img is None or img.size == 0: + return None + if img.shape[0] == 0 or img.shape[1] == 0: + return None + try: + return cv2.resize(img, target_size) + except cv2.error: + return None diff --git a/preprocessor/services/scraping/reference_processor.py b/preprocessor/services/scraping/reference_processor.py index 93d4584cc..d30ee5ec3 100644 --- a/preprocessor/services/scraping/reference_processor.py +++ b/preprocessor/services/scraping/reference_processor.py @@ -1,4 +1,3 @@ -from dataclasses import dataclass from datetime import datetime import json import logging @@ -8,7 +7,6 @@ Dict, List, Optional, - Tuple, ) import warnings @@ -27,54 +25,26 @@ OutputSpec, ProcessingItem, ) +from preprocessor.services.scraping.grid_visualizer import CharacterGridVisualizer from preprocessor.services.ui.console import console warnings.filterwarnings('ignore', message='.*estimate.*is deprecated.*', category=FutureWarning, module='insightface') class CharacterReferenceProcessor(BaseProcessor): - @dataclass - class _GridDimensions: - face_size: int = 280 - faces_per_char: int = 3 - footer_height: int = 80 - header_height: int = 180 - header_row_height: int = 40 - label_col_width: int = 350 - padding: int = 15 - stats_col_width: int = 200 - - @property - def face_col_width(self) -> int: - return self.face_size + self.padding - - @property - def row_height(self) -> int: - return self.face_size + self.padding * 2 - - def total_height(self, num_chars: int) -> int: - return self.header_height + num_chars * self.row_height + self.footer_height - - def total_width(self) -> int: - return ( - self.label_col_width - + self.stats_col_width - + self.faces_per_char * self.face_col_width - + self.padding * 2 - ) - - def __init__(self, args: Dict[str, Any]): + def __init__(self, args: Dict[str, Any]) -> None: super().__init__(args=args, class_name='CharacterReferenceProcessor', error_exit_code=20, loglevel=logging.INFO) self.characters_dir = args['characters_dir'] self.output_dir = args['output_dir'] self.similarity_threshold = args['similarity_threshold'] self.interactive = args['interactive'] self.face_app: Optional[FaceAnalysis] = None + self._visualizer = CharacterGridVisualizer(similarity_threshold=self.similarity_threshold) def generate_validation_grid(self) -> None: output_path = self.output_dir / 'validation_grid.png' if output_path.exists(): - console.print(f'[dim]⊘ Skipping validation grid (already exists): {output_path}[/dim]') + console.print(f'[dim]Skipping validation grid (already exists): {output_path}[/dim]') return console.print('\n[blue]Generating validation grid...[/blue]') @@ -88,37 +58,15 @@ def generate_validation_grid(self) -> None: console.print('[yellow]No processed characters found, skipping validation grid[/yellow]') return - dims = self._GridDimensions() - grid_width = dims.total_width() - grid_height = dims.total_height(len(processed_chars)) - bg_color = (250, 252, 255) - grid = np.full((grid_height, grid_width, 3), bg_color, dtype=np.uint8) - - metadata_all = self.__load_all_metadata(processed_chars) - avg_similarity = ( - np.mean([m.get('average_similarity', 0) for m in metadata_all]) if metadata_all else 0 + stats = self._visualizer.generate_grid( + processed_chars_dir=self.output_dir, + output_path=output_path, ) - self.__render_header(grid, dims, len(processed_chars), avg_similarity, self.similarity_threshold) - self.__render_table_headers(grid, dims) - - y_offset = dims.header_height + dims.header_row_height + dims.padding - for idx, char_dir in enumerate(processed_chars): - self.__render_character_row(grid, dims, char_dir, idx, y_offset, bg_color) - y_offset += dims.row_height - - self.__render_footer(grid, dims, grid_height) - - cv2.imwrite( - str(output_path), - grid, - [cv2.IMWRITE_PNG_COMPRESSION, 6], - ) - - console.print(f'[green]✓ Validation grid saved to: {output_path}[/green]') - console.print(f'[green] Grid size: {grid_width}x{grid_height}px[/green]') - console.print(f'[green] Characters: {len(processed_chars)}[/green]') - console.print(f'[green] Average similarity: {avg_similarity:.4f}[/green]') + console.print(f'[green]Validation grid saved to: {output_path}[/green]') + console.print(f'[green] Grid size: {stats["width"]}x{stats["height"]}px[/green]') + console.print(f'[green] Characters: {stats["num_chars"]}[/green]') + console.print(f'[green] Average similarity: {stats["avg_similarity"]:.4f}[/green]') def get_output_subdir(self) -> str: return 'character_references' @@ -169,7 +117,7 @@ def _process_item(self, item: ProcessingItem, _missing_outputs: List[OutputSpec] console.print(f'[yellow]Skipping {char_name}: could not identify common face[/yellow]') return self.__save_processed_references(char_name, selected_faces, reference_images) - console.print(f'[green]✓ Processed {char_name}[/green]') + console.print(f'[green]Processed {char_name}[/green]') def _validate_args(self, args: Dict[str, Any]) -> None: required = ['characters_dir', 'output_dir', 'similarity_threshold', 'interactive'] @@ -253,7 +201,8 @@ def __create_selection_grid(self, data, mode: str, char_name: str) -> Path: cv2.imwrite(str(output_path), grid) return output_path - def __create_candidates_grid(self, candidates: List[CandidateFace]) -> np.ndarray: + @staticmethod + def __create_candidates_grid(candidates: List[CandidateFace]) -> np.ndarray: num_refs = len(candidates[0].faces) num_candidates = len(candidates) face_size = 150 @@ -272,7 +221,7 @@ def __create_candidates_grid(self, candidates: List[CandidateFace]) -> np.ndarra y_base = label_height + padding + cand_idx * (face_size + label_height + padding) for face_idx, face_data in enumerate(candidate.faces): x = padding + face_idx * (face_size + padding) - face_resized = self.__safe_resize(face_data.face_img, (face_size, face_size)) + face_resized = CharacterGridVisualizer._safe_resize(face_data.face_img, (face_size, face_size)) if face_resized is not None: grid[y_base:y_base + face_size, x:x + face_size] = face_resized @@ -281,7 +230,8 @@ def __create_candidates_grid(self, candidates: List[CandidateFace]) -> np.ndarra return grid - def __create_manual_selection_grid(self, faces_data: List[FaceData]) -> np.ndarray: + @staticmethod + def __create_manual_selection_grid(faces_data: List[FaceData]) -> np.ndarray: num_faces = len(faces_data) cols = min(3, num_faces) rows = (num_faces + cols - 1) // cols @@ -296,7 +246,7 @@ def __create_manual_selection_grid(self, faces_data: List[FaceData]) -> np.ndarr col = idx % cols x = padding + col * (face_size + padding) y = padding + row * (face_size + padding) - face_resized = self.__safe_resize(face_data.face_img, (face_size, face_size)) + face_resized = CharacterGridVisualizer._safe_resize(face_data.face_img, (face_size, face_size)) if face_resized is not None: grid[y:y + face_size, x:x + face_size] = face_resized @@ -425,248 +375,7 @@ def __find_matching_faces_for_reference( return None return matched_faces - @staticmethod - def __load_all_metadata(processed_chars: List[Path]) -> List[Dict[str, Any]]: - metadata_all = [] - for char_dir in processed_chars: - metadata_file = char_dir / 'metadata.json' - if metadata_file.exists(): - with open(metadata_file, 'r', encoding='utf-8') as f: - metadata_all.append(json.load(f)) - return metadata_all - - def __render_character_row( - self, - grid: np.ndarray, - dims: _GridDimensions, - char_dir: Path, - row_idx: int, - y_offset: int, - bg_color: Tuple[int, int, int], - ) -> None: - char_name = char_dir.name.replace('_', ' ').title() - row_bg = (245, 248, 252) if row_idx % 2 == 0 else bg_color - - cv2.rectangle( - grid, - (0, y_offset - dims.padding), - (dims.total_width(), y_offset + dims.face_size + dims.padding), - row_bg, - -1, - ) - - cv2.putText( - grid, - char_name, - (dims.padding * 2, y_offset + dims.face_size // 2), - cv2.FONT_HERSHEY_SIMPLEX, - 0.55, - (30, 40, 50), - 1, - cv2.LINE_AA, - ) - - self.__render_character_stats(grid, dims, char_dir, y_offset) - self.__render_character_faces(grid, dims, char_dir, y_offset) - def __render_character_stats( - self, grid: np.ndarray, dims: _GridDimensions, char_dir: Path, y_offset: int, - ) -> None: - metadata_file = char_dir / 'metadata.json' - if not metadata_file.exists(): - return - - with open(metadata_file, 'r', encoding='utf-8') as f: - metadata = json.load(f) - - similarity = metadata.get('average_similarity', 0.0) - method = metadata.get('detection_stats', {}).get('selection_method', 'unknown') - faces_detected = metadata.get('detection_stats', {}).get('total_faces_detected', []) - - stats_x = dims.label_col_width + dims.padding - stats_y_base = y_offset + dims.face_size // 2 - 30 - - sim_color = (0, 150, 0) if similarity >= self.similarity_threshold else (180, 100, 0) - cv2.putText( - grid, f'Similarity: {similarity:.4f}', (stats_x, stats_y_base), - cv2.FONT_HERSHEY_SIMPLEX, 0.45, sim_color, 1, cv2.LINE_AA, - ) - - method_color = (50, 120, 200) if method == 'automatic' else (180, 100, 50) - cv2.putText( - grid, f'Method: {method}', (stats_x, stats_y_base + 25), - cv2.FONT_HERSHEY_SIMPLEX, 0.42, method_color, 1, cv2.LINE_AA, - ) - - faces_str = str(faces_detected) if len(str(faces_detected)) < 20 else f'[{len(faces_detected)} imgs]' - cv2.putText( - grid, f'Detected: {faces_str}', (stats_x, stats_y_base + 50), - cv2.FONT_HERSHEY_SIMPLEX, 0.38, (100, 110, 120), 1, cv2.LINE_AA, - ) - - def __render_character_faces( - self, grid: np.ndarray, dims: _GridDimensions, char_dir: Path, y_offset: int, - ) -> None: - face_files = sorted(char_dir.glob('face_*.jpg')) - for face_idx, face_file in enumerate(face_files[:dims.faces_per_char]): - face_img = cv2.imread(str(face_file)) - if face_img is None: - continue - - face_resized = self.__safe_resize(face_img, (dims.face_size, dims.face_size)) - if face_resized is None: - continue - - x = dims.label_col_width + dims.stats_col_width + face_idx * dims.face_col_width + dims.padding - grid[y_offset:y_offset + dims.face_size, x:x + dims.face_size] = face_resized - - cv2.rectangle( - grid, (x - 1, y_offset - 1), - (x + dims.face_size + 1, y_offset + dims.face_size + 1), - (180, 190, 200), 1, - ) - - @staticmethod - def __render_footer(grid: np.ndarray, dims: _GridDimensions, grid_height: int) -> None: - footer_y = grid_height - dims.footer_height + 20 - cv2.line(grid, (0, footer_y - 20), (dims.total_width(), footer_y - 20), (200, 210, 220), 1) - - footer_text = ( - f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')} | " - f"Model: {settings.face_recognition.model_name} | " - f"Normalized Size: {settings.character.normalized_face_size[0]}x" - f"{settings.character.normalized_face_size[1]}px" - ) - cv2.putText( - grid, - footer_text, - (dims.padding * 3, footer_y), - cv2.FONT_HERSHEY_SIMPLEX, - 0.4, - (120, 130, 140), - 1, - cv2.LINE_AA, - ) - - legend_y = footer_y + 30 - legend_items = [ - ('Automatic: Face found on all references', (50, 120, 200)), - ('Manual: User-selected reference', (180, 100, 50)), - ] - for idx, (text, color) in enumerate(legend_items): - x_pos = dims.padding * 3 + idx * 380 - cv2.circle(grid, (x_pos, legend_y - 3), 5, color, -1) - cv2.putText( - grid, - text, - (x_pos + 15, legend_y), - cv2.FONT_HERSHEY_SIMPLEX, - 0.38, - (100, 110, 120), - 1, - cv2.LINE_AA, - ) - - @staticmethod - def __render_header( - grid: np.ndarray, - dims: _GridDimensions, - total_chars: int, - avg_similarity: float, - threshold: float, - ) -> None: - header_bg_color = (45, 55, 72) - cv2.rectangle(grid, (0, 0), (dims.total_width(), dims.header_height), header_bg_color, -1) - - title_text = 'FACIAL REFERENCE VALIDATION REPORT' - cv2.putText( - grid, - title_text, - (dims.padding * 3, 50), - cv2.FONT_HERSHEY_DUPLEX, - 1.1, - (255, 255, 255), - 2, - cv2.LINE_AA, - ) - - subtitle = 'InsightFace Buffalo-L Model | Face Vector Extraction & Similarity Analysis' - cv2.putText( - grid, - subtitle, - (dims.padding * 3, 85), - cv2.FONT_HERSHEY_SIMPLEX, - 0.55, - (200, 210, 220), - 1, - cv2.LINE_AA, - ) - - stats_y = 115 - stats_items = [ - f'Total Subjects: {total_chars}', - f'Avg Similarity: {avg_similarity:.4f}', - f'Threshold: {threshold:.2f}', - ] - for idx, stat in enumerate(stats_items): - x_pos = dims.padding * 3 + idx * 280 - cv2.putText( - grid, - stat, - (x_pos, stats_y), - cv2.FONT_HERSHEY_SIMPLEX, - 0.5, - (180, 200, 220), - 1, - cv2.LINE_AA, - ) - - @staticmethod - def __render_table_headers(grid: np.ndarray, dims: _GridDimensions) -> None: - table_header_y = dims.header_height + 1 - cv2.line(grid, (0, table_header_y), (dims.total_width(), table_header_y), (180, 190, 200), 2) - - col_headers = [ - ('CHARACTER NAME', dims.label_col_width // 2, 0), - ('STATISTICS', dims.label_col_width + dims.stats_col_width // 2, 0), - ('REFERENCE IMAGE 1', dims.label_col_width + dims.stats_col_width + dims.face_col_width // 2, 0), - ('REFERENCE IMAGE 2', dims.label_col_width + dims.stats_col_width + dims.face_col_width * 3 // 2, 0), - ('REFERENCE IMAGE 3', dims.label_col_width + dims.stats_col_width + dims.face_col_width * 5 // 2, 0), - ] - - for text, x_center, _ in col_headers: - text_size = cv2.getTextSize(text, cv2.FONT_HERSHEY_SIMPLEX, 0.42, 1)[0] - text_x = x_center - text_size[0] // 2 - cv2.putText( - grid, - text, - (text_x, table_header_y + 25), - cv2.FONT_HERSHEY_SIMPLEX, - 0.42, - (60, 70, 85), - 1, - cv2.LINE_AA, - ) - - cv2.line( - grid, - (0, table_header_y + dims.header_row_height), - (dims.total_width(), table_header_y + dims.header_row_height), - (200, 210, 220), - 1, - ) - - @staticmethod - def __safe_resize(img: np.ndarray, target_size: tuple) -> Optional[np.ndarray]: - if img is None or img.size == 0: - return None - if img.shape[0] == 0 or img.shape[1] == 0: - return None - try: - return cv2.resize(img, target_size) - except cv2.error as e: - logging.error(f'OpenCV resize error: {e}') - return None def __save_processed_references( self, @@ -679,7 +388,10 @@ def __save_processed_references( face_vectors = [] for idx, face_data in enumerate(selected_faces): - face_normalized = self.__safe_resize(face_data.face_img, settings.character.normalized_face_size) + face_normalized = CharacterGridVisualizer._safe_resize( + face_data.face_img, + settings.character.normalized_face_size, + ) if face_normalized is None: self.logger.warning(f'Skipping face {idx} for {char_name}: failed to resize (invalid dimensions)') continue diff --git a/preprocessor/services/search/embedding_model.py b/preprocessor/services/search/embedding_model.py index 64359cef0..2d0c9bb3f 100644 --- a/preprocessor/services/search/embedding_model.py +++ b/preprocessor/services/search/embedding_model.py @@ -18,6 +18,3 @@ def encode_text(self, text: Union[str, List[str]]) -> Union[List[float], List[Li if isinstance(text, str): return self._service.get_text_embedding(text) return [self._service.get_text_embedding(t) for t in text] - - def __encode_image(self, image_path: str) -> List[float]: # pylint: disable=unused-private-member - return self._service.get_image_embedding(image_path) diff --git a/preprocessor/services/text/text_statistics.py b/preprocessor/services/text/text_statistics.py index 0731f3e0d..ba2b86f78 100644 --- a/preprocessor/services/text/text_statistics.py +++ b/preprocessor/services/text/text_statistics.py @@ -85,7 +85,7 @@ def to_dict(self) -> Dict[str, Any]: 'trigrams': self.trigrams, } - def __calculate(self) -> None: # pylint: disable=unused-private-member + def __calculate(self) -> None: # pylint: disable=unused-private-member self.__calculate_basic_stats() self.__calculate_character_stats() self.__calculate_word_stats() @@ -146,12 +146,6 @@ def __calculate_word_stats(self) -> None: self.avg_word_length = round(sum(word_lengths) / len(word_lengths), 2) if word_lengths else 0.0 self.word_frequency = [{'word': word, 'count': count} for word, count in word_counter.most_common(50)] - @classmethod - def __from_text(cls, text: str, language: str='pl') -> 'TextStatistics': # pylint: disable=unused-private-member - stats = cls(text=text, language=language) - stats.__calculate() - return stats - def __get_config(self) -> LanguageConfig: return POLISH_CONFIG if self.language == 'pl' else ENGLISH_CONFIG diff --git a/preprocessor/services/transcription/engines/whisper_engine.py b/preprocessor/services/transcription/engines/whisper_engine.py index 45ba55702..839ba4ccb 100644 --- a/preprocessor/services/transcription/engines/whisper_engine.py +++ b/preprocessor/services/transcription/engines/whisper_engine.py @@ -24,7 +24,7 @@ def __init__(self, model: str='large-v3-turbo', language: str='Polish', device: compute_type = 'float16' console.print(f'[cyan]Loading Whisper model: {model} on {device} with compute_type={compute_type}[/cyan]') self.model = WhisperModel(model, device=device, compute_type=compute_type) - console.print('[green]✓ Whisper model loaded[/green]') + console.print('[green]Whisper model loaded[/green]') def cleanup(self) -> None: console.print('[cyan]Unloading Whisper model and clearing GPU memory...[/cyan]') @@ -33,7 +33,7 @@ def cleanup(self) -> None: gc.collect() if torch.cuda.is_available(): torch.cuda.empty_cache() - console.print('[green]✓ Whisper model unloaded, GPU memory cleared[/green]') + console.print('[green]Whisper model unloaded, GPU memory cleared[/green]') def get_name(self) -> str: return f'Whisper-{self.model_name}' @@ -45,5 +45,5 @@ def transcribe(self, audio_path: Path) -> Dict[str, Any]: language_code = WhisperUtils.get_language_code(self.language) segments, info = self.model.transcribe(str(audio_path), language=language_code, beam_size=10, word_timestamps=True, condition_on_previous_text=False) result = WhisperUtils.build_transcription_result(segments, language=info.language) - console.print(f'[green]✓ Transcription completed: {audio_path.name}[/green]') + console.print(f'[green]Transcription completed: {audio_path.name}[/green]') return result diff --git a/preprocessor/services/transcription/generators/base_generator.py b/preprocessor/services/transcription/generators/base_generator.py index 5b04797d6..6d14a9994 100644 --- a/preprocessor/services/transcription/generators/base_generator.py +++ b/preprocessor/services/transcription/generators/base_generator.py @@ -14,7 +14,7 @@ class BaseTranscriptionGenerator(ABC): - def __init__(self, input_dir: Path, output_dir: Path, logger: ErrorHandlingLogger): + def __init__(self, input_dir: Path, output_dir: Path, logger: ErrorHandlingLogger) -> None: self.input_dir = input_dir self.output_dir = output_dir self.logger = logger diff --git a/preprocessor/services/transcription/generators/multi_format_generator.py b/preprocessor/services/transcription/generators/multi_format_generator.py index eaf58b1b4..5f4815f02 100644 --- a/preprocessor/services/transcription/generators/multi_format_generator.py +++ b/preprocessor/services/transcription/generators/multi_format_generator.py @@ -19,7 +19,7 @@ class MultiFormatGenerator: - def __init__(self, jsons_dir: Path, episodes_info_json: Path, output_base_path: Path, logger: ErrorHandlingLogger, series_name: str=''): + def __init__(self, jsons_dir: Path, episodes_info_json: Path, output_base_path: Path, logger: ErrorHandlingLogger, series_name: str='') -> None: self.jsons_dir = jsons_dir self.output_base_path = output_base_path self.logger = logger diff --git a/preprocessor/services/transcription/processors/audio_normalizer.py b/preprocessor/services/transcription/processors/audio_normalizer.py index 2e0c586e0..0964abc65 100644 --- a/preprocessor/services/transcription/processors/audio_normalizer.py +++ b/preprocessor/services/transcription/processors/audio_normalizer.py @@ -13,7 +13,7 @@ class AudioNormalizer: SUPPORTED_VIDEO_EXTENSIONS = BaseProcessor.SUPPORTED_VIDEO_EXTENSIONS - def __init__(self, input_videos: Path, output_dir: Path, logger: ErrorHandlingLogger, video_files: Optional[List[Path]]=None): + def __init__(self, input_videos: Path, output_dir: Path, logger: ErrorHandlingLogger, video_files: Optional[List[Path]]=None) -> None: self.__input_videos: Path = input_videos self.__output_dir: Path = output_dir self.__logger: ErrorHandlingLogger = logger diff --git a/preprocessor/services/transcription/processors/episode_info_processor.py b/preprocessor/services/transcription/processors/episode_info_processor.py index b4304d38a..d0e0f1730 100644 --- a/preprocessor/services/transcription/processors/episode_info_processor.py +++ b/preprocessor/services/transcription/processors/episode_info_processor.py @@ -12,7 +12,7 @@ class EpisodeInfoProcessor: - def __init__(self, jsons_dir: Path, episodes_info_json: Path, output_path: Path, logger: ErrorHandlingLogger, series_name: str=''): + def __init__(self, jsons_dir: Path, episodes_info_json: Path, output_path: Path, logger: ErrorHandlingLogger, series_name: str='') -> None: self.__jsons_dir: Path = jsons_dir self.__output_path: Path = output_path self.__logger: ErrorHandlingLogger = logger diff --git a/preprocessor/services/transcription/utils.py b/preprocessor/services/transcription/utils.py index fec35909d..9d753c16e 100644 --- a/preprocessor/services/transcription/utils.py +++ b/preprocessor/services/transcription/utils.py @@ -38,7 +38,7 @@ def fix_transcription_file_unicode(file_path: Path) -> bool: return False @staticmethod - def __fix_unicode(file_path: Path) -> None: # pylint: disable=unused-private-member + def __fix_unicode(file_path: Path) -> None: # pylint: disable=unused-private-member if not file_path.exists(): return with open(file_path, 'r', encoding='utf-8') as f: diff --git a/preprocessor/services/transcription/whisper.py b/preprocessor/services/transcription/whisper.py index f7d65c9d8..1450ead68 100644 --- a/preprocessor/services/transcription/whisper.py +++ b/preprocessor/services/transcription/whisper.py @@ -29,7 +29,7 @@ def cleanup(self) -> None: self._model = None if torch.cuda.is_available(): torch.cuda.empty_cache() - console.print('[green]✓ Whisper model unloaded, GPU memory cleared[/green]') + console.print('[green]Whisper model unloaded, GPU memory cleared[/green]') def transcribe(self, audio_path: Path) -> Dict[str, Any]: console.print(f'[cyan]Transcribing with Whisper: {audio_path.name}[/cyan]') @@ -46,7 +46,7 @@ def transcribe(self, audio_path: Path) -> Dict[str, Any]: temperature=self.temperature, ) result = WhisperUtils.build_transcription_result(segments, language=info.language) - console.print(f'[green]✓ Transcription completed: {audio_path.name}[/green]') + console.print(f'[green]Transcription completed: {audio_path.name}[/green]') return result def _load_model(self) -> WhisperModel: @@ -57,5 +57,5 @@ def _load_model(self) -> WhisperModel: compute_type = 'float16' console.print(f'[cyan]Loading Whisper model: {self.model_name} on {self.device} with compute_type={compute_type}[/cyan]') self._model = WhisperModel(self.model_name, device=self.device, compute_type=compute_type) - console.print('[green]✓ Whisper model loaded[/green]') + console.print('[green]Whisper model loaded[/green]') return self._model diff --git a/preprocessor/services/ui/console.py b/preprocessor/services/ui/console.py index 3e5a7c58d..926ab547f 100644 --- a/preprocessor/services/ui/console.py +++ b/preprocessor/services/ui/console.py @@ -25,12 +25,12 @@ def __get_console() -> Console: class SimpleProgress: - def __init__(self): + def __init__(self) -> None: self.tasks = {} self.task_counter = 0 self.console = console - def add_task(self, description: str, total: int): + def add_task(self, description: str, total: int) -> int: task_id = self.task_counter self.task_counter += 1 self.tasks[task_id] = { @@ -43,7 +43,7 @@ def add_task(self, description: str, total: int): self.__print_progress(task_id) return task_id - def advance(self, task_id: int, advance: int=1): + def advance(self, task_id: int, advance: int=1) -> None: if task_id not in self.tasks: return task = self.tasks[task_id] @@ -53,13 +53,13 @@ def advance(self, task_id: int, advance: int=1): self.__print_progress(task_id) task['last_print'] = current_time - def __enter__(self): + def __enter__(self) -> 'SimpleProgress': return self - def __exit__(self, exc_type, exc_val, exc_tb): + def __exit__(self, exc_type, exc_val, exc_tb) -> None: pass - def __print_progress(self, task_id: int): + def __print_progress(self, task_id: int) -> None: task = self.tasks[task_id] completed = task['completed'] total = task['total'] @@ -75,9 +75,9 @@ def __print_progress(self, task_id: int): bar_width = 30 filled = int(bar_width * completed / total) if total > 0 else 0 if filled < bar_width: - progress_bar = '━' * filled + '╸' + '─' * (bar_width - filled - 1) + progress_bar = '=' * filled + '>' + '-' * (bar_width - filled - 1) else: - progress_bar = '━' * bar_width + progress_bar = '=' * bar_width console.print( f"[bold blue]{task['description']}[/bold blue] " diff --git a/preprocessor/services/ui/progress.py b/preprocessor/services/ui/progress.py index 380ee82b7..aad14ee39 100644 --- a/preprocessor/services/ui/progress.py +++ b/preprocessor/services/ui/progress.py @@ -1,4 +1,3 @@ -from contextlib import contextmanager import time from typing import Optional @@ -8,40 +7,27 @@ class ProgressTracker: - def __init__(self): + def __init__(self) -> None: self.current_operation: Optional[str] = None self.start_time: Optional[float] = None - @contextmanager - def __track_operation(self, operation_name: str, total: int): # pylint: disable=unused-private-member - self.current_operation = operation_name - self.start_time = time.time() - console.print(f' [cyan]{operation_name} (total: {total})...[/cyan]') - tracker = OperationTracker(operation_name=operation_name, total=total, start_time=self.start_time) - try: - yield tracker - finally: - if tracker.completed > 0: - elapsed = time.time() - self.start_time - console.print(f' [green]✓ {operation_name} completed: {tracker.completed}/{total} in {TimeFormatter.format_human(elapsed)}[/green]') - class OperationTracker: - def __init__(self, operation_name: str, total: int, start_time: float): + def __init__(self, operation_name: str, total: int, start_time: float) -> None: self.operation_name = operation_name self.total = total self.completed = 0 self.start_time = start_time self.last_report = 0 - def update(self, completed: int, interval: int=10): + def update(self, completed: int, interval: int=10) -> None: self.completed = completed should_report = completed % interval == 0 or completed == self.total or completed == 1 if should_report and completed != self.last_report: self.__report_progress() self.last_report = completed - def __report_progress(self): + def __report_progress(self) -> None: elapsed = time.time() - self.start_time percent = self.completed / self.total * 100 if self.total > 0 else 0 if 0 < self.completed < self.total: diff --git a/preprocessor/services/validation/episode_stats.py b/preprocessor/services/validation/episode_stats.py index 1f0329c0b..4e8400642 100644 --- a/preprocessor/services/validation/episode_stats.py +++ b/preprocessor/services/validation/episode_stats.py @@ -2,31 +2,58 @@ dataclass, field, ) -import json -from pathlib import Path from typing import ( - Any, - Dict, List, Optional, Tuple, + TypedDict, ) -from preprocessor.config.config import ( - get_base_output_dir, - settings, -) -from preprocessor.config.constants import ( - DEFAULT_VIDEO_EXTENSION, - OUTPUT_FILE_NAMES, - OUTPUT_FILE_PATTERNS, -) from preprocessor.services.episodes import EpisodeInfo -from preprocessor.services.io.path_manager import PathManager from preprocessor.services.validation.base_result import ValidationStatusMixin -from preprocessor.services.validation.file_validators import FileValidator +from preprocessor.services.validation.validators import ( + CharacterValidator, + ElasticValidator, + FaceClusterValidator, + FrameValidator, + ImageHashValidator, + ObjectValidator, + SceneValidator, + TranscriptionValidator, + VideoValidator, +) +from preprocessor.services.validation.validators.base_validator import BaseValidator + + +class EpisodeStatsData(TypedDict, total=False): + """Type-safe dict for episode statistics data.""" + transcription_chars: Optional[int] + transcription_duration: Optional[float] + transcription_words: Optional[int] + exported_frames_count: Optional[int] + exported_frames_total_size_mb: Optional[float] + exported_frames_avg_resolution: Optional[Tuple[int, int]] + video_size_mb: Optional[float] + video_duration: Optional[float] + video_codec: Optional[str] + video_resolution: Optional[Tuple[int, int]] + scenes_count: Optional[int] + scenes_avg_duration: Optional[float] + image_hashes_count: Optional[int] + character_visualizations_count: Optional[int] + face_clusters_count: Optional[int] + face_clusters_total_faces: Optional[int] + object_detections_count: Optional[int] + object_visualizations_count: Optional[int] + + +class EpisodeStatsDict(TypedDict): + """Type-safe dict representation of EpisodeStats.""" + status: str + errors: List[str] + warnings: List[str] + stats: EpisodeStatsData -ELASTIC_SUBDIRS = settings.output_subdirs.elastic_document_subdirs @dataclass class EpisodeStats(ValidationStatusMixin): # pylint: disable=too-many-instance-attributes @@ -53,19 +80,24 @@ class EpisodeStats(ValidationStatusMixin): # pylint: disable=too-many-instance- video_size_mb: Optional[float] = None warnings: List[str] = field(default_factory=list) - def collect_stats(self): - self.__validate_transcription() - self.__validate_exported_frames() - self.__validate_video() - self.__validate_scenes() - self.__validate_image_hashes() - self.__validate_character_visualizations() - self.__validate_face_clusters() - self.__validate_object_detections() - self.__validate_object_visualizations() - self.__validate_other_files() + def __post_init__(self) -> None: + self._validators: List[BaseValidator] = [ + TranscriptionValidator(), + FrameValidator(), + VideoValidator(), + SceneValidator(), + ImageHashValidator(), + CharacterValidator(), + FaceClusterValidator(), + ObjectValidator(), + ElasticValidator(), + ] + + def collect_stats(self) -> None: + for validator in self._validators: + validator.validate(self) - def to_dict(self) -> Dict[str, Any]: + def to_dict(self) -> EpisodeStatsDict: return { 'status': self.status, 'errors': self.errors, @@ -91,381 +123,3 @@ def to_dict(self) -> Dict[str, Any]: 'object_visualizations_count': self.object_visualizations_count, }, } - - def __check_size_anomalies( - self, sizes: List[int], folder_name: str, threshold: float = 0.2, - ): - if len(sizes) < 2: - return - avg_size = sum(sizes) / len(sizes) - if avg_size == 0: - return - for i, size in enumerate(sizes): - deviation = abs(size - avg_size) / avg_size - if deviation > threshold: - warning_msg = ( - f'{folder_name} file #{i + 1} size deviation: ' - f'{deviation * 100:.1f}% from average' - ) - self.warnings.append(warning_msg) - - def __extract_transcription_stats(self, raw_transcription: Path): - data = self.__load_json_safely(raw_transcription) - if not data: - self.errors.append(f'Error reading transcription: {raw_transcription}') - return - text = data.get('text', '') - if not text: - segments = data.get('segments', []) - if segments: - text = ' '.join((seg.get('text', '') for seg in segments)) - self.transcription_chars = len(text) - self.transcription_words = len(text.split()) - words = data.get('words', []) - if words: - self.transcription_duration = words[-1].get('end', 0.0) - else: - segments = data.get('segments', []) - if segments and segments[-1].get('end'): - self.transcription_duration = segments[-1].get('end', 0.0) - - @staticmethod - def __load_json_safely(file_path: Path) -> Optional[Dict[str, Any]]: - try: - with open(file_path, 'r', encoding='utf-8') as f: - return json.load(f) - except Exception: - return None - - def __validate_character_visualizations(self): - self.__validate_visualizations(settings.output_subdirs.character_visualizations, 'character_visualizations_count', 'character visualization') - - def __validate_clean_transcription(self, clean_transcription_file): - if not clean_transcription_file.exists(): - self.warnings.append(f'Missing clean transcription file: {clean_transcription_file.name}') - return - result = FileValidator.validate_json_file(clean_transcription_file) - if not result.is_valid: - self.warnings.append(f'Invalid clean transcription JSON: {result.error_message}') - - def __validate_clean_txt(self, clean_txt_file): - if not clean_txt_file.exists(): - self.warnings.append(f'Missing clean transcription txt: {clean_txt_file.name}') - - def __validate_embedding_dimensions(self, jsonl_file, subdir: str): - embedding_fields = { - ELASTIC_SUBDIRS.text_embeddings: 'text_embedding', - ELASTIC_SUBDIRS.video_frames: 'video_embedding', - ELASTIC_SUBDIRS.episode_names: 'title_embedding', - ELASTIC_SUBDIRS.full_episode_embeddings: 'full_episode_embedding', - ELASTIC_SUBDIRS.sound_event_embeddings: 'sound_event_embedding', - } - if subdir not in embedding_fields: - return - embedding_field = embedding_fields[subdir] - expected_dim = settings.embedding_model.embedding_dim - try: - with open(jsonl_file, 'r', encoding='utf-8') as f: - for line_num, line in enumerate(f, 1): - if not line.strip(): - continue - doc = json.loads(line) - if embedding_field in doc: - embedding = doc[embedding_field] - if isinstance(embedding, list): - actual_dim = len(embedding) - if actual_dim != expected_dim: - error_msg = ( - f'{jsonl_file.name} line {line_num}: ' - f'{embedding_field} has {actual_dim} dimensions, ' - f'expected {expected_dim}' - ) - self.errors.append(error_msg) - return - except Exception as e: - self.errors.append(f'Error validating embeddings in {jsonl_file.name}: {e}') - - def __validate_exported_frames(self): - frames_dir = PathManager(self.series_name).get_episode_dir(self.episode_info, settings.output_subdirs.frames) - if not frames_dir.exists(): - self.warnings.append(f'Missing {settings.output_subdirs.frames} directory: {frames_dir}') - return - frame_files = sorted(frames_dir.glob(OUTPUT_FILE_PATTERNS['frame'])) - if not frame_files: - self.warnings.append(f'No frames found in {settings.output_subdirs.frames}/') - return - self.exported_frames_count = len(frame_files) - total_size = 0 - resolutions = [] - invalid_count = 0 - for frame_file in frame_files: - result = FileValidator.validate_image_file(frame_file) - if result.is_valid: - total_size += result.metadata['size_mb'] - resolutions.append((result.metadata['width'], result.metadata['height'])) - else: - invalid_count += 1 - self.errors.append(f'Invalid frame {frame_file.name}: {result.error_message}') - if invalid_count > 0: - self.warnings.append(f'{invalid_count} invalid frames found') - self.exported_frames_total_size_mb = round(total_size, 2) - if resolutions: - most_common_res = max(set(resolutions), key=resolutions.count) - self.exported_frames_avg_resolution = most_common_res - - def __validate_face_clusters(self): - clusters_dir = PathManager(self.series_name).get_episode_dir(self.episode_info, settings.output_subdirs.face_clusters) - if not clusters_dir.exists(): - return - metadata_files = list(clusters_dir.glob('*_face_clusters.json')) - metadata_file = metadata_files[0] if metadata_files else None - if not metadata_file or not metadata_file.exists(): - self.warnings.append('Missing face clustering metadata file') - return - result = FileValidator.validate_json_file(metadata_file) - if not result.is_valid: - self.errors.append(f'Invalid face clustering metadata: {result.error_message}') - return - data = self.__load_json_safely(metadata_file) - if not data: - self.errors.append(f'Error reading face clustering metadata: {metadata_file}') - return - clusters = data.get('clusters', {}) - if isinstance(clusters, dict): - self.face_clusters_count = len(clusters) - total_faces = sum((cluster_info.get('face_count', 0) for cluster_info in clusters.values())) - elif isinstance(clusters, list): - self.face_clusters_count = len(clusters) - total_faces = sum((cluster_info.get('face_count', 0) for cluster_info in clusters)) - else: - self.warnings.append('Unexpected clusters format in face clustering metadata') - return - noise_info = data.get('noise', {}) - if noise_info: - total_faces += noise_info.get('face_count', 0) - self.face_clusters_total_faces = total_faces - - def __validate_image_hashes(self): - self.__validate_json_directory(settings.output_subdirs.image_hashes, 'image_hashes_count', 'image_hashes') - - @staticmethod - def __validate_images_in_directory( - directory: Path, - extensions: Tuple[str, ...] = ('*.jpg', '*.png'), - ) -> Tuple[int, int, List[str]]: - if not directory.exists(): - return 0, 0, [] - image_files = [] - for ext in extensions: - image_files.extend(directory.glob(ext)) - if not image_files: - return 0, 0, [] - invalid_count = 0 - errors = [] - for img_file in image_files: - result = FileValidator.validate_image_file(img_file) - if not result.is_valid: - invalid_count += 1 - errors.append(f'Invalid image {img_file.name}: {result.error_message}') - return len(image_files), invalid_count, errors - - def __validate_json_directory( - self, - subdir: str, - count_attr: Optional[str], - context_name: str, - exclude_pattern: Optional[str] = None, - check_anomalies: bool = True, - ): - dir_path = PathManager(self.series_name).get_episode_dir(self.episode_info, subdir) - count, sizes, errors = self.__validate_json_files_in_directory(dir_path, exclude_pattern) - if not dir_path.exists(): - self.warnings.append(f'Missing {subdir} directory') - return - if count == 0: - self.warnings.append(f'No JSON files in {subdir}/') - return - if count_attr: - setattr(self, count_attr, count) - self.errors.extend(errors) - if check_anomalies: - self.__check_size_anomalies(sizes, context_name) - - @staticmethod - def __validate_json_files_in_directory( - directory: Path, exclude_pattern: Optional[str] = None, - ) -> Tuple[int, List[int], List[str]]: - if not directory.exists(): - return 0, [], [] - json_files = [ - f for f in directory.glob('*.json') - if not exclude_pattern or exclude_pattern not in str(f) - ] - if not json_files: - return 0, [], [] - sizes = [] - errors = [] - for json_file in json_files: - result = FileValidator.validate_json_file(json_file) - if not result.is_valid: - errors.append(f'Invalid JSON {json_file.name}: {result.error_message}') - else: - sizes.append(json_file.stat().st_size) - return len(json_files), sizes, errors - - def __validate_object_detections(self): - self.__validate_json_directory( - settings.output_subdirs.object_detections, - 'object_detections_count', - 'object_detections', - exclude_pattern='visualizations', - ) - - def __validate_object_visualizations(self): - self.__validate_visualizations(settings.output_subdirs.object_visualizations, 'object_visualizations_count', 'visualization') - - def __validate_other_files(self): - char_detections_dir = PathManager(self.series_name).get_episode_dir(self.episode_info, settings.output_subdirs.character_detections) - detections_file = char_detections_dir / OUTPUT_FILE_NAMES['detections'] - if detections_file.exists(): - result = FileValidator.validate_json_file(detections_file) - if not result.is_valid: - self.errors.append(f"Invalid {OUTPUT_FILE_NAMES['detections']}: {result.error_message}") - embeddings_dir = PathManager(self.series_name).get_episode_dir(self.episode_info, settings.output_subdirs.embeddings) - if embeddings_dir.exists(): - embeddings_file = embeddings_dir / OUTPUT_FILE_NAMES['embeddings_text'] - if embeddings_file.exists(): - result = FileValidator.validate_json_file(embeddings_file) - if not result.is_valid: - self.errors.append(f"Invalid {OUTPUT_FILE_NAMES['embeddings_text']}: {result.error_message}") - elastic_subdirs = [ - ELASTIC_SUBDIRS.text_segments, - ELASTIC_SUBDIRS.text_embeddings, - ELASTIC_SUBDIRS.video_frames, - ELASTIC_SUBDIRS.episode_names, - ELASTIC_SUBDIRS.text_statistics, - ELASTIC_SUBDIRS.full_episode_embeddings, - ELASTIC_SUBDIRS.sound_events, - ELASTIC_SUBDIRS.sound_event_embeddings, - ] - found_elastic_docs = False - for subdir in elastic_subdirs: - elastic_base = settings.output_subdirs.elastic_documents - elastic_docs_dir = PathManager(self.series_name).get_episode_dir( - self.episode_info, f'{elastic_base}/{subdir}', - ) - if elastic_docs_dir.exists(): - found_elastic_docs = True - for jsonl_file in elastic_docs_dir.glob('*.jsonl'): - result = FileValidator.validate_jsonl_file(jsonl_file) - if not result.is_valid: - self.errors.append(f'Invalid JSONL {jsonl_file.name}: {result.error_message}') - else: - self.__validate_embedding_dimensions(jsonl_file, subdir) - if not found_elastic_docs: - self.warnings.append(f'Missing {settings.output_subdirs.elastic_documents} directory') - transcriptions_dir = PathManager(self.series_name).get_episode_dir(self.episode_info, settings.output_subdirs.transcriptions) - if transcriptions_dir.exists(): - clean_subdir = settings.output_subdirs.transcription_subdirs.clean - clean_dir = transcriptions_dir / clean_subdir - filename = f'{self.series_name}_{self.episode_info.episode_code()}_text_stats.json' - text_stats_file = clean_dir / filename - if text_stats_file.exists(): - result = FileValidator.validate_json_file(text_stats_file) - if not result.is_valid: - self.errors.append(f'Invalid text_stats JSON: {result.error_message}') - else: - self.warnings.append(f'Missing text statistics file: {text_stats_file.name}') - - def __validate_raw_transcription(self, transcription_files: Dict[str, Path]): - raw_transcription = None - for key in ('main', 'segmented', 'simple'): - if transcription_files[key].exists(): - raw_transcription = transcription_files[key] - break - if not raw_transcription: - self.warnings.append('Missing raw transcription file (checked: .json, _segmented.json, _simple.json)') - return - result = FileValidator.validate_json_file(raw_transcription) - if not result.is_valid: - self.errors.append(f'Invalid transcription JSON: {result.error_message}') - return - self.__extract_transcription_stats(raw_transcription) - - def __validate_scenes(self): - scenes_dir = PathManager(self.series_name).get_episode_dir(self.episode_info, settings.output_subdirs.scenes) - scenes_file = scenes_dir / f"{self.series_name}_{self.episode_info.episode_code()}{OUTPUT_FILE_PATTERNS['scenes_suffix']}" - if not scenes_file.exists(): - self.errors.append(f'Missing scenes file: {scenes_file}') - return - result = FileValidator.validate_json_file(scenes_file) - if not result.is_valid: - self.errors.append(f'Invalid scenes JSON: {result.error_message}') - return - data = self.__load_json_safely(scenes_file) - if not data: - self.errors.append(f'Error reading scenes: {scenes_file}') - return - self.scenes_count = data.get('total_scenes', 0) - scenes = data.get('scenes', []) - if scenes: - durations = [scene.get('duration', 0) for scene in scenes] - self.scenes_avg_duration = round(sum(durations) / len(durations), 2) - - def __validate_sound_events(self, sound_events_file): - if not sound_events_file.exists(): - self.warnings.append(f'Missing sound events file: {sound_events_file.name}') - return - result = FileValidator.validate_json_file(sound_events_file) - if not result.is_valid: - self.warnings.append(f'Invalid sound events JSON: {result.error_message}') - - def __validate_transcription(self): - transcriptions_dir = PathManager(self.series_name).get_episode_dir(self.episode_info, settings.output_subdirs.transcriptions) - base_name = f'{self.series_name}_{self.episode_info.episode_code()}' - raw_dir = transcriptions_dir / settings.output_subdirs.transcription_subdirs.raw - clean_dir = transcriptions_dir / settings.output_subdirs.transcription_subdirs.clean - sound_events_dir = transcriptions_dir / settings.output_subdirs.transcription_subdirs.sound_events - transcription_files = { - 'main': raw_dir / f'{base_name}.json', - 'segmented': raw_dir / f'{base_name}_segmented.json', - 'simple': raw_dir / f'{base_name}_simple.json', - 'clean': clean_dir / f'{base_name}_clean_transcription.json', - 'clean_txt': clean_dir / f'{base_name}_clean_transcription.txt', - 'sound_events': sound_events_dir / f'{base_name}_sound_events.json', - } - if not any((f.exists() for f in transcription_files.values())): - self.errors.append('No transcription files found in any format') - return - self.__validate_raw_transcription(transcription_files) - self.__validate_clean_transcription(transcription_files['clean']) - self.__validate_clean_txt(transcription_files['clean_txt']) - self.__validate_sound_events(transcription_files['sound_events']) - - def __validate_video(self): - filename = f'{self.series_name.lower()}_{self.episode_info.episode_code()}{DEFAULT_VIDEO_EXTENSION}' - season_dir = get_base_output_dir(self.series_name) / settings.output_subdirs.video / self.episode_info.season_code() - video_file = season_dir / filename - if not video_file.exists(): - self.warnings.append(f'Missing video file: {video_file}') - return - result = FileValidator.validate_video_file(video_file) - if not result.is_valid: - self.errors.append(f'Invalid video: {result.error_message}') - return - self.video_size_mb = result.metadata['size_mb'] - self.video_duration = result.metadata['duration'] - self.video_codec = result.metadata['codec'] - self.video_resolution = (result.metadata['width'], result.metadata['height']) - - def __validate_visualizations(self, subdir: str, count_attr: str, context_name: str): - viz_dir = PathManager(self.series_name).get_episode_dir(self.episode_info, subdir) - total_count, invalid_count, errors = self.__validate_images_in_directory(viz_dir) - if total_count == 0 and viz_dir.exists(): - self.warnings.append(f'No visualization images in {subdir}/') - return - if total_count > 0: - setattr(self, count_attr, total_count) - self.errors.extend(errors) - if invalid_count > 0: - self.warnings.append(f'{invalid_count} invalid {context_name} images found') diff --git a/preprocessor/services/validation/file_validators.py b/preprocessor/services/validation/file_validators.py index 75a9ea208..8058328f2 100644 --- a/preprocessor/services/validation/file_validators.py +++ b/preprocessor/services/validation/file_validators.py @@ -141,7 +141,7 @@ def __check_file_exists(path: Path) -> Optional[ValidationResult]: return None @staticmethod - def __validate_archive_file(path: Path) -> ValidationResult: # pylint: disable=unused-private-member + def __validate_archive_file(path: Path) -> ValidationResult: # pylint: disable=unused-private-member if error := FileValidator.__check_file_exists(path): return error try: diff --git a/preprocessor/services/validation/global_validator.py b/preprocessor/services/validation/global_validator.py index 4850e390e..f0a434aef 100644 --- a/preprocessor/services/validation/global_validator.py +++ b/preprocessor/services/validation/global_validator.py @@ -10,7 +10,7 @@ class GlobalValidationResult(BaseValidationResult): class GlobalValidator: - def __init__(self, series_name: str, base_output_dir: Path): + def __init__(self, series_name: str, base_output_dir: Path) -> None: self.series_name = series_name self.base_output_dir = base_output_dir self.result = GlobalValidationResult() @@ -29,7 +29,7 @@ def __get_character_images(char_folder: Path) -> List[Path]: image_files.extend(char_folder.glob(ext)) return image_files - def __validate_characters_folder(self): + def __validate_characters_folder(self) -> None: characters_dir = self.base_output_dir / 'characters' if not characters_dir.exists(): self.result.warnings.append('Missing characters/ directory') @@ -58,7 +58,7 @@ def __validate_characters_folder(self): if characters_without_images: self.result.warnings.append(f'{len(characters_without_images)} characters without reference images') - def __validate_json_file(self, file_path: Path, stats_key: str): + def __validate_json_file(self, file_path: Path, stats_key: str) -> None: if file_path.exists(): result = FileValidator.validate_json_file(file_path) if not result.is_valid: @@ -68,13 +68,13 @@ def __validate_json_file(self, file_path: Path, stats_key: str): else: self.result.warnings.append(f'Missing {file_path.name}') - def __validate_main_json_files(self): + def __validate_main_json_files(self) -> None: episodes_file = self.base_output_dir / f'{self.series_name}_episodes.json' self.__validate_json_file(episodes_file, 'episodes_json_valid') characters_file = self.base_output_dir / f'{self.series_name}_characters.json' self.__validate_json_file(characters_file, 'characters_json_valid') - def __validate_processing_metadata(self): + def __validate_processing_metadata(self) -> None: metadata_dir = self.base_output_dir / 'processing_metadata' if not metadata_dir.exists(): self.result.warnings.append('Missing processing_metadata/ directory') diff --git a/preprocessor/services/validation/report_generator.py b/preprocessor/services/validation/report_generator.py index e0b58bae6..20294c96a 100644 --- a/preprocessor/services/validation/report_generator.py +++ b/preprocessor/services/validation/report_generator.py @@ -4,6 +4,7 @@ from typing import ( Any, Dict, + Optional, ) from preprocessor.services.validation.episode_stats import EpisodeStats @@ -12,7 +13,7 @@ class ReportGenerator: - def __init__(self, season: str, anomaly_threshold: float): + def __init__(self, season: str, anomaly_threshold: float) -> None: self.season = season self.anomaly_threshold = anomaly_threshold self.timestamp = datetime.now().isoformat() @@ -22,7 +23,7 @@ def generate_report( episodes_stats: Dict[str, EpisodeStats], season_comparison: SeasonComparison, output_path: Path, - ): + ) -> Optional[Dict[str, Any]]: report = { 'validation_timestamp': self.timestamp, 'season': self.season, @@ -37,7 +38,7 @@ def generate_report( return report @staticmethod - def __save_report(report: Dict[str, Any], output_path: Path): + def __save_report(report: Dict[str, Any], output_path: Path) -> None: output_path.parent.mkdir(parents=True, exist_ok=True) with open(output_path, 'w', encoding='utf-8') as f: json.dump(report, f, indent=2, ensure_ascii=False) diff --git a/preprocessor/services/validation/season_comparator.py b/preprocessor/services/validation/season_comparator.py index 98a60c511..4c32554f6 100644 --- a/preprocessor/services/validation/season_comparator.py +++ b/preprocessor/services/validation/season_comparator.py @@ -36,7 +36,7 @@ class SeasonComparison: anomalies: List[Anomaly] = field(default_factory=list) metrics: Dict[str, MetricComparison] = field(default_factory=dict) - def compare_episodes(self, episodes_stats: Dict[str, EpisodeStats]): + def compare_episodes(self, episodes_stats: Dict[str, EpisodeStats]) -> None: metric_keys = [ 'transcription_duration', 'transcription_chars', @@ -74,7 +74,7 @@ def to_dict(self) -> Dict[str, Any]: ], } - def __compare_metric(self, metric_key: str, episodes_stats: Dict[str, EpisodeStats]): + def __compare_metric(self, metric_key: str, episodes_stats: Dict[str, EpisodeStats]) -> None: values = [] episode_values = {} for episode_id, stats in episodes_stats.items(): diff --git a/preprocessor/services/validation/validator.py b/preprocessor/services/validation/validator.py index f270cc90f..a535382c7 100644 --- a/preprocessor/services/validation/validator.py +++ b/preprocessor/services/validation/validator.py @@ -72,7 +72,7 @@ def __collect_episodes_stats(self, transcriptions_season_path: Path) -> Dict[str episodes_stats[episode_id] = stats return episodes_stats - def __generate_episode_reports(self, episodes_stats: Dict[str, EpisodeStats]): + def __generate_episode_reports(self, episodes_stats: Dict[str, EpisodeStats]) -> None: for stats in episodes_stats.values(): episode_report = { 'validation_timestamp': datetime.now().isoformat(), @@ -88,7 +88,7 @@ def __generate_episode_reports(self, episodes_stats: Dict[str, EpisodeStats]): report_path = self.validation_reports_dir / report_filename FileOperations.atomic_write_json(report_path, episode_report) - def __print_summary(self, episodes_stats: Dict[str, EpisodeStats], season_comparison: SeasonComparison): + def __print_summary(self, episodes_stats: Dict[str, EpisodeStats], season_comparison: SeasonComparison) -> None: console.print(f'\n[bold]Validation Summary for {self.season}[/bold]') console.print(f'Total episodes: {len(episodes_stats)}') pass_count = sum((1 for stats in episodes_stats.values() if stats.status == 'PASS')) diff --git a/preprocessor/services/validation/validators/__init__.py b/preprocessor/services/validation/validators/__init__.py new file mode 100644 index 000000000..98c636063 --- /dev/null +++ b/preprocessor/services/validation/validators/__init__.py @@ -0,0 +1,23 @@ +from preprocessor.services.validation.validators.base_validator import BaseValidator +from preprocessor.services.validation.validators.character_validator import CharacterValidator +from preprocessor.services.validation.validators.elastic_validator import ElasticValidator +from preprocessor.services.validation.validators.face_cluster_validator import FaceClusterValidator +from preprocessor.services.validation.validators.frame_validator import FrameValidator +from preprocessor.services.validation.validators.image_hash_validator import ImageHashValidator +from preprocessor.services.validation.validators.object_validator import ObjectValidator +from preprocessor.services.validation.validators.scene_validator import SceneValidator +from preprocessor.services.validation.validators.transcription_validator import TranscriptionValidator +from preprocessor.services.validation.validators.video_validator import VideoValidator + +__all__ = [ + 'BaseValidator', + 'CharacterValidator', + 'ElasticValidator', + 'FaceClusterValidator', + 'FrameValidator', + 'ImageHashValidator', + 'ObjectValidator', + 'SceneValidator', + 'TranscriptionValidator', + 'VideoValidator', +] diff --git a/preprocessor/services/validation/validators/base_validator.py b/preprocessor/services/validation/validators/base_validator.py new file mode 100644 index 000000000..47a22466c --- /dev/null +++ b/preprocessor/services/validation/validators/base_validator.py @@ -0,0 +1,33 @@ +from abc import ( + ABC, + abstractmethod, +) +from pathlib import Path +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from preprocessor.services.validation.episode_stats import EpisodeStats + + +class BaseValidator(ABC): + + @abstractmethod + def validate(self, stats: 'EpisodeStats') -> None: + pass + + @staticmethod + def _check_path_exists( + path: Path, stats: 'EpisodeStats', error_msg: str, + ) -> bool: + if not path.exists(): + stats.errors.append(error_msg) + return False + return True + + @staticmethod + def _add_warning(stats: 'EpisodeStats', message: str) -> None: + stats.warnings.append(message) + + @staticmethod + def _add_error(stats: 'EpisodeStats', message: str) -> None: + stats.errors.append(message) diff --git a/preprocessor/services/validation/validators/character_validator.py b/preprocessor/services/validation/validators/character_validator.py new file mode 100644 index 000000000..8b2a3a610 --- /dev/null +++ b/preprocessor/services/validation/validators/character_validator.py @@ -0,0 +1,19 @@ +from typing import TYPE_CHECKING + +from preprocessor.config.config import settings +from preprocessor.services.validation.validators.base_validator import BaseValidator +from preprocessor.services.validation.validators.validation_helpers import VisualizationValidationHelper + +if TYPE_CHECKING: + from preprocessor.services.validation.episode_stats import EpisodeStats + + +class CharacterValidator(BaseValidator): + + def validate(self, stats: 'EpisodeStats') -> None: + VisualizationValidationHelper.validate_visualizations( + stats, + settings.output_subdirs.character_visualizations, + 'character_visualizations_count', + 'character visualization', + ) diff --git a/preprocessor/services/validation/validators/elastic_validator.py b/preprocessor/services/validation/validators/elastic_validator.py new file mode 100644 index 000000000..2ad9c2eb9 --- /dev/null +++ b/preprocessor/services/validation/validators/elastic_validator.py @@ -0,0 +1,133 @@ +import json +from pathlib import Path +from typing import TYPE_CHECKING + +from preprocessor.config.config import settings +from preprocessor.config.constants import OUTPUT_FILE_NAMES +from preprocessor.services.io.path_manager import PathManager +from preprocessor.services.validation.file_validators import FileValidator +from preprocessor.services.validation.validators.base_validator import BaseValidator + +if TYPE_CHECKING: + from preprocessor.services.validation.episode_stats import EpisodeStats + +ELASTIC_SUBDIRS = settings.output_subdirs.elastic_document_subdirs + + +class ElasticValidator(BaseValidator): + + def validate(self, stats: 'EpisodeStats') -> None: + self.__validate_character_detections(stats) + self.__validate_embeddings(stats) + self.__validate_elastic_documents(stats) + self.__validate_text_statistics(stats) + + def __validate_character_detections(self, stats: 'EpisodeStats') -> None: + char_detections_dir = PathManager(stats.series_name).get_episode_dir( + stats.episode_info, settings.output_subdirs.character_detections, + ) + detections_file = char_detections_dir / OUTPUT_FILE_NAMES['detections'] + + if detections_file.exists(): + result = FileValidator.validate_json_file(detections_file) + if not result.is_valid: + self._add_error(stats, f"Invalid {OUTPUT_FILE_NAMES['detections']}: {result.error_message}") + + def __validate_embeddings(self, stats: 'EpisodeStats') -> None: + embeddings_dir = PathManager(stats.series_name).get_episode_dir( + stats.episode_info, settings.output_subdirs.embeddings, + ) + + if embeddings_dir.exists(): + embeddings_file = embeddings_dir / OUTPUT_FILE_NAMES['embeddings_text'] + if embeddings_file.exists(): + result = FileValidator.validate_json_file(embeddings_file) + if not result.is_valid: + self._add_error(stats, f"Invalid {OUTPUT_FILE_NAMES['embeddings_text']}: {result.error_message}") + + def __validate_elastic_documents(self, stats: 'EpisodeStats') -> None: + elastic_subdirs = [ + ELASTIC_SUBDIRS.text_segments, + ELASTIC_SUBDIRS.text_embeddings, + ELASTIC_SUBDIRS.video_frames, + ELASTIC_SUBDIRS.episode_names, + ELASTIC_SUBDIRS.text_statistics, + ELASTIC_SUBDIRS.full_episode_embeddings, + ELASTIC_SUBDIRS.sound_events, + ELASTIC_SUBDIRS.sound_event_embeddings, + ] + + found_elastic_docs = False + for subdir in elastic_subdirs: + elastic_base = settings.output_subdirs.elastic_documents + elastic_docs_dir = PathManager(stats.series_name).get_episode_dir( + stats.episode_info, f'{elastic_base}/{subdir}', + ) + + if elastic_docs_dir.exists(): + found_elastic_docs = True + for jsonl_file in elastic_docs_dir.glob('*.jsonl'): + result = FileValidator.validate_jsonl_file(jsonl_file) + if not result.is_valid: + self._add_error(stats, f'Invalid JSONL {jsonl_file.name}: {result.error_message}') + else: + self.__validate_embedding_dimensions(stats, jsonl_file, subdir) + + if not found_elastic_docs: + self._add_warning(stats, f'Missing {settings.output_subdirs.elastic_documents} directory') + + def __validate_text_statistics(self, stats: 'EpisodeStats') -> None: + transcriptions_dir = PathManager(stats.series_name).get_episode_dir( + stats.episode_info, settings.output_subdirs.transcriptions, + ) + + if transcriptions_dir.exists(): + clean_subdir = settings.output_subdirs.transcription_subdirs.clean + clean_dir = transcriptions_dir / clean_subdir + filename = f'{stats.series_name}_{stats.episode_info.episode_code()}_text_stats.json' + text_stats_file = clean_dir / filename + + if text_stats_file.exists(): + result = FileValidator.validate_json_file(text_stats_file) + if not result.is_valid: + self._add_error(stats, f'Invalid text_stats JSON: {result.error_message}') + else: + self._add_warning(stats, f'Missing text statistics file: {text_stats_file.name}') + + def __validate_embedding_dimensions( + self, stats: 'EpisodeStats', jsonl_file: Path, subdir: str, + ) -> None: + embedding_fields = { + ELASTIC_SUBDIRS.text_embeddings: 'text_embedding', + ELASTIC_SUBDIRS.video_frames: 'video_embedding', + ELASTIC_SUBDIRS.episode_names: 'title_embedding', + ELASTIC_SUBDIRS.full_episode_embeddings: 'full_episode_embedding', + ELASTIC_SUBDIRS.sound_event_embeddings: 'sound_event_embedding', + } + + if subdir not in embedding_fields: + return + + embedding_field = embedding_fields[subdir] + expected_dim = settings.embedding_model.embedding_dim + + try: + with open(jsonl_file, 'r', encoding='utf-8') as f: + for line_num, line in enumerate(f, 1): + if not line.strip(): + continue + doc = json.loads(line) + if embedding_field in doc: + embedding = doc[embedding_field] + if isinstance(embedding, list): + actual_dim = len(embedding) + if actual_dim != expected_dim: + error_msg = ( + f'{jsonl_file.name} line {line_num}: ' + f'{embedding_field} has {actual_dim} dimensions, ' + f'expected {expected_dim}' + ) + self._add_error(stats, error_msg) + return + except Exception as e: + self._add_error(stats, f'Error validating embeddings in {jsonl_file.name}: {e}') diff --git a/preprocessor/services/validation/validators/face_cluster_validator.py b/preprocessor/services/validation/validators/face_cluster_validator.py new file mode 100644 index 000000000..fd9cb580d --- /dev/null +++ b/preprocessor/services/validation/validators/face_cluster_validator.py @@ -0,0 +1,69 @@ +import json +from pathlib import Path +from typing import ( + TYPE_CHECKING, + Any, + Dict, + Optional, +) + +from preprocessor.config.config import settings +from preprocessor.services.io.path_manager import PathManager +from preprocessor.services.validation.file_validators import FileValidator +from preprocessor.services.validation.validators.base_validator import BaseValidator + +if TYPE_CHECKING: + from preprocessor.services.validation.episode_stats import EpisodeStats + + +class FaceClusterValidator(BaseValidator): + + def validate(self, stats: 'EpisodeStats') -> None: + clusters_dir = PathManager(stats.series_name).get_episode_dir( + stats.episode_info, settings.output_subdirs.face_clusters, + ) + + if not clusters_dir.exists(): + return + + metadata_files = list(clusters_dir.glob('*_face_clusters.json')) + metadata_file = metadata_files[0] if metadata_files else None + + if not metadata_file or not metadata_file.exists(): + self._add_warning(stats, 'Missing face clustering metadata file') + return + + result = FileValidator.validate_json_file(metadata_file) + if not result.is_valid: + self._add_error(stats, f'Invalid face clustering metadata: {result.error_message}') + return + + data = self.__load_json_safely(metadata_file) + if not data: + self._add_error(stats, f'Error reading face clustering metadata: {metadata_file}') + return + + clusters = data.get('clusters', {}) + if isinstance(clusters, dict): + stats.face_clusters_count = len(clusters) + total_faces = sum((cluster_info.get('face_count', 0) for cluster_info in clusters.values())) + elif isinstance(clusters, list): + stats.face_clusters_count = len(clusters) + total_faces = sum((cluster_info.get('face_count', 0) for cluster_info in clusters)) + else: + self._add_warning(stats, 'Unexpected clusters format in face clustering metadata') + return + + noise_info = data.get('noise', {}) + if noise_info: + total_faces += noise_info.get('face_count', 0) + + stats.face_clusters_total_faces = total_faces + + @staticmethod + def __load_json_safely(file_path: Path) -> Optional[Dict[str, Any]]: + try: + with open(file_path, 'r', encoding='utf-8') as f: + return json.load(f) + except Exception: + return None diff --git a/preprocessor/services/validation/validators/frame_validator.py b/preprocessor/services/validation/validators/frame_validator.py new file mode 100644 index 000000000..915439b8b --- /dev/null +++ b/preprocessor/services/validation/validators/frame_validator.py @@ -0,0 +1,51 @@ +from typing import TYPE_CHECKING + +from preprocessor.config.config import settings +from preprocessor.config.constants import OUTPUT_FILE_PATTERNS +from preprocessor.services.io.path_manager import PathManager +from preprocessor.services.validation.file_validators import FileValidator +from preprocessor.services.validation.validators.base_validator import BaseValidator + +if TYPE_CHECKING: + from preprocessor.services.validation.episode_stats import EpisodeStats + + +class FrameValidator(BaseValidator): + + def validate(self, stats: 'EpisodeStats') -> None: + frames_dir = PathManager(stats.series_name).get_episode_dir( + stats.episode_info, settings.output_subdirs.frames, + ) + + if not frames_dir.exists(): + self._add_warning( + stats, f'Missing {settings.output_subdirs.frames} directory: {frames_dir}', + ) + return + + frame_files = sorted(frames_dir.glob(OUTPUT_FILE_PATTERNS['frame'])) + if not frame_files: + self._add_warning(stats, f'No frames found in {settings.output_subdirs.frames}/') + return + + stats.exported_frames_count = len(frame_files) + total_size = 0 + resolutions = [] + invalid_count = 0 + + for frame_file in frame_files: + result = FileValidator.validate_image_file(frame_file) + if result.is_valid: + total_size += result.metadata['size_mb'] + resolutions.append((result.metadata['width'], result.metadata['height'])) + else: + invalid_count += 1 + self._add_error(stats, f'Invalid frame {frame_file.name}: {result.error_message}') + + if invalid_count > 0: + self._add_warning(stats, f'{invalid_count} invalid frames found') + + stats.exported_frames_total_size_mb = round(total_size, 2) + if resolutions: + most_common_res = max(set(resolutions), key=resolutions.count) + stats.exported_frames_avg_resolution = most_common_res diff --git a/preprocessor/services/validation/validators/image_hash_validator.py b/preprocessor/services/validation/validators/image_hash_validator.py new file mode 100644 index 000000000..e3bff0456 --- /dev/null +++ b/preprocessor/services/validation/validators/image_hash_validator.py @@ -0,0 +1,19 @@ +from typing import TYPE_CHECKING + +from preprocessor.config.config import settings +from preprocessor.services.validation.validators.base_validator import BaseValidator +from preprocessor.services.validation.validators.validation_helpers import JsonDirectoryValidationHelper + +if TYPE_CHECKING: + from preprocessor.services.validation.episode_stats import EpisodeStats + + +class ImageHashValidator(BaseValidator): + + def validate(self, stats: 'EpisodeStats') -> None: + JsonDirectoryValidationHelper.validate_json_directory( + stats, + settings.output_subdirs.image_hashes, + 'image_hashes_count', + 'image_hashes', + ) diff --git a/preprocessor/services/validation/validators/object_validator.py b/preprocessor/services/validation/validators/object_validator.py new file mode 100644 index 000000000..41ceea434 --- /dev/null +++ b/preprocessor/services/validation/validators/object_validator.py @@ -0,0 +1,37 @@ +from typing import TYPE_CHECKING + +from preprocessor.config.config import settings +from preprocessor.services.validation.validators.base_validator import BaseValidator +from preprocessor.services.validation.validators.validation_helpers import ( + JsonDirectoryValidationHelper, + VisualizationValidationHelper, +) + +if TYPE_CHECKING: + from preprocessor.services.validation.episode_stats import EpisodeStats + + +class ObjectValidator(BaseValidator): + + def validate(self, stats: 'EpisodeStats') -> None: + self.__validate_object_detections(stats) + self.__validate_object_visualizations(stats) + + @staticmethod + def __validate_object_detections(stats: 'EpisodeStats') -> None: + JsonDirectoryValidationHelper.validate_json_directory( + stats, + settings.output_subdirs.object_detections, + 'object_detections_count', + 'object_detections', + exclude_pattern='visualizations', + ) + + @staticmethod + def __validate_object_visualizations(stats: 'EpisodeStats') -> None: + VisualizationValidationHelper.validate_visualizations( + stats, + settings.output_subdirs.object_visualizations, + 'object_visualizations_count', + 'visualization', + ) diff --git a/preprocessor/services/validation/validators/scene_validator.py b/preprocessor/services/validation/validators/scene_validator.py new file mode 100644 index 000000000..37c693fa1 --- /dev/null +++ b/preprocessor/services/validation/validators/scene_validator.py @@ -0,0 +1,54 @@ +import json +from pathlib import Path +from typing import ( + TYPE_CHECKING, + Any, + Dict, + Optional, +) + +from preprocessor.config.config import settings +from preprocessor.config.constants import OUTPUT_FILE_PATTERNS +from preprocessor.services.io.path_manager import PathManager +from preprocessor.services.validation.file_validators import FileValidator +from preprocessor.services.validation.validators.base_validator import BaseValidator + +if TYPE_CHECKING: + from preprocessor.services.validation.episode_stats import EpisodeStats + + +class SceneValidator(BaseValidator): + + def validate(self, stats: 'EpisodeStats') -> None: + scenes_dir = PathManager(stats.series_name).get_episode_dir( + stats.episode_info, settings.output_subdirs.scenes, + ) + scenes_file = scenes_dir / f"{stats.series_name}_{stats.episode_info.episode_code()}{OUTPUT_FILE_PATTERNS['scenes_suffix']}" + + if not scenes_file.exists(): + self._add_error(stats, f'Missing scenes file: {scenes_file}') + return + + result = FileValidator.validate_json_file(scenes_file) + if not result.is_valid: + self._add_error(stats, f'Invalid scenes JSON: {result.error_message}') + return + + data = self.__load_json_safely(scenes_file) + if not data: + self._add_error(stats, f'Error reading scenes: {scenes_file}') + return + + stats.scenes_count = data.get('total_scenes', 0) + scenes = data.get('scenes', []) + if scenes: + durations = [scene.get('duration', 0) for scene in scenes] + stats.scenes_avg_duration = round(sum(durations) / len(durations), 2) + + @staticmethod + def __load_json_safely(file_path: Path) -> Optional[Dict[str, Any]]: + try: + with open(file_path, 'r', encoding='utf-8') as f: + return json.load(f) + except Exception: + return None diff --git a/preprocessor/services/validation/validators/transcription_validator.py b/preprocessor/services/validation/validators/transcription_validator.py new file mode 100644 index 000000000..1364e6380 --- /dev/null +++ b/preprocessor/services/validation/validators/transcription_validator.py @@ -0,0 +1,124 @@ +import json +from pathlib import Path +from typing import ( + TYPE_CHECKING, + Any, + Dict, + Optional, +) + +from preprocessor.config.config import settings +from preprocessor.services.io.path_manager import PathManager +from preprocessor.services.validation.file_validators import FileValidator +from preprocessor.services.validation.validators.base_validator import BaseValidator + +if TYPE_CHECKING: + from preprocessor.services.validation.episode_stats import EpisodeStats + + +class TranscriptionValidator(BaseValidator): + + def validate(self, stats: 'EpisodeStats') -> None: + transcriptions_dir = PathManager(stats.series_name).get_episode_dir( + stats.episode_info, settings.output_subdirs.transcriptions, + ) + base_name = f'{stats.series_name}_{stats.episode_info.episode_code()}' + raw_dir = transcriptions_dir / settings.output_subdirs.transcription_subdirs.raw + clean_dir = transcriptions_dir / settings.output_subdirs.transcription_subdirs.clean + sound_events_dir = transcriptions_dir / settings.output_subdirs.transcription_subdirs.sound_events + + transcription_files = { + 'main': raw_dir / f'{base_name}.json', + 'segmented': raw_dir / f'{base_name}_segmented.json', + 'simple': raw_dir / f'{base_name}_simple.json', + 'clean': clean_dir / f'{base_name}_clean_transcription.json', + 'clean_txt': clean_dir / f'{base_name}_clean_transcription.txt', + 'sound_events': sound_events_dir / f'{base_name}_sound_events.json', + } + + if not any((f.exists() for f in transcription_files.values())): + self._add_error(stats, 'No transcription files found in any format') + return + + self.__validate_raw_transcription(stats, transcription_files) + self.__validate_clean_transcription(stats, transcription_files['clean']) + self.__validate_clean_txt(stats, transcription_files['clean_txt']) + self.__validate_sound_events(stats, transcription_files['sound_events']) + + def __validate_raw_transcription( + self, stats: 'EpisodeStats', transcription_files: Dict[str, Path], + ) -> None: + raw_transcription = None + for key in ('main', 'segmented', 'simple'): + if transcription_files[key].exists(): + raw_transcription = transcription_files[key] + break + + if not raw_transcription: + self._add_warning( + stats, + 'Missing raw transcription file (checked: .json, _segmented.json, _simple.json)', + ) + return + + result = FileValidator.validate_json_file(raw_transcription) + if not result.is_valid: + self._add_error(stats, f'Invalid transcription JSON: {result.error_message}') + return + + self.__extract_transcription_stats(stats, raw_transcription) + + def __extract_transcription_stats(self, stats: 'EpisodeStats', raw_transcription: Path) -> None: + data = self.__load_json_safely(raw_transcription) + if not data: + self._add_error(stats, f'Error reading transcription: {raw_transcription}') + return + + text = data.get('text', '') + if not text: + segments = data.get('segments', []) + if segments: + text = ' '.join((seg.get('text', '') for seg in segments)) + + stats.transcription_chars = len(text) + stats.transcription_words = len(text.split()) + + words = data.get('words', []) + if words: + stats.transcription_duration = words[-1].get('end', 0.0) + else: + segments = data.get('segments', []) + if segments and segments[-1].get('end'): + stats.transcription_duration = segments[-1].get('end', 0.0) + + @staticmethod + def __load_json_safely(file_path: Path) -> Optional[Dict[str, Any]]: + try: + with open(file_path, 'r', encoding='utf-8') as f: + return json.load(f) + except Exception: + return None + + def __validate_clean_transcription(self, stats: 'EpisodeStats', clean_transcription_file: Path) -> None: + if not clean_transcription_file.exists(): + self._add_warning( + stats, f'Missing clean transcription file: {clean_transcription_file.name}', + ) + return + + result = FileValidator.validate_json_file(clean_transcription_file) + if not result.is_valid: + self._add_warning(stats, f'Invalid clean transcription JSON: {result.error_message}') + + def __validate_clean_txt(self, stats: 'EpisodeStats', clean_txt_file: Path) -> None: + if not clean_txt_file.exists(): + self._add_warning(stats, f'Missing clean transcription txt: {clean_txt_file.name}') + + def __validate_sound_events(self, stats: 'EpisodeStats', sound_events_file: Path) -> None: + if not sound_events_file.exists(): + self._add_warning(stats, f'Missing sound events file: {sound_events_file.name}') + return + + result = FileValidator.validate_json_file(sound_events_file) + if not result.is_valid: + self._add_warning(stats, f'Invalid sound events JSON: {result.error_message}') diff --git a/preprocessor/services/validation/validators/validation_helpers.py b/preprocessor/services/validation/validators/validation_helpers.py new file mode 100644 index 000000000..c19bb6b29 --- /dev/null +++ b/preprocessor/services/validation/validators/validation_helpers.py @@ -0,0 +1,137 @@ +from pathlib import Path +from typing import ( + TYPE_CHECKING, + List, + Optional, + Tuple, +) + +from preprocessor.services.io.path_manager import PathManager +from preprocessor.services.validation.file_validators import FileValidator + +if TYPE_CHECKING: + from preprocessor.services.validation.episode_stats import EpisodeStats + + +class JsonDirectoryValidationHelper: + + @staticmethod + def validate_json_directory( + stats: 'EpisodeStats', + subdir: str, + count_attr: Optional[str], + context_name: str, + exclude_pattern: Optional[str] = None, + check_anomalies: bool = True, + ) -> None: + dir_path = PathManager(stats.series_name).get_episode_dir(stats.episode_info, subdir) + count, sizes, errors = JsonDirectoryValidationHelper._validate_json_files_in_directory( + dir_path, exclude_pattern, + ) + + if not dir_path.exists(): + stats.warnings.append(f'Missing {subdir} directory') + return + + if count == 0: + stats.warnings.append(f'No JSON files in {subdir}/') + return + + if count_attr: + setattr(stats, count_attr, count) + + stats.errors.extend(errors) + + if check_anomalies: + JsonDirectoryValidationHelper._check_size_anomalies(stats, sizes, context_name) + + @staticmethod + def _validate_json_files_in_directory( + directory: Path, exclude_pattern: Optional[str] = None, + ) -> Tuple[int, List[int], List[str]]: + if not directory.exists(): + return 0, [], [] + + json_files = [ + f for f in directory.glob('*.json') + if not exclude_pattern or exclude_pattern not in str(f) + ] + + if not json_files: + return 0, [], [] + + sizes = [] + errors = [] + for json_file in json_files: + result = FileValidator.validate_json_file(json_file) + if not result.is_valid: + errors.append(f'Invalid JSON {json_file.name}: {result.error_message}') + else: + sizes.append(json_file.stat().st_size) + + return len(json_files), sizes, errors + + @staticmethod + def _check_size_anomalies( + stats: 'EpisodeStats', sizes: List[int], folder_name: str, threshold: float = 0.2, + ) -> None: + if len(sizes) < 2: + return + + avg_size = sum(sizes) / len(sizes) + if avg_size == 0: + return + + for i, size in enumerate(sizes): + deviation = abs(size - avg_size) / avg_size + if deviation > threshold: + warning_msg = ( + f'{folder_name} file #{i + 1} size deviation: ' + f'{deviation * 100:.1f}% from average' + ) + stats.warnings.append(warning_msg) + + +class VisualizationValidationHelper: + + @staticmethod + def validate_visualizations( + stats: 'EpisodeStats', subdir: str, count_attr: str, context_name: str, + ) -> None: + viz_dir = PathManager(stats.series_name).get_episode_dir(stats.episode_info, subdir) + total_count, invalid_count, errors = VisualizationValidationHelper._validate_images_in_directory(viz_dir) + + if total_count == 0 and viz_dir.exists(): + stats.warnings.append(f'No visualization images in {subdir}/') + return + + if total_count > 0: + setattr(stats, count_attr, total_count) + stats.errors.extend(errors) + if invalid_count > 0: + stats.warnings.append(f'{invalid_count} invalid {context_name} images found') + + @staticmethod + def _validate_images_in_directory( + directory: Path, + extensions: Tuple[str, ...] = ('*.jpg', '*.png'), + ) -> Tuple[int, int, List[str]]: + if not directory.exists(): + return 0, 0, [] + + image_files = [] + for ext in extensions: + image_files.extend(directory.glob(ext)) + + if not image_files: + return 0, 0, [] + + invalid_count = 0 + errors = [] + for img_file in image_files: + result = FileValidator.validate_image_file(img_file) + if not result.is_valid: + invalid_count += 1 + errors.append(f'Invalid image {img_file.name}: {result.error_message}') + + return len(image_files), invalid_count, errors diff --git a/preprocessor/services/validation/validators/video_validator.py b/preprocessor/services/validation/validators/video_validator.py new file mode 100644 index 000000000..a7d95c386 --- /dev/null +++ b/preprocessor/services/validation/validators/video_validator.py @@ -0,0 +1,34 @@ +from typing import TYPE_CHECKING + +from preprocessor.config.config import ( + get_base_output_dir, + settings, +) +from preprocessor.config.constants import DEFAULT_VIDEO_EXTENSION +from preprocessor.services.validation.file_validators import FileValidator +from preprocessor.services.validation.validators.base_validator import BaseValidator + +if TYPE_CHECKING: + from preprocessor.services.validation.episode_stats import EpisodeStats + + +class VideoValidator(BaseValidator): + + def validate(self, stats: 'EpisodeStats') -> None: + filename = f'{stats.series_name.lower()}_{stats.episode_info.episode_code()}{DEFAULT_VIDEO_EXTENSION}' + season_dir = get_base_output_dir(stats.series_name) / settings.output_subdirs.video / stats.episode_info.season_code() + video_file = season_dir / filename + + if not video_file.exists(): + self._add_warning(stats, f'Missing video file: {video_file}') + return + + result = FileValidator.validate_video_file(video_file) + if not result.is_valid: + self._add_error(stats, f'Invalid video: {result.error_message}') + return + + stats.video_size_mb = result.metadata['size_mb'] + stats.video_duration = result.metadata['duration'] + stats.video_codec = result.metadata['codec'] + stats.video_resolution = (result.metadata['width'], result.metadata['height']) diff --git a/preprocessor/services/video/emotion_utils.py b/preprocessor/services/video/emotion_utils.py index 1f3a8e5bb..d28accb76 100644 --- a/preprocessor/services/video/emotion_utils.py +++ b/preprocessor/services/video/emotion_utils.py @@ -42,7 +42,7 @@ def __clip_bbox( return x1, y1, x2, y2 @staticmethod - def __crop_face(frame: np.ndarray, bbox: Dict[str, int]) -> Optional[np.ndarray]: # pylint: disable=unused-private-member + def __crop_face(frame: np.ndarray, bbox: Dict[str, int]) -> Optional[np.ndarray]: # pylint: disable=unused-private-member try: x1, y1, x2, y2 = (bbox['x1'], bbox['y1'], bbox['x2'], bbox['y2']) height, width = frame.shape[:2] @@ -55,7 +55,7 @@ def __crop_face(frame: np.ndarray, bbox: Dict[str, int]) -> Optional[np.ndarray] return None @staticmethod - def __detect_batch( # pylint: disable=unused-private-member + def __detect_batch( # pylint: disable=unused-private-member face_images: List[np.ndarray], model: HSEmotionRecognizer, batch_size: int = 32, @@ -86,7 +86,7 @@ def __detect_batch( # pylint: disable=unused-private-member return results @staticmethod - def __init_model(logger: Optional[ErrorHandlingLogger]=None) -> HSEmotionRecognizer: # pylint: disable=unused-private-member + def __init_model(logger: Optional[ErrorHandlingLogger]=None) -> HSEmotionRecognizer: # pylint: disable=unused-private-member model_name = settings.emotion_detection.model_name if logger: logger.info(f'Loading HSEmotion model: {model_name}...') diff --git a/preprocessor/services/video/image_hasher.py b/preprocessor/services/video/image_hasher.py index 5255b1d39..b4dfcbb4d 100644 --- a/preprocessor/services/video/image_hasher.py +++ b/preprocessor/services/video/image_hasher.py @@ -23,7 +23,7 @@ def cleanup(self) -> None: if torch.cuda.is_available(): torch.cuda.empty_cache() - def __compute_hash(self, image_tensor: torch.Tensor) -> int: # pylint: disable=unused-private-member + def __compute_hash(self, image_tensor: torch.Tensor) -> int: # pylint: disable=unused-private-member if self.model is None: raise RuntimeError('Model not initialized or already cleaned up') with torch.no_grad(): diff --git a/preprocessor/services/video/strategies/scene_changes_strategy.py b/preprocessor/services/video/strategies/scene_changes_strategy.py index e2be552e1..db7abb2c9 100644 --- a/preprocessor/services/video/strategies/scene_changes_strategy.py +++ b/preprocessor/services/video/strategies/scene_changes_strategy.py @@ -12,7 +12,7 @@ class SceneChangesStrategy(BaseKeyframeStrategy): - def __init__(self, frames_per_scene: int): + def __init__(self, frames_per_scene: int) -> None: self.frames_per_scene = frames_per_scene def extract_frame_requests(self, video_path: Path, data: Dict[str, Any]) -> List[Dict[str, Any]]: diff --git a/preprocessor/steps/analysis/__init__.py b/preprocessor/steps/analysis/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/preprocessor/steps/analysis/resolution_analysis_step.py b/preprocessor/steps/analysis/resolution_analysis_step.py new file mode 100644 index 000000000..fa9ad15b9 --- /dev/null +++ b/preprocessor/steps/analysis/resolution_analysis_step.py @@ -0,0 +1,162 @@ +from pathlib import Path +from typing import List + +from preprocessor.config.step_configs import TranscodeConfig +from preprocessor.core.artifacts import ResolutionAnalysisResult +from preprocessor.core.base_step import PipelineStep +from preprocessor.core.context import ExecutionContext +from preprocessor.services.io.path_service import PathService +from preprocessor.services.media.ffmpeg import FFmpegWrapper + + +class ResolutionAnalysisStep(PipelineStep[None, ResolutionAnalysisResult, TranscodeConfig]): + + def execute( + self, input_data: None, context: ExecutionContext, + ) -> ResolutionAnalysisResult: + context.logger.info('=' * 80) + context.logger.info('RESOLUTION ANALYSIS - Checking source video resolutions') + context.logger.info('=' * 80) + + video_paths = self._find_video_files(context) + if not video_paths: + context.logger.warning('No video files found - skipping resolution analysis') + context.mark_step_completed(self.name, 'all') + return ResolutionAnalysisResult(total_files=0, upscaling_percentage=0.0) + + resolutions = self._scan_resolutions(video_paths, context) + if not resolutions: + context.logger.warning('Failed to analyze resolutions - skipping') + context.mark_step_completed(self.name, 'all') + return ResolutionAnalysisResult(total_files=len(video_paths), upscaling_percentage=0.0) + + upscaling_pct = self._analyze_and_report(resolutions, context) + + context.mark_step_completed(self.name, 'all') + return ResolutionAnalysisResult(total_files=len(resolutions), upscaling_percentage=upscaling_pct) + + @property + def name(self) -> str: + return 'resolution_analysis' + + @staticmethod + def _find_video_files(context: ExecutionContext) -> List[Path]: + input_base = PathService.get_input_base() + series_path = input_base / context.series_name + + if not series_path.exists(): + return [] + + video_extensions = {'.mp4', '.mkv', '.avi', '.mov', '.m4v'} + video_files = [ + p for p in series_path.rglob('*') + if p.is_file() and p.suffix.lower() in video_extensions + ] + + return sorted(video_files) + + @staticmethod + def _scan_resolutions( + video_paths: List[Path], context: ExecutionContext, + ) -> List[tuple[int, int, str]]: + resolutions = [] + + for video_path in video_paths: + try: + probe_data = FFmpegWrapper.probe_video(video_path) + width, height = FFmpegWrapper.get_resolution(probe_data) + sar_num, sar_denom = FFmpegWrapper.get_sample_aspect_ratio(probe_data) + + effective_width = int(width * sar_num / sar_denom) + resolutions.append((effective_width, height, video_path.name)) + + except Exception as e: # pylint: disable=broad-except + context.logger.warning(f'Failed to probe {video_path.name}: {e}') + continue + + return resolutions + + def _analyze_and_report( + self, resolutions: List[tuple[int, int, str]], context: ExecutionContext, + ) -> float: + from collections import Counter # pylint: disable=import-outside-toplevel + + resolution_counts = Counter((w, h) for w, h, _ in resolutions) + total_episodes = len(resolutions) + + target_width = self.config.resolution.width + target_height = self.config.resolution.height + target_pixels = target_width * target_height + + upscaling_count = sum( + 1 for w, h, _ in resolutions + if (w * h) < target_pixels + ) + upscaling_pct = (upscaling_count / total_episodes) * 100 if total_episodes > 0 else 0 + + context.logger.info('') + context.logger.info('Source Resolution Distribution:') + context.logger.info('-' * 60) + + for (width, height), count in resolution_counts.most_common(): + pct = (count / total_episodes) * 100 + label = self._get_resolution_label(width, height) + context.logger.info( + f' {width}x{height} ({label}): {count} episodes ({pct:.1f}%)', + ) + + context.logger.info('') + context.logger.info( + f'Target Resolution: {target_width}x{target_height} ' + f'({self._get_resolution_label(target_width, target_height)})', + ) + + if upscaling_pct > 50: + context.logger.warning('') + context.logger.warning('⚠' * 30) + context.logger.warning( + f'⚠ WARNING: {upscaling_pct:.1f}% of episodes will require UPSCALING!', + ) + context.logger.warning( + '⚠ Upscaling degrades quality. Consider using analyze-resolution CLI ' + 'to find optimal target resolution.', + ) + context.logger.warning('⚠' * 30) + elif upscaling_pct > 0: + context.logger.info( + f'Note: {upscaling_pct:.1f}% of episodes will be upscaled ' + '(enhanced quality params will be used)', + ) + + context.logger.info('=' * 80) + + return upscaling_pct + + @staticmethod + def _get_resolution_label(width: int, height: int) -> str: + resolution_labels = { + (7680, 4320): '8K', + (3840, 2160): '4K', + (2560, 1440): '1440p', + (1920, 1080): '1080p', + (1280, 720): '720p', + (854, 480): '480p', + (640, 360): '360p', + (426, 240): '240p', + (256, 144): '144p', + } + + if (width, height) in resolution_labels: + return resolution_labels[(width, height)] + + if height >= 2000: + return '4K+' + if height >= 1400: + return '2K' + if height >= 1000: + return 'Full HD' + if height >= 700: + return 'HD' + if height >= 450: + return 'SD' + return 'Low' diff --git a/preprocessor/steps/audio/separation_step.py b/preprocessor/steps/audio/separation_step.py index 6bcc3b730..de41a3383 100644 --- a/preprocessor/steps/audio/separation_step.py +++ b/preprocessor/steps/audio/separation_step.py @@ -36,7 +36,8 @@ def execute( ) -> TranscriptionData: output_paths = self._prepare_output_paths(input_data) - if self._should_skip_processing(output_paths, context, input_data): + clean_json = output_paths['clean_json'] + if self._check_cache_validity(clean_json, context, input_data.episode_id, 'cached'): return self._create_cached_result(output_paths, input_data) context.mark_step_started(self.name, input_data.episode_id) @@ -63,7 +64,8 @@ def execute( def name(self) -> str: return 'sound_separation' - def _prepare_output_paths(self, input_data: TranscriptionData) -> Dict[str, Path]: + @staticmethod + def _prepare_output_paths(input_data: TranscriptionData) -> Dict[str, Path]: base_name = input_data.path.stem.replace(FILE_SUFFIXES['segmented'], '') episode_dir = input_data.path.parent.parent clean_dir = episode_dir / 'clean' @@ -82,19 +84,6 @@ def _prepare_output_paths(self, input_data: TranscriptionData) -> Dict[str, Path 'sound_srt': sound_dir / f"{base_name}{FILE_SUFFIXES['sound_events']}{FILE_EXTENSIONS['srt']}", } - def _should_skip_processing( - self, - output_paths: Dict[str, Path], - context: ExecutionContext, - input_data: TranscriptionData, - ) -> bool: - clean_json = output_paths['clean_json'] - sound_json = output_paths['sound_json'] - if clean_json.exists() and sound_json.exists() and (not context.force_rerun): - if context.is_step_completed(self.name, input_data.episode_id): - context.logger.info(f'Skipping {input_data.episode_id} (cached)') - return True - return False @staticmethod def _load_transcription_data(input_data: TranscriptionData) -> Dict[str, Any]: @@ -262,7 +251,7 @@ def __generate_txt_file(json_path: Path, txt_path: Path) -> None: f.write(' '.join(text_lines)) @staticmethod - def __is_sound_event_text(text: str) -> bool: # pylint: disable=unused-private-member + def __is_sound_event_text(text: str) -> bool: # pylint: disable=unused-private-member return bool(re.match(r'^\(.*\)$', text.strip())) @staticmethod diff --git a/preprocessor/steps/packaging/archives_step.py b/preprocessor/steps/packaging/archives_step.py index 126904dc1..7a50438fb 100644 --- a/preprocessor/steps/packaging/archives_step.py +++ b/preprocessor/steps/packaging/archives_step.py @@ -14,7 +14,7 @@ class ArchiveGenerationStep(PipelineStep[ProcessedEpisode, ArchiveArtifact, Arch def execute(self, input_data: ProcessedEpisode, context: ExecutionContext) -> ArchiveArtifact: output_path = self._get_output_path(input_data, context) - if self._should_skip_processing(output_path, context, input_data): + if self._check_cache_validity(output_path, context, input_data.episode_id, 'cached archive'): return self._create_archive_artifact(input_data, output_path) context.logger.info(f'Generating archive for {input_data.episode_id}') @@ -32,17 +32,6 @@ def _get_output_path(input_data: ProcessedEpisode, context: ExecutionContext) -> output_filename: str = f'{context.series_name}_{input_data.episode_info.episode_code()}_archive.zip' return context.get_output_path(input_data.episode_info, 'archives', output_filename) - def _should_skip_processing( - self, - output_path: Path, - context: ExecutionContext, - input_data: ProcessedEpisode, - ) -> bool: - if output_path.exists() and (not context.force_rerun): - if context.is_step_completed(self.name, input_data.episode_id): - context.logger.info(f'Skipping {input_data.episode_id} (cached archive)') - return True - return False @staticmethod def _create_archive_artifact(input_data: ProcessedEpisode, output_path: Path) -> ArchiveArtifact: diff --git a/preprocessor/steps/scraping/character_scraper_step.py b/preprocessor/steps/scraping/character_scraper_step.py index c014c8d7c..c0275bda3 100644 --- a/preprocessor/steps/scraping/character_scraper_step.py +++ b/preprocessor/steps/scraping/character_scraper_step.py @@ -1,3 +1,5 @@ +from typing import Type + from preprocessor.config.step_configs import CharacterScraperConfig from preprocessor.services.scraping.base_scraper_step import BaseScraperStep from preprocessor.services.scraping.character_scraper import CharacterScraper @@ -5,7 +7,7 @@ class CharacterScraperStep(BaseScraperStep[CharacterScraperConfig]): - def _get_scraper_class(self): + def _get_scraper_class(self) -> Type[CharacterScraper]: return CharacterScraper def _get_metadata_type_name(self) -> str: diff --git a/preprocessor/steps/scraping/episode_scraper_step.py b/preprocessor/steps/scraping/episode_scraper_step.py index f7392538a..4de7f883a 100644 --- a/preprocessor/steps/scraping/episode_scraper_step.py +++ b/preprocessor/steps/scraping/episode_scraper_step.py @@ -2,6 +2,7 @@ from typing import ( Any, Dict, + Type, ) from preprocessor.config.step_configs import EpisodeScraperConfig @@ -12,7 +13,7 @@ class EpisodeScraperStep(BaseScraperStep[EpisodeScraperConfig]): - def _get_scraper_class(self): + def _get_scraper_class(self) -> Type[EpisodeScraper]: return EpisodeScraper def _get_metadata_type_name(self) -> str: diff --git a/preprocessor/steps/scraping/reference_processor_step.py b/preprocessor/steps/scraping/reference_processor_step.py index 929f370ad..3f08b2346 100644 --- a/preprocessor/steps/scraping/reference_processor_step.py +++ b/preprocessor/steps/scraping/reference_processor_step.py @@ -24,7 +24,8 @@ def execute( characters_path, output_dir = self._get_paths() self._validate_characters_file(characters_path) - if self._should_skip_processing(output_dir, context): + if output_dir.exists() and any(output_dir.iterdir()) and not context.force_rerun: + context.logger.info(f"Character references already exist in: {output_dir}") self._executed = True return input_data @@ -50,12 +51,6 @@ def _validate_characters_file(characters_path: Path) -> None: f"Run scrape_characters first.", ) - @staticmethod - def _should_skip_processing(output_dir: Path, context: ExecutionContext) -> bool: - if output_dir.exists() and any(output_dir.iterdir()) and not context.force_rerun: - context.logger.info(f"Character references already exist in: {output_dir}") - return True - return False def _process_character_references( self, diff --git a/preprocessor/steps/search/document_generation_step.py b/preprocessor/steps/search/document_generation_step.py index 119fc2062..d8d6f28ee 100644 --- a/preprocessor/steps/search/document_generation_step.py +++ b/preprocessor/steps/search/document_generation_step.py @@ -21,7 +21,7 @@ def execute(self, input_data: Artifact, context: ExecutionContext) -> ElasticDoc episode_info, episode_id = self._extract_episode_info(input_data) output_dir = context.get_output_path(episode_info, 'elastic_documents', '') - if self._should_skip_processing(output_dir, context, episode_id): + if self._check_cache_validity(output_dir, context, episode_id, 'cached'): return self._create_empty_result(episode_id, episode_info, output_dir) context.logger.info(f'Generating Elasticsearch documents for {episode_id}') @@ -50,16 +50,6 @@ def _extract_episode_info(input_data: Artifact) -> tuple[Any, str]: episode_id = getattr(input_data, 'episode_id') return episode_info, episode_id - def _should_skip_processing( - self, - output_dir: Path, - context: ExecutionContext, - episode_id: str, - ) -> bool: - if output_dir.exists() and (not context.force_rerun): - if context.is_step_completed(self.name, episode_id): - return True - return False @staticmethod def _create_empty_result(episode_id: str, episode_info: Any, output_dir: Path) -> ElasticDocuments: diff --git a/preprocessor/steps/text/analysis_step.py b/preprocessor/steps/text/analysis_step.py index 8a7c42f01..dbb8cfb63 100644 --- a/preprocessor/steps/text/analysis_step.py +++ b/preprocessor/steps/text/analysis_step.py @@ -24,7 +24,7 @@ class TextAnalysisStep(PipelineStep[TranscriptionData, TextAnalysisResults, Text def execute(self, input_data: TranscriptionData, context: ExecutionContext) -> TextAnalysisResults: output_path = self._get_output_path(input_data) - if self._should_skip_processing(output_path, context, input_data): + if self._check_cache_validity(output_path, context, input_data.episode_id, 'cached'): return self._load_cached_result(output_path, input_data) context.logger.info(f'Analyzing text for {input_data.episode_id}') @@ -53,17 +53,6 @@ def _get_output_path(input_data: TranscriptionData) -> Path: output_filename = input_data.path.stem + '_text_stats.json' return input_data.path.parent / output_filename - def _should_skip_processing( - self, - output_path: Path, - context: ExecutionContext, - input_data: TranscriptionData, - ) -> bool: - if output_path.exists() and (not context.force_rerun): - if context.is_step_completed(self.name, input_data.episode_id): - context.logger.info(f'Skipping {input_data.episode_id} (cached)') - return True - return False @staticmethod def _load_cached_result(output_path: Path, input_data: TranscriptionData) -> TextAnalysisResults: diff --git a/preprocessor/steps/text/transcription_step.py b/preprocessor/steps/text/transcription_step.py index 908846d63..1ffcdb1fe 100644 --- a/preprocessor/steps/text/transcription_step.py +++ b/preprocessor/steps/text/transcription_step.py @@ -31,7 +31,7 @@ def cleanup(self) -> None: def execute(self, input_data: AudioArtifact, context: ExecutionContext) -> TranscriptionData: output_path = self._get_output_path(input_data, context) - if self._should_skip_processing(output_path, context, input_data): + if self._check_cache_validity(output_path, context, input_data.episode_id, 'cached transcription'): return self._create_cached_result(output_path, input_data) self._ensure_whisper_loaded() @@ -60,17 +60,6 @@ def _get_output_path(input_data: AudioArtifact, context: ExecutionContext) -> Pa f'raw/{output_filename}', ) - def _should_skip_processing( - self, - output_path: Path, - context: ExecutionContext, - input_data: AudioArtifact, - ) -> bool: - if output_path.exists() and (not context.force_rerun): - if context.is_step_completed(self.name, input_data.episode_id): - context.logger.info(f'Skipping {input_data.episode_id} (cached transcription)') - return True - return False def _create_cached_result(self, output_path: Path, input_data: AudioArtifact) -> TranscriptionData: return TranscriptionData( diff --git a/preprocessor/steps/video/frame_export_step.py b/preprocessor/steps/video/frame_export_step.py index 4a266df9b..304a74296 100644 --- a/preprocessor/steps/video/frame_export_step.py +++ b/preprocessor/steps/video/frame_export_step.py @@ -7,6 +7,7 @@ Any, Dict, List, + Tuple, ) from PIL import Image @@ -26,7 +27,7 @@ class FrameExporterStep(PipelineStep[SceneCollection, FrameCollection, FrameExportConfig]): - def __init__(self, config: FrameExportConfig): + def __init__(self, config: FrameExportConfig) -> None: super().__init__(config) decord.bridge.set_bridge('native') self.strategy = KeyframeStrategyFactory.create(self.config.keyframe_strategy, self.config.frames_per_scene) @@ -34,7 +35,7 @@ def __init__(self, config: FrameExportConfig): def execute(self, input_data: SceneCollection, context: ExecutionContext) -> FrameCollection: episode_dir, metadata_file = self._prepare_output_paths(input_data, context) - if self._should_skip_processing(metadata_file, context, input_data): + if self._check_cache_validity(metadata_file, context, input_data.episode_id, 'cached'): return self._load_cached_result(metadata_file, episode_dir, input_data) self._prepare_episode_directory(episode_dir, context) @@ -72,24 +73,12 @@ def name(self) -> str: def _prepare_output_paths( input_data: SceneCollection, context: ExecutionContext, - ) -> tuple[Path, Path]: + ) -> Tuple[Path, Path]: episode_dir = context.get_output_path(input_data.episode_info, 'exported_frames', '') metadata_filename = f'{context.series_name}_{input_data.episode_info.episode_code()}_frame_metadata.json' metadata_file = episode_dir / metadata_filename return episode_dir, metadata_file - def _should_skip_processing( - self, - metadata_file: Path, - context: ExecutionContext, - input_data: SceneCollection, - ) -> bool: - if metadata_file.exists() and (not context.force_rerun): - if context.is_step_completed(self.name, input_data.episode_id): - context.logger.info(f'Skipping {input_data.episode_id} (cached)') - return True - return False - @staticmethod def _load_cached_result( metadata_file: Path, diff --git a/preprocessor/steps/video/scene_detection_step.py b/preprocessor/steps/video/scene_detection_step.py index 868aa7c4d..d88ec825f 100644 --- a/preprocessor/steps/video/scene_detection_step.py +++ b/preprocessor/steps/video/scene_detection_step.py @@ -21,7 +21,7 @@ class SceneDetectorStep(PipelineStep[TranscodedVideo, SceneCollection, SceneDetectionConfig]): - def __init__(self, config: SceneDetectionConfig): + def __init__(self, config: SceneDetectionConfig) -> None: super().__init__(config) self.transnet = TransNetWrapper() self._model_loaded = False @@ -34,7 +34,7 @@ def cleanup(self) -> None: def execute(self, input_data: TranscodedVideo, context: ExecutionContext) -> SceneCollection: output_path = self._get_output_path(input_data, context) - if self._should_skip_processing(output_path, context, input_data): + if self._check_cache_validity(output_path, context, input_data.episode_id, 'cached'): return self._load_cached_result(output_path, input_data) self._ensure_model_loaded(context) @@ -56,18 +56,6 @@ def _get_output_path(input_data: TranscodedVideo, context: ExecutionContext) -> output_filename = f'{context.series_name}_{input_data.episode_info.episode_code()}_scenes.json' return context.get_output_path(input_data.episode_info, 'scene_timestamps', output_filename) - def _should_skip_processing( - self, - output_path: Path, - context: ExecutionContext, - input_data: TranscodedVideo, - ) -> bool: - if output_path.exists() and (not context.force_rerun): - if context.is_step_completed(self.name, input_data.episode_id): - context.logger.info(f'Skipping {input_data.episode_id} (cached)') - return True - return False - def _load_cached_result(self, output_path: Path, input_data: TranscodedVideo) -> SceneCollection: scenes_data = load_json(output_path) return SceneCollection( diff --git a/preprocessor/steps/video/transcoding_step.py b/preprocessor/steps/video/transcoding_step.py index fda8964cf..cd198e131 100644 --- a/preprocessor/steps/video/transcoding_step.py +++ b/preprocessor/steps/video/transcoding_step.py @@ -21,14 +21,17 @@ def execute( ) -> TranscodedVideo: output_path = self._get_output_path(input_data, context) - if self._should_skip_processing(output_path, context, input_data): + if self._check_cache_validity(output_path, context, input_data.episode_id, 'output exists'): return self._create_result_artifact(output_path, input_data) probe_data = FFmpegWrapper.probe_video(input_data.path) target_fps = self._calculate_target_fps(probe_data, context) - video_bitrate, minrate, maxrate, bufsize = self._adjust_video_bitrate(probe_data, context) + is_upscaling, source_pixels, target_pixels = self._detect_upscaling(probe_data) + video_bitrate, minrate, maxrate, bufsize = self._adjust_video_bitrate( + probe_data, context, is_upscaling, source_pixels, target_pixels, + ) audio_bitrate = self._adjust_audio_bitrate(probe_data, context) - deinterlace = self._determine_deinterlace(input_data, context) + deinterlace = self._determine_deinterlace(input_data, context, probe_data) context.logger.info(f'Transcoding {input_data.episode_id}') self._perform_transcode( @@ -41,6 +44,7 @@ def execute( audio_bitrate, target_fps, deinterlace, + is_upscaling, context, input_data, ) @@ -57,18 +61,6 @@ def _get_output_path(input_data: SourceVideo, context: ExecutionContext) -> Path output_filename = f'{context.series_name}_{input_data.episode_info.episode_code()}.mp4' return context.get_season_output_path(input_data.episode_info, 'transcoded_videos', output_filename) - def _should_skip_processing( - self, - output_path: Path, - context: ExecutionContext, - input_data: SourceVideo, - ) -> bool: - if output_path.exists() and (not context.force_rerun): - context.logger.info(f'Skipping {input_data.episode_id} (output exists)') - if not context.is_step_completed(self.name, input_data.episode_id): - context.mark_step_completed(self.name, input_data.episode_id) - return True - return False def _create_result_artifact(self, output_path: Path, input_data: SourceVideo) -> TranscodedVideo: resolution_str = f'{self.config.resolution.width}x{self.config.resolution.height}' @@ -94,11 +86,29 @@ def _calculate_target_fps( ) return target_fps + def _detect_upscaling(self, probe_data: Dict[str, Any]) -> tuple[bool, int, int]: + source_width, source_height = FFmpegWrapper.get_resolution(probe_data) + sar_num, sar_denom = FFmpegWrapper.get_sample_aspect_ratio(probe_data) + effective_width = int(source_width * sar_num / sar_denom) + + source_pixels = effective_width * source_height + target_pixels = self.config.resolution.width * self.config.resolution.height + + return source_pixels < target_pixels, source_pixels, target_pixels + def _adjust_video_bitrate( self, probe_data: Dict[str, Any], context: ExecutionContext, + is_upscaling: bool, + source_pixels: int, + target_pixels: int, ) -> tuple[float, float, float, float]: + if is_upscaling: + return self._calculate_upscale_bitrate( + probe_data, source_pixels, target_pixels, context, + ) + input_video_bitrate = FFmpegWrapper.get_video_bitrate(probe_data) video_bitrate = self.config.video_bitrate_mbps minrate = self.config.minrate_mbps @@ -120,6 +130,57 @@ def _adjust_video_bitrate( return video_bitrate, minrate, maxrate, bufsize + def _calculate_upscale_bitrate( + self, + probe_data: Dict[str, Any], + source_pixels: int, + target_pixels: int, + context: ExecutionContext, + ) -> tuple[float, float, float, float]: + __MIN_BITRATE_FOR_RESOLUTION: Dict[tuple[int, int], float] = { + (7680, 4320): 35.0, + (3840, 2160): 15.0, + (2560, 1440): 8.0, + (1920, 1080): 3.5, + (1280, 720): 2.0, + (854, 480): 1.2, + (640, 360): 0.8, + (426, 240): 0.5, + (256, 144): 0.3, + } + + target_res = (self.config.resolution.width, self.config.resolution.height) + min_required = __MIN_BITRATE_FOR_RESOLUTION.get(target_res, 2.0) + + source_bitrate = FFmpegWrapper.get_video_bitrate(probe_data) + pixel_ratio = target_pixels / source_pixels + + if source_bitrate: + calculated = source_bitrate * pixel_ratio * 1.2 + upscaled_bitrate = max(calculated, min_required) + else: + upscaled_bitrate = min_required * 1.2 + + max_allowed = self.config.video_bitrate_mbps * 1.3 + upscaled_bitrate = min(upscaled_bitrate, max_allowed) + + ratio = upscaled_bitrate / self.config.video_bitrate_mbps + + context.logger.warning( + f'⚠ UPSCALING: {source_pixels:,} px → {target_pixels:,} px ' + f'(+{((target_pixels/source_pixels)-1)*100:.1f}%). ' + f'Bitrate: {source_bitrate or "N/A"} → {upscaled_bitrate:.2f} Mbps ' + f'(min for {target_res[0]}x{target_res[1]}: {min_required} Mbps). ' + f'Using Lanczos scaler + enhanced nvenc params.', + ) + + return ( + upscaled_bitrate, + round(self.config.minrate_mbps * ratio, 2), + round(self.config.maxrate_mbps * ratio, 2), + round(self.config.bufsize_mbps * ratio, 2), + ) + def _adjust_audio_bitrate( self, probe_data: Dict[str, Any], @@ -139,12 +200,23 @@ def _adjust_audio_bitrate( return audio_bitrate - def _determine_deinterlace(self, input_data: SourceVideo, context: ExecutionContext) -> bool: + def _determine_deinterlace( + self, input_data: SourceVideo, context: ExecutionContext, probe_data: Dict[str, Any], + ) -> bool: + field_order = FFmpegWrapper.get_field_order(probe_data) + if self.config.force_deinterlace: - context.logger.info( - f"Force deinterlacing enabled for {input_data.episode_id} - " - f"skipping interlace detection and applying bwdif filter unconditionally", - ) + if field_order == 'progressive': + context.logger.warning( + f"⚠ Force deinterlacing enabled for {input_data.episode_id} " + f"but video is progressive (field_order={field_order}). " + f"This may degrade quality unnecessarily.", + ) + else: + context.logger.info( + f"Force deinterlacing enabled for {input_data.episode_id} - " + f"skipping interlace detection and applying bwdif filter unconditionally", + ) return True context.logger.info(f"Detecting interlacing for {input_data.episode_id}...") @@ -184,6 +256,7 @@ def _perform_transcode( # pylint: disable=too-many-arguments audio_bitrate: int, target_fps: float, deinterlace: bool, + is_upscaling: bool, context: ExecutionContext, input_data: SourceVideo, ) -> None: @@ -208,6 +281,7 @@ def _perform_transcode( # pylint: disable=too-many-arguments gop_size=int(target_fps * self.config.gop_size), target_fps=target_fps if target_fps < input_fps else None, deinterlace=deinterlace, + is_upscaling=is_upscaling, ) temp_path.replace(output_path) except BaseException: diff --git a/preprocessor/steps/vision/character_detection_step.py b/preprocessor/steps/vision/character_detection_step.py index 5c252d8a4..6269b1b62 100644 --- a/preprocessor/steps/vision/character_detection_step.py +++ b/preprocessor/steps/vision/character_detection_step.py @@ -38,7 +38,7 @@ def execute( ) -> DetectionResults: output_path = self._get_output_path(input_data, context) - if self._should_skip_processing(output_path, context, input_data): + if self._check_cache_validity(output_path, context, input_data.episode_id, 'cached character detections'): return self._load_cached_result(output_path, input_data) self._ensure_model_loaded(context) @@ -73,17 +73,6 @@ def _get_output_path(input_data: FrameCollection, context: ExecutionContext) -> input_data.episode_info, 'character_detections', output_filename, ) - def _should_skip_processing( - self, - output_path: Path, - context: ExecutionContext, - input_data: FrameCollection, - ) -> bool: - if output_path.exists() and (not context.force_rerun): - if context.is_step_completed(self.name, input_data.episode_id): - context.logger.info(f'Skipping {input_data.episode_id} (cached character detections)') - return True - return False @staticmethod def _load_cached_result(output_path: Path, input_data: FrameCollection) -> DetectionResults: diff --git a/preprocessor/steps/vision/image_hashing_step.py b/preprocessor/steps/vision/image_hashing_step.py index 2b07adc98..b665d2d55 100644 --- a/preprocessor/steps/vision/image_hashing_step.py +++ b/preprocessor/steps/vision/image_hashing_step.py @@ -39,7 +39,7 @@ def execute( ) -> ImageHashCollection: output_path = self._get_output_path(input_data, context) - if self._should_skip_processing(output_path, context, input_data): + if self._check_cache_validity(output_path, context, input_data.episode_id, 'cached'): return self._load_cached_result(output_path, input_data) frame_metadata, frame_requests = self._load_frame_metadata(input_data, context) @@ -75,17 +75,6 @@ def _get_output_path(input_data: FrameCollection, context: ExecutionContext) -> output_filename: str = f'{filename_base}_image_hashes.json' return context.get_output_path(input_data.episode_info, 'image_hashes', output_filename) - def _should_skip_processing( - self, - output_path: Path, - context: ExecutionContext, - input_data: FrameCollection, - ) -> bool: - if output_path.exists() and (not context.force_rerun): - if context.is_step_completed(self.name, input_data.episode_id): - context.logger.info(f'Skipping {input_data.episode_id} (cached)') - return True - return False @staticmethod def _load_cached_result(output_path: Path, input_data: FrameCollection) -> ImageHashCollection: From 5e8e75b3e73a109905e9940caea658323d2e11c8 Mon Sep 17 00:00:00 2001 From: dam2452 <81230036+dam2452@users.noreply.github.com> Date: Thu, 12 Feb 2026 12:21:15 +0100 Subject: [PATCH 22/89] Support global pipeline steps; drop _executed flags Refactor pipeline execution to support global steps by adding __run_global_step and __run_episode_step in PipelineExecutor and using a special 'all' episode_id for global completion tracking. Add an is_global property (default False) to PipelineStep and mark BaseScraperStep, ResolutionAnalysisStep, and CharacterReferenceStep as global. Remove per-instance _executed flags from scraper and reference steps so step execution and skipping rely on the pipeline's centralized skip/mark logic and context.force_rerun handling. Improve logging and error messages for global step execution. --- preprocessor/app/pipeline_builder.py | 73 ++++++++++++------- preprocessor/core/base_step.py | 4 + .../services/scraping/base_scraper_step.py | 13 +--- .../analysis/resolution_analysis_step.py | 4 + .../scraping/reference_processor_step.py | 13 +--- 5 files changed, 64 insertions(+), 43 deletions(-) diff --git a/preprocessor/app/pipeline_builder.py b/preprocessor/app/pipeline_builder.py index 6299a9f0a..ea11b3be0 100644 --- a/preprocessor/app/pipeline_builder.py +++ b/preprocessor/app/pipeline_builder.py @@ -98,35 +98,58 @@ def __run_for_episodes( # pylint: disable=unused-private-member ) for step in self.steps: - self.context.logger.info(f"=== Running Step: {step.name} ===") - next_artifacts = [] + if step.is_global: + self.__run_global_step(step) + else: + current_artifacts = self.__run_episode_step(step, current_artifacts) - for artifact in current_artifacts: - episode_id = artifact.episode_id + def __run_global_step(self, step: PipelineStep) -> None: + self.context.logger.info(f"=== Running Global Step: {step.name} ===") - if self.__should_skip_step(step.name, episode_id): - self.context.logger.info( - f"Skipping {step.name} for {episode_id} (already completed)", - ) - next_artifacts.append(artifact) - continue + if self.__should_skip_step(step.name, 'all'): + self.context.logger.info(f"Skipping {step.name} (already completed)") + return - try: - self.__mark_step_in_progress(step.name, episode_id) - result = step.execute(artifact, self.context) - self.__mark_step_completed(step.name, episode_id) - - if result: - next_artifacts.append(result) - else: - next_artifacts.append(artifact) - except Exception as e: - self.context.logger.error( - f"Step {step.name} failed for {artifact.episode_id}: {e}", - ) - raise + try: + self.__mark_step_in_progress(step.name, 'all') + step.execute(None, self.context) + self.__mark_step_completed(step.name, 'all') + except Exception as e: + self.context.logger.error(f"Global step {step.name} failed: {e}") + raise + + def __run_episode_step( + self, step: PipelineStep, current_artifacts: List[Any], + ) -> List[Any]: + self.context.logger.info(f"=== Running Step: {step.name} ===") + next_artifacts = [] + + for artifact in current_artifacts: + episode_id = artifact.episode_id + + if self.__should_skip_step(step.name, episode_id): + self.context.logger.info( + f"Skipping {step.name} for {episode_id} (already completed)", + ) + next_artifacts.append(artifact) + continue + + try: + self.__mark_step_in_progress(step.name, episode_id) + result = step.execute(artifact, self.context) + self.__mark_step_completed(step.name, episode_id) + + if result: + next_artifacts.append(result) + else: + next_artifacts.append(artifact) + except Exception as e: + self.context.logger.error( + f"Step {step.name} failed for {artifact.episode_id}: {e}", + ) + raise - current_artifacts = next_artifacts + return next_artifacts def __should_skip_step(self, step_name: str, episode_id: str) -> bool: if self.context.force_rerun: diff --git a/preprocessor/core/base_step.py b/preprocessor/core/base_step.py index 9aece3655..33579b635 100644 --- a/preprocessor/core/base_step.py +++ b/preprocessor/core/base_step.py @@ -39,6 +39,10 @@ def execute(self, input_data: InputT, context: "ExecutionContext") -> OutputT: def name(self) -> str: pass + @property + def is_global(self) -> bool: + return False + def _check_cache_validity( self, output_path: Path, diff --git a/preprocessor/services/scraping/base_scraper_step.py b/preprocessor/services/scraping/base_scraper_step.py index 0fa8a7b13..52fe473bd 100644 --- a/preprocessor/services/scraping/base_scraper_step.py +++ b/preprocessor/services/scraping/base_scraper_step.py @@ -22,21 +22,13 @@ class BaseScraperStep(PipelineStep[SourceVideo, SourceVideo, ConfigT], ABC): - def __init__(self, config: ConfigT) -> None: - super().__init__(config) - self._executed = False - def execute( self, input_data: SourceVideo, context: ExecutionContext, ) -> Optional[SourceVideo]: - if self._executed: - return input_data - output_path = Path(self.config.output_file) # type: ignore[attr-defined] if output_path.exists() and not context.force_rerun: context.logger.info(f"{self._get_metadata_type_name()} metadata already exists: {output_path}") - self._executed = True return input_data urls = self.config.urls # type: ignore[attr-defined] @@ -53,7 +45,6 @@ def execute( context.logger.info(f"{self._get_metadata_type_name()} metadata saved to: {output_path}") - self._executed = True return input_data @abstractmethod @@ -64,6 +55,10 @@ def _get_scraper_class(self) -> Type: def _get_metadata_type_name(self) -> str: pass + @property + def is_global(self) -> bool: + return True + def _build_scraper_args(self, output_path: Path, context: ExecutionContext) -> Dict[str, Any]: base_args: Dict[str, Any] = { "urls": self.config.urls, # type: ignore[attr-defined] diff --git a/preprocessor/steps/analysis/resolution_analysis_step.py b/preprocessor/steps/analysis/resolution_analysis_step.py index fa9ad15b9..eaf5f66a3 100644 --- a/preprocessor/steps/analysis/resolution_analysis_step.py +++ b/preprocessor/steps/analysis/resolution_analysis_step.py @@ -39,6 +39,10 @@ def execute( def name(self) -> str: return 'resolution_analysis' + @property + def is_global(self) -> bool: + return True + @staticmethod def _find_video_files(context: ExecutionContext) -> List[Path]: input_base = PathService.get_input_base() diff --git a/preprocessor/steps/scraping/reference_processor_step.py b/preprocessor/steps/scraping/reference_processor_step.py index 3f08b2346..e22692aa4 100644 --- a/preprocessor/steps/scraping/reference_processor_step.py +++ b/preprocessor/steps/scraping/reference_processor_step.py @@ -11,26 +11,17 @@ class CharacterReferenceStep( PipelineStep[SourceVideo, SourceVideo, CharacterReferenceConfig], ): - def __init__(self, config: CharacterReferenceConfig) -> None: - super().__init__(config) - self._executed = False - def execute( self, input_data: SourceVideo, context: ExecutionContext, ) -> Optional[SourceVideo]: - if self._executed: - return input_data - characters_path, output_dir = self._get_paths() self._validate_characters_file(characters_path) if output_dir.exists() and any(output_dir.iterdir()) and not context.force_rerun: context.logger.info(f"Character references already exist in: {output_dir}") - self._executed = True return input_data self._process_character_references(characters_path, output_dir, context) - self._executed = True return input_data @@ -38,6 +29,10 @@ def execute( def name(self) -> str: return "process_character_references" + @property + def is_global(self) -> bool: + return True + def _get_paths(self) -> tuple[Path, Path]: characters_path = Path(self.config.characters_file) output_dir = Path(self.config.output_dir) From deeb26b9775143755c1d17887a1e9498146ea2cd Mon Sep 17 00:00:00 2001 From: dam2452 <81230036+dam2452@users.noreply.github.com> Date: Thu, 12 Feb 2026 12:37:38 +0100 Subject: [PATCH 23/89] Refactor resolution analysis and update step modules Update pipeline module paths to new *_step modules and refactor resolution analysis step. The ResolutionAnalysisStep now uses stricter private method names, improved imports/typing, and writes a detailed JSON report (including counts, labels, per-file info and upscaling metrics) to the output directory. Also replace CharacterReferenceProcessor with CharacterReferenceDownloader (parameter key renamed and series_name passed) and adjust logging/messages accordingly. --- preprocessor/app/pipeline_factory.py | 32 +++---- .../analysis/resolution_analysis_step.py | 91 ++++++++++++++++--- .../scraping/reference_processor_step.py | 13 +-- preprocessor/steps/video/transcoding_step.py | 20 +++- 4 files changed, 122 insertions(+), 34 deletions(-) diff --git a/preprocessor/app/pipeline_factory.py b/preprocessor/app/pipeline_factory.py index 09fead456..4d72dc99c 100644 --- a/preprocessor/app/pipeline_factory.py +++ b/preprocessor/app/pipeline_factory.py @@ -109,7 +109,7 @@ def build_pipeline(series_name: str) -> PipelineDefinition: # pylint: disable=t transcoded_videos = StepBuilder( id="transcode", phase=PROCESSING, - module="preprocessor.steps.video.transcoding:VideoTranscoderStep", + module="preprocessor.steps.video.transcoding_step:VideoTranscoderStep", description=f"Conversion to {series_config.processing.transcode.codec} {series_config.processing.transcode.resolution} with adaptive bitrate", produces=["transcoded_videos/{season}/{episode}.mp4"], needs=[resolution_analysis], @@ -126,7 +126,7 @@ def build_pipeline(series_name: str) -> PipelineDefinition: # pylint: disable=t scene_data = StepBuilder( id="detect_scenes", phase=PROCESSING, - module="preprocessor.steps.video.scene_detection:SceneDetectorStep", + module="preprocessor.steps.video.scene_detection_step:SceneDetectorStep", description="Detects scene changes using TransNetV2", produces=["scene_detections/{season}/{episode}.json"], needs=[transcoded_videos], @@ -139,7 +139,7 @@ def build_pipeline(series_name: str) -> PipelineDefinition: # pylint: disable=t exported_frames = StepBuilder( id="export_frames", phase=PROCESSING, - module="preprocessor.steps.video.frame_export:FrameExporterStep", + module="preprocessor.steps.video.frame_export_step:FrameExporterStep", description="Exports frames (PNG) at scene boundaries", produces=["frames/{season}/{episode}/*.png"], needs=[scene_data], @@ -149,7 +149,7 @@ def build_pipeline(series_name: str) -> PipelineDefinition: # pylint: disable=t transcription_data = StepBuilder( id="transcribe", phase=PROCESSING, - module="preprocessor.steps.text.transcription:TranscriptionStep", + module="preprocessor.steps.text.transcription_step:TranscriptionStep", description=f"Audio transcription using {series_config.processing.transcription.mode}", produces=["transcriptions/{season}/{episode}.json"], needs=[transcoded_videos], @@ -165,7 +165,7 @@ def build_pipeline(series_name: str) -> PipelineDefinition: # pylint: disable=t separated_audio = StepBuilder( id="separate_sounds", phase=PROCESSING, - module="preprocessor.steps.audio.separation:SoundSeparationStep", + module="preprocessor.steps.audio.separation_step:SoundSeparationStep", description="Separates dialogue from sound effects", produces=["separated_audio/{season}/{episode}/"], needs=[transcription_data], @@ -175,7 +175,7 @@ def build_pipeline(series_name: str) -> PipelineDefinition: # pylint: disable=t text_stats = StepBuilder( id="analyze_text", phase=PROCESSING, - module="preprocessor.steps.text.analysis:TextAnalysisStep", + module="preprocessor.steps.text.analysis_step:TextAnalysisStep", description="Analyzes text statistics (word frequency, sentiment)", produces=["text_analysis/{season}/{episode}.json"], needs=[transcription_data], @@ -185,7 +185,7 @@ def build_pipeline(series_name: str) -> PipelineDefinition: # pylint: disable=t text_embeddings = StepBuilder( id="text_embeddings", phase=PROCESSING, - module="preprocessor.steps.text.embeddings:TextEmbeddingStep", + module="preprocessor.steps.text.embeddings_step:TextEmbeddingStep", description="Generates text embeddings using Qwen3-VL-Embedding", produces=["embeddings/text/{season}/{episode}.npy"], needs=[text_stats], @@ -201,7 +201,7 @@ def build_pipeline(series_name: str) -> PipelineDefinition: # pylint: disable=t image_hashes = StepBuilder( id="image_hashing", phase=PROCESSING, - module="preprocessor.steps.vision.image_hashing:ImageHashStep", + module="preprocessor.steps.vision.image_hashing_step:ImageHashStep", description="Perceptual frame hashing (phash, dhash, wavelet)", produces=["hashes/{season}/{episode}.json"], needs=[exported_frames], @@ -211,7 +211,7 @@ def build_pipeline(series_name: str) -> PipelineDefinition: # pylint: disable=t video_embeddings = StepBuilder( id="video_embeddings", phase=PROCESSING, - module="preprocessor.steps.vision.embeddings:VideoEmbeddingStep", + module="preprocessor.steps.vision.embeddings_step:VideoEmbeddingStep", description="Visual embeddings using Qwen3-VL-Embedding", produces=["embeddings/vision/{season}/{episode}.npy"], needs=[exported_frames, image_hashes], @@ -225,7 +225,7 @@ def build_pipeline(series_name: str) -> PipelineDefinition: # pylint: disable=t character_detections = StepBuilder( id="detect_characters", phase=PROCESSING, - module="preprocessor.steps.vision.character_detection:CharacterDetectorStep", + module="preprocessor.steps.vision.character_detection_step:CharacterDetectorStep", description="Recognizes characters in frames using InsightFace", produces=["detections/characters/{season}/{episode}.json"], needs=[exported_frames], @@ -235,7 +235,7 @@ def build_pipeline(series_name: str) -> PipelineDefinition: # pylint: disable=t emotion_data = StepBuilder( id="detect_emotions", phase=PROCESSING, - module="preprocessor.steps.vision.emotion_detection:EmotionDetectionStep", + module="preprocessor.steps.vision.emotion_detection_step:EmotionDetectionStep", description="Detects emotions on faces using EmoNet", produces=["detections/emotions/{season}/{episode}.json"], needs=[exported_frames], @@ -245,7 +245,7 @@ def build_pipeline(series_name: str) -> PipelineDefinition: # pylint: disable=t face_clusters = StepBuilder( id="cluster_faces", phase=PROCESSING, - module="preprocessor.steps.vision.face_clustering:FaceClusteringStep", + module="preprocessor.steps.vision.face_clustering_step:FaceClusteringStep", description="Face clustering using HDBSCAN", produces=["clusters/faces/{season}/{episode}.json"], needs=[exported_frames], @@ -255,7 +255,7 @@ def build_pipeline(series_name: str) -> PipelineDefinition: # pylint: disable=t object_detections = StepBuilder( id="detect_objects", phase=PROCESSING, - module="preprocessor.steps.vision.object_detection:ObjectDetectionStep", + module="preprocessor.steps.vision.object_detection_step:ObjectDetectionStep", description="General object detection using D-FINE", produces=["detections/objects/{season}/{episode}.json"], needs=[exported_frames], @@ -265,7 +265,7 @@ def build_pipeline(series_name: str) -> PipelineDefinition: # pylint: disable=t elastic_documents = StepBuilder( id="generate_elastic_docs", phase=INDEXING, - module="preprocessor.steps.search.document_generation:DocumentGeneratorStep", + module="preprocessor.steps.search.document_generation_step:DocumentGeneratorStep", description="Combines all data into Elasticsearch documents", produces=["elastic_documents/{season}/{episode}.ndjson"], needs=[ @@ -282,7 +282,7 @@ def build_pipeline(series_name: str) -> PipelineDefinition: # pylint: disable=t episode_archives = StepBuilder( id="generate_archives", phase=INDEXING, - module="preprocessor.steps.packaging.archives:ArchiveGenerationStep", + module="preprocessor.steps.packaging.archives_step:ArchiveGenerationStep", description="Creates ZIP archives per episode (all artifacts)", produces=["archives/{season}/{episode}.zip"], needs=[elastic_documents], @@ -292,7 +292,7 @@ def build_pipeline(series_name: str) -> PipelineDefinition: # pylint: disable=t indexed_data = StepBuilder( id="index_to_elasticsearch", phase=INDEXING, - module="preprocessor.steps.search.indexing:ElasticsearchIndexerStep", + module="preprocessor.steps.search.indexing_step:ElasticsearchIndexerStep", description="Indexes documents into Elasticsearch", produces=[""], needs=[elastic_documents], diff --git a/preprocessor/steps/analysis/resolution_analysis_step.py b/preprocessor/steps/analysis/resolution_analysis_step.py index eaf5f66a3..ab78ffb35 100644 --- a/preprocessor/steps/analysis/resolution_analysis_step.py +++ b/preprocessor/steps/analysis/resolution_analysis_step.py @@ -1,3 +1,6 @@ +from collections import Counter +from datetime import datetime +import json from pathlib import Path from typing import List @@ -18,19 +21,20 @@ def execute( context.logger.info('RESOLUTION ANALYSIS - Checking source video resolutions') context.logger.info('=' * 80) - video_paths = self._find_video_files(context) + video_paths = self.__find_video_files(context) if not video_paths: context.logger.warning('No video files found - skipping resolution analysis') context.mark_step_completed(self.name, 'all') return ResolutionAnalysisResult(total_files=0, upscaling_percentage=0.0) - resolutions = self._scan_resolutions(video_paths, context) + resolutions = self.__scan_resolutions(video_paths, context) if not resolutions: context.logger.warning('Failed to analyze resolutions - skipping') context.mark_step_completed(self.name, 'all') return ResolutionAnalysisResult(total_files=len(video_paths), upscaling_percentage=0.0) - upscaling_pct = self._analyze_and_report(resolutions, context) + upscaling_pct = self.__analyze_and_report(resolutions, context) + self.__save_results_to_json(resolutions, upscaling_pct, context) context.mark_step_completed(self.name, 'all') return ResolutionAnalysisResult(total_files=len(resolutions), upscaling_percentage=upscaling_pct) @@ -44,7 +48,7 @@ def is_global(self) -> bool: return True @staticmethod - def _find_video_files(context: ExecutionContext) -> List[Path]: + def __find_video_files(context: ExecutionContext) -> List[Path]: input_base = PathService.get_input_base() series_path = input_base / context.series_name @@ -60,7 +64,7 @@ def _find_video_files(context: ExecutionContext) -> List[Path]: return sorted(video_files) @staticmethod - def _scan_resolutions( + def __scan_resolutions( video_paths: List[Path], context: ExecutionContext, ) -> List[tuple[int, int, str]]: resolutions = [] @@ -80,11 +84,9 @@ def _scan_resolutions( return resolutions - def _analyze_and_report( + def __analyze_and_report( self, resolutions: List[tuple[int, int, str]], context: ExecutionContext, ) -> float: - from collections import Counter # pylint: disable=import-outside-toplevel - resolution_counts = Counter((w, h) for w, h, _ in resolutions) total_episodes = len(resolutions) @@ -104,7 +106,7 @@ def _analyze_and_report( for (width, height), count in resolution_counts.most_common(): pct = (count / total_episodes) * 100 - label = self._get_resolution_label(width, height) + label = self.__get_resolution_label(width, height) context.logger.info( f' {width}x{height} ({label}): {count} episodes ({pct:.1f}%)', ) @@ -112,7 +114,7 @@ def _analyze_and_report( context.logger.info('') context.logger.info( f'Target Resolution: {target_width}x{target_height} ' - f'({self._get_resolution_label(target_width, target_height)})', + f'({self.__get_resolution_label(target_width, target_height)})', ) if upscaling_pct > 50: @@ -137,7 +139,7 @@ def _analyze_and_report( return upscaling_pct @staticmethod - def _get_resolution_label(width: int, height: int) -> str: + def __get_resolution_label(width: int, height: int) -> str: resolution_labels = { (7680, 4320): '8K', (3840, 2160): '4K', @@ -164,3 +166,70 @@ def _get_resolution_label(width: int, height: int) -> str: if height >= 450: return 'SD' return 'Low' + + def __save_results_to_json( + self, + resolutions: List[tuple[int, int, str]], + upscaling_pct: float, + context: ExecutionContext, + ) -> None: + output_base = PathService.get_output_base() + output_dir = output_base / context.series_name + output_dir.mkdir(parents=True, exist_ok=True) + output_file = output_dir / 'resolution_analysis.json' + + resolution_counts = Counter((w, h) for w, h, _ in resolutions) + total_episodes = len(resolutions) + + target_width = self.config.resolution.width + target_height = self.config.resolution.height + target_pixels = target_width * target_height + + upscaling_count = sum( + 1 for w, h, _ in resolutions + if (w * h) < target_pixels + ) + + source_resolutions = [ + { + 'width': width, + 'height': height, + 'count': count, + 'percentage': round((count / total_episodes) * 100, 1), + 'label': self.__get_resolution_label(width, height), + } + for (width, height), count in resolution_counts.most_common() + ] + + files_details = [ + { + 'filename': filename, + 'width': width, + 'height': height, + 'label': self.__get_resolution_label(width, height), + 'needs_upscaling': (width * height) < target_pixels, + } + for width, height, filename in sorted(resolutions, key=lambda x: x[2]) + ] + + result = { + 'analysis_date': datetime.now().isoformat(), + 'series_name': context.series_name, + 'target_resolution': { + 'width': target_width, + 'height': target_height, + 'label': self.__get_resolution_label(target_width, target_height), + }, + 'source_resolutions': source_resolutions, + 'total_files': total_episodes, + 'upscaling_required': { + 'count': upscaling_count, + 'percentage': round(upscaling_pct, 1), + }, + 'files': files_details, + } + + with open(output_file, 'w', encoding='utf-8') as f: + json.dump(result, f, indent=2, ensure_ascii=False) + + context.logger.info(f'Resolution analysis saved to: {output_file}') diff --git a/preprocessor/steps/scraping/reference_processor_step.py b/preprocessor/steps/scraping/reference_processor_step.py index e22692aa4..b17b53be2 100644 --- a/preprocessor/steps/scraping/reference_processor_step.py +++ b/preprocessor/steps/scraping/reference_processor_step.py @@ -5,7 +5,7 @@ from preprocessor.core.artifacts import SourceVideo from preprocessor.core.base_step import PipelineStep from preprocessor.core.context import ExecutionContext -from preprocessor.services.scraping.reference_processor import CharacterReferenceProcessor +from preprocessor.services.characters.reference_downloader import CharacterReferenceDownloader class CharacterReferenceStep( @@ -53,22 +53,23 @@ def _process_character_references( output_dir: Path, context: ExecutionContext, ) -> None: - context.logger.info(f"Processing character references from {characters_path}") + context.logger.info(f"Downloading character references from {characters_path}") - processor = CharacterReferenceProcessor( + downloader = CharacterReferenceDownloader( { - "characters_file": characters_path, + "characters_json": characters_path, "output_dir": output_dir, "search_engine": self.config.search_engine, "images_per_character": self.config.images_per_character, + "series_name": context.series_name, }, ) - exit_code = processor.work() + exit_code = downloader.work() if exit_code != 0: raise RuntimeError( - f"Character reference processor failed with exit code {exit_code}", + f"Character reference downloader failed with exit code {exit_code}", ) context.logger.info(f"Character references saved to: {output_dir}") diff --git a/preprocessor/steps/video/transcoding_step.py b/preprocessor/steps/video/transcoding_step.py index cd198e131..1a47328c5 100644 --- a/preprocessor/steps/video/transcoding_step.py +++ b/preprocessor/steps/video/transcoding_step.py @@ -16,7 +16,7 @@ class VideoTranscoderStep(PipelineStep[SourceVideo, TranscodedVideo, TranscodeConfig]): - def execute( + def execute( # pylint: disable=too-many-locals self, input_data: SourceVideo, context: ExecutionContext, ) -> TranscodedVideo: output_path = self._get_output_path(input_data, context) @@ -27,6 +27,24 @@ def execute( probe_data = FFmpegWrapper.probe_video(input_data.path) target_fps = self._calculate_target_fps(probe_data, context) is_upscaling, source_pixels, target_pixels = self._detect_upscaling(probe_data) + + source_width, source_height = FFmpegWrapper.get_resolution(probe_data) + sar_num, sar_denom = FFmpegWrapper.get_sample_aspect_ratio(probe_data) + effective_width = int(source_width * sar_num / sar_denom) + + if is_upscaling: + context.logger.info( + f'{input_data.episode_id}: Source {effective_width}x{source_height} ' + f'({source_pixels:,} px) → Target {self.config.resolution.width}x{self.config.resolution.height} ' + f'({target_pixels:,} px) - UPSCALING DETECTED', + ) + else: + context.logger.info( + f'{input_data.episode_id}: Source {effective_width}x{source_height} ' + f'({source_pixels:,} px) → Target {self.config.resolution.width}x{self.config.resolution.height} ' + f'({target_pixels:,} px) - No upscaling', + ) + video_bitrate, minrate, maxrate, bufsize = self._adjust_video_bitrate( probe_data, context, is_upscaling, source_pixels, target_pixels, ) From b7dc97e141a23599447519b80b30a476820504d2 Mon Sep 17 00:00:00 2001 From: dam2452 <81230036+dam2452@users.noreply.github.com> Date: Thu, 12 Feb 2026 13:16:34 +0100 Subject: [PATCH 24/89] Improve FFmpeg, interlace detection & transcode Multiple changes to improve video quality, interlacing analysis and transcoding consistency: - series config: disable forced deinterlace for kiepscy.json. - FFmpegWrapper: add default audio sample rate (48 kHz), tweak detect_interlacing (default 60s, command ordering), include -ar in transcode, enable sws_flags, embed color metadata and video_track_timescale, adjust aq-strength, enforce closed GOP (strict_gop/forced-idr/no-scenecut), and refine deinterlace/scaler filters for better results. - ResolutionAnalysisStep: refactor to return richer per-file video_info (width/height, field_order, idet stats, needs_deinterlace, metadata match), run idet per file, validate metadata vs idet, improve logging and include interlacing_analysis in the JSON output. - VideoTranscoderStep: add informational logs about forced parameters, force target FPS to 24.0, revise bitrate/upscaling heuristics (pixel_ratio/quality_boost and min bitrate adjustments), update deinterlace detection messaging to use idet (first 60s) and handle metadata mismatches, always pass target_fps to transcode, and small signature/lint cleanups. Overall these changes aim to produce more consistent, color-accurate outputs, more reliable deinterlacing decisions based on idet, and smarter bitrate scaling for upscaling scenarios. --- preprocessor/series_configs/kiepscy.json | 2 +- preprocessor/services/media/ffmpeg.py | 44 +++-- .../analysis/resolution_analysis_step.py | 152 ++++++++++++++---- preprocessor/steps/video/transcoding_step.py | 79 ++++++--- 4 files changed, 212 insertions(+), 65 deletions(-) diff --git a/preprocessor/series_configs/kiepscy.json b/preprocessor/series_configs/kiepscy.json index 1d7d5ee4f..cdcd91357 100644 --- a/preprocessor/series_configs/kiepscy.json +++ b/preprocessor/series_configs/kiepscy.json @@ -8,7 +8,7 @@ }, "processing": { "transcode": { - "force_deinterlace": true + "force_deinterlace": false } }, "scraping": { diff --git a/preprocessor/services/media/ffmpeg.py b/preprocessor/services/media/ffmpeg.py index 7801ab460..4f5490dcf 100644 --- a/preprocessor/services/media/ffmpeg.py +++ b/preprocessor/services/media/ffmpeg.py @@ -14,6 +14,7 @@ class FFmpegWrapper: __AQ_STRENGTH = '15' __AUDIO_CHANNELS = '2' + __AUDIO_SAMPLE_RATE = '48000' __BF = '2' __B_ADAPT = '1' __LEVEL = '4.1' @@ -25,18 +26,16 @@ class FFmpegWrapper: @staticmethod def detect_interlacing( video_path: Path, - analysis_time: Optional[int] = None, + analysis_time: Optional[int] = 60, threshold: float = 0.15, ) -> Tuple[bool, Optional[Dict[str, Any]]]: - cmd = [ - 'ffmpeg', - '-i', str(video_path), - ] + cmd = ['ffmpeg'] - if analysis_time: + if analysis_time is not None: cmd.extend(['-t', str(analysis_time)]) cmd.extend([ + '-i', str(video_path), '-vf', 'idet', '-an', '-f', 'null', @@ -141,7 +140,7 @@ def probe_video(video_path: Path) -> Dict[str, Any]: return json.loads(result.stdout) @staticmethod - def transcode( # pylint: disable=too-many-arguments + def transcode( # pylint: disable=too-many-arguments,too-many-locals input_path: Path, output_path: Path, codec: str, @@ -156,6 +155,7 @@ def transcode( # pylint: disable=too-many-arguments target_fps: Optional[float] = None, deinterlace: bool = False, is_upscaling: bool = False, + log_command: bool = False, ) -> None: width, height = [int(x) for x in resolution.split(':')] vf_filter = FFmpegWrapper.__build_video_filter(width, height, deinterlace, is_upscaling) @@ -170,6 +170,16 @@ def transcode( # pylint: disable=too-many-arguments audio_bitrate, vf_filter, output_path, ), ) + + if log_command: + print('ffmpeg \\') + for i, arg in enumerate(command[1:], 1): + if i == len(command) - 1: + print(f' {arg}') + else: + print(f' {arg} \\') + print() + subprocess.run(command, check=True, capture_output=False) @staticmethod @@ -180,6 +190,7 @@ def __build_audio_and_output_params( '-c:a', 'aac', '-b:a', audio_bitrate, '-ac', FFmpegWrapper.__AUDIO_CHANNELS, + '-ar', FFmpegWrapper.__AUDIO_SAMPLE_RATE, '-vf', vf_filter, '-movflags', '+faststart', '-f', 'mp4', @@ -192,12 +203,18 @@ def __build_base_command( ) -> List[str]: command = [ 'ffmpeg', '-v', 'error', '-stats', '-hide_banner', '-y', + '-sws_flags', 'accurate_rnd+full_chroma_int+full_chroma_inp', '-i', str(input_path), '-c:v', codec, '-preset', preset, '-profile:v', FFmpegWrapper.__PROFILE, '-level', FFmpegWrapper.__LEVEL, '-pix_fmt', FFmpegWrapper.__PIX_FMT, + '-colorspace', 'bt709', + '-color_primaries', 'bt709', + '-color_trc', 'bt709', + '-color_range', 'tv', + '-video_track_timescale', '90000', ] if target_fps: command.extend(['-r', str(target_fps)]) @@ -230,7 +247,7 @@ def __build_encoding_params( if is_upscaling: params.extend([ '-rc-lookahead', '60', - '-aq-strength', '18', + '-aq-strength', '15', '-b_ref_mode', 'middle', ]) else: @@ -239,6 +256,12 @@ def __build_encoding_params( '-aq-strength', FFmpegWrapper.__AQ_STRENGTH, ]) + params.extend([ + '-strict_gop', '1', + '-forced-idr', '1', + '-no-scenecut', '1', + ]) + return params @staticmethod @@ -248,9 +271,10 @@ def __build_video_filter( filters = [] if deinterlace: - filters.append('bwdif=mode=0') + filters.append('bwdif=mode=0:parity=-1:deint=1') + filters.append('setfield=prog') - scaler_flags = 'lanczos' if is_upscaling else 'bicubic' + scaler_flags = 'spline36+accurate_rnd+full_chroma_int' if is_upscaling else 'bicubic' filters.append( f"scale='iw*sar:ih',scale={width}:{height}:" diff --git a/preprocessor/steps/analysis/resolution_analysis_step.py b/preprocessor/steps/analysis/resolution_analysis_step.py index ab78ffb35..f8d59d5a0 100644 --- a/preprocessor/steps/analysis/resolution_analysis_step.py +++ b/preprocessor/steps/analysis/resolution_analysis_step.py @@ -2,7 +2,11 @@ from datetime import datetime import json from pathlib import Path -from typing import List +from typing import ( + Dict, + List, + Optional, +) from preprocessor.config.step_configs import TranscodeConfig from preprocessor.core.artifacts import ResolutionAnalysisResult @@ -27,17 +31,17 @@ def execute( context.mark_step_completed(self.name, 'all') return ResolutionAnalysisResult(total_files=0, upscaling_percentage=0.0) - resolutions = self.__scan_resolutions(video_paths, context) - if not resolutions: - context.logger.warning('Failed to analyze resolutions - skipping') + video_info = self.__scan_resolutions(video_paths, context) + if not video_info: + context.logger.warning('Failed to analyze videos - skipping') context.mark_step_completed(self.name, 'all') return ResolutionAnalysisResult(total_files=len(video_paths), upscaling_percentage=0.0) - upscaling_pct = self.__analyze_and_report(resolutions, context) - self.__save_results_to_json(resolutions, upscaling_pct, context) + upscaling_pct = self.__analyze_and_report(video_info, context) + self.__save_results_to_json(video_info, upscaling_pct, context) context.mark_step_completed(self.name, 'all') - return ResolutionAnalysisResult(total_files=len(resolutions), upscaling_percentage=upscaling_pct) + return ResolutionAnalysisResult(total_files=len(video_info), upscaling_percentage=upscaling_pct) @property def name(self) -> str: @@ -66,40 +70,71 @@ def __find_video_files(context: ExecutionContext) -> List[Path]: @staticmethod def __scan_resolutions( video_paths: List[Path], context: ExecutionContext, - ) -> List[tuple[int, int, str]]: - resolutions = [] + ) -> List[Dict[str, any]]: + video_info = [] for video_path in video_paths: try: probe_data = FFmpegWrapper.probe_video(video_path) width, height = FFmpegWrapper.get_resolution(probe_data) sar_num, sar_denom = FFmpegWrapper.get_sample_aspect_ratio(probe_data) + field_order = FFmpegWrapper.get_field_order(probe_data) effective_width = int(width * sar_num / sar_denom) - resolutions.append((effective_width, height, video_path.name)) + + context.logger.info( + f'Analyzing interlacing for {video_path.name} ' + f'(field_order={field_order}, analyzing full video)...', + ) + has_interlacing, idet_stats = FFmpegWrapper.detect_interlacing( + video_path, analysis_time=None, + ) + + metadata_vs_reality = ResolutionAnalysisStep.__validate_field_order( + field_order, has_interlacing, idet_stats, + ) + + if metadata_vs_reality != 'match': + context.logger.warning( + f'⚠ {video_path.name}: field_order={field_order} but idet says {metadata_vs_reality}!', + ) + + video_info.append({ + 'filename': video_path.name, + 'width': effective_width, + 'height': height, + 'field_order': field_order, + 'needs_deinterlace': has_interlacing, + 'idet_stats': idet_stats, + 'metadata_match': metadata_vs_reality, + }) except Exception as e: # pylint: disable=broad-except context.logger.warning(f'Failed to probe {video_path.name}: {e}') continue - return resolutions + return video_info def __analyze_and_report( - self, resolutions: List[tuple[int, int, str]], context: ExecutionContext, + self, video_info: List[Dict[str, any]], context: ExecutionContext, ) -> float: - resolution_counts = Counter((w, h) for w, h, _ in resolutions) - total_episodes = len(resolutions) + resolution_counts = Counter((v['width'], v['height']) for v in video_info) + total_episodes = len(video_info) target_width = self.config.resolution.width target_height = self.config.resolution.height target_pixels = target_width * target_height upscaling_count = sum( - 1 for w, h, _ in resolutions - if (w * h) < target_pixels + 1 for v in video_info + if (v['width'] * v['height']) < target_pixels ) upscaling_pct = (upscaling_count / total_episodes) * 100 if total_episodes > 0 else 0 + needs_deinterlace_count = sum(1 for v in video_info if v['needs_deinterlace']) + progressive_count = sum(1 for v in video_info if not v['needs_deinterlace']) + metadata_mismatch_count = sum(1 for v in video_info if v['metadata_match'] != 'match') + context.logger.info('') context.logger.info('Source Resolution Distribution:') context.logger.info('-' * 60) @@ -134,10 +169,49 @@ def __analyze_and_report( '(enhanced quality params will be used)', ) + context.logger.info('') + context.logger.info('Interlacing Analysis (based on idet, not metadata):') + context.logger.info('-' * 60) + context.logger.info( + f' Progressive: {progressive_count} episodes ' + f'({(progressive_count/total_episodes)*100:.1f}%)', + ) + context.logger.info( + f' Interlaced (needs deinterlace): {needs_deinterlace_count} episodes ' + f'({(needs_deinterlace_count/total_episodes)*100:.1f}%)', + ) + + if metadata_mismatch_count > 0: + context.logger.warning('') + context.logger.warning( + f'⚠ WARNING: {metadata_mismatch_count} episodes have INCORRECT field_order metadata!', + ) + context.logger.warning( + '⚠ Using idet analysis instead of metadata for deinterlacing decisions.', + ) + context.logger.info('=' * 80) return upscaling_pct + @staticmethod + def __validate_field_order( + field_order: str, has_interlacing: bool, idet_stats: Optional[Dict[str, int]], + ) -> str: + if not idet_stats: + return 'unknown' + + metadata_says_progressive = field_order in {'progressive', 'unknown'} + idet_says_progressive = not has_interlacing + + if metadata_says_progressive and idet_says_progressive: + return 'match' + if not metadata_says_progressive and not idet_says_progressive: + return 'match' + if metadata_says_progressive and not idet_says_progressive: + return 'interlaced (metadata wrong)' + return 'progressive (metadata wrong)' + @staticmethod def __get_resolution_label(width: int, height: int) -> str: resolution_labels = { @@ -167,9 +241,9 @@ def __get_resolution_label(width: int, height: int) -> str: return 'SD' return 'Low' - def __save_results_to_json( + def __save_results_to_json( # pylint: disable=too-many-locals self, - resolutions: List[tuple[int, int, str]], + video_info: List[Dict[str, any]], upscaling_pct: float, context: ExecutionContext, ) -> None: @@ -178,18 +252,22 @@ def __save_results_to_json( output_dir.mkdir(parents=True, exist_ok=True) output_file = output_dir / 'resolution_analysis.json' - resolution_counts = Counter((w, h) for w, h, _ in resolutions) - total_episodes = len(resolutions) + resolution_counts = Counter((v['width'], v['height']) for v in video_info) + total_episodes = len(video_info) target_width = self.config.resolution.width target_height = self.config.resolution.height target_pixels = target_width * target_height upscaling_count = sum( - 1 for w, h, _ in resolutions - if (w * h) < target_pixels + 1 for v in video_info + if (v['width'] * v['height']) < target_pixels ) + needs_deinterlace_count = sum(1 for v in video_info if v['needs_deinterlace']) + progressive_count = sum(1 for v in video_info if not v['needs_deinterlace']) + metadata_mismatch_count = sum(1 for v in video_info if v['metadata_match'] != 'match') + source_resolutions = [ { 'width': width, @@ -203,13 +281,17 @@ def __save_results_to_json( files_details = [ { - 'filename': filename, - 'width': width, - 'height': height, - 'label': self.__get_resolution_label(width, height), - 'needs_upscaling': (width * height) < target_pixels, + 'filename': v['filename'], + 'width': v['width'], + 'height': v['height'], + 'label': self.__get_resolution_label(v['width'], v['height']), + 'needs_upscaling': (v['width'] * v['height']) < target_pixels, + 'field_order': v['field_order'], + 'needs_deinterlace': v['needs_deinterlace'], + 'metadata_match': v['metadata_match'], + 'idet_stats': v['idet_stats'], } - for width, height, filename in sorted(resolutions, key=lambda x: x[2]) + for v in sorted(video_info, key=lambda x: x['filename']) ] result = { @@ -226,6 +308,20 @@ def __save_results_to_json( 'count': upscaling_count, 'percentage': round(upscaling_pct, 1), }, + 'interlacing_analysis': { + 'progressive': { + 'count': progressive_count, + 'percentage': round((progressive_count / total_episodes) * 100, 1), + }, + 'interlaced': { + 'count': needs_deinterlace_count, + 'percentage': round((needs_deinterlace_count / total_episodes) * 100, 1), + }, + 'metadata_mismatches': { + 'count': metadata_mismatch_count, + 'percentage': round((metadata_mismatch_count / total_episodes) * 100, 1), + }, + }, 'files': files_details, } diff --git a/preprocessor/steps/video/transcoding_step.py b/preprocessor/steps/video/transcoding_step.py index 1a47328c5..a7bf35766 100644 --- a/preprocessor/steps/video/transcoding_step.py +++ b/preprocessor/steps/video/transcoding_step.py @@ -15,6 +15,7 @@ class VideoTranscoderStep(PipelineStep[SourceVideo, TranscodedVideo, TranscodeConfig]): + _command_logged = False def execute( # pylint: disable=too-many-locals self, input_data: SourceVideo, context: ExecutionContext, @@ -51,6 +52,14 @@ def execute( # pylint: disable=too-many-locals audio_bitrate = self._adjust_audio_bitrate(probe_data, context) deinterlace = self._determine_deinterlace(input_data, context, probe_data) + context.logger.info( + 'Video: SAR 1:1 (square pixels), timebase 1/90000, ' + 'colorspace bt709, color_range tv, closed GOP=12 frames (0.5s) with IDR keyframes ' + '(forced for frame-accurate cutting & concat)', + ) + context.logger.info( + f'Audio: AAC {audio_bitrate} kbps, 2 channels (stereo), 48 kHz sample rate (forced)', + ) context.logger.info(f'Transcoding {input_data.episode_id}') self._perform_transcode( input_data.path, @@ -96,12 +105,13 @@ def _calculate_target_fps( context: ExecutionContext, ) -> float: input_fps = FFmpegWrapper.get_framerate(probe_data) - target_fps = min(input_fps, 30.0) - if target_fps < input_fps: + target_fps = 24.0 + + if input_fps != target_fps: context.logger.info( - f'Input FPS ({input_fps}) > 30. ' - f'Limiting to {target_fps} FPS for compatibility and smaller file size.', + f'Input FPS ({input_fps:.2f}) → forcing {target_fps} FPS for consistency and cinematic quality.', ) + return target_fps def _detect_upscaling(self, probe_data: Dict[str, Any]) -> tuple[bool, int, int]: @@ -169,15 +179,21 @@ def _calculate_upscale_bitrate( target_res = (self.config.resolution.width, self.config.resolution.height) min_required = __MIN_BITRATE_FOR_RESOLUTION.get(target_res, 2.0) + pixel_ratio = target_pixels / source_pixels + + if pixel_ratio > 1.4: + min_required *= 1.25 + elif pixel_ratio > 1.2: + min_required *= 1.15 source_bitrate = FFmpegWrapper.get_video_bitrate(probe_data) - pixel_ratio = target_pixels / source_pixels + quality_boost = 1.2 + max(0.0, (pixel_ratio - 1.1) * 0.4) if source_bitrate: - calculated = source_bitrate * pixel_ratio * 1.2 + calculated = source_bitrate * pixel_ratio * quality_boost upscaled_bitrate = max(calculated, min_required) else: - upscaled_bitrate = min_required * 1.2 + upscaled_bitrate = min_required * max(1.2, pixel_ratio * 0.9) max_allowed = self.config.video_bitrate_mbps * 1.3 upscaled_bitrate = min(upscaled_bitrate, max_allowed) @@ -186,10 +202,10 @@ def _calculate_upscale_bitrate( context.logger.warning( f'⚠ UPSCALING: {source_pixels:,} px → {target_pixels:,} px ' - f'(+{((target_pixels/source_pixels)-1)*100:.1f}%). ' + f'(+{((target_pixels/source_pixels)-1)*100:.1f}%, quality_boost={quality_boost:.2f}). ' f'Bitrate: {source_bitrate or "N/A"} → {upscaled_bitrate:.2f} Mbps ' f'(min for {target_res[0]}x{target_res[1]}: {min_required} Mbps). ' - f'Using Lanczos scaler + enhanced nvenc params.', + f'Using Spline36 scaler (flicker-free) + enhanced nvenc params.', ) return ( @@ -224,27 +240,33 @@ def _determine_deinterlace( field_order = FFmpegWrapper.get_field_order(probe_data) if self.config.force_deinterlace: - if field_order == 'progressive': - context.logger.warning( - f"⚠ Force deinterlacing enabled for {input_data.episode_id} " - f"but video is progressive (field_order={field_order}). " - f"This may degrade quality unnecessarily.", - ) - else: - context.logger.info( - f"Force deinterlacing enabled for {input_data.episode_id} - " - f"skipping interlace detection and applying bwdif filter unconditionally", - ) + context.logger.info( + f"Force deinterlacing enabled for {input_data.episode_id} (field_order={field_order}) - " + f"skipping idet analysis and applying bwdif filter unconditionally", + ) return True - context.logger.info(f"Detecting interlacing for {input_data.episode_id}...") + context.logger.info( + f"Detecting interlacing for {input_data.episode_id} " + f"(field_order={field_order}, analyzing first 60s)...", + ) has_interlacing, idet_stats = FFmpegWrapper.detect_interlacing(input_data.path) + if idet_stats: + metadata_says_progressive = field_order in {'progressive', 'unknown'} + idet_says_progressive = not has_interlacing + + if metadata_says_progressive != idet_says_progressive: + context.logger.warning( + f"⚠ {input_data.episode_id}: field_order={field_order} but idet detected " + f"{'interlaced' if has_interlacing else 'progressive'} content! Using idet result.", + ) + if has_interlacing and idet_stats: context.logger.info( f"Interlacing detected for {input_data.episode_id} " f"({idet_stats['ratio']*100:.1f}% interlaced frames: " - f"TFF={idet_stats['tff']}, BFF={idet_stats['bff']}) - " + f"TFF={idet_stats['tff']}, BFF={idet_stats['bff']}, Progressive={idet_stats['progressive']}) - " f"applying bwdif deinterlacing filter", ) elif idet_stats: @@ -282,8 +304,12 @@ def _perform_transcode( # pylint: disable=too-many-arguments context.mark_step_started(self.name, input_data.episode_id, [str(temp_path)]) try: - probe_data = FFmpegWrapper.probe_video(input_path) - input_fps = FFmpegWrapper.get_framerate(probe_data) + log_command = not VideoTranscoderStep._command_logged + if log_command: + VideoTranscoderStep._command_logged = True + context.logger.info('=' * 80) + context.logger.info('FFmpeg command example (showing once):') + context.logger.info('=' * 80) FFmpegWrapper.transcode( input_path=input_path, @@ -296,10 +322,11 @@ def _perform_transcode( # pylint: disable=too-many-arguments maxrate=f'{maxrate}M', bufsize=f'{bufsize}M', audio_bitrate=f'{audio_bitrate}k', - gop_size=int(target_fps * self.config.gop_size), - target_fps=target_fps if target_fps < input_fps else None, + gop_size=int(target_fps * 0.5), + target_fps=target_fps, deinterlace=deinterlace, is_upscaling=is_upscaling, + log_command=log_command, ) temp_path.replace(output_path) except BaseException: From 83ee83151a83baee9d2c12fa635c1780ae29a18f Mon Sep 17 00:00:00 2001 From: dam2452 <81230036+dam2452@users.noreply.github.com> Date: Thu, 12 Feb 2026 13:18:49 +0100 Subject: [PATCH 25/89] Update kiepscy.json --- preprocessor/series_configs/kiepscy.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/preprocessor/series_configs/kiepscy.json b/preprocessor/series_configs/kiepscy.json index cdcd91357..8e002c480 100644 --- a/preprocessor/series_configs/kiepscy.json +++ b/preprocessor/series_configs/kiepscy.json @@ -3,7 +3,7 @@ "display_name": "\u015awiat wed\u0142ug Kiepskich", "indexing": { "elasticsearch": { - "index_name": "kiepscy_clips" + "index_name": "kiepscy" } }, "processing": { From 9043390390743682c943901c2672adf3d5bf2298 Mon Sep 17 00:00:00 2001 From: dam2452 <81230036+dam2452@users.noreply.github.com> Date: Thu, 12 Feb 2026 21:31:33 +0100 Subject: [PATCH 26/89] Refactor config, IO and search CLI Major refactor to decouple environment/output path handling and simplify CLI/search logic. Moves base output path logic into preprocessor.config.output_paths and introduces Environment.is_docker; adds OutputDirMixin to centralize per-feature output dirs. Introduces SettingsFactory and settings_instance for controlled Settings creation and injects settings into ExecutionContext. Replaces PathManager with PathService and removes several legacy IO modules (detection_io, hashing, path_manager); updates imports across services. Simplifies LLMProvider by unifying combined content building and client usage, and reduces duplication in SearchCommandHandler by adding a generic _execute_search helper; the CLI search command now builds a SearchConfig/params dataclass and delegates execution. FFmpeg wrapper updated to accept a TranscodeParams object and constants/flags were renamed/tuned. Miscellaneous plumbing and cleanup to align modules with the new config/IO patterns. --- preprocessor/app/pipeline_factory.py | 2 +- preprocessor/cli/cli_main.py | 169 +++++++++++------- preprocessor/cli/search_handler.py | 160 ++++++++--------- preprocessor/cli/search_params.py | 63 +++++++ preprocessor/config/config.py | 92 ++++------ preprocessor/config/mixins.py | 15 ++ preprocessor/config/output_paths.py | 15 ++ preprocessor/config/settings_factory.py | 18 ++ preprocessor/config/settings_instance.py | 4 + preprocessor/core/context.py | 13 ++ preprocessor/services/ai/clients.py | 2 +- preprocessor/services/ai/provider.py | 98 +++------- .../services/characters/face_detection.py | 2 +- .../characters/reference_downloader.py | 2 +- preprocessor/services/core/base_processor.py | 4 +- preprocessor/services/core/environment.py | 19 ++ .../services/episodes/episode_manager.py | 88 +-------- preprocessor/services/io/__init__.py | 3 +- preprocessor/services/io/detection_io.py | 30 ---- preprocessor/services/io/files.py | 14 -- preprocessor/services/io/hashing.py | 2 - preprocessor/services/io/path_manager.py | 20 --- preprocessor/services/io/path_service.py | 11 +- preprocessor/services/media/ffmpeg.py | 60 +++---- .../services/media/transcode_params.py | 33 ++++ .../services/scraping/base_scraper.py | 2 +- .../services/scraping/grid_visualizer.py | 2 +- .../services/scraping/reference_processor.py | 90 +++++++--- .../search/clients/elasticsearch_queries.py | 152 +++++++++------- .../search/clients/embedding_service.py | 2 +- .../engines/elevenlabs_engine.py | 2 +- preprocessor/services/validation/validator.py | 6 +- .../validation/validators/base_validator.py | 67 ++++++- .../validators/character_validator.py | 2 +- .../validators/elastic_validator.py | 30 ++-- .../validators/face_cluster_validator.py | 6 +- .../validation/validators/frame_validator.py | 6 +- .../validators/image_hash_validator.py | 2 +- .../validation/validators/object_validator.py | 2 +- .../validation/validators/scene_validator.py | 6 +- .../validators/transcription_validator.py | 46 ++--- .../validators/validation_helpers.py | 6 +- preprocessor/services/video/emotion_utils.py | 12 +- preprocessor/steps/audio/separation_step.py | 10 +- .../steps/search/document_generation_step.py | 8 +- preprocessor/steps/text/analysis_step.py | 9 +- preprocessor/steps/text/embeddings_step.py | 17 +- preprocessor/steps/text/transcription_step.py | 4 +- preprocessor/steps/video/frame_export_step.py | 4 +- .../steps/video/scene_detection_step.py | 9 +- preprocessor/steps/video/transcoding_step.py | 33 ++-- .../steps/vision/character_detection_step.py | 26 +-- preprocessor/steps/vision/embeddings_step.py | 17 +- .../steps/vision/emotion_detection_step.py | 165 ++++++++++++++++- .../steps/vision/image_hashing_step.py | 12 +- 55 files changed, 956 insertions(+), 738 deletions(-) create mode 100644 preprocessor/cli/search_params.py create mode 100644 preprocessor/config/mixins.py create mode 100644 preprocessor/config/output_paths.py create mode 100644 preprocessor/config/settings_factory.py create mode 100644 preprocessor/config/settings_instance.py create mode 100644 preprocessor/services/core/environment.py delete mode 100644 preprocessor/services/io/detection_io.py delete mode 100644 preprocessor/services/io/hashing.py delete mode 100644 preprocessor/services/io/path_manager.py create mode 100644 preprocessor/services/media/transcode_params.py diff --git a/preprocessor/app/pipeline_factory.py b/preprocessor/app/pipeline_factory.py index 4d72dc99c..fe75eefa3 100644 --- a/preprocessor/app/pipeline_factory.py +++ b/preprocessor/app/pipeline_factory.py @@ -5,7 +5,7 @@ Phase, StepBuilder, ) -from preprocessor.config.config import get_base_output_dir +from preprocessor.config.output_paths import get_base_output_dir from preprocessor.config.series_config import SeriesConfig from preprocessor.config.step_configs import ( ArchiveConfig, diff --git a/preprocessor/cli/cli_main.py b/preprocessor/cli/cli_main.py index b53f4d68f..527ef1df8 100644 --- a/preprocessor/cli/cli_main.py +++ b/preprocessor/cli/cli_main.py @@ -19,6 +19,11 @@ SearchCommandHandler, SearchFilters, ) +from preprocessor.cli.search_params import ( + SearchActionParams, + SearchConfig, + SearchQueryParams, +) from preprocessor.cli.skip_list_builder import SkipListBuilder from preprocessor.config.series_config import SeriesConfig from preprocessor.services.io.path_service import PathService @@ -136,6 +141,78 @@ def __analyze_resolution(series: str) -> None: setup.logger.finalize() +def _execute_search_command(config: SearchConfig) -> None: # pylint: disable=too-many-statements + """Execute search with config object. + + Args: + config: Complete search configuration. + """ + series_config = SeriesConfig.load(config.series) + index_base = series_config.indexing.elasticsearch.index_name + + hash_value = None + if config.query.phash: + hash_value = SearchCommandHandler.compute_perceptual_hash(config.query.phash) + if hash_value is None: + sys.exit(1) + + async def __run() -> None: + es_client = AsyncElasticsearch(hosts=[config.host], verify_certs=False) + + try: + await es_client.ping() + except Exception: + click.echo(f"Cannot connect to Elasticsearch at {config.host}", err=True) + click.echo("Make sure Elasticsearch is running:", err=True) + click.echo(" docker-compose -f docker-compose.test.yml up -d", err=True) + sys.exit(1) + + embedding_svc = EmbeddingService() + queries = ElasticsearchQueries(embedding_svc, index_base) + + try: + handler = SearchCommandHandler(es_client, embedding_svc, queries, config.json_output) + + result = None + if config.actions.stats: + result = await handler.handle_stats() + elif config.actions.list_chars_flag: + result = await handler.handle_list_characters() + elif config.actions.list_objects_flag: + result = await handler.handle_list_objects() + elif config.query.text: + result = await handler.handle_text_search(config.query.text, config.filters) + elif config.query.text_semantic: + result = await handler.handle_text_semantic_search(config.query.text_semantic, config.filters) + elif config.query.text_to_video: + result = await handler.handle_text_to_video_search(config.query.text_to_video, config.filters) + elif config.query.image: + result = await handler.handle_image_search(config.query.image, config.filters) + elif config.query.emotion: + result = await handler.handle_emotion_search(config.query.emotion, config.filters) + elif config.query.character: + result = await handler.handle_character_search(config.query.character, config.filters) + elif config.query.object_query: + result = await handler.handle_object_search(config.query.object_query, config.filters) + elif hash_value: + result = await handler.handle_hash_search(hash_value, config.filters) + elif config.query.episode_name: + result = await handler.handle_episode_name_search(config.query.episode_name, config.filters) + elif config.query.episode_name_semantic: + result = await handler.handle_episode_name_semantic_search( + config.query.episode_name_semantic, config.filters, + ) + + if result: + click.echo(result) + + finally: + embedding_svc.cleanup() + await es_client.close() + + asyncio.run(__run()) + + @cli.command(name="search") @click.option("--series", required=True, help="Series name (e.g., ranczo, kiepscy)") @click.option("--text", type=str, help="Full-text search by transcriptions") @@ -156,7 +233,7 @@ def __analyze_resolution(series: str) -> None: @click.option("--stats", is_flag=True, help="Show index statistics") @click.option("--json-output", is_flag=True, help="Output in JSON format") @click.option("--host", type=str, default="http://localhost:9200", help="Elasticsearch host") -def search( # pylint: disable=too-many-arguments,too-many-locals,too-many-statements +def search( # pylint: disable=too-many-arguments,too-many-locals series: str, text: str, text_semantic: str, @@ -177,76 +254,32 @@ def search( # pylint: disable=too-many-arguments,too-many-locals,too-many-state json_output: bool, host: str, ) -> None: - if not any([ - text, text_semantic, text_to_video, image, phash, character, emotion, - object_query, episode_name, episode_name_semantic, list_chars_flag, list_objects_flag, stats, - ]): + """Search command entry point - Click requires all parameters.""" + config = SearchConfig( + series=series, + query=SearchQueryParams( + text=text, + text_semantic=text_semantic, + text_to_video=text_to_video, + image=image, + phash=phash, + character=character, + emotion=emotion, + object_query=object_query, + episode_name=episode_name, + episode_name_semantic=episode_name_semantic, + ), + filters=SearchFilters(season, episode, character, limit), + actions=SearchActionParams(list_chars_flag, list_objects_flag, stats), + json_output=json_output, + host=host, + ) + + if not config.has_any_operation(): click.echo("Provide at least one search option. Use --help", err=True) sys.exit(1) - series_config = SeriesConfig.load(series) - index_base = series_config.indexing.elasticsearch.index_name - - hash_value = None - if phash: - hash_value = SearchCommandHandler.compute_perceptual_hash(phash) - if hash_value is None: - sys.exit(1) - - async def __run() -> None: - es_client = AsyncElasticsearch(hosts=[host], verify_certs=False) - - try: - await es_client.ping() - except Exception: - click.echo(f"Cannot connect to Elasticsearch at {host}", err=True) - click.echo("Make sure Elasticsearch is running:", err=True) - click.echo(" docker-compose -f docker-compose.test.yml up -d", err=True) - sys.exit(1) - - embedding_svc = EmbeddingService() - queries = ElasticsearchQueries(embedding_svc, index_base) - - try: - handler = SearchCommandHandler(es_client, embedding_svc, queries, json_output) - filters = SearchFilters(season, episode, character, limit) - - result = None - if stats: - result = await handler.handle_stats() - elif list_chars_flag: - result = await handler.handle_list_characters() - elif list_objects_flag: - result = await handler.handle_list_objects() - elif text: - result = await handler.handle_text_search(text, filters) - elif text_semantic: - result = await handler.handle_text_semantic_search(text_semantic, filters) - elif text_to_video: - result = await handler.handle_text_to_video_search(text_to_video, filters) - elif image: - result = await handler.handle_image_search(image, filters) - elif emotion: - result = await handler.handle_emotion_search(emotion, filters) - elif character: - result = await handler.handle_character_search(character, filters) - elif object_query: - result = await handler.handle_object_search(object_query, filters) - elif hash_value: - result = await handler.handle_hash_search(hash_value, filters) - elif episode_name: - result = await handler.handle_episode_name_search(episode_name, filters) - elif episode_name_semantic: - result = await handler.handle_episode_name_semantic_search(episode_name_semantic, filters) - - if result: - click.echo(result) - - finally: - embedding_svc.cleanup() - await es_client.close() - - asyncio.run(__run()) + _execute_search_command(config) _CLI_TEMPLATE_SERIES = "ranczo" diff --git a/preprocessor/cli/search_handler.py b/preprocessor/cli/search_handler.py index a7cf50bfe..9f6fb4b54 100644 --- a/preprocessor/cli/search_handler.py +++ b/preprocessor/cli/search_handler.py @@ -1,6 +1,9 @@ +import json from pathlib import Path from typing import ( Any, + Awaitable, + Callable, Dict, List, Optional, @@ -44,9 +47,30 @@ def __init__( self._queries = queries self._json_output = json_output - async def handle_stats(self) -> str: - import json # pylint: disable=import-outside-toplevel + async def _execute_search( + self, + search_func: Callable[..., Awaitable[Dict[str, Any]]], + result_type: str, + result_key: str = "hits", + ) -> str: + """Generic search executor - reduces duplication. + + Args: + search_func: Async function that executes the search query. + result_type: Type of result for console formatting. + result_key: Key to extract from result for JSON output (default: "hits"). + + Returns: + Formatted search results (JSON or console output). + """ + result = await search_func() + + if self._json_output: + return json.dumps(result.get(result_key, result), indent=2) + + return self._format_console_output(result, result_type) + async def handle_stats(self) -> str: result = await self._queries.get_stats(self._es) if self._json_output: return json.dumps(result, indent=2) @@ -59,8 +83,6 @@ async def handle_stats(self) -> str: return "\n".join(output) async def handle_list_characters(self) -> str: - import json # pylint: disable=import-outside-toplevel - chars = await self._queries.list_characters(self._es) if self._json_output: return json.dumps(chars, indent=2) @@ -71,8 +93,6 @@ async def handle_list_characters(self) -> str: return "\n".join(output) async def handle_list_objects(self) -> str: - import json # pylint: disable=import-outside-toplevel - objects = await self._queries.list_objects(self._es) if self._json_output: return json.dumps(objects, indent=2) @@ -83,112 +103,82 @@ async def handle_list_objects(self) -> str: return "\n".join(output) async def handle_text_search(self, query: str, filters: SearchFilters) -> str: - import json # pylint: disable=import-outside-toplevel - - result = await self._queries.search_text_query( - self._es, query, filters.season, filters.episode, filters.limit, + return await self._execute_search( + search_func=lambda: self._queries.search_text_query( + self._es, query, filters.season, filters.episode, filters.limit, + ), + result_type="text", ) - if self._json_output: - return json.dumps(result["hits"], indent=2) - - return self._format_console_output(result, "text") async def handle_text_semantic_search(self, query: str, filters: SearchFilters) -> str: - import json # pylint: disable=import-outside-toplevel - - result = await self._queries.search_text_semantic( - self._es, query, filters.season, filters.episode, filters.limit, + return await self._execute_search( + search_func=lambda: self._queries.search_text_semantic( + self._es, query, filters.season, filters.episode, filters.limit, + ), + result_type="text_semantic", ) - if self._json_output: - return json.dumps(result["hits"], indent=2) - - return self._format_console_output(result, "text_semantic") async def handle_text_to_video_search(self, query: str, filters: SearchFilters) -> str: - import json # pylint: disable=import-outside-toplevel - - result = await self._queries.search_text_to_video( - self._es, query, filters.season, filters.episode, filters.character, filters.limit, + return await self._execute_search( + search_func=lambda: self._queries.search_text_to_video( + self._es, query, filters.season, filters.episode, filters.character, filters.limit, + ), + result_type="video", ) - if self._json_output: - return json.dumps(result["hits"], indent=2) - - return self._format_console_output(result, "video") async def handle_image_search(self, image_path: Path, filters: SearchFilters) -> str: - import json # pylint: disable=import-outside-toplevel - - result = await self._queries.search_video_semantic( - self._es, str(image_path), filters.season, filters.episode, filters.character, filters.limit, + return await self._execute_search( + search_func=lambda: self._queries.search_video_semantic( + self._es, str(image_path), filters.season, filters.episode, filters.character, filters.limit, + ), + result_type="video", ) - if self._json_output: - return json.dumps(result["hits"], indent=2) - - return self._format_console_output(result, "video") async def handle_emotion_search(self, emotion: str, filters: SearchFilters) -> str: - import json # pylint: disable=import-outside-toplevel - - result = await self._queries.search_by_emotion( - self._es, emotion, filters.season, filters.episode, filters.character, filters.limit, + return await self._execute_search( + search_func=lambda: self._queries.search_by_emotion( + self._es, emotion, filters.season, filters.episode, filters.character, filters.limit, + ), + result_type="video", ) - if self._json_output: - return json.dumps(result["hits"], indent=2) - - return self._format_console_output(result, "video") async def handle_character_search(self, character: str, filters: SearchFilters) -> str: - import json # pylint: disable=import-outside-toplevel - - result = await self._queries.search_by_character( - self._es, character, filters.season, filters.episode, filters.limit, + return await self._execute_search( + search_func=lambda: self._queries.search_by_character( + self._es, character, filters.season, filters.episode, filters.limit, + ), + result_type="video", ) - if self._json_output: - return json.dumps(result["hits"], indent=2) - - return self._format_console_output(result, "video") async def handle_object_search(self, object_query: str, filters: SearchFilters) -> str: - import json # pylint: disable=import-outside-toplevel - - result = await self._queries.search_by_object( - self._es, object_query, filters.season, filters.episode, filters.limit, + return await self._execute_search( + search_func=lambda: self._queries.search_by_object( + self._es, object_query, filters.season, filters.episode, filters.limit, + ), + result_type="video", ) - if self._json_output: - return json.dumps(result["hits"], indent=2) - - return self._format_console_output(result, "video") async def handle_hash_search(self, hash_value: str, filters: SearchFilters) -> str: - import json # pylint: disable=import-outside-toplevel - - result = await self._queries.search_perceptual_hash(self._es, hash_value, filters.limit) - if self._json_output: - return json.dumps(result["hits"], indent=2) - - return self._format_console_output(result, "video") + return await self._execute_search( + search_func=lambda: self._queries.search_perceptual_hash(self._es, hash_value, filters.limit), + result_type="video", + ) async def handle_episode_name_search(self, episode_name: str, filters: SearchFilters) -> str: - import json # pylint: disable=import-outside-toplevel - - result = await self._queries.search_episode_name( - self._es, episode_name, filters.season, filters.limit, + return await self._execute_search( + search_func=lambda: self._queries.search_episode_name( + self._es, episode_name, filters.season, filters.limit, + ), + result_type="episode_name", ) - if self._json_output: - return json.dumps(result["hits"], indent=2) - - return self._format_console_output(result, "episode_name") async def handle_episode_name_semantic_search(self, episode_name: str, filters: SearchFilters) -> str: - import json # pylint: disable=import-outside-toplevel - - result = await self._queries.search_episode_name_semantic( - self._es, episode_name, filters.season, filters.limit, + return await self._execute_search( + search_func=lambda: self._queries.search_episode_name_semantic( + self._es, episode_name, filters.season, filters.limit, + ), + result_type="episode_name", ) - if self._json_output: - return json.dumps(result["hits"], indent=2) - - return self._format_console_output(result, "episode_name") @staticmethod def compute_perceptual_hash(phash_input: str) -> Optional[str]: diff --git a/preprocessor/cli/search_params.py b/preprocessor/cli/search_params.py new file mode 100644 index 000000000..00b85905d --- /dev/null +++ b/preprocessor/cli/search_params.py @@ -0,0 +1,63 @@ +from dataclasses import dataclass +from pathlib import Path +from typing import Optional + +from preprocessor.cli.search_handler import SearchFilters + + +@dataclass +class SearchQueryParams: + + text: Optional[str] = None + text_semantic: Optional[str] = None + text_to_video: Optional[str] = None + image: Optional[Path] = None + phash: Optional[str] = None + character: Optional[str] = None + emotion: Optional[str] = None + object_query: Optional[str] = None + episode_name: Optional[str] = None + episode_name_semantic: Optional[str] = None + + def has_search_criteria(self) -> bool: + return any([ + self.text, + self.text_semantic, + self.text_to_video, + self.image, + self.phash, + self.character, + self.emotion, + self.object_query, + self.episode_name, + self.episode_name_semantic, + ]) + + +@dataclass +class SearchActionParams: + + list_chars_flag: bool = False + list_objects_flag: bool = False + stats: bool = False + + def has_action(self) -> bool: + return any([ + self.list_chars_flag, + self.list_objects_flag, + self.stats, + ]) + + +@dataclass +class SearchConfig: + + series: str + query: SearchQueryParams + filters: SearchFilters + actions: SearchActionParams + json_output: bool = False + host: str = "http://localhost:9200" + + def has_any_operation(self) -> bool: + return self.query.has_search_criteria() or self.actions.has_action() diff --git a/preprocessor/config/config.py b/preprocessor/config/config.py index 223fc7c98..1dd6c62fa 100644 --- a/preprocessor/config/config.py +++ b/preprocessor/config/config.py @@ -6,6 +6,7 @@ from pathlib import Path from typing import ( Any, + ClassVar, Dict, List, Optional, @@ -14,19 +15,9 @@ from pydantic import SecretStr +from preprocessor.config.mixins import OutputDirMixin from preprocessor.services.media.resolution import Resolution -is_docker = os.getenv('DOCKER_CONTAINER', 'false').lower() == 'true' -BASE_OUTPUT_DIR = Path('/app/output_data') if is_docker else Path('preprocessor/output_data') - -def get_base_output_dir(series_name: Optional[str]=None) -> Path: - base = Path('/app/output_data') if is_docker else Path('preprocessor/output_data') - if series_name: - return base / series_name.lower() - return base - -def get_output_path(relative_path: str, series_name: Optional[str]=None) -> Path: - return get_base_output_dir(series_name) / relative_path @dataclass class ElasticDocumentSubdirs: @@ -73,26 +64,22 @@ def api_key(self) -> Optional[str]: return self._api_key.get_secret_value() if self._api_key else None @dataclass -class TranscodeSettings: +class TranscodeSettings(OutputDirMixin): + OUTPUT_SUBDIR: ClassVar[str] = 'transcoded_videos' + audio_bitrate_kbps: int = 128 codec: str = 'h264_nvenc' gop_size: float = 0.5 target_duration_seconds: float = 100.0 target_file_size_mb: float = 50.0 - @staticmethod - def get_output_dir(series_name: str) -> Path: - return get_base_output_dir(series_name) / 'transcoded_videos' - @dataclass -class SceneDetectionSettings: +class SceneDetectionSettings(OutputDirMixin): + OUTPUT_SUBDIR: ClassVar[str] = 'scene_timestamps' + min_scene_len: int = 10 threshold: float = 0.5 - @staticmethod - def get_output_dir(series_name: str) -> Path: - return get_base_output_dir(series_name) / 'scene_timestamps' - @dataclass class SceneChangesSettings: frames_per_scene: int = 1 @@ -103,23 +90,19 @@ class KeyframeExtractionSettings: strategy: str = 'scene_changes' @dataclass -class FrameExportSettings: - resolution: Resolution = Resolution.R1080P +class FrameExportSettings(OutputDirMixin): + OUTPUT_SUBDIR: ClassVar[str] = 'exported_frames' - @staticmethod - def get_output_dir(series_name: str) -> Path: - return get_base_output_dir(series_name) / 'exported_frames' + resolution: Resolution = Resolution.R1080P @dataclass -class TranscriptionSettings: +class TranscriptionSettings(OutputDirMixin): + OUTPUT_SUBDIR: ClassVar[str] = 'transcriptions' + device: str = 'cuda' language: str = 'Polish' model: str = 'large-v3-turbo' - @staticmethod - def get_output_dir(series_name: str) -> Path: - return get_base_output_dir(series_name) / 'transcriptions' - @dataclass class WhisperSettings: model: str = 'large-v3-turbo' @@ -163,32 +146,28 @@ class EmbeddingModelSettings: tensor_parallel_size: int = 1 @dataclass -class EmbeddingSettings: +class EmbeddingSettings(OutputDirMixin): + OUTPUT_SUBDIR: ClassVar[str] = 'embeddings' + batch_size: int = 32 generate_full_episode_embedding: bool = True prefetch_chunks: int = 2 progress_sub_batch_size: int = 100 text_batch_size: int = 64 - @staticmethod - def get_output_dir(series_name: str) -> Path: - return get_base_output_dir(series_name) / 'embeddings' - @dataclass class FaceRecognitionSettings: detection_size: Tuple[int, int] = (1280, 1280) model_name: str = 'buffalo_l' @dataclass -class FaceClusteringSettings: +class FaceClusteringSettings(OutputDirMixin): + OUTPUT_SUBDIR: ClassVar[str] = 'face_clusters' + min_cluster_size: int = 5 min_samples: int = 3 save_noise: bool = True - @staticmethod - def get_output_dir(series_name: str) -> Path: - return get_base_output_dir(series_name) / 'face_clusters' - @dataclass class EmotionDetectionSettings: model_name: str = 'enet_b2_8' @@ -199,32 +178,25 @@ def _from_env(cls) -> 'EmotionDetectionSettings': return cls(model_name=model_name) @dataclass -class CharacterSettings: +class CharacterSettings(OutputDirMixin): + OUTPUT_SUBDIR: ClassVar[str] = 'characters' + face_detection_threshold: float = 0.2 frame_detection_threshold: float = 0.55 normalized_face_size: Tuple[int, int] = (112, 112) reference_images_per_character: int = 3 reference_matching_threshold: float = 0.5 - @staticmethod - def get_output_dir(series_name: str) -> Path: - return get_base_output_dir(series_name) / 'characters' - @dataclass -class ObjectDetectionSettings: +class ObjectDetectionSettings(OutputDirMixin): + OUTPUT_SUBDIR: ClassVar[str] = 'object_detections' + conf_threshold: float = 0.3 model_name: str = 'ustc-community/dfine-xlarge-obj2coco' - @staticmethod - def get_output_dir(series_name: str) -> Path: - return get_base_output_dir(series_name) / 'object_detections' - @dataclass -class ImageHashSettings: - - @staticmethod - def get_output_dir(series_name: str) -> Path: - return get_base_output_dir(series_name) / 'image_hashes' +class ImageHashSettings(OutputDirMixin): + OUTPUT_SUBDIR: ClassVar[str] = 'image_hashes' @dataclass class ImageScraperSettings(BaseAPISettings): @@ -249,11 +221,8 @@ def _from_env(cls) -> 'ImageScraperSettings': return cls(_api_key=api_key) @dataclass -class ScraperSettings: - - @staticmethod - def get_output_dir(series_name: str) -> Path: - return get_base_output_dir(series_name) / 'scraped_pages' +class ScraperSettings(OutputDirMixin): + OUTPUT_SUBDIR: ClassVar[str] = 'scraped_pages' @dataclass class ElasticsearchSettings: @@ -386,4 +355,3 @@ class IndexConfig: def to_dict(self) -> Dict[str, Any]: return {'name': self.name, 'transcription_jsons': str(self.transcription_jsons), 'dry_run': self.dry_run, 'append': self.append} -settings = Settings._from_env() diff --git a/preprocessor/config/mixins.py b/preprocessor/config/mixins.py new file mode 100644 index 000000000..80f210050 --- /dev/null +++ b/preprocessor/config/mixins.py @@ -0,0 +1,15 @@ +from pathlib import Path +from typing import ClassVar + +from preprocessor.config.output_paths import get_base_output_dir + + +class OutputDirMixin: + + OUTPUT_SUBDIR: ClassVar[str] + + @classmethod + def get_output_dir(cls, series_name: str) -> Path: + if not hasattr(cls, 'OUTPUT_SUBDIR'): + raise NotImplementedError(f"{cls.__name__} must define OUTPUT_SUBDIR class variable") + return get_base_output_dir(series_name) / cls.OUTPUT_SUBDIR diff --git a/preprocessor/config/output_paths.py b/preprocessor/config/output_paths.py new file mode 100644 index 000000000..1012cdb30 --- /dev/null +++ b/preprocessor/config/output_paths.py @@ -0,0 +1,15 @@ +from pathlib import Path +from typing import Optional + +from preprocessor.services.core.environment import Environment + +BASE_OUTPUT_DIR = Path('/app/output_data') if Environment.is_docker() else Path('preprocessor/output_data') + +def get_base_output_dir(series_name: Optional[str]=None) -> Path: + base = Path('/app/output_data') if Environment.is_docker() else Path('preprocessor/output_data') + if series_name: + return base / series_name.lower() + return base + +def get_output_path(relative_path: str, series_name: Optional[str]=None) -> Path: + return get_base_output_dir(series_name) / relative_path diff --git a/preprocessor/config/settings_factory.py b/preprocessor/config/settings_factory.py new file mode 100644 index 000000000..55a5a003b --- /dev/null +++ b/preprocessor/config/settings_factory.py @@ -0,0 +1,18 @@ +from typing import Optional + +from preprocessor.config.config import Settings + + +class SettingsFactory: + + _instance: Optional[Settings] = None + + @staticmethod + def get_settings() -> Settings: + if SettingsFactory._instance is None: + SettingsFactory._instance = Settings._from_env() + return SettingsFactory._instance + + @staticmethod + def reset(new_settings: Optional[Settings] = None) -> None: + SettingsFactory._instance = new_settings diff --git a/preprocessor/config/settings_instance.py b/preprocessor/config/settings_instance.py new file mode 100644 index 000000000..e8e1ea29d --- /dev/null +++ b/preprocessor/config/settings_instance.py @@ -0,0 +1,4 @@ +from preprocessor.config.config import Settings +from preprocessor.config.settings_factory import SettingsFactory + +settings: Settings = SettingsFactory.get_settings() diff --git a/preprocessor/core/context.py b/preprocessor/core/context.py index 195ce21b3..95dceb05f 100644 --- a/preprocessor/core/context.py +++ b/preprocessor/core/context.py @@ -5,6 +5,8 @@ Optional, ) +from preprocessor.config.config import Settings +from preprocessor.config.settings_factory import SettingsFactory from preprocessor.services.core.logging import ErrorHandlingLogger if TYPE_CHECKING: @@ -20,12 +22,14 @@ def __init__( logger: ErrorHandlingLogger, state_manager: Optional['StateManager'] = None, force_rerun: bool = False, + settings: Optional[Settings] = None, ) -> None: self._series_name: str = series_name self._base_output_dir: Path = base_output_dir / series_name self._state_manager: Optional['StateManager'] = state_manager self._force_rerun: bool = force_rerun self._logger: ErrorHandlingLogger = logger + self._settings: Settings = settings or SettingsFactory.get_settings() @property def force_rerun(self) -> bool: @@ -73,6 +77,15 @@ def mark_step_started( def series_name(self) -> str: return self._series_name + @property + def settings(self) -> Settings: + """Get settings instance. + + Returns: + The active Settings instance for this context. + """ + return self._settings + @property def state_manager(self) -> Optional['StateManager']: return self._state_manager diff --git a/preprocessor/services/ai/clients.py b/preprocessor/services/ai/clients.py index 923bdbf21..862452d4b 100644 --- a/preprocessor/services/ai/clients.py +++ b/preprocessor/services/ai/clients.py @@ -14,7 +14,7 @@ SamplingParams, ) -from preprocessor.config.config import settings +from preprocessor.config.settings_instance import settings from preprocessor.services.ui.console import console diff --git a/preprocessor/services/ai/provider.py b/preprocessor/services/ai/provider.py index 6488c5635..9adbcc68b 100644 --- a/preprocessor/services/ai/provider.py +++ b/preprocessor/services/ai/provider.py @@ -15,12 +15,6 @@ extract_all_seasons_user, extract_characters_system, extract_characters_user, - extract_episode_metadata_system, - extract_episode_metadata_user, - extract_season_system, - extract_season_user, - merge_episode_data_system, - merge_episode_data_user, ) from preprocessor.services.ai.clients import ( BaseLLMClient, @@ -31,31 +25,23 @@ AllSeasonsMetadata, CharacterInfo, CharactersList, - EpisodeMetadata, SeasonMetadata, ) from preprocessor.services.ui.console import console class LLMProvider: - __client: Optional[BaseLLMClient] = None - __instance: Optional['LLMProvider'] = None def __init__(self, model_name: Optional[str] = None, parser_mode: Optional[ParserMode] = None) -> None: self._parser_mode = parser_mode or ParserMode.NORMAL - if self.__client is None: - if self._parser_mode == ParserMode.PREMIUM: - self.__client = GeminiClient() - else: - self.__client = VLLMClient(model_name=model_name) + if self._parser_mode == ParserMode.PREMIUM: + self._client: BaseLLMClient = GeminiClient() + else: + self._client: BaseLLMClient = VLLMClient(model_name=model_name) def extract_all_seasons(self, scraped_pages: List[Dict[str, Any]]) -> Optional[List[SeasonMetadata]]: - combined_content = '' - for i, page in enumerate(scraped_pages, 1): - url = page['url'] - markdown = page['markdown'] - combined_content += f'\n\n=== SOURCE {i}: {url} ===\n\n{markdown}\n' + combined_content = self.__build_combined_content(scraped_pages) result = self.__process_llm_request( system_prompt=extract_all_seasons_system.get(), @@ -73,11 +59,7 @@ def extract_characters( scraped_pages: List[Dict[str, Any]], series_name: str, ) -> Optional[List[CharacterInfo]]: - combined_content = '' - for i, page in enumerate(scraped_pages, 1): - url = page['url'] - markdown = page['markdown'] - combined_content += f'\n\n=== SOURCE {i}: {url} ===\n\n{markdown}\n' + combined_content = self.__build_combined_content(scraped_pages) result = self.__process_llm_request( system_prompt=extract_characters_system.get(), @@ -91,18 +73,24 @@ def extract_characters( ) return result.characters if result else None - def __new__(cls, model_name: Optional[str] = None, parser_mode: Optional[ParserMode] = None) -> 'LLMProvider': - if cls.__instance is None: - cls.__instance = super().__new__(cls) - return cls.__instance + @staticmethod + def __build_combined_content(scraped_pages: List[Dict[str, Any]]) -> str: + """Build combined markdown from scraped pages. + + Args: + scraped_pages: List of scraped page dictionaries with 'url' and 'markdown' keys. - def __extract_episode_metadata(self, page_text: str, url: str) -> Optional[EpisodeMetadata]: # pylint: disable=unused-private-member - return self.__process_llm_request( - system_prompt=extract_episode_metadata_system.get(), - user_prompt=extract_episode_metadata_user.get().format(url=url, page_text=page_text), - response_model=EpisodeMetadata, - error_context=f'extraction failed for {url}', - ) + Returns: + Combined content with source separators. + """ + combined_parts: List[str] = [] + for i, page in enumerate(scraped_pages, 1): + url: str = page['url'] + markdown: str = page['markdown'] + combined_parts.append( + f'\n\n=== SOURCE {i}: {url} ===\n\n{markdown}\n', + ) + return ''.join(combined_parts) @staticmethod def __extract_json(content: str) -> Dict[str, Any]: @@ -123,41 +111,6 @@ def __extract_json(content: str) -> Dict[str, Any]: console.print(f'[yellow]Raw content:\n{content}[/yellow]') raise - def __extract_season_episodes(self, page_text: str, url: str) -> Optional[SeasonMetadata]: # pylint: disable=unused-private-member - return self.__process_llm_request( - system_prompt=extract_season_system.get(), - user_prompt=extract_season_user.get().format(url=url, page_text=page_text), - response_model=SeasonMetadata, - error_context=f'extraction failed for {url}', - ) - - def __merge_episode_data(self, metadata_list: List[EpisodeMetadata]) -> EpisodeMetadata: # pylint: disable=unused-private-member - if not metadata_list: - raise ValueError('No metadata to merge') - if len(metadata_list) == 1: - return metadata_list[0] - - combined_text = '\n\n---\n\n'.join([ - f'Source {i + 1}:\n' - f'Title: {m.title}\n' - f'Description: {m.description}\n' - f'Summary: {m.summary}\n' - f'Season: {m.season}\n' - f'Episode: {m.episode_number}' - for i, m in enumerate(metadata_list) - ]) - - result = self.__process_llm_request( - system_prompt=merge_episode_data_system.get(), - user_prompt=merge_episode_data_user.get().format( - num_sources=len(metadata_list), - combined_text=combined_text, - ), - response_model=EpisodeMetadata, - error_context='merge failed', - ) - return result if result else metadata_list[0] - def __process_llm_request( self, system_prompt: str, @@ -165,15 +118,12 @@ def __process_llm_request( response_model: Type[BaseModel], error_context: str, ) -> Optional[BaseModel]: - if self.__client is None: - raise RuntimeError('LLM client not initialized') - try: messages = [ {'role': 'system', 'content': system_prompt}, {'role': 'user', 'content': user_prompt}, ] - content = self.__client.generate(messages) + content = self._client.generate(messages) data = self.__extract_json(content) return response_model(**data) except Exception as e: diff --git a/preprocessor/services/characters/face_detection.py b/preprocessor/services/characters/face_detection.py index a70a391bc..9acb35b77 100644 --- a/preprocessor/services/characters/face_detection.py +++ b/preprocessor/services/characters/face_detection.py @@ -14,7 +14,7 @@ from numpy.linalg import norm import onnxruntime as ort -from preprocessor.config.config import settings +from preprocessor.config.settings_instance import settings from preprocessor.services.ui.console import console warnings.filterwarnings('ignore', message='.*estimate.*is deprecated.*', category=FutureWarning, module='insightface') diff --git a/preprocessor/services/characters/reference_downloader.py b/preprocessor/services/characters/reference_downloader.py index 557078ab3..ca6da287b 100644 --- a/preprocessor/services/characters/reference_downloader.py +++ b/preprocessor/services/characters/reference_downloader.py @@ -22,7 +22,7 @@ sync_playwright, ) -from preprocessor.config.config import settings +from preprocessor.config.settings_instance import settings from preprocessor.services.characters.face_detection import FaceDetector from preprocessor.services.characters.image_search import ( BaseImageSearch, diff --git a/preprocessor/services/core/base_processor.py b/preprocessor/services/core/base_processor.py index 4328b2385..60ca635fc 100644 --- a/preprocessor/services/core/base_processor.py +++ b/preprocessor/services/core/base_processor.py @@ -16,7 +16,7 @@ from preprocessor.config.constants import SUPPORTED_VIDEO_EXTENSIONS from preprocessor.core.state_manager import StateManager from preprocessor.services.core.logging import ErrorHandlingLogger -from preprocessor.services.io.path_manager import PathManager +from preprocessor.services.io.path_service import PathService from preprocessor.services.ui.console import ( SimpleProgress, console, @@ -58,7 +58,7 @@ def __init__(self, args: Dict[str, Any], class_name: str, error_exit_code: int, self.logger = ErrorHandlingLogger(class_name=class_name, loglevel=loglevel, error_exit_code=error_exit_code) self.state_manager: Optional[StateManager] = args.get('state_manager') self.series_name: str = args.get('series_name', 'unknown') - self.path_manager: PathManager = args.get('path_manager', PathManager(self.series_name)) + self.path_manager: PathService = args.get('path_manager', PathService(self.series_name)) self.progress = args.get('progress_tracker', ProgressTracker()) def cleanup(self) -> None: diff --git a/preprocessor/services/core/environment.py b/preprocessor/services/core/environment.py new file mode 100644 index 000000000..c101a83f7 --- /dev/null +++ b/preprocessor/services/core/environment.py @@ -0,0 +1,19 @@ +import os +from typing import Optional + + +class Environment: + + _is_docker_cached: Optional[bool] = None + + @staticmethod + def is_docker() -> bool: + if Environment._is_docker_cached is None: + Environment._is_docker_cached = ( + os.getenv('DOCKER_CONTAINER', 'false').lower() == 'true' + ) + return Environment._is_docker_cached + + @staticmethod + def reset_cache() -> None: + Environment._is_docker_cached = None diff --git a/preprocessor/services/episodes/episode_manager.py b/preprocessor/services/episodes/episode_manager.py index 0f210861e..d1330ac0e 100644 --- a/preprocessor/services/episodes/episode_manager.py +++ b/preprocessor/services/episodes/episode_manager.py @@ -5,17 +5,15 @@ from typing import ( Any, Dict, - List, Optional, ) from preprocessor.config.constants import ( - SUPPORTED_VIDEO_EXTENSIONS, EpisodeMetadataKeys, EpisodesDataKeys, ) from preprocessor.services.core.logging import ErrorHandlingLogger -from preprocessor.services.io.path_manager import PathManager +from preprocessor.services.io.path_service import PathService @dataclass @@ -45,7 +43,7 @@ class EpisodeManager: def __init__(self, episodes_info_json: Optional[Path], series_name: str, logger: Optional[ErrorHandlingLogger]=None) -> None: self.series_name = series_name.lower() self.episodes_data: Optional[Dict[str, Any]] = None - self.path_manager = PathManager(self.series_name) + self.path_manager = PathService(self.series_name) self._logger: Optional[ErrorHandlingLogger] = logger if episodes_info_json and episodes_info_json.exists(): with open(episodes_info_json, 'r', encoding='utf-8') as f: @@ -120,85 +118,3 @@ def __create_episode_info( premiere_date=premiere_date, viewership=viewership, ) - - @staticmethod - def __find_scene_timestamps_file(episode_info: EpisodeInfo, search_dir: Path) -> Optional[Path]: - if not search_dir.exists(): - return None - episode_code = episode_info.episode_code() - pattern = f'**/*{episode_code}*_scenes.json' - for scene_file in search_dir.glob(pattern): - return scene_file - return None - - def __find_transcription_file(self, episode_info: EpisodeInfo, search_dir: Path, prefer_segmented: bool=True) -> Optional[Path]: # pylint: disable=unused-private-member - if not search_dir.exists(): - return None - season_dir_name = episode_info.season_code() - season_dir = search_dir / season_dir_name - if not season_dir.exists(): - return None - if prefer_segmented: - segmented = season_dir / self.path_manager.build_filename(episode_info, extension='json', suffix='segmented') - if segmented.exists(): - return segmented - regular = season_dir / self.path_manager.build_filename(episode_info, extension='json') - if regular.exists(): - return regular - return None - - @staticmethod - def __find_video_file(episode_info: EpisodeInfo, search_dir: Path) -> Optional[Path]: # pylint: disable=unused-private-member - if not search_dir.exists(): - return None - if search_dir.is_file(): - return search_dir - episode_code = episode_info.episode_code() - season_dir_name = episode_info.season_code() - search_dirs = [search_dir / season_dir_name, search_dir] - for dir_path in search_dirs: - if not dir_path.exists(): - continue - for ext in SUPPORTED_VIDEO_EXTENSIONS: - for video_file in dir_path.glob(f'*{ext}'): - if re.search(episode_code, video_file.name, re.IGNORECASE): - return video_file - return None - - def __list_all_episodes(self) -> List[EpisodeInfo]: # pylint: disable=unused-private-member - episodes: List[EpisodeInfo] = [] - if not self.episodes_data: - return episodes - for season_data in self.episodes_data.get(EpisodesDataKeys.SEASONS, []): - season_num = season_data.get(EpisodesDataKeys.SEASON_NUMBER, 1) - season_episodes = sorted(season_data.get(EpisodesDataKeys.EPISODES, []), key=lambda ep: ep.get(EpisodeMetadataKeys.EPISODE_NUMBER, 0)) - for idx, ep_data in enumerate(season_episodes): - episodes.append( - self.__create_episode_info( - season=season_num, - relative_episode=idx + 1, - title=ep_data.get(EpisodeMetadataKeys.TITLE), - premiere_date=ep_data.get(EpisodeMetadataKeys.PREMIERE_DATE), - viewership=ep_data.get(EpisodeMetadataKeys.VIEWERSHIP), - ), - ) - return episodes - - @staticmethod - def __load_scene_timestamps( # pylint: disable=unused-private-member - episode_info: EpisodeInfo, - search_dir: Optional[Path], - _logger: Optional[ErrorHandlingLogger]=None, - ) -> Optional[List[Dict[str, Any]]]: - if not search_dir: - return None - scene_file = EpisodeManager.__find_scene_timestamps_file(episode_info, search_dir) - if not scene_file: - return None - try: - with open(scene_file, 'r', encoding='utf-8') as f: - return json.load(f) - except (OSError, json.JSONDecodeError) as e: - if _logger: - _logger.error(f'Failed to load scene timestamps: {e}') - return None diff --git a/preprocessor/services/io/__init__.py b/preprocessor/services/io/__init__.py index c209731a0..d9b335f39 100644 --- a/preprocessor/services/io/__init__.py +++ b/preprocessor/services/io/__init__.py @@ -1,4 +1,3 @@ -from preprocessor.services.io.path_manager import PathManager from preprocessor.services.io.path_service import PathService -__all__ = ['PathManager', 'PathService'] +__all__ = ['PathService'] diff --git a/preprocessor/services/io/detection_io.py b/preprocessor/services/io/detection_io.py deleted file mode 100644 index d7b660c91..000000000 --- a/preprocessor/services/io/detection_io.py +++ /dev/null @@ -1,30 +0,0 @@ -from pathlib import Path -from typing import ( - Any, - Dict, - List, -) - -from insightface.app import FaceAnalysis -import numpy as np - -from preprocessor.services.characters.face_detection import FaceDetector - - -def process_frames_for_detection( - frame_files: List[Path], - face_app: FaceAnalysis, - character_vectors: Dict[str, np.ndarray], - threshold: float, -) -> List[Dict[str, Any]]: - results: List[Dict[str, Any]] = [] - for frame_path in frame_files: - detections: List[Dict[str, Any]] = FaceDetector.detect_characters_in_frame( - frame_path, - face_app, - character_vectors, - threshold, - ) - if detections: - results.append({'frame': frame_path.name, 'faces': detections}) - return results diff --git a/preprocessor/services/io/files.py b/preprocessor/services/io/files.py index e5dda10ad..851480f8f 100644 --- a/preprocessor/services/io/files.py +++ b/preprocessor/services/io/files.py @@ -32,17 +32,3 @@ def __atomic_write(path: Path, write_func: Callable[[Any], None]) -> None: if temp_path.exists(): temp_path.unlink() raise - - @staticmethod - def __atomic_write_text(path: Path, content: str) -> None: # pylint: disable=unused-private-member - - def __write(temp: Path) -> None: - with open(temp, 'w', encoding='utf-8') as f: - f.write(content) - FileOperations.__atomic_write(path, __write) - -def atomic_write_json(path: Path, data: Dict[str, Any], indent: int=2) -> None: - FileOperations.atomic_write_json(path, data, indent) - -def load_json(path: Path) -> Dict[str, Any]: - return FileOperations.load_json(path) diff --git a/preprocessor/services/io/hashing.py b/preprocessor/services/io/hashing.py deleted file mode 100644 index e31f77ee3..000000000 --- a/preprocessor/services/io/hashing.py +++ /dev/null @@ -1,2 +0,0 @@ -class HashStorage: - pass diff --git a/preprocessor/services/io/path_manager.py b/preprocessor/services/io/path_manager.py deleted file mode 100644 index 72c66a4fd..000000000 --- a/preprocessor/services/io/path_manager.py +++ /dev/null @@ -1,20 +0,0 @@ -from pathlib import Path -from typing import TYPE_CHECKING - -from preprocessor.services.io.path_service import PathService - -if TYPE_CHECKING: - from preprocessor.services.episodes.episode_manager import EpisodeInfo - - -class PathManager: - def __init__(self, series_name: str) -> None: - self._service: PathService = PathService(series_name) - - def build_filename( - self, episode_info: 'EpisodeInfo', extension: str = 'json', suffix: str = '', - ) -> str: - return self._service.build_filename(episode_info, extension, suffix) - - def get_episode_dir(self, episode_info: 'EpisodeInfo', subdir: str) -> Path: - return self._service.get_episode_dir(episode_info, subdir) diff --git a/preprocessor/services/io/path_service.py b/preprocessor/services/io/path_service.py index 1de748476..1f6f808bd 100644 --- a/preprocessor/services/io/path_service.py +++ b/preprocessor/services/io/path_service.py @@ -1,8 +1,8 @@ -import os from pathlib import Path from typing import TYPE_CHECKING -from preprocessor.config.config import get_base_output_dir +from preprocessor.config.output_paths import get_base_output_dir +from preprocessor.services.core.environment import Environment if TYPE_CHECKING: from preprocessor.services.episodes.episode_manager import EpisodeInfo @@ -26,11 +26,8 @@ def get_episode_dir(self, episode_info: 'EpisodeInfo', subdir: str) -> Path: @staticmethod def get_input_base() -> Path: - return Path('/input_data') if PathService._is_docker() else Path('preprocessor/input_data') + return Path('/input_data') if Environment.is_docker() else Path('preprocessor/input_data') @staticmethod def get_output_base() -> Path: - return Path('/app/output_data') if PathService._is_docker() else Path('preprocessor/output_data') - @staticmethod - def _is_docker() -> bool: - return os.getenv('DOCKER_CONTAINER', 'false').lower() == 'true' + return Path('/app/output_data') if Environment.is_docker() else Path('preprocessor/output_data') diff --git a/preprocessor/services/media/ffmpeg.py b/preprocessor/services/media/ffmpeg.py index 4f5490dcf..ea1d25176 100644 --- a/preprocessor/services/media/ffmpeg.py +++ b/preprocessor/services/media/ffmpeg.py @@ -10,13 +10,15 @@ Tuple, ) +from preprocessor.services.media.transcode_params import TranscodeParams + class FFmpegWrapper: - __AQ_STRENGTH = '15' + __ADAPTIVE_QUANTIZATION_STRENGTH = '15' __AUDIO_CHANNELS = '2' __AUDIO_SAMPLE_RATE = '48000' - __BF = '2' - __B_ADAPT = '1' + __B_FRAMES = '2' + __B_ADAPT_MODE = '1' __LEVEL = '4.1' __PIX_FMT = 'yuv420p' __PROFILE = 'high' @@ -140,38 +142,36 @@ def probe_video(video_path: Path) -> Dict[str, Any]: return json.loads(result.stdout) @staticmethod - def transcode( # pylint: disable=too-many-arguments,too-many-locals - input_path: Path, - output_path: Path, - codec: str, - preset: str, - resolution: str, - video_bitrate: str, - minrate: str, - maxrate: str, - bufsize: str, - audio_bitrate: str, - gop_size: int, - target_fps: Optional[float] = None, - deinterlace: bool = False, - is_upscaling: bool = False, - log_command: bool = False, - ) -> None: - width, height = [int(x) for x in resolution.split(':')] - vf_filter = FFmpegWrapper.__build_video_filter(width, height, deinterlace, is_upscaling) - command = FFmpegWrapper.__build_base_command(input_path, codec, preset, target_fps) + def transcode(params: TranscodeParams) -> None: + """Transcode video with parameter object. + + Args: + params: Transcoding parameters. + """ + width, height = params.get_resolution_tuple() + vf_filter = FFmpegWrapper.__build_video_filter( + width, height, params.deinterlace, params.is_upscaling, + ) + command = FFmpegWrapper.__build_base_command( + params.input_path, params.codec, params.preset, params.target_fps, + ) command.extend( FFmpegWrapper.__build_encoding_params( - video_bitrate, minrate, maxrate, bufsize, gop_size, is_upscaling, + params.video_bitrate, + params.minrate, + params.maxrate, + params.bufsize, + params.gop_size, + params.is_upscaling, ), ) command.extend( FFmpegWrapper.__build_audio_and_output_params( - audio_bitrate, vf_filter, output_path, + params.audio_bitrate, vf_filter, params.output_path, ), ) - if log_command: + if params.log_command: print('ffmpeg \\') for i, arg in enumerate(command[1:], 1): if i == len(command) - 1: @@ -235,8 +235,8 @@ def __build_encoding_params( '-minrate', minrate, '-maxrate', maxrate, '-bufsize', bufsize, - '-bf', FFmpegWrapper.__BF, - '-b_adapt', FFmpegWrapper.__B_ADAPT, + '-bf', FFmpegWrapper.__B_FRAMES, + '-b_adapt', FFmpegWrapper.__B_ADAPT_MODE, '-2pass', FFmpegWrapper.__TWO_PASS, '-multipass', 'fullres', '-g', str(gop_size), @@ -253,7 +253,7 @@ def __build_encoding_params( else: params.extend([ '-rc-lookahead', FFmpegWrapper.__RC_LOOKAHEAD, - '-aq-strength', FFmpegWrapper.__AQ_STRENGTH, + '-aq-strength', FFmpegWrapper.__ADAPTIVE_QUANTIZATION_STRENGTH, ]) params.extend([ @@ -274,7 +274,7 @@ def __build_video_filter( filters.append('bwdif=mode=0:parity=-1:deint=1') filters.append('setfield=prog') - scaler_flags = 'spline36+accurate_rnd+full_chroma_int' if is_upscaling else 'bicubic' + scaler_flags = 'lanczos' if is_upscaling else 'bicubic' filters.append( f"scale='iw*sar:ih',scale={width}:{height}:" diff --git a/preprocessor/services/media/transcode_params.py b/preprocessor/services/media/transcode_params.py new file mode 100644 index 000000000..32e5b082e --- /dev/null +++ b/preprocessor/services/media/transcode_params.py @@ -0,0 +1,33 @@ +from dataclasses import dataclass +from pathlib import Path +from typing import ( + Optional, + Tuple, +) + + +@dataclass +class TranscodeParams: + + input_path: Path + output_path: Path + codec: str + preset: str + resolution: str + video_bitrate: str + minrate: str + maxrate: str + bufsize: str + audio_bitrate: str + gop_size: int + target_fps: Optional[float] = None + deinterlace: bool = False + is_upscaling: bool = False + log_command: bool = False + + def get_resolution_tuple(self) -> Tuple[int, int]: + try: + width, height = [int(x) for x in self.resolution.split(':')] + return width, height + except (ValueError, AttributeError) as e: + raise ValueError(f"Invalid resolution format: {self.resolution}") from e diff --git a/preprocessor/services/scraping/base_scraper.py b/preprocessor/services/scraping/base_scraper.py index c520fb9ce..01c4a4fce 100644 --- a/preprocessor/services/scraping/base_scraper.py +++ b/preprocessor/services/scraping/base_scraper.py @@ -9,11 +9,11 @@ Optional, ) -from preprocessor.config.config import settings from preprocessor.config.enums import ( ParserMode, ScraperMethod, ) +from preprocessor.config.settings_instance import settings from preprocessor.services.ai import LLMProvider from preprocessor.services.core.base_processor import BaseProcessor from preprocessor.services.scraping.clipboard import ScraperClipboard diff --git a/preprocessor/services/scraping/grid_visualizer.py b/preprocessor/services/scraping/grid_visualizer.py index 397245cea..fed26d590 100644 --- a/preprocessor/services/scraping/grid_visualizer.py +++ b/preprocessor/services/scraping/grid_visualizer.py @@ -13,7 +13,7 @@ import cv2 import numpy as np -from preprocessor.config.config import settings +from preprocessor.config.settings_instance import settings @dataclass diff --git a/preprocessor/services/scraping/reference_processor.py b/preprocessor/services/scraping/reference_processor.py index d30ee5ec3..af6e8f246 100644 --- a/preprocessor/services/scraping/reference_processor.py +++ b/preprocessor/services/scraping/reference_processor.py @@ -7,6 +7,7 @@ Dict, List, Optional, + Tuple, ) import warnings @@ -14,7 +15,7 @@ from insightface.app import FaceAnalysis import numpy as np -from preprocessor.config.config import settings +from preprocessor.config.settings_instance import settings from preprocessor.services.characters.face_detection import FaceDetector from preprocessor.services.characters.models import ( CandidateFace, @@ -291,51 +292,90 @@ def __find_common_face( ) -> Optional[List[FaceData]]: first_image_faces = all_faces[0] remaining_images = all_faces[1:] - candidates = [] + + candidates = self.__find_all_face_candidates(first_image_faces, remaining_images, all_faces) + return self.__select_final_candidate( + candidates, first_image_faces, all_faces, char_name, reference_images, + ) + + def __find_all_face_candidates( + self, + first_image_faces: List[FaceData], + remaining_images: List[List[FaceData]], + all_faces: List[List[FaceData]], + ) -> List[CandidateFace]: + candidates: List[CandidateFace] = [] for first_face in first_image_faces: matched_faces = [first_face] - similarities = [] + similarities: List[float] = [] + for other_image_faces in remaining_images: if not other_image_faces: break - best_match = None - best_similarity: float = -1.0 - for other_face in other_image_faces: - similarity: float = float( - np.dot( - first_face.face_vector, - other_face.face_vector, - ), - ) - if similarity > best_similarity: - best_similarity = similarity - best_match = other_face + + best_match, best_similarity = self.__find_best_matching_face( + first_face, other_image_faces, + ) + if best_match: matched_faces.append(best_match) similarities.append(best_similarity) - if best_similarity < self.similarity_threshold: - console.print( - f'[yellow]Warning: Low similarity {best_similarity:.2f} < ' - f'{self.similarity_threshold:.2f}[/yellow]', - ) + self.__warn_if_low_similarity(best_similarity) else: break + if len(matched_faces) == len(all_faces): - avg_similarity = np.mean(similarities) if similarities else 1.0 + avg_similarity = float(np.mean(similarities)) if similarities else 1.0 candidates.append(CandidateFace(faces=matched_faces, avg_similarity=avg_similarity)) + + return candidates + + def __find_best_matching_face( + self, + reference_face: FaceData, + candidate_faces: List[FaceData], + ) -> Tuple[Optional[FaceData], float]: + best_match: Optional[FaceData] = None + best_similarity: float = -1.0 + + for candidate_face in candidate_faces: + similarity: float = float( + np.dot(reference_face.face_vector, candidate_face.face_vector), + ) + if similarity > best_similarity: + best_similarity = similarity + best_match = candidate_face + + return best_match, best_similarity + + def __warn_if_low_similarity(self, similarity: float) -> None: + if similarity < self.similarity_threshold: + console.print( + f'[yellow]Warning: Low similarity {similarity:.2f} < ' + f'{self.similarity_threshold:.2f}[/yellow]', + ) + + def __select_final_candidate( + self, + candidates: List[CandidateFace], + first_image_faces: List[FaceData], + all_faces: List[List[FaceData]], + char_name: str, + reference_images: List[Path], + ) -> Optional[List[FaceData]]: if len(candidates) == 0: if self.interactive: return self.__ask_user_to_select_initial_face( - first_image_faces, - all_faces, - char_name, - reference_images, + first_image_faces, all_faces, char_name, reference_images, ) return None + if len(candidates) == 1: return candidates[0].faces + if self.interactive: return self.__ask_user_to_select_candidate(candidates, char_name) + candidates.sort(key=lambda c: c.avg_similarity, reverse=True) return candidates[0].faces diff --git a/preprocessor/services/search/clients/elasticsearch_queries.py b/preprocessor/services/search/clients/elasticsearch_queries.py index 0a5288abb..3c55cd36c 100644 --- a/preprocessor/services/search/clients/elasticsearch_queries.py +++ b/preprocessor/services/search/clients/elasticsearch_queries.py @@ -131,81 +131,19 @@ async def search_by_object( self, es_client: AsyncElasticsearch, object_query: str, - season: Optional[int]=None, - episode: Optional[int]=None, - limit: int=20, + season: Optional[int] = None, + episode: Optional[int] = None, + limit: int = 20, ) -> Dict[str, Any]: filter_clauses = self.__build_episode_filters(season, episode) - must_clauses: List[Dict[str, Any]] = [] - if ':' in object_query: - object_class, count_filter = object_query.split(':', 1) - object_class = object_class.strip() - if count_filter.endswith('+'): - min_count = int(count_filter[:-1]) - must_clauses.append({ - 'nested': { - 'path': 'detected_objects', - 'query': { - 'bool': { - 'must': [ - {'term': {'detected_objects.class': object_class}}, - {'range': {'detected_objects.count': {'gte': min_count}}}, - ], - }, - }, - }, - }) - elif '-' in count_filter: - min_c, max_c = count_filter.split('-') - must_clauses.append({ - 'nested': { - 'path': 'detected_objects', - 'query': { - 'bool': { - 'must': [ - {'term': {'detected_objects.class': object_class}}, - {'range': {'detected_objects.count': {'gte': int(min_c), 'lte': int(max_c)}}}, - ], - }, - }, - }, - }) - else: - exact_count = int(count_filter) - must_clauses.append({ - 'nested': { - 'path': 'detected_objects', - 'query': { - 'bool': { - 'must': [ - {'term': {'detected_objects.class': object_class}}, - {'term': {'detected_objects.count': exact_count}}, - ], - }, - }, - }, - }) - else: - must_clauses.append({ - 'nested': { - 'path': 'detected_objects', - 'query': {'term': {'detected_objects.class': object_query.strip()}}, - }, - }) + object_class, count_filter = self.__parse_object_query(object_query) + must_clauses = [self.__build_object_nested_query(object_class, count_filter)] query_body = {'bool': {'must': must_clauses, 'filter': filter_clauses}} - object_class = object_query.split(':')[0].strip() if ':' in object_query else object_query.strip() + return await es_client.search( index=self.__video_frames_index, query=query_body, - sort=[{ - 'detected_objects.count': { - 'order': 'desc', - 'nested': { - 'path': 'detected_objects', - 'filter': {'term': {'detected_objects.class': object_class}}, - }, - }, - }], + sort=[self.__build_object_sort(object_class)], track_scores=True, size=limit, _source=[ @@ -214,6 +152,82 @@ async def search_by_object( ], ) + @staticmethod + def __parse_object_query(object_query: str) -> Tuple[str, Optional[str]]: + if ':' not in object_query: + return object_query.strip(), None + object_class, count_filter = object_query.split(':', 1) + return object_class.strip(), count_filter + + @staticmethod + def __build_object_nested_query(object_class: str, count_filter: Optional[str]) -> Dict[str, Any]: + if count_filter is None: + return { + 'nested': { + 'path': 'detected_objects', + 'query': {'term': {'detected_objects.class': object_class}}, + }, + } + + if count_filter.endswith('+'): + min_count = int(count_filter[:-1]) + return { + 'nested': { + 'path': 'detected_objects', + 'query': { + 'bool': { + 'must': [ + {'term': {'detected_objects.class': object_class}}, + {'range': {'detected_objects.count': {'gte': min_count}}}, + ], + }, + }, + }, + } + + if '-' in count_filter: + min_count, max_count = count_filter.split('-') + return { + 'nested': { + 'path': 'detected_objects', + 'query': { + 'bool': { + 'must': [ + {'term': {'detected_objects.class': object_class}}, + {'range': {'detected_objects.count': {'gte': int(min_count), 'lte': int(max_count)}}}, + ], + }, + }, + }, + } + + exact_count = int(count_filter) + return { + 'nested': { + 'path': 'detected_objects', + 'query': { + 'bool': { + 'must': [ + {'term': {'detected_objects.class': object_class}}, + {'term': {'detected_objects.count': exact_count}}, + ], + }, + }, + }, + } + + @staticmethod + def __build_object_sort(object_class: str) -> Dict[str, Any]: + return { + 'detected_objects.count': { + 'order': 'desc', + 'nested': { + 'path': 'detected_objects', + 'filter': {'term': {'detected_objects.class': object_class}}, + }, + }, + } + async def search_episode_name( self, es_client: AsyncElasticsearch, diff --git a/preprocessor/services/search/clients/embedding_service.py b/preprocessor/services/search/clients/embedding_service.py index 93c2c3d02..530bca286 100644 --- a/preprocessor/services/search/clients/embedding_service.py +++ b/preprocessor/services/search/clients/embedding_service.py @@ -14,7 +14,7 @@ AutoProcessor, ) -from preprocessor.config.config import settings +from preprocessor.config.settings_instance import settings class EmbeddingService: diff --git a/preprocessor/services/transcription/engines/elevenlabs_engine.py b/preprocessor/services/transcription/engines/elevenlabs_engine.py index 26255910a..86f39265c 100644 --- a/preprocessor/services/transcription/engines/elevenlabs_engine.py +++ b/preprocessor/services/transcription/engines/elevenlabs_engine.py @@ -10,7 +10,7 @@ from elevenlabs.client import ElevenLabs from elevenlabs.core import ApiError -from preprocessor.config.config import settings +from preprocessor.config.settings_instance import settings from preprocessor.services.core.logging import ErrorHandlingLogger from preprocessor.services.transcription.engines.base_engine import TranscriptionEngine from preprocessor.services.ui.console import console diff --git a/preprocessor/services/validation/validator.py b/preprocessor/services/validation/validator.py index a535382c7..9f9e34375 100644 --- a/preprocessor/services/validation/validator.py +++ b/preprocessor/services/validation/validator.py @@ -8,10 +8,10 @@ from rich.console import Console from rich.progress import track -from preprocessor.config.config import settings +from preprocessor.config.settings_instance import settings from preprocessor.services.episodes import EpisodeManager from preprocessor.services.io.files import FileOperations -from preprocessor.services.io.path_manager import PathManager +from preprocessor.services.io.path_service import PathService from preprocessor.services.validation.episode_stats import EpisodeStats from preprocessor.services.validation.report_generator import ReportGenerator from preprocessor.services.validation.season_comparator import SeasonComparison @@ -83,7 +83,7 @@ def __generate_episode_reports(self, episodes_stats: Dict[str, EpisodeStats]) -> 'warnings': stats.warnings, 'stats': stats.to_dict()['stats'], } - path_manager = PathManager(self.series_name) + path_manager = PathService(self.series_name) report_filename = path_manager.build_filename(stats.episode_info, extension='json') report_path = self.validation_reports_dir / report_filename FileOperations.atomic_write_json(report_path, episode_report) diff --git a/preprocessor/services/validation/validators/base_validator.py b/preprocessor/services/validation/validators/base_validator.py index 47a22466c..69d0fb587 100644 --- a/preprocessor/services/validation/validators/base_validator.py +++ b/preprocessor/services/validation/validators/base_validator.py @@ -2,8 +2,16 @@ ABC, abstractmethod, ) +import json from pathlib import Path -from typing import TYPE_CHECKING +from typing import ( + TYPE_CHECKING, + Any, + Dict, + Optional, +) + +from preprocessor.services.validation.file_validators import FileValidator if TYPE_CHECKING: from preprocessor.services.validation.episode_stats import EpisodeStats @@ -31,3 +39,60 @@ def _add_warning(stats: 'EpisodeStats', message: str) -> None: @staticmethod def _add_error(stats: 'EpisodeStats', message: str) -> None: stats.errors.append(message) + + @staticmethod + def _validate_json_if_exists( + stats: 'EpisodeStats', + file_path: Path, + error_msg_prefix: str, + ) -> bool: + if not file_path.exists(): + return False + + result = FileValidator.validate_json_file(file_path) + if not result.is_valid: + BaseValidator._add_error(stats, f'{error_msg_prefix}: {result.error_message}') + return False + return True + + @staticmethod + def _validate_json_with_warning( + stats: 'EpisodeStats', + file_path: Path, + missing_msg: str, + invalid_msg_prefix: str, + ) -> bool: + if not file_path.exists(): + BaseValidator._add_warning(stats, missing_msg) + return False + + result = FileValidator.validate_json_file(file_path) + if not result.is_valid: + BaseValidator._add_warning(stats, f'{invalid_msg_prefix}: {result.error_message}') + return False + return True + + @staticmethod + def _validate_json_with_error( + stats: 'EpisodeStats', + file_path: Path, + missing_msg: str, + invalid_msg_prefix: str, + ) -> bool: + if not file_path.exists(): + BaseValidator._add_error(stats, missing_msg) + return False + + result = FileValidator.validate_json_file(file_path) + if not result.is_valid: + BaseValidator._add_error(stats, f'{invalid_msg_prefix}: {result.error_message}') + return False + return True + + @staticmethod + def _load_json_safely(file_path: Path) -> Optional[Dict[str, Any]]: + try: + with open(file_path, 'r', encoding='utf-8') as f: + return json.load(f) + except Exception: + return None diff --git a/preprocessor/services/validation/validators/character_validator.py b/preprocessor/services/validation/validators/character_validator.py index 8b2a3a610..aa1a9e99a 100644 --- a/preprocessor/services/validation/validators/character_validator.py +++ b/preprocessor/services/validation/validators/character_validator.py @@ -1,6 +1,6 @@ from typing import TYPE_CHECKING -from preprocessor.config.config import settings +from preprocessor.config.settings_instance import settings from preprocessor.services.validation.validators.base_validator import BaseValidator from preprocessor.services.validation.validators.validation_helpers import VisualizationValidationHelper diff --git a/preprocessor/services/validation/validators/elastic_validator.py b/preprocessor/services/validation/validators/elastic_validator.py index 2ad9c2eb9..1e757eadb 100644 --- a/preprocessor/services/validation/validators/elastic_validator.py +++ b/preprocessor/services/validation/validators/elastic_validator.py @@ -2,9 +2,9 @@ from pathlib import Path from typing import TYPE_CHECKING -from preprocessor.config.config import settings from preprocessor.config.constants import OUTPUT_FILE_NAMES -from preprocessor.services.io.path_manager import PathManager +from preprocessor.config.settings_instance import settings +from preprocessor.services.io.path_service import PathService from preprocessor.services.validation.file_validators import FileValidator from preprocessor.services.validation.validators.base_validator import BaseValidator @@ -23,27 +23,29 @@ def validate(self, stats: 'EpisodeStats') -> None: self.__validate_text_statistics(stats) def __validate_character_detections(self, stats: 'EpisodeStats') -> None: - char_detections_dir = PathManager(stats.series_name).get_episode_dir( + char_detections_dir = PathService(stats.series_name).get_episode_dir( stats.episode_info, settings.output_subdirs.character_detections, ) detections_file = char_detections_dir / OUTPUT_FILE_NAMES['detections'] - if detections_file.exists(): - result = FileValidator.validate_json_file(detections_file) - if not result.is_valid: - self._add_error(stats, f"Invalid {OUTPUT_FILE_NAMES['detections']}: {result.error_message}") + self._validate_json_if_exists( + stats, + detections_file, + error_msg_prefix=f"Invalid {OUTPUT_FILE_NAMES['detections']}", + ) def __validate_embeddings(self, stats: 'EpisodeStats') -> None: - embeddings_dir = PathManager(stats.series_name).get_episode_dir( + embeddings_dir = PathService(stats.series_name).get_episode_dir( stats.episode_info, settings.output_subdirs.embeddings, ) if embeddings_dir.exists(): embeddings_file = embeddings_dir / OUTPUT_FILE_NAMES['embeddings_text'] - if embeddings_file.exists(): - result = FileValidator.validate_json_file(embeddings_file) - if not result.is_valid: - self._add_error(stats, f"Invalid {OUTPUT_FILE_NAMES['embeddings_text']}: {result.error_message}") + self._validate_json_if_exists( + stats, + embeddings_file, + error_msg_prefix=f"Invalid {OUTPUT_FILE_NAMES['embeddings_text']}", + ) def __validate_elastic_documents(self, stats: 'EpisodeStats') -> None: elastic_subdirs = [ @@ -60,7 +62,7 @@ def __validate_elastic_documents(self, stats: 'EpisodeStats') -> None: found_elastic_docs = False for subdir in elastic_subdirs: elastic_base = settings.output_subdirs.elastic_documents - elastic_docs_dir = PathManager(stats.series_name).get_episode_dir( + elastic_docs_dir = PathService(stats.series_name).get_episode_dir( stats.episode_info, f'{elastic_base}/{subdir}', ) @@ -77,7 +79,7 @@ def __validate_elastic_documents(self, stats: 'EpisodeStats') -> None: self._add_warning(stats, f'Missing {settings.output_subdirs.elastic_documents} directory') def __validate_text_statistics(self, stats: 'EpisodeStats') -> None: - transcriptions_dir = PathManager(stats.series_name).get_episode_dir( + transcriptions_dir = PathService(stats.series_name).get_episode_dir( stats.episode_info, settings.output_subdirs.transcriptions, ) diff --git a/preprocessor/services/validation/validators/face_cluster_validator.py b/preprocessor/services/validation/validators/face_cluster_validator.py index fd9cb580d..a9bdc6808 100644 --- a/preprocessor/services/validation/validators/face_cluster_validator.py +++ b/preprocessor/services/validation/validators/face_cluster_validator.py @@ -7,8 +7,8 @@ Optional, ) -from preprocessor.config.config import settings -from preprocessor.services.io.path_manager import PathManager +from preprocessor.config.settings_instance import settings +from preprocessor.services.io.path_service import PathService from preprocessor.services.validation.file_validators import FileValidator from preprocessor.services.validation.validators.base_validator import BaseValidator @@ -19,7 +19,7 @@ class FaceClusterValidator(BaseValidator): def validate(self, stats: 'EpisodeStats') -> None: - clusters_dir = PathManager(stats.series_name).get_episode_dir( + clusters_dir = PathService(stats.series_name).get_episode_dir( stats.episode_info, settings.output_subdirs.face_clusters, ) diff --git a/preprocessor/services/validation/validators/frame_validator.py b/preprocessor/services/validation/validators/frame_validator.py index 915439b8b..fbe2ad06f 100644 --- a/preprocessor/services/validation/validators/frame_validator.py +++ b/preprocessor/services/validation/validators/frame_validator.py @@ -1,8 +1,8 @@ from typing import TYPE_CHECKING -from preprocessor.config.config import settings from preprocessor.config.constants import OUTPUT_FILE_PATTERNS -from preprocessor.services.io.path_manager import PathManager +from preprocessor.config.settings_instance import settings +from preprocessor.services.io.path_service import PathService from preprocessor.services.validation.file_validators import FileValidator from preprocessor.services.validation.validators.base_validator import BaseValidator @@ -13,7 +13,7 @@ class FrameValidator(BaseValidator): def validate(self, stats: 'EpisodeStats') -> None: - frames_dir = PathManager(stats.series_name).get_episode_dir( + frames_dir = PathService(stats.series_name).get_episode_dir( stats.episode_info, settings.output_subdirs.frames, ) diff --git a/preprocessor/services/validation/validators/image_hash_validator.py b/preprocessor/services/validation/validators/image_hash_validator.py index e3bff0456..9e26e6806 100644 --- a/preprocessor/services/validation/validators/image_hash_validator.py +++ b/preprocessor/services/validation/validators/image_hash_validator.py @@ -1,6 +1,6 @@ from typing import TYPE_CHECKING -from preprocessor.config.config import settings +from preprocessor.config.settings_instance import settings from preprocessor.services.validation.validators.base_validator import BaseValidator from preprocessor.services.validation.validators.validation_helpers import JsonDirectoryValidationHelper diff --git a/preprocessor/services/validation/validators/object_validator.py b/preprocessor/services/validation/validators/object_validator.py index 41ceea434..58bd44b8d 100644 --- a/preprocessor/services/validation/validators/object_validator.py +++ b/preprocessor/services/validation/validators/object_validator.py @@ -1,6 +1,6 @@ from typing import TYPE_CHECKING -from preprocessor.config.config import settings +from preprocessor.config.settings_instance import settings from preprocessor.services.validation.validators.base_validator import BaseValidator from preprocessor.services.validation.validators.validation_helpers import ( JsonDirectoryValidationHelper, diff --git a/preprocessor/services/validation/validators/scene_validator.py b/preprocessor/services/validation/validators/scene_validator.py index 37c693fa1..f9b32eb3f 100644 --- a/preprocessor/services/validation/validators/scene_validator.py +++ b/preprocessor/services/validation/validators/scene_validator.py @@ -7,9 +7,9 @@ Optional, ) -from preprocessor.config.config import settings from preprocessor.config.constants import OUTPUT_FILE_PATTERNS -from preprocessor.services.io.path_manager import PathManager +from preprocessor.config.settings_instance import settings +from preprocessor.services.io.path_service import PathService from preprocessor.services.validation.file_validators import FileValidator from preprocessor.services.validation.validators.base_validator import BaseValidator @@ -20,7 +20,7 @@ class SceneValidator(BaseValidator): def validate(self, stats: 'EpisodeStats') -> None: - scenes_dir = PathManager(stats.series_name).get_episode_dir( + scenes_dir = PathService(stats.series_name).get_episode_dir( stats.episode_info, settings.output_subdirs.scenes, ) scenes_file = scenes_dir / f"{stats.series_name}_{stats.episode_info.episode_code()}{OUTPUT_FILE_PATTERNS['scenes_suffix']}" diff --git a/preprocessor/services/validation/validators/transcription_validator.py b/preprocessor/services/validation/validators/transcription_validator.py index 1364e6380..cc1ff3a46 100644 --- a/preprocessor/services/validation/validators/transcription_validator.py +++ b/preprocessor/services/validation/validators/transcription_validator.py @@ -1,14 +1,11 @@ -import json from pathlib import Path from typing import ( TYPE_CHECKING, - Any, Dict, - Optional, ) -from preprocessor.config.config import settings -from preprocessor.services.io.path_manager import PathManager +from preprocessor.config.settings_instance import settings +from preprocessor.services.io.path_service import PathService from preprocessor.services.validation.file_validators import FileValidator from preprocessor.services.validation.validators.base_validator import BaseValidator @@ -19,7 +16,7 @@ class TranscriptionValidator(BaseValidator): def validate(self, stats: 'EpisodeStats') -> None: - transcriptions_dir = PathManager(stats.series_name).get_episode_dir( + transcriptions_dir = PathService(stats.series_name).get_episode_dir( stats.episode_info, settings.output_subdirs.transcriptions, ) base_name = f'{stats.series_name}_{stats.episode_info.episode_code()}' @@ -69,7 +66,7 @@ def __validate_raw_transcription( self.__extract_transcription_stats(stats, raw_transcription) def __extract_transcription_stats(self, stats: 'EpisodeStats', raw_transcription: Path) -> None: - data = self.__load_json_safely(raw_transcription) + data = self._load_json_safely(raw_transcription) if not data: self._add_error(stats, f'Error reading transcription: {raw_transcription}') return @@ -91,34 +88,23 @@ def __extract_transcription_stats(self, stats: 'EpisodeStats', raw_transcription if segments and segments[-1].get('end'): stats.transcription_duration = segments[-1].get('end', 0.0) - @staticmethod - def __load_json_safely(file_path: Path) -> Optional[Dict[str, Any]]: - try: - with open(file_path, 'r', encoding='utf-8') as f: - return json.load(f) - except Exception: - return None def __validate_clean_transcription(self, stats: 'EpisodeStats', clean_transcription_file: Path) -> None: - if not clean_transcription_file.exists(): - self._add_warning( - stats, f'Missing clean transcription file: {clean_transcription_file.name}', - ) - return - - result = FileValidator.validate_json_file(clean_transcription_file) - if not result.is_valid: - self._add_warning(stats, f'Invalid clean transcription JSON: {result.error_message}') + self._validate_json_with_warning( + stats, + clean_transcription_file, + missing_msg=f'Missing clean transcription file: {clean_transcription_file.name}', + invalid_msg_prefix='Invalid clean transcription JSON', + ) def __validate_clean_txt(self, stats: 'EpisodeStats', clean_txt_file: Path) -> None: if not clean_txt_file.exists(): self._add_warning(stats, f'Missing clean transcription txt: {clean_txt_file.name}') def __validate_sound_events(self, stats: 'EpisodeStats', sound_events_file: Path) -> None: - if not sound_events_file.exists(): - self._add_warning(stats, f'Missing sound events file: {sound_events_file.name}') - return - - result = FileValidator.validate_json_file(sound_events_file) - if not result.is_valid: - self._add_warning(stats, f'Invalid sound events JSON: {result.error_message}') + self._validate_json_with_warning( + stats, + sound_events_file, + missing_msg=f'Missing sound events file: {sound_events_file.name}', + invalid_msg_prefix='Invalid sound events JSON', + ) diff --git a/preprocessor/services/validation/validators/validation_helpers.py b/preprocessor/services/validation/validators/validation_helpers.py index c19bb6b29..03865a708 100644 --- a/preprocessor/services/validation/validators/validation_helpers.py +++ b/preprocessor/services/validation/validators/validation_helpers.py @@ -6,7 +6,7 @@ Tuple, ) -from preprocessor.services.io.path_manager import PathManager +from preprocessor.services.io.path_service import PathService from preprocessor.services.validation.file_validators import FileValidator if TYPE_CHECKING: @@ -24,7 +24,7 @@ def validate_json_directory( exclude_pattern: Optional[str] = None, check_anomalies: bool = True, ) -> None: - dir_path = PathManager(stats.series_name).get_episode_dir(stats.episode_info, subdir) + dir_path = PathService(stats.series_name).get_episode_dir(stats.episode_info, subdir) count, sizes, errors = JsonDirectoryValidationHelper._validate_json_files_in_directory( dir_path, exclude_pattern, ) @@ -98,7 +98,7 @@ class VisualizationValidationHelper: def validate_visualizations( stats: 'EpisodeStats', subdir: str, count_attr: str, context_name: str, ) -> None: - viz_dir = PathManager(stats.series_name).get_episode_dir(stats.episode_info, subdir) + viz_dir = PathService(stats.series_name).get_episode_dir(stats.episode_info, subdir) total_count, invalid_count, errors = VisualizationValidationHelper._validate_images_in_directory(viz_dir) if total_count == 0 and viz_dir.exists(): diff --git a/preprocessor/services/video/emotion_utils.py b/preprocessor/services/video/emotion_utils.py index d28accb76..fd8b79e82 100644 --- a/preprocessor/services/video/emotion_utils.py +++ b/preprocessor/services/video/emotion_utils.py @@ -8,7 +8,7 @@ from hsemotion_onnx.facial_emotions import HSEmotionRecognizer import numpy as np -from preprocessor.config.config import settings +from preprocessor.config.settings_instance import settings from preprocessor.services.core.logging import ErrorHandlingLogger EMOTION_LABELS = ['anger', 'contempt', 'disgust', 'fear', 'happiness', 'neutral', 'sadness', 'surprise'] @@ -42,7 +42,7 @@ def __clip_bbox( return x1, y1, x2, y2 @staticmethod - def __crop_face(frame: np.ndarray, bbox: Dict[str, int]) -> Optional[np.ndarray]: # pylint: disable=unused-private-member + def _crop_face(frame: np.ndarray, bbox: Dict[str, int]) -> Optional[np.ndarray]: try: x1, y1, x2, y2 = (bbox['x1'], bbox['y1'], bbox['x2'], bbox['y2']) height, width = frame.shape[:2] @@ -55,13 +55,13 @@ def __crop_face(frame: np.ndarray, bbox: Dict[str, int]) -> Optional[np.ndarray] return None @staticmethod - def __detect_batch( # pylint: disable=unused-private-member + def _detect_batch( face_images: List[np.ndarray], model: HSEmotionRecognizer, batch_size: int = 32, logger: Optional[ErrorHandlingLogger] = None, - ) -> List[Tuple[str, float, Dict[str, float]]]: - results = [] + ) -> List[Optional[Tuple[str, float, Dict[str, float]]]]: + results: List[Optional[Tuple[str, float, Dict[str, float]]]] = [] total = len(face_images) for batch_start in range(0, total, batch_size): batch_end = min(batch_start + batch_size, total) @@ -86,7 +86,7 @@ def __detect_batch( # pylint: disable=unused-private-member return results @staticmethod - def __init_model(logger: Optional[ErrorHandlingLogger]=None) -> HSEmotionRecognizer: # pylint: disable=unused-private-member + def _init_model(logger: Optional[ErrorHandlingLogger] = None) -> HSEmotionRecognizer: model_name = settings.emotion_detection.model_name if logger: logger.info(f'Loading HSEmotion model: {model_name}...') diff --git a/preprocessor/steps/audio/separation_step.py b/preprocessor/steps/audio/separation_step.py index de41a3383..ec13e4c6f 100644 --- a/preprocessor/steps/audio/separation_step.py +++ b/preprocessor/steps/audio/separation_step.py @@ -20,7 +20,7 @@ from preprocessor.core.artifacts import TranscriptionData from preprocessor.core.base_step import PipelineStep from preprocessor.core.context import ExecutionContext -from preprocessor.services.io.files import atomic_write_json +from preprocessor.services.io.files import FileOperations from preprocessor.services.transcription.sound_classification import ( classify_segment, is_sound_event, @@ -130,10 +130,10 @@ def _save_separated_data( clean_data = {'episode_info': episode_info_dict, 'segments': dialogue_segments} sound_data = {'episode_info': episode_info_dict, 'segments': sound_segments} - atomic_write_json(output_paths['clean_json'], clean_data) - atomic_write_json(output_paths['sound_json'], sound_data) - atomic_write_json(output_paths['clean_segmented'], clean_data) - atomic_write_json(output_paths['sound_segmented'], sound_data) + FileOperations.atomic_write_json(output_paths['clean_json'], clean_data) + FileOperations.atomic_write_json(output_paths['sound_json'], sound_data) + FileOperations.atomic_write_json(output_paths['clean_segmented'], clean_data) + FileOperations.atomic_write_json(output_paths['sound_segmented'], sound_data) def _generate_additional_formats( self, diff --git a/preprocessor/steps/search/document_generation_step.py b/preprocessor/steps/search/document_generation_step.py index d8d6f28ee..2897b5092 100644 --- a/preprocessor/steps/search/document_generation_step.py +++ b/preprocessor/steps/search/document_generation_step.py @@ -12,7 +12,7 @@ ) from preprocessor.core.base_step import PipelineStep from preprocessor.core.context import ExecutionContext -from preprocessor.services.io.files import load_json +from preprocessor.services.io.files import FileOperations class DocumentGeneratorStep(PipelineStep[Artifact, ElasticDocuments, DocumentGenerationConfig]): @@ -82,15 +82,15 @@ def __gather_input_data(episode_info: Any, context: ExecutionContext) -> Dict[st clean_filename = f'{context.series_name}_{episode_info.episode_code()}_clean_transcription.json' clean_path = context.get_output_path(episode_info, 'transcriptions/clean', clean_filename) if clean_path.exists(): - data['transcription'] = load_json(clean_path) + data['transcription'] = FileOperations.load_json(clean_path) text_emb_filename = f'{context.series_name}_{episode_info.episode_code()}_embeddings_text.json' text_emb_path = context.get_output_path(episode_info, 'embeddings', text_emb_filename) if text_emb_path.exists(): - data['text_embeddings'] = load_json(text_emb_path) + data['text_embeddings'] = FileOperations.load_json(text_emb_path) scene_filename = f'{context.series_name}_{episode_info.episode_code()}_scenes.json' scene_path = context.get_output_path(episode_info, 'scene_timestamps', scene_filename) if scene_path.exists(): - data['scenes'] = load_json(scene_path) + data['scenes'] = FileOperations.load_json(scene_path) return data def __generate_segments_jsonl(self, data: Dict[str, Any], episode_info: Any, context: ExecutionContext) -> tuple[Path, int]: diff --git a/preprocessor/steps/text/analysis_step.py b/preprocessor/steps/text/analysis_step.py index dbb8cfb63..820a8193b 100644 --- a/preprocessor/steps/text/analysis_step.py +++ b/preprocessor/steps/text/analysis_step.py @@ -12,10 +12,7 @@ ) from preprocessor.core.base_step import PipelineStep from preprocessor.core.context import ExecutionContext -from preprocessor.services.io.files import ( - atomic_write_json, - load_json, -) +from preprocessor.services.io.files import FileOperations from preprocessor.services.text.text_statistics import TextStatistics @@ -34,7 +31,7 @@ def execute(self, input_data: TranscriptionData, context: ExecutionContext) -> T stats = self._analyze_text_statistics(txt_path) result_data = self._build_result_data(stats, txt_path, input_data) - atomic_write_json(output_path, result_data) + FileOperations.atomic_write_json(output_path, result_data) context.mark_step_completed(self.name, input_data.episode_id) return TextAnalysisResults( @@ -56,7 +53,7 @@ def _get_output_path(input_data: TranscriptionData) -> Path: @staticmethod def _load_cached_result(output_path: Path, input_data: TranscriptionData) -> TextAnalysisResults: - stats_data = load_json(output_path) + stats_data = FileOperations.load_json(output_path) return TextAnalysisResults( episode_id=input_data.episode_id, episode_info=input_data.episode_info, diff --git a/preprocessor/steps/text/embeddings_step.py b/preprocessor/steps/text/embeddings_step.py index d226a7c73..7a11abc8b 100644 --- a/preprocessor/steps/text/embeddings_step.py +++ b/preprocessor/steps/text/embeddings_step.py @@ -14,10 +14,7 @@ ) from preprocessor.core.base_step import PipelineStep from preprocessor.core.context import ExecutionContext -from preprocessor.services.io.files import ( - atomic_write_json, - load_json, -) +from preprocessor.services.io.files import FileOperations from preprocessor.services.io.metadata import MetadataBuilder from preprocessor.services.search.embedding_model import EmbeddingModelWrapper @@ -84,7 +81,7 @@ def _load_cached_result( # pylint: disable=duplicate-code output_path: Path, input_data: TranscriptionData, ) -> EmbeddingCollection: - emb_data: Dict[str, Any] = load_json(output_path) + emb_data: Dict[str, Any] = FileOperations.load_json(output_path) return self._create_embedding_collection( input_data, output_path, @@ -159,8 +156,8 @@ def _batch_encode_chunks( batch_meta: List[Dict[str, Any]] = chunk_metadata[i:i + self.config.batch_size] batch_embeddings: List[List[float]] = self._model.encode_text(batch_texts) - for meta, emb in zip(batch_meta, batch_embeddings): - results.append({**meta, 'embedding': emb}) + for meta, embedding in zip(batch_meta, batch_embeddings): + results.append({**meta, 'embedding': embedding}) return results @@ -180,7 +177,7 @@ def _save_results( results_key='text_embeddings', results_data=results, ) - atomic_write_json(output_path, output_data) + FileOperations.atomic_write_json(output_path, output_data) def _create_embedding_collection( # pylint: disable=duplicate-code self, @@ -218,8 +215,8 @@ def __load_clean_transcription( raw_path.name.replace('.json', '_clean_transcription.json') ) if clean_path.exists(): - return load_json(clean_path) - return load_json(raw_path) + return FileOperations.load_json(clean_path) + return FileOperations.load_json(raw_path) @staticmethod def __split_into_sentences(text: str) -> List[str]: diff --git a/preprocessor/steps/text/transcription_step.py b/preprocessor/steps/text/transcription_step.py index 1ffcdb1fe..2871b1bb8 100644 --- a/preprocessor/steps/text/transcription_step.py +++ b/preprocessor/steps/text/transcription_step.py @@ -13,7 +13,7 @@ from preprocessor.core.base_step import PipelineStep from preprocessor.core.context import ExecutionContext from preprocessor.services.episodes.episode_manager import EpisodeManager -from preprocessor.services.io.files import atomic_write_json +from preprocessor.services.io.files import FileOperations from preprocessor.services.transcription.whisper import Whisper @@ -89,7 +89,7 @@ def _transcribe_audio( try: result: Dict[str, Any] = self._whisper.transcribe(input_data.path) result['episode_info'] = EpisodeManager.get_metadata(input_data.episode_info) - atomic_write_json(output_path, result) + FileOperations.atomic_write_json(output_path, result) return result except Exception as e: context.logger.error( diff --git a/preprocessor/steps/video/frame_export_step.py b/preprocessor/steps/video/frame_export_step.py index 304a74296..984821ef4 100644 --- a/preprocessor/steps/video/frame_export_step.py +++ b/preprocessor/steps/video/frame_export_step.py @@ -21,7 +21,7 @@ ) from preprocessor.core.base_step import PipelineStep from preprocessor.core.context import ExecutionContext -from preprocessor.services.io.files import atomic_write_json +from preprocessor.services.io.files import FileOperations from preprocessor.services.video.strategies.strategy_factory import KeyframeStrategyFactory @@ -273,4 +273,4 @@ def __write_metadata( }, 'frames': frames_with_paths, } - atomic_write_json(metadata_file, metadata, indent=2) + FileOperations.atomic_write_json(metadata_file, metadata, indent=2) diff --git a/preprocessor/steps/video/scene_detection_step.py b/preprocessor/steps/video/scene_detection_step.py index d88ec825f..e359e12e1 100644 --- a/preprocessor/steps/video/scene_detection_step.py +++ b/preprocessor/steps/video/scene_detection_step.py @@ -12,10 +12,7 @@ ) from preprocessor.core.base_step import PipelineStep from preprocessor.core.context import ExecutionContext -from preprocessor.services.io.files import ( - atomic_write_json, - load_json, -) +from preprocessor.services.io.files import FileOperations from preprocessor.services.media.scene_detection import TransNetWrapper @@ -57,7 +54,7 @@ def _get_output_path(input_data: TranscodedVideo, context: ExecutionContext) -> return context.get_output_path(input_data.episode_info, 'scene_timestamps', output_filename) def _load_cached_result(self, output_path: Path, input_data: TranscodedVideo) -> SceneCollection: - scenes_data = load_json(output_path) + scenes_data = FileOperations.load_json(output_path) return SceneCollection( path=output_path, video_path=input_data.path, @@ -93,7 +90,7 @@ def _save_results(self, scenes: List[Dict[str, Any]], video_path: Path, output_p }, 'scenes': scenes, } - atomic_write_json(output_path, output_data) + FileOperations.atomic_write_json(output_path, output_data) def _create_scene_collection( self, diff --git a/preprocessor/steps/video/transcoding_step.py b/preprocessor/steps/video/transcoding_step.py index a7bf35766..76a04387a 100644 --- a/preprocessor/steps/video/transcoding_step.py +++ b/preprocessor/steps/video/transcoding_step.py @@ -12,6 +12,7 @@ from preprocessor.core.base_step import PipelineStep from preprocessor.core.context import ExecutionContext from preprocessor.services.media.ffmpeg import FFmpegWrapper +from preprocessor.services.media.transcode_params import TranscodeParams class VideoTranscoderStep(PipelineStep[SourceVideo, TranscodedVideo, TranscodeConfig]): @@ -312,21 +313,23 @@ def _perform_transcode( # pylint: disable=too-many-arguments context.logger.info('=' * 80) FFmpegWrapper.transcode( - input_path=input_path, - output_path=temp_path, - codec=self.config.codec, - preset=self.config.preset, - resolution=f'{self.config.resolution.width}:{self.config.resolution.height}', - video_bitrate=f'{video_bitrate}M', - minrate=f'{minrate}M', - maxrate=f'{maxrate}M', - bufsize=f'{bufsize}M', - audio_bitrate=f'{audio_bitrate}k', - gop_size=int(target_fps * 0.5), - target_fps=target_fps, - deinterlace=deinterlace, - is_upscaling=is_upscaling, - log_command=log_command, + TranscodeParams( + input_path=input_path, + output_path=temp_path, + codec=self.config.codec, + preset=self.config.preset, + resolution=f'{self.config.resolution.width}:{self.config.resolution.height}', + video_bitrate=f'{video_bitrate}M', + minrate=f'{minrate}M', + maxrate=f'{maxrate}M', + bufsize=f'{bufsize}M', + audio_bitrate=f'{audio_bitrate}k', + gop_size=int(target_fps * 0.5), + target_fps=target_fps, + deinterlace=deinterlace, + is_upscaling=is_upscaling, + log_command=log_command, + ), ) temp_path.replace(output_path) except BaseException: diff --git a/preprocessor/steps/vision/character_detection_step.py b/preprocessor/steps/vision/character_detection_step.py index 6269b1b62..11455c11a 100644 --- a/preprocessor/steps/vision/character_detection_step.py +++ b/preprocessor/steps/vision/character_detection_step.py @@ -15,11 +15,7 @@ from preprocessor.core.base_step import PipelineStep from preprocessor.core.context import ExecutionContext from preprocessor.services.characters import FaceDetector -from preprocessor.services.io.detection_io import process_frames_for_detection -from preprocessor.services.io.files import ( - atomic_write_json, - load_json, -) +from preprocessor.services.io.files import FileOperations class CharacterDetectorStep(PipelineStep[FrameCollection, DetectionResults, CharacterDetectionConfig]): @@ -76,13 +72,13 @@ def _get_output_path(input_data: FrameCollection, context: ExecutionContext) -> @staticmethod def _load_cached_result(output_path: Path, input_data: FrameCollection) -> DetectionResults: - det_data: Dict[str, Any] = load_json(output_path) + detection_data: Dict[str, Any] = FileOperations.load_json(output_path) return DetectionResults( episode_id=input_data.episode_id, episode_info=input_data.episode_info, path=output_path, detection_type='character', - detection_count=len(det_data.get('detections', [])), + detection_count=len(detection_data.get('detections', [])), ) def _ensure_model_loaded(self, context: ExecutionContext) -> None: @@ -127,9 +123,17 @@ def _create_empty_result( ) def _detect_characters(self, frame_files: List[Path]) -> List[Dict[str, Any]]: - return process_frames_for_detection( - frame_files, self._face_app, self._character_vectors, self.config.threshold, - ) + results: List[Dict[str, Any]] = [] + for frame_path in frame_files: + detections: List[Dict[str, Any]] = FaceDetector.detect_characters_in_frame( + frame_path, + self._face_app, + self._character_vectors, + self.config.threshold, + ) + if detections: + results.append({'frame': frame_path.name, 'faces': detections}) + return results def _save_results( self, @@ -150,7 +154,7 @@ def _save_results( }, 'detections': results, } - atomic_write_json(output_path, output_data) + FileOperations.atomic_write_json(output_path, output_data) @staticmethod def __count_characters(results: List[Dict[str, Any]]) -> Dict[str, int]: diff --git a/preprocessor/steps/vision/embeddings_step.py b/preprocessor/steps/vision/embeddings_step.py index 2e78303fb..55c9b7bff 100644 --- a/preprocessor/steps/vision/embeddings_step.py +++ b/preprocessor/steps/vision/embeddings_step.py @@ -15,10 +15,7 @@ ) from preprocessor.core.base_step import PipelineStep from preprocessor.core.context import ExecutionContext -from preprocessor.services.io.files import ( - atomic_write_json, - load_json, -) +from preprocessor.services.io.files import FileOperations from preprocessor.services.io.metadata import MetadataBuilder from preprocessor.services.search.embedding_model import EmbeddingModelWrapper @@ -87,7 +84,7 @@ def _load_cached_result( # pylint: disable=duplicate-code output_path: Path, input_data: FrameCollection, ) -> EmbeddingCollection: - emb_data: Dict[str, Any] = load_json(output_path) + emb_data: Dict[str, Any] = FileOperations.load_json(output_path) return self._create_embedding_collection( input_data, output_path, @@ -99,7 +96,7 @@ def _load_frame_requests( input_data: FrameCollection, context: ExecutionContext, ) -> List[Dict[str, Any]]: - frame_metadata: Dict[str, Any] = load_json(input_data.metadata_path) + frame_metadata: Dict[str, Any] = FileOperations.load_json(input_data.metadata_path) frame_requests: List[Dict[str, Any]] = frame_metadata.get('frames', []) if not frame_requests: context.logger.warning(f'No frames for embedding in {input_data.episode_id}') @@ -124,8 +121,8 @@ def _generate_embeddings( image_paths: List[str] = [str(input_data.directory / f['frame_path']) for f in batch] batch_embeddings: List[np.ndarray] = self._model.encode_images(image_paths) # pylint: disable=no-member - for request, emb in zip(batch, batch_embeddings): - res: Dict[str, Any] = {**request, 'embedding': emb.tolist()} + for request, embedding in zip(batch, batch_embeddings): + res: Dict[str, Any] = {**request, 'embedding': embedding.tolist()} frame_num: int = request.get('frame_number', -1) if frame_num in image_hashes: res['perceptual_hash'] = image_hashes[frame_num] @@ -152,7 +149,7 @@ def _save_results( results_key='video_embeddings', results_data=results, ) - atomic_write_json(output_path, output_data) + FileOperations.atomic_write_json(output_path, output_data) def _create_embedding_collection( # pylint: disable=duplicate-code self, @@ -179,7 +176,7 @@ def __load_image_hashes( if not hash_path.exists(): return {} try: - data: Dict[str, Any] = load_json(hash_path) + data: Dict[str, Any] = FileOperations.load_json(hash_path) return {h['frame_number']: h['perceptual_hash'] for h in data.get('hashes', [])} except Exception as e: context.logger.warning(f'Could not load image hashes from {hash_path}: {e}') diff --git a/preprocessor/steps/vision/emotion_detection_step.py b/preprocessor/steps/vision/emotion_detection_step.py index 8ec1bf245..d1db207d4 100644 --- a/preprocessor/steps/vision/emotion_detection_step.py +++ b/preprocessor/steps/vision/emotion_detection_step.py @@ -1,4 +1,15 @@ from pathlib import Path +from typing import ( + Any, + Dict, + List, + Optional, + Tuple, +) + +import cv2 +from hsemotion_onnx.facial_emotions import HSEmotionRecognizer +import numpy as np from preprocessor.config.step_configs import EmotionDetectionConfig from preprocessor.core.artifacts import ( @@ -7,22 +18,162 @@ ) from preprocessor.core.base_step import PipelineStep from preprocessor.core.context import ExecutionContext +from preprocessor.services.io.files import FileOperations +from preprocessor.services.video.emotion_utils import EmotionDetector class EmotionDetectionStep(PipelineStep[FrameCollection, EmotionData, EmotionDetectionConfig]): + def __init__(self, config: EmotionDetectionConfig) -> None: + super().__init__(config) + self._model: Optional[HSEmotionRecognizer] = None + + def cleanup(self) -> None: + self._model = None + def execute(self, input_data: FrameCollection, context: ExecutionContext) -> EmotionData: - output_filename: str = f'{context.series_name}_{input_data.episode_info.episode_code()}_emotions.json' - output_path: Path = context.get_output_path(input_data.episode_info, 'emotion_detections', output_filename) - if output_path.exists() and (not context.force_rerun): - if context.is_step_completed(self.name, input_data.episode_id): - context.logger.info(f'Skipping {input_data.episode_id} (cached emotion detection)') - return EmotionData(episode_id=input_data.episode_id, episode_info=input_data.episode_info, path=output_path) + detections_path = self._get_character_detections_path(input_data, context) + + if self._check_cache_validity(detections_path, context, input_data.episode_id, 'cached emotion detection'): + return EmotionData( + episode_id=input_data.episode_id, + episode_info=input_data.episode_info, + path=detections_path, + ) + + if not detections_path.exists(): + context.logger.warning( + f'No character detections found for emotion analysis: {detections_path}', + ) + return EmotionData( + episode_id=input_data.episode_id, + episode_info=input_data.episode_info, + path=detections_path, + ) + context.logger.info(f'Detecting emotions for {input_data.episode_id}') context.mark_step_started(self.name, input_data.episode_id) + + self._ensure_model_loaded(context) + + detections_data = FileOperations.load_json(detections_path) + self._process_emotions(detections_data, input_data, context) + FileOperations.atomic_write_json(detections_path, detections_data) + context.mark_step_completed(self.name, input_data.episode_id) - return EmotionData(episode_id=input_data.episode_id, episode_info=input_data.episode_info, path=output_path) + return EmotionData( + episode_id=input_data.episode_id, + episode_info=input_data.episode_info, + path=detections_path, + ) @property def name(self) -> str: return 'emotion_detection' + + @staticmethod + def _get_character_detections_path( + input_data: FrameCollection, context: ExecutionContext, + ) -> Path: + filename = f'{context.series_name}_{input_data.episode_info.episode_code()}' + output_filename: str = f'{filename}_character_detections.json' + return context.get_output_path( + input_data.episode_info, 'character_detections', output_filename, + ) + + def _ensure_model_loaded(self, context: ExecutionContext) -> None: + if self._model is None: + self._model = EmotionDetector._init_model(context.logger) + + def _process_emotions( + self, + detections_data: Dict[str, Any], + input_data: FrameCollection, + context: ExecutionContext, + ) -> None: + detections: List[Dict[str, Any]] = detections_data.get('detections', []) + + face_crops, face_metadata = self._collect_face_crops( + detections, input_data.directory, context, + ) + + if not face_crops: + context.logger.warning('No valid face crops found for emotion detection') + return + + context.logger.info(f'Processing {len(face_crops)} faces with HSEmotion model') + emotion_results = EmotionDetector._detect_batch( + face_crops, self._model, batch_size=32, logger=context.logger, + ) + + self._apply_emotion_results(detections, emotion_results, face_metadata, context) + + @staticmethod + def _collect_face_crops( + detections: List[Dict[str, Any]], + frames_dir: Path, + context: ExecutionContext, + ) -> Tuple[List[np.ndarray], List[Dict[str, int]]]: + face_crops: List[np.ndarray] = [] + face_metadata: List[Dict[str, int]] = [] + + total_faces = sum(len(d.get('faces', [])) for d in detections) + context.logger.info(f'Collecting {total_faces} faces for batch emotion analysis') + + for detection_idx, detection in enumerate(detections): + frame_file = detection.get('frame_file') + if not frame_file: + continue + + frame_path = frames_dir / frame_file + if not frame_path.exists(): + continue + + frame = cv2.imread(str(frame_path)) + if frame is None: + continue + + faces = detection.get('faces', []) + for face_idx, face in enumerate(faces): + bbox = face.get('bbox') + if not bbox: + continue + + face_crop = EmotionDetector._crop_face(frame, bbox) + if face_crop is None: + continue + + face_crops.append(face_crop) + face_metadata.append({ + 'detection_idx': detection_idx, + 'face_idx': face_idx, + }) + + return face_crops, face_metadata + + @staticmethod + def _apply_emotion_results( + detections: List[Dict[str, Any]], + emotion_results: List[Optional[Tuple[str, float, Dict[str, float]]]], + face_metadata: List[Dict[str, int]], + context: ExecutionContext, + ) -> None: + processed = 0 + for result, metadata in zip(emotion_results, face_metadata): + if result is None: + continue + + dominant_emotion, confidence, emotion_scores = result + detection_idx = metadata['detection_idx'] + face_idx = metadata['face_idx'] + + face = detections[detection_idx]['faces'][face_idx] + face['emotion'] = { + 'label': dominant_emotion, + 'confidence': confidence, + 'scores': emotion_scores, + } + processed += 1 + + total = len(face_metadata) + context.logger.info(f'Emotion analysis complete: {processed}/{total} faces processed') diff --git a/preprocessor/steps/vision/image_hashing_step.py b/preprocessor/steps/vision/image_hashing_step.py index b665d2d55..0b83d63e0 100644 --- a/preprocessor/steps/vision/image_hashing_step.py +++ b/preprocessor/steps/vision/image_hashing_step.py @@ -1,3 +1,4 @@ +# pylint: disable=cyclic-import # False positive - config uses import-outside-toplevel import gc from pathlib import Path from typing import ( @@ -16,10 +17,7 @@ ) from preprocessor.core.base_step import PipelineStep from preprocessor.core.context import ExecutionContext -from preprocessor.services.io.files import ( - atomic_write_json, - load_json, -) +from preprocessor.services.io.files import FileOperations from preprocessor.services.video.frame_utils import FrameLoader from preprocessor.services.video.image_hasher import PerceptualHasher @@ -78,7 +76,7 @@ def _get_output_path(input_data: FrameCollection, context: ExecutionContext) -> @staticmethod def _load_cached_result(output_path: Path, input_data: FrameCollection) -> ImageHashCollection: - hash_data: Dict[str, Any] = load_json(output_path) + hash_data: Dict[str, Any] = FileOperations.load_json(output_path) return ImageHashCollection( episode_id=input_data.episode_id, episode_info=input_data.episode_info, @@ -91,7 +89,7 @@ def _load_frame_metadata( input_data: FrameCollection, context: ExecutionContext, ) -> tuple[Dict[str, Any], List[Dict[str, Any]]]: - frame_metadata: Dict[str, Any] = load_json(input_data.metadata_path) + frame_metadata: Dict[str, Any] = FileOperations.load_json(input_data.metadata_path) frame_requests: List[Dict[str, Any]] = frame_metadata.get('frames', []) if not frame_requests: context.logger.warning(f'No frames to hash for {input_data.episode_id}') @@ -156,7 +154,7 @@ def _save_results( }, 'hashes': hash_results, } - atomic_write_json(output_path, output_data) + FileOperations.atomic_write_json(output_path, output_data) @staticmethod def __cleanup_memory() -> None: From 2c8b1659087a7225a49c41bbb43538f71f82d6ba Mon Sep 17 00:00:00 2001 From: dam2452 <81230036+dam2452@users.noreply.github.com> Date: Thu, 12 Feb 2026 21:50:04 +0100 Subject: [PATCH 27/89] Update transcoding_step.py --- preprocessor/steps/video/transcoding_step.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/preprocessor/steps/video/transcoding_step.py b/preprocessor/steps/video/transcoding_step.py index 76a04387a..953b9873e 100644 --- a/preprocessor/steps/video/transcoding_step.py +++ b/preprocessor/steps/video/transcoding_step.py @@ -196,7 +196,7 @@ def _calculate_upscale_bitrate( else: upscaled_bitrate = min_required * max(1.2, pixel_ratio * 0.9) - max_allowed = self.config.video_bitrate_mbps * 1.3 + max_allowed = self.config.video_bitrate_mbps * 1.1 upscaled_bitrate = min(upscaled_bitrate, max_allowed) ratio = upscaled_bitrate / self.config.video_bitrate_mbps From 078fa85e4264265fe28fd246b3914a8e742722fa Mon Sep 17 00:00:00 2001 From: dam2452 <81230036+dam2452@users.noreply.github.com> Date: Fri, 13 Feb 2026 10:40:04 +0100 Subject: [PATCH 28/89] Refactor pipeline, executor, and CLI internals Major refactor across pipeline, builder, config and CLI modules: - PipelineDefinition: made internal attributes private, added name property, registration/getters, full validate() that builds a DAG, checks missing deps and cycles, logging on success, execution ordering by topological sort, grouped steps by phase, improved ASCII output and repr. - PipelineExecutor / pipeline_builder: encapsulated context and steps, use run() to execute per-episode/global steps, forward context to step execution, implemented step progress/completion state updates, improved logging and error handling. - StepBuilder: freeze dataclass, validate step id and module path on post-init, keep eq/hash/repr. - Pipeline factory: organize phases with comments, use a single output_dir variable, update transcode config keys and description, minor config/formatting improvements, and rename helper to _get_step_configs. - CLI and search: make many attributes/methods private, freeze dataclasses for search params, refactor SearchFilters/SearchCommandHandler to use properties and private members, rename/clarify async runner, and tidy up click option formatting and small logging/messages. - Config dataclasses: make multiple config dataclasses frozen, hide API key in repr by using field(default=None, repr=False), remove unused imports. - Other small fixes: formatting, minor API and naming cleanups, and consistency improvements across modules. These changes improve encapsulation, add validation for pipeline definitions, and standardize config and CLI data structures. --- preprocessor/app/pipeline.py | 150 ++--- preprocessor/app/pipeline_builder.py | 80 ++- preprocessor/app/pipeline_factory.py | 57 +- preprocessor/app/step_builder.py | 28 +- preprocessor/cli/cli_main.py | 65 ++- preprocessor/cli/helpers.py | 4 +- preprocessor/cli/search_handler.py | 164 +++--- preprocessor/cli/search_params.py | 9 +- preprocessor/cli/skip_list_builder.py | 12 +- preprocessor/config/config.py | 154 ++---- preprocessor/config/constants.py | 3 + preprocessor/config/enums.py | 5 + preprocessor/config/mixins.py | 5 +- preprocessor/config/output_paths.py | 12 +- preprocessor/config/series_config.py | 20 +- preprocessor/config/settings_factory.py | 19 +- preprocessor/config/step_configs.py | 70 ++- preprocessor/config/step_defaults.py | 92 ++-- preprocessor/config/types/detection.py | 2 + preprocessor/config/types/episode.py | 4 + preprocessor/config/types/keys.py | 22 + preprocessor/config/types/scene.py | 3 + preprocessor/config/types/search.py | 16 +- preprocessor/config/types/transcription.py | 14 +- preprocessor/config/types/video.py | 1 + preprocessor/core/artifacts.py | 20 + preprocessor/core/base_step.py | 20 +- preprocessor/core/context.py | 94 ++-- preprocessor/core/processing_metadata.py | 90 ++- preprocessor/core/state_manager.py | 81 ++- preprocessor/series_configs/defaults.json | 11 +- preprocessor/services/ai/clients.py | 27 +- preprocessor/services/ai/models.py | 4 +- preprocessor/services/ai/provider.py | 46 +- preprocessor/services/audio/extraction.py | 77 ++- .../services/characters/face_detection.py | 249 ++++++--- .../image_search/duckduckgo_image_search.py | 5 +- .../image_search/google_image_search.py | 39 +- .../characters/image_search/image_search.py | 5 +- preprocessor/services/characters/models.py | 5 +- .../characters/reference_downloader.py | 301 ++++++----- preprocessor/services/core/base_processor.py | 204 +++---- preprocessor/services/core/environment.py | 11 +- preprocessor/services/core/logging.py | 34 +- preprocessor/services/core/time.py | 6 +- .../services/episodes/episode_manager.py | 139 +++-- preprocessor/services/io/files.py | 15 +- preprocessor/services/io/metadata.py | 10 +- preprocessor/services/io/path_service.py | 22 +- preprocessor/services/media/ffmpeg.py | 64 ++- preprocessor/services/media/resolution.py | 25 +- .../services/media/scene_detection.py | 78 +-- .../services/media/transcode_params.py | 7 +- .../services/scraping/base_scraper.py | 98 ++-- .../services/scraping/base_scraper_step.py | 37 +- .../services/scraping/character_scraper.py | 18 +- preprocessor/services/scraping/clipboard.py | 20 +- preprocessor/services/scraping/crawl4ai.py | 59 +- .../services/scraping/episode_scraper.py | 57 +- .../services/scraping/grid_visualizer.py | 349 +++++------- .../services/scraping/reference_processor.py | 491 +++++------------ .../search/clients/elasticsearch_queries.py | 427 +++------------ .../search/clients/embedding_service.py | 77 +-- .../services/search/clients/hash_service.py | 37 +- .../search/clients/result_formatters.py | 99 ++-- preprocessor/services/search/elasticsearch.py | 83 ++- .../services/search/embedding_model.py | 38 +- preprocessor/services/text/import_step.py | 195 ++++--- preprocessor/services/text/language_config.py | 34 +- preprocessor/services/text/text_statistics.py | 83 +-- .../transcription/engines/base_engine.py | 5 +- .../engines/elevenlabs_engine.py | 166 +++--- .../transcription/engines/whisper_engine.py | 54 +- .../generators/base_generator.py | 23 +- .../generators/json_generator.py | 77 +-- .../generators/multi_format_generator.py | 215 +++----- .../transcription/generators/srt_generator.py | 20 +- .../transcription/generators/txt_generator.py | 12 +- .../processors/audio_normalizer.py | 84 +-- .../processors/episode_info_processor.py | 87 +-- .../processors/normalized_audio_processor.py | 95 ++-- .../transcription/sound_classification.py | 8 +- preprocessor/services/transcription/utils.py | 27 +- .../services/transcription/whisper.py | 66 ++- preprocessor/services/ui/console.py | 129 +++-- preprocessor/services/ui/progress.py | 70 ++- .../services/validation/base_result.py | 9 +- .../services/validation/episode_stats.py | 96 +--- .../services/validation/file_validators.py | 145 ++--- .../services/validation/global_validator.py | 153 +++--- .../services/validation/report_generator.py | 29 +- .../services/validation/season_comparator.py | 117 ++-- preprocessor/services/validation/validator.py | 211 +++++--- .../validation/validators/base_validator.py | 1 - .../validators/character_validator.py | 1 - .../validators/elastic_validator.py | 103 ++-- .../validators/face_cluster_validator.py | 46 +- .../validation/validators/frame_validator.py | 29 +- .../validators/image_hash_validator.py | 1 - .../validation/validators/object_validator.py | 7 +- .../validation/validators/scene_validator.py | 49 +- .../validators/transcription_validator.py | 127 +++-- .../validators/validation_helpers.py | 75 +-- .../validation/validators/video_validator.py | 38 +- preprocessor/services/video/discovery.py | 12 +- preprocessor/services/video/emotion_utils.py | 95 ++-- preprocessor/services/video/frame_utils.py | 21 +- preprocessor/services/video/image_hasher.py | 54 +- .../video/strategies/base_strategy.py | 5 +- .../strategies/scene_changes_strategy.py | 102 +++- .../video/strategies/strategy_factory.py | 5 +- .../analysis/resolution_analysis_step.py | 371 ++++++++----- preprocessor/steps/audio/separation_step.py | 235 ++++---- preprocessor/steps/packaging/archives_step.py | 28 +- .../steps/scraping/character_scraper_step.py | 7 +- .../steps/scraping/episode_scraper_step.py | 7 +- .../scraping/reference_processor_step.py | 56 +- .../steps/search/document_generation_step.py | 166 ++++-- preprocessor/steps/search/indexing_step.py | 128 ++--- preprocessor/steps/text/analysis_step.py | 106 ++-- preprocessor/steps/text/embeddings_step.py | 151 +++--- preprocessor/steps/text/transcription_step.py | 102 ++-- .../steps/validation/validator_step.py | 30 +- preprocessor/steps/video/frame_export_step.py | 299 +++++----- .../steps/video/scene_detection_step.py | 121 +++-- preprocessor/steps/video/transcoding_step.py | 511 +++++++++--------- .../steps/vision/character_detection_step.py | 136 +++-- preprocessor/steps/vision/embeddings_step.py | 179 +++--- .../steps/vision/emotion_detection_step.py | 85 ++- .../steps/vision/face_clustering_step.py | 52 +- .../steps/vision/image_hashing_step.py | 122 +++-- .../steps/vision/object_detection_step.py | 52 +- 132 files changed, 5398 insertions(+), 5117 deletions(-) diff --git a/preprocessor/app/pipeline.py b/preprocessor/app/pipeline.py index 2be569b7e..f7211c111 100644 --- a/preprocessor/app/pipeline.py +++ b/preprocessor/app/pipeline.py @@ -16,70 +16,94 @@ class PipelineDefinition: def __init__(self, name: str) -> None: - self.name: str = name - self._steps: Dict[str, StepBuilder] = {} - self._graph: Optional[nx.DiGraph] = None + self.__name = name + self.__steps: Dict[str, StepBuilder] = {} + self.__graph: Optional[nx.DiGraph] = None + + @property + def name(self) -> str: + return self.__name + + def register(self, step: StepBuilder) -> None: + if step.id in self.__steps: + raise ValueError( + f"DUPLICATE STEP:\n" + f" Step '{step.id}' is already registered in the pipeline!\n" + f" Check build_pipeline() in pipeline_factory.py", + ) + self.__steps[step.id] = step + + def get_step(self, step_id: str) -> StepBuilder: + if step_id not in self.__steps: + raise KeyError( + f"Step '{step_id}' not found. Available: {list(self.__steps.keys())}", + ) + return self.__steps[step_id] def get_all_steps(self) -> Dict[str, StepBuilder]: - return dict(self._steps) + return dict(self.__steps) + + def validate(self, logger: Optional["ErrorHandlingLogger"] = None) -> None: + self.__graph = nx.DiGraph() + + for step_id, step in self.__steps.items(): + self.__graph.add_node(step_id, step=step) + + for step_id, step in self.__steps.items(): + for dep_id in step.dependency_ids: + if dep_id not in self.__steps: + self.__raise_missing_dependency_error(step_id, dep_id) + self.__graph.add_edge(dep_id, step_id) + + if not nx.is_directed_acyclic_graph(self.__graph): + self.__raise_cycle_error() + + message = ( + f"Pipeline '{self.__name}' validated successfully:\n" + f" - {len(self.__steps)} steps registered\n" + f" - DAG structure confirmed\n" + f" - No cyclic dependencies" + ) + + if logger: + logger.info(message) + else: + print(message) def get_execution_order( self, targets: Optional[List[str]] = None, skip: Optional[List[str]] = None, ) -> List[str]: - if not self._graph: + if not self.__graph: raise RuntimeError( "Pipeline not validated! Call pipeline.validate() first.", ) - full_order: List[str] = list(nx.topological_sort(self._graph)) + full_order: List[str] = list(nx.topological_sort(self.__graph)) if targets: required: Set[str] = set() for target in targets: - if target not in self._steps: - raise ValueError( - f"Target step '{target}' does not exist in pipeline", - ) + if target not in self.__steps: + raise ValueError(f"Target step '{target}' does not exist in pipeline") required.add(target) - required.update(nx.ancestors(self._graph, target)) + required.update(nx.ancestors(self.__graph, target)) full_order = [s for s in full_order if s in required] skip_set: Set[str] = set(skip or []) return [s for s in full_order if s not in skip_set] - def get_step(self, step_id: str) -> StepBuilder: - if step_id not in self._steps: - raise KeyError( - f"Step '{step_id}' not found. Available: {list(self._steps.keys())}", - ) - return self._steps[step_id] - - def register(self, step: StepBuilder) -> None: - if step.id in self._steps: - raise ValueError( - f"DUPLICATE STEP:\n" - f" Step '{step.id}' is already registered in the pipeline!\n" - f" Check build_pipeline() in pipeline_factory.py", - ) - self._steps[step.id] = step - def to_ascii_art(self) -> str: - if not self._graph: + if not self.__graph: self.validate() lines: List[str] = [ "=" * 80, - f"PIPELINE: {self.name}", + f"PIPELINE: {self.__name}", "=" * 80, "", ] - phases: Dict[str, List[StepBuilder]] = {} - for _, step in self._steps.items(): - phase_name: str = step.phase.name - if phase_name not in phases: - phases[phase_name] = [] - phases[phase_name].append(step) + phases: Dict[str, List[StepBuilder]] = self.__group_steps_by_phase() for phase_name in ("SCRAPING", "PROCESSING", "INDEXING"): if phase_name not in phases: @@ -89,51 +113,26 @@ def to_ascii_art(self) -> str: lines.append("-" * 80) for step in phases[phase_name]: - deps_str: str = "" - if step.dependency_ids: - deps_str = f" <- needs: {', '.join(step.dependency_ids)}" - + deps_str = f" <- needs: {', '.join(step.dependency_ids)}" if step.dependency_ids else "" lines.append(f" {step.id}{deps_str}") lines.append(f" -> produces: {', '.join(step.produces)}") - lines.append(f" -> {step.description}") - lines.append("") + lines.append(f" -> {step.description}\n") lines.append("=" * 80) return "\n".join(lines) - def validate(self, logger: Optional["ErrorHandlingLogger"] = None) -> None: - self._graph = nx.DiGraph() - - for step_id, step in self._steps.items(): - self._graph.add_node(step_id, step=step) - - for step_id, step in self._steps.items(): - for dep_id in step.dependency_ids: - if dep_id not in self._steps: - self.__raise_missing_dependency_error(step_id, dep_id) - self._graph.add_edge(dep_id, step_id) - - if not nx.is_directed_acyclic_graph(self._graph): - self.__raise_cycle_error() - - message = ( - f"Pipeline '{self.name}' validated successfully:\n" - f" - {len(self._steps)} steps registered\n" - f" - DAG structure confirmed\n" - f" - No cyclic dependencies" - ) - - if logger: - logger.info(message) - else: - print(message) - - def __repr__(self) -> str: - return f"PipelineDefinition(name='{self.name}', steps={len(self._steps)})" + def __group_steps_by_phase(self) -> Dict[str, List[StepBuilder]]: + phases: Dict[str, List[StepBuilder]] = {} + for step in self.__steps.values(): + phase_name = step.phase.name + if phase_name not in phases: + phases[phase_name] = [] + phases[phase_name].append(step) + return phases def __raise_cycle_error(self) -> None: - cycles: List[List[str]] = list(nx.simple_cycles(self._graph)) - cycle_path: str = " -> ".join(cycles[0]) + f" -> {cycles[0][0]}" + cycles = list(nx.simple_cycles(self.__graph)) + cycle_path = " -> ".join(cycles[0]) + f" -> {cycles[0][0]}" raise ValueError( f"\n{'=' * 80}\n" @@ -147,9 +146,7 @@ def __raise_cycle_error(self) -> None: f"\n{'=' * 80}\n", ) - def __raise_missing_dependency_error( - self, step_id: str, missing_dep_id: str, - ) -> None: + def __raise_missing_dependency_error(self, step_id: str, missing_dep_id: str) -> None: raise ValueError( f"\n{'=' * 80}\n" f"PIPELINE DEPENDENCY ERROR\n" @@ -163,3 +160,6 @@ def __raise_missing_dependency_error( f" 3. Or remove '{missing_dep_id}' from needs=[...] in definition of '{step_id}'\n" f"\n{'=' * 80}\n", ) + + def __repr__(self) -> str: + return f"PipelineDefinition(name='{self.__name}', steps={len(self.__steps)})" diff --git a/preprocessor/app/pipeline_builder.py b/preprocessor/app/pipeline_builder.py index ea11b3be0..d4eb8b773 100644 --- a/preprocessor/app/pipeline_builder.py +++ b/preprocessor/app/pipeline_builder.py @@ -17,20 +17,20 @@ class PipelineExecutor: def __init__(self, context: ExecutionContext) -> None: - self.context = context - self.steps: List[PipelineStep] = [] + self.__context = context + self.__steps: List[PipelineStep] = [] def add_step(self, step: PipelineStep) -> "PipelineExecutor": - self.steps.append(step) + self.__steps.append(step) return self def cleanup(self) -> None: - for step in self.steps: + for step in self.__steps: if hasattr(step, "cleanup"): try: step.cleanup() except Exception as e: - self.context.logger.error(f"Cleanup failed for step {step.name}: {e}") + self.__context.logger.error(f"Cleanup failed for step {step.name}: {e}") def execute_step( self, @@ -39,18 +39,18 @@ def execute_step( source_path: Path, episode_manager: EpisodeManager, ) -> None: - step = pipeline.get_step(step_id) - self.context.logger.info(f"Step: {step_id}") - self.context.logger.info(f"{step.description}") + step_def = pipeline.get_step(step_id) + self.__context.logger.info(f"Step: {step_id}") + self.__context.logger.info(f"{step_def.description}") - StepClass = step.load_class() - instance = StepClass(step.config) + StepClass = step_def.load_class() + instance = StepClass(step_def.config) - runner = PipelineExecutor(self.context) + runner = PipelineExecutor(self.__context) runner.add_step(instance) - runner.__run_for_episodes(source_path, episode_manager) + runner.run(source_path, episode_manager) - self.context.logger.info(f"Step '{step_id}' completed") + self.__context.logger.info(f"Step '{step_id}' completed") def execute_steps( self, @@ -60,24 +60,12 @@ def execute_steps( episode_manager: EpisodeManager, ) -> None: for step_id in step_ids: - self.context.logger.info(f"{'=' * 80}") + self.__context.logger.info(f"{'=' * 80}") self.execute_step(pipeline, step_id, source_path, episode_manager) - def __mark_step_completed(self, step_name: str, episode_id: str) -> None: - if self.context.state_manager is None: - return - self.context.state_manager.mark_step_completed(step_name, episode_id) - - def __mark_step_in_progress(self, step_name: str, episode_id: str) -> None: - if self.context.state_manager is None: - return - self.context.state_manager.mark_step_started(step_name, episode_id) - - def __run_for_episodes( # pylint: disable=unused-private-member - self, source_path: Path, episode_manager: EpisodeManager, - ) -> None: + def run(self, source_path: Path, episode_manager: EpisodeManager) -> None: video_files = VideoDiscovery.discover(source_path) - self.context.logger.info( + self.__context.logger.info( f"Discovered {len(video_files)} video files in {source_path}", ) @@ -85,7 +73,7 @@ def __run_for_episodes( # pylint: disable=unused-private-member for video_file in video_files: episode_info = episode_manager.parse_filename(video_file) if not episode_info: - self.context.logger.warning(f"Cannot parse: {video_file}") + self.__context.logger.warning(f"Cannot parse: {video_file}") continue episode_id = episode_manager.get_episode_id_for_state(episode_info) @@ -97,38 +85,38 @@ def __run_for_episodes( # pylint: disable=unused-private-member ), ) - for step in self.steps: + for step in self.__steps: if step.is_global: self.__run_global_step(step) else: current_artifacts = self.__run_episode_step(step, current_artifacts) def __run_global_step(self, step: PipelineStep) -> None: - self.context.logger.info(f"=== Running Global Step: {step.name} ===") + self.__context.logger.info(f"=== Running Global Step: {step.name} ===") if self.__should_skip_step(step.name, 'all'): - self.context.logger.info(f"Skipping {step.name} (already completed)") + self.__context.logger.info(f"Skipping {step.name} (already completed)") return try: self.__mark_step_in_progress(step.name, 'all') - step.execute(None, self.context) + step.execute(None, self.__context) self.__mark_step_completed(step.name, 'all') except Exception as e: - self.context.logger.error(f"Global step {step.name} failed: {e}") + self.__context.logger.error(f"Global step {step.name} failed: {e}") raise def __run_episode_step( self, step: PipelineStep, current_artifacts: List[Any], ) -> List[Any]: - self.context.logger.info(f"=== Running Step: {step.name} ===") + self.__context.logger.info(f"=== Running Step: {step.name} ===") next_artifacts = [] for artifact in current_artifacts: episode_id = artifact.episode_id if self.__should_skip_step(step.name, episode_id): - self.context.logger.info( + self.__context.logger.info( f"Skipping {step.name} for {episode_id} (already completed)", ) next_artifacts.append(artifact) @@ -136,7 +124,7 @@ def __run_episode_step( try: self.__mark_step_in_progress(step.name, episode_id) - result = step.execute(artifact, self.context) + result = step.execute(artifact, self.__context) self.__mark_step_completed(step.name, episode_id) if result: @@ -144,18 +132,28 @@ def __run_episode_step( else: next_artifacts.append(artifact) except Exception as e: - self.context.logger.error( + self.__context.logger.error( f"Step {step.name} failed for {artifact.episode_id}: {e}", ) raise return next_artifacts + def __mark_step_completed(self, step_name: str, episode_id: str) -> None: + if self.__context.state_manager is None: + return + self.__context.state_manager.mark_step_completed(step_name, episode_id) + + def __mark_step_in_progress(self, step_name: str, episode_id: str) -> None: + if self.__context.state_manager is None: + return + self.__context.state_manager.mark_step_started(step_name, episode_id) + def __should_skip_step(self, step_name: str, episode_id: str) -> bool: - if self.context.force_rerun: + if self.__context.force_rerun: return False - if self.context.state_manager is None: + if self.__context.state_manager is None: return False - return self.context.state_manager.is_step_completed(step_name, episode_id) + return self.__context.state_manager.is_step_completed(step_name, episode_id) diff --git a/preprocessor/app/pipeline_factory.py b/preprocessor/app/pipeline_factory.py index fe75eefa3..992c48116 100644 --- a/preprocessor/app/pipeline_factory.py +++ b/preprocessor/app/pipeline_factory.py @@ -31,6 +31,7 @@ ) from preprocessor.services.media.resolution import Resolution +# Phase Definitions SCRAPING = Phase("SCRAPING", color="blue") PROCESSING = Phase("PROCESSING", color="green") INDEXING = Phase("INDEXING", color="yellow") @@ -38,8 +39,12 @@ def build_pipeline(series_name: str) -> PipelineDefinition: # pylint: disable=too-many-locals - series_config: SeriesConfig = SeriesConfig.load(series_name) + series_config = SeriesConfig.load(series_name) + output_dir = get_base_output_dir(series_name) + # ========================================================= + # SCRAPING PHASE + # ========================================================= episodes_metadata = StepBuilder( id="scrape_episodes", phase=SCRAPING, @@ -49,7 +54,7 @@ def build_pipeline(series_name: str) -> PipelineDefinition: # pylint: disable=t needs=[], config=EpisodeScraperConfig( urls=series_config.scraping.episodes.urls, - output_file=str(get_base_output_dir(series_name) / f"{series_name}_episodes.json"), + output_file=str(output_dir / f"{series_name}_episodes.json"), headless=True, merge_sources=True, scraper_method="crawl4ai", @@ -66,7 +71,7 @@ def build_pipeline(series_name: str) -> PipelineDefinition: # pylint: disable=t needs=[], config=CharacterScraperConfig( urls=series_config.scraping.characters.urls, - output_file=str(get_base_output_dir(series_name) / f"{series_name}_characters.json"), + output_file=str(output_dir / f"{series_name}_characters.json"), headless=True, scraper_method="crawl4ai", parser_mode=series_config.scraping.characters.parser_mode, @@ -81,13 +86,16 @@ def build_pipeline(series_name: str) -> PipelineDefinition: # pylint: disable=t produces=["character_faces/{character}/*.jpg"], needs=[characters_metadata], config=CharacterReferenceConfig( - characters_file=str(get_base_output_dir(series_name) / f"{series_name}_characters.json"), - output_dir=str(get_base_output_dir(series_name) / "character_faces"), + characters_file=str(output_dir / f"{series_name}_characters.json"), + output_dir=str(output_dir / "character_faces"), search_engine=series_config.scraping.character_references.search_engine, images_per_character=series_config.scraping.character_references.images_per_character, ), ) + # ========================================================= + # PROCESSING PHASE: VIDEO + # ========================================================= resolution_analysis = StepBuilder( id="resolution_analysis", phase=PROCESSING, @@ -96,11 +104,9 @@ def build_pipeline(series_name: str) -> PipelineDefinition: # pylint: disable=t produces=[], needs=[], config=TranscodeConfig( - video_bitrate_mbps=series_config.processing.transcode.video_bitrate_mbps, - minrate_mbps=series_config.processing.transcode.minrate_mbps, - maxrate_mbps=series_config.processing.transcode.maxrate_mbps, - bufsize_mbps=series_config.processing.transcode.bufsize_mbps, - gop_size=series_config.processing.transcode.gop_size, + bitrate_reference_mb=series_config.processing.transcode.bitrate_reference_mb, + bitrate_reference_seconds=series_config.processing.transcode.bitrate_reference_seconds, + keyframe_interval_seconds=series_config.processing.transcode.keyframe_interval_seconds, force_deinterlace=series_config.processing.transcode.force_deinterlace, resolution=Resolution.from_string(series_config.processing.transcode.resolution), ), @@ -110,15 +116,13 @@ def build_pipeline(series_name: str) -> PipelineDefinition: # pylint: disable=t id="transcode", phase=PROCESSING, module="preprocessor.steps.video.transcoding_step:VideoTranscoderStep", - description=f"Conversion to {series_config.processing.transcode.codec} {series_config.processing.transcode.resolution} with adaptive bitrate", + description=f"Conversion to h264_nvenc {series_config.processing.transcode.resolution} with adaptive bitrate", produces=["transcoded_videos/{season}/{episode}.mp4"], needs=[resolution_analysis], config=TranscodeConfig( - video_bitrate_mbps=series_config.processing.transcode.video_bitrate_mbps, - minrate_mbps=series_config.processing.transcode.minrate_mbps, - maxrate_mbps=series_config.processing.transcode.maxrate_mbps, - bufsize_mbps=series_config.processing.transcode.bufsize_mbps, - gop_size=series_config.processing.transcode.gop_size, + bitrate_reference_mb=series_config.processing.transcode.bitrate_reference_mb, + bitrate_reference_seconds=series_config.processing.transcode.bitrate_reference_seconds, + keyframe_interval_seconds=series_config.processing.transcode.keyframe_interval_seconds, force_deinterlace=series_config.processing.transcode.force_deinterlace, ), ) @@ -143,9 +147,14 @@ def build_pipeline(series_name: str) -> PipelineDefinition: # pylint: disable=t description="Exports frames (PNG) at scene boundaries", produces=["frames/{season}/{episode}/*.png"], needs=[scene_data], - config=FrameExportConfig(frames_per_scene=series_config.processing.frame_export.frames_per_scene), + config=FrameExportConfig( + frames_per_scene=series_config.processing.frame_export.frames_per_scene, + ), ) + # ========================================================= + # PROCESSING PHASE: TEXT & AUDIO + # ========================================================= transcription_data = StepBuilder( id="transcribe", phase=PROCESSING, @@ -198,6 +207,9 @@ def build_pipeline(series_name: str) -> PipelineDefinition: # pylint: disable=t ), ) + # ========================================================= + # PROCESSING PHASE: VISION + # ========================================================= image_hashes = StepBuilder( id="image_hashing", phase=PROCESSING, @@ -262,6 +274,9 @@ def build_pipeline(series_name: str) -> PipelineDefinition: # pylint: disable=t config=ObjectDetectionConfig(), ) + # ========================================================= + # INDEXING PHASE + # ========================================================= elastic_documents = StepBuilder( id="generate_elastic_docs", phase=INDEXING, @@ -304,6 +319,9 @@ def build_pipeline(series_name: str) -> PipelineDefinition: # pylint: disable=t ), ) + # ========================================================= + # VALIDATION PHASE + # ========================================================= validation = StepBuilder( id="validate", phase=VALIDATION, @@ -314,6 +332,9 @@ def build_pipeline(series_name: str) -> PipelineDefinition: # pylint: disable=t config=ValidationConfig(), ) + # ========================================================= + # PIPELINE REGISTRATION + # ========================================================= pipeline = PipelineDefinition(name=f"{series_name}_processing") pipeline.register(episodes_metadata) @@ -353,6 +374,6 @@ def visualize(series_name: str = "ranczo") -> None: print(pipeline.to_ascii_art()) -def __get_step_configs(series_name: str) -> Dict[str, object]: +def _get_step_configs(series_name: str) -> Dict[str, object]: pipeline = build_pipeline(series_name) return {step_id: step.config for step_id, step in pipeline.get_all_steps().items()} diff --git a/preprocessor/app/step_builder.py b/preprocessor/app/step_builder.py index a95a890d2..426a1c7f0 100644 --- a/preprocessor/app/step_builder.py +++ b/preprocessor/app/step_builder.py @@ -4,16 +4,12 @@ ) import importlib from typing import ( - TYPE_CHECKING, Any, List, ) -if TYPE_CHECKING: - pass - -@dataclass +@dataclass(frozen=True) class Phase: name: str color: str @@ -50,24 +46,30 @@ def load_class(self) -> type: f"Class '{class_name}' not found in module '{module_path}' for step '{self.id}': {e}", ) from e - def __eq__(self, other: object) -> bool: - if not isinstance(other, StepBuilder): - return False - return self.id == other.id - - def __hash__(self) -> int: - return hash(self.id) - def __post_init__(self) -> None: + self.__validate_id() + self.__validate_module_path() + + def __validate_id(self) -> None: if not self.id.replace("_", "").replace("-", "").isalnum(): raise ValueError( f"Invalid step_id: '{self.id}'. Use only alphanumeric and underscores.", ) + + def __validate_module_path(self) -> None: if not self.module or ":" not in self.module: raise ValueError( f"Invalid module format for '{self.id}'. Expected 'package.module:ClassName'", ) + def __eq__(self, other: object) -> bool: + if not isinstance(other, StepBuilder): + return False + return self.id == other.id + + def __hash__(self) -> int: + return hash(self.id) + def __repr__(self) -> str: deps = f", needs={self.dependency_ids}" if self.needs else "" return f"StepBuilder(id='{self.id}'{deps})" diff --git a/preprocessor/cli/cli_main.py b/preprocessor/cli/cli_main.py index 527ef1df8..190825e23 100644 --- a/preprocessor/cli/cli_main.py +++ b/preprocessor/cli/cli_main.py @@ -92,8 +92,8 @@ def __step_command(series: str, force_rerun: bool, _step_id: str = step_id) -> N try: step = pipeline.get_step(_step_id) - deps = step.dependency_ids + if deps: setup.logger.info(f"Dependencies: {', '.join(deps)}") for dep_id in deps: @@ -141,12 +141,7 @@ def __analyze_resolution(series: str) -> None: setup.logger.finalize() -def _execute_search_command(config: SearchConfig) -> None: # pylint: disable=too-many-statements - """Execute search with config object. - - Args: - config: Complete search configuration. - """ +def __execute_search_command(config: SearchConfig) -> None: # pylint: disable=too-many-statements series_config = SeriesConfig.load(config.series) index_base = series_config.indexing.elasticsearch.index_name @@ -156,7 +151,7 @@ def _execute_search_command(config: SearchConfig) -> None: # pylint: disable=to if hash_value is None: sys.exit(1) - async def __run() -> None: + async def __run_async_search() -> None: es_client = AsyncElasticsearch(hosts=[config.host], verify_certs=False) try: @@ -210,7 +205,7 @@ async def __run() -> None: embedding_svc.cleanup() await es_client.close() - asyncio.run(__run()) + asyncio.run(__run_async_search()) @cli.command(name="search") @@ -221,8 +216,14 @@ async def __run() -> None: @click.option("--image", type=click.Path(exists=True, path_type=Path), help="Semantic search by video embeddings") @click.option("--hash", "phash", type=str, help="Search by perceptual hash (provide hash string or image path)") @click.option("--character", type=str, help="Search by character") -@click.option("--emotion", type=str, help="Search by emotion (neutral, happiness, surprise, sadness, anger, disgust, fear, contempt)") -@click.option("--object", "object_query", type=str, help="Search by detected objects (e.g., 'dog', 'person:5+', 'chair:2-4')") +@click.option( + "--emotion", type=str, + help="Search by emotion (neutral, happiness, surprise, sadness, anger, disgust, fear, contempt)", +) +@click.option( + "--object", "object_query", type=str, + help="Search by detected objects (e.g., 'dog', 'person:5+', 'chair:2-4')", +) @click.option("--episode-name", type=str, help="Fuzzy search by episode names") @click.option("--episode-name-semantic", type=str, help="Semantic search by episode names") @click.option("--list-characters", "list_chars_flag", is_flag=True, help="List all characters") @@ -234,27 +235,26 @@ async def __run() -> None: @click.option("--json-output", is_flag=True, help="Output in JSON format") @click.option("--host", type=str, default="http://localhost:9200", help="Elasticsearch host") def search( # pylint: disable=too-many-arguments,too-many-locals - series: str, - text: str, - text_semantic: str, - text_to_video: str, - image: Path, - phash: str, - character: str, - emotion: str, - object_query: str, - episode_name: str, - episode_name_semantic: str, - list_chars_flag: bool, - list_objects_flag: bool, - season: int, - episode: int, - limit: int, - stats: bool, - json_output: bool, - host: str, + series: str, + text: str, + text_semantic: str, + text_to_video: str, + image: Path, + phash: str, + character: str, + emotion: str, + object_query: str, + episode_name: str, + episode_name_semantic: str, + list_chars_flag: bool, + list_objects_flag: bool, + season: int, + episode: int, + limit: int, + stats: bool, + json_output: bool, + host: str, ) -> None: - """Search command entry point - Click requires all parameters.""" config = SearchConfig( series=series, query=SearchQueryParams( @@ -279,7 +279,7 @@ def search( # pylint: disable=too-many-arguments,too-many-locals click.echo("Provide at least one search option. Use --help", err=True) sys.exit(1) - _execute_search_command(config) + __execute_search_command(config) _CLI_TEMPLATE_SERIES = "ranczo" @@ -289,6 +289,5 @@ def search( # pylint: disable=too-many-arguments,too-many-locals command_func = __create_step_command(_step_id, _step.description) cli.add_command(command_func) - if __name__ == "__main__": cli() diff --git a/preprocessor/cli/helpers.py b/preprocessor/cli/helpers.py index a8decd8c3..e3f09d5d9 100644 --- a/preprocessor/cli/helpers.py +++ b/preprocessor/cli/helpers.py @@ -9,7 +9,7 @@ from preprocessor.services.io.path_service import PathService -@dataclass +@dataclass(frozen=True) class PipelineSetup: context: ExecutionContext logger: ErrorHandlingLogger @@ -18,7 +18,6 @@ class PipelineSetup: class PipelineContextFactory: - @staticmethod def build( series: str, @@ -62,6 +61,7 @@ def __create_episode_manager( if not episodes_json.exists(): episodes_json = None return EpisodeManager(episodes_json, series, logger) + @staticmethod def __create_logger( command_name: str, diff --git a/preprocessor/cli/search_handler.py b/preprocessor/cli/search_handler.py index 9f6fb4b54..14e82c8a4 100644 --- a/preprocessor/cli/search_handler.py +++ b/preprocessor/cli/search_handler.py @@ -19,60 +19,51 @@ class SearchFilters: - def __init__( - self, - season: Optional[int] = None, - episode: Optional[int] = None, - character: Optional[str] = None, - limit: int = 20, + self, + season: Optional[int] = None, + episode: Optional[int] = None, + character: Optional[str] = None, + limit: int = 20, ) -> None: - self.season = season - self.episode = episode - self.character = character - self.limit = limit + self.__season = season + self.__episode = episode + self.__character = character + self.__limit = limit + @property + def season(self) -> Optional[int]: + return self.__season -class SearchCommandHandler: + @property + def episode(self) -> Optional[int]: + return self.__episode - def __init__( - self, - es_client: AsyncElasticsearch, - embedding_service: EmbeddingService, - queries: ElasticsearchQueries, - json_output: bool, - ) -> None: - self._es = es_client - self._embedding = embedding_service - self._queries = queries - self._json_output = json_output - - async def _execute_search( - self, - search_func: Callable[..., Awaitable[Dict[str, Any]]], - result_type: str, - result_key: str = "hits", - ) -> str: - """Generic search executor - reduces duplication. + @property + def character(self) -> Optional[str]: + return self.__character - Args: - search_func: Async function that executes the search query. - result_type: Type of result for console formatting. - result_key: Key to extract from result for JSON output (default: "hits"). + @property + def limit(self) -> int: + return self.__limit - Returns: - Formatted search results (JSON or console output). - """ - result = await search_func() - if self._json_output: - return json.dumps(result.get(result_key, result), indent=2) - - return self._format_console_output(result, result_type) +class SearchCommandHandler: + def __init__( + self, + es_client: AsyncElasticsearch, + embedding_service: EmbeddingService, + queries: ElasticsearchQueries, + json_output: bool, + ) -> None: + self.__es = es_client + self.__embedding = embedding_service # pylint: disable=unused-private-member + self.__queries = queries + self.__json_output = json_output async def handle_stats(self) -> str: - result = await self._queries.get_stats(self._es) - if self._json_output: + result = await self.__queries.get_stats(self.__es) + if self.__json_output: return json.dumps(result, indent=2) output = ["\nStatystyki:"] @@ -83,8 +74,8 @@ async def handle_stats(self) -> str: return "\n".join(output) async def handle_list_characters(self) -> str: - chars = await self._queries.list_characters(self._es) - if self._json_output: + chars = await self.__queries.list_characters(self.__es) + if self.__json_output: return json.dumps(chars, indent=2) output = [f"\nZnaleziono {len(chars)} postaci:"] @@ -93,8 +84,8 @@ async def handle_list_characters(self) -> str: return "\n".join(output) async def handle_list_objects(self) -> str: - objects = await self._queries.list_objects(self._es) - if self._json_output: + objects = await self.__queries.list_objects(self.__es) + if self.__json_output: return json.dumps(objects, indent=2) output = [f"\nZnaleziono {len(objects)} klas obiektow:"] @@ -103,79 +94,79 @@ async def handle_list_objects(self) -> str: return "\n".join(output) async def handle_text_search(self, query: str, filters: SearchFilters) -> str: - return await self._execute_search( - search_func=lambda: self._queries.search_text_query( - self._es, query, filters.season, filters.episode, filters.limit, + return await self.__execute_search( + search_func=lambda: self.__queries.search_text_query( + self.__es, query, filters.season, filters.episode, filters.limit, ), result_type="text", ) async def handle_text_semantic_search(self, query: str, filters: SearchFilters) -> str: - return await self._execute_search( - search_func=lambda: self._queries.search_text_semantic( - self._es, query, filters.season, filters.episode, filters.limit, + return await self.__execute_search( + search_func=lambda: self.__queries.search_text_semantic( + self.__es, query, filters.season, filters.episode, filters.limit, ), result_type="text_semantic", ) async def handle_text_to_video_search(self, query: str, filters: SearchFilters) -> str: - return await self._execute_search( - search_func=lambda: self._queries.search_text_to_video( - self._es, query, filters.season, filters.episode, filters.character, filters.limit, + return await self.__execute_search( + search_func=lambda: self.__queries.search_text_to_video( + self.__es, query, filters.season, filters.episode, filters.character, filters.limit, ), result_type="video", ) async def handle_image_search(self, image_path: Path, filters: SearchFilters) -> str: - return await self._execute_search( - search_func=lambda: self._queries.search_video_semantic( - self._es, str(image_path), filters.season, filters.episode, filters.character, filters.limit, + return await self.__execute_search( + search_func=lambda: self.__queries.search_video_semantic( + self.__es, str(image_path), filters.season, filters.episode, filters.character, filters.limit, ), result_type="video", ) async def handle_emotion_search(self, emotion: str, filters: SearchFilters) -> str: - return await self._execute_search( - search_func=lambda: self._queries.search_by_emotion( - self._es, emotion, filters.season, filters.episode, filters.character, filters.limit, + return await self.__execute_search( + search_func=lambda: self.__queries.search_by_emotion( + self.__es, emotion, filters.season, filters.episode, filters.character, filters.limit, ), result_type="video", ) async def handle_character_search(self, character: str, filters: SearchFilters) -> str: - return await self._execute_search( - search_func=lambda: self._queries.search_by_character( - self._es, character, filters.season, filters.episode, filters.limit, + return await self.__execute_search( + search_func=lambda: self.__queries.search_by_character( + self.__es, character, filters.season, filters.episode, filters.limit, ), result_type="video", ) async def handle_object_search(self, object_query: str, filters: SearchFilters) -> str: - return await self._execute_search( - search_func=lambda: self._queries.search_by_object( - self._es, object_query, filters.season, filters.episode, filters.limit, + return await self.__execute_search( + search_func=lambda: self.__queries.search_by_object( + self.__es, object_query, filters.season, filters.episode, filters.limit, ), result_type="video", ) async def handle_hash_search(self, hash_value: str, filters: SearchFilters) -> str: - return await self._execute_search( - search_func=lambda: self._queries.search_perceptual_hash(self._es, hash_value, filters.limit), + return await self.__execute_search( + search_func=lambda: self.__queries.search_perceptual_hash(self.__es, hash_value, filters.limit), result_type="video", ) async def handle_episode_name_search(self, episode_name: str, filters: SearchFilters) -> str: - return await self._execute_search( - search_func=lambda: self._queries.search_episode_name( - self._es, episode_name, filters.season, filters.limit, + return await self.__execute_search( + search_func=lambda: self.__queries.search_episode_name( + self.__es, episode_name, filters.season, filters.limit, ), result_type="episode_name", ) async def handle_episode_name_semantic_search(self, episode_name: str, filters: SearchFilters) -> str: - return await self._execute_search( - search_func=lambda: self._queries.search_episode_name_semantic( - self._es, episode_name, filters.season, filters.limit, + return await self.__execute_search( + search_func=lambda: self.__queries.search_episode_name_semantic( + self.__es, episode_name, filters.season, filters.limit, ), result_type="episode_name", ) @@ -187,17 +178,33 @@ def compute_perceptual_hash(phash_input: str) -> Optional[str]: click.echo(f"Computing perceptual hash from image: {phash_input}", err=True) hash_svc = HashService() hash_value = hash_svc.get_perceptual_hash(str(phash_path)) + if hash_value: click.echo(f"Computed hash: {hash_value}", err=True) else: click.echo("Failed to compute hash from image", err=True) return None + hash_svc.cleanup() return hash_value + return phash_input + async def __execute_search( + self, + search_func: Callable[..., Awaitable[Dict[str, Any]]], + result_type: str, + result_key: str = "hits", + ) -> str: + result = await search_func() + + if self.__json_output: + return json.dumps(result.get(result_key, result), indent=2) + + return self.__format_console_output(result, result_type) + @staticmethod - def _format_console_output(result: Dict[str, Any], result_type: str) -> str: + def __format_console_output(result: Dict[str, Any], result_type: str) -> str: class __StringBuffer: def __init__(self) -> None: self.buffer: List[str] = [] @@ -209,7 +216,6 @@ def getvalue(self) -> str: return ''.join(self.buffer) buffer = __StringBuffer() - original_echo = click.echo def __buffer_echo(message: Optional[str] = None, **_kwargs: Any) -> None: diff --git a/preprocessor/cli/search_params.py b/preprocessor/cli/search_params.py index 00b85905d..f9930197d 100644 --- a/preprocessor/cli/search_params.py +++ b/preprocessor/cli/search_params.py @@ -5,9 +5,8 @@ from preprocessor.cli.search_handler import SearchFilters -@dataclass +@dataclass(frozen=True) class SearchQueryParams: - text: Optional[str] = None text_semantic: Optional[str] = None text_to_video: Optional[str] = None @@ -34,9 +33,8 @@ def has_search_criteria(self) -> bool: ]) -@dataclass +@dataclass(frozen=True) class SearchActionParams: - list_chars_flag: bool = False list_objects_flag: bool = False stats: bool = False @@ -49,9 +47,8 @@ def has_action(self) -> bool: ]) -@dataclass +@dataclass(frozen=True) class SearchConfig: - series: str query: SearchQueryParams filters: SearchFilters diff --git a/preprocessor/cli/skip_list_builder.py b/preprocessor/cli/skip_list_builder.py index 826d4fbee..e53016e45 100644 --- a/preprocessor/cli/skip_list_builder.py +++ b/preprocessor/cli/skip_list_builder.py @@ -10,12 +10,16 @@ class SkipListBuilder: @staticmethod def build( - cli_skip: Tuple[str, ...], - series_config: SeriesConfig, - logger: ErrorHandlingLogger, + cli_skip: Tuple[str, ...], + series_config: SeriesConfig, + logger: ErrorHandlingLogger, ) -> List[str]: skip_list = list(cli_skip) + if series_config.pipeline_mode == "selective" and series_config.skip_steps: - logger.info(f"Selective mode: auto-skipping {', '.join(series_config.skip_steps)}") + logger.info( + f"Selective mode: auto-skipping {', '.join(series_config.skip_steps)}", + ) skip_list.extend(series_config.skip_steps) + return list(set(skip_list)) diff --git a/preprocessor/config/config.py b/preprocessor/config/config.py index 1dd6c62fa..e0ded6ed1 100644 --- a/preprocessor/config/config.py +++ b/preprocessor/config/config.py @@ -3,12 +3,8 @@ field, ) import os -from pathlib import Path from typing import ( - Any, ClassVar, - Dict, - List, Optional, Tuple, ) @@ -19,7 +15,7 @@ from preprocessor.services.media.resolution import Resolution -@dataclass +@dataclass(frozen=True) class ElasticDocumentSubdirs: episode_names: str = 'episode_names' full_episode_embeddings: str = 'full_episode_embeddings' @@ -30,13 +26,15 @@ class ElasticDocumentSubdirs: text_statistics: str = 'text_statistics' video_frames: str = 'video_frames' -@dataclass + +@dataclass(frozen=True) class TranscriptionSubdirs: clean: str = 'clean' raw: str = 'raw' sound_events: str = 'sound_events' -@dataclass + +@dataclass(frozen=True) class OutputSubdirs: # pylint: disable=too-many-instance-attributes archives: str = 'archives' character_detections: str = 'character_detections' @@ -55,15 +53,17 @@ class OutputSubdirs: # pylint: disable=too-many-instance-attributes validation_reports: str = 'validation_reports' video: str = 'transcoded_videos' -@dataclass + +@dataclass(frozen=True) class BaseAPISettings: - _api_key: Optional[SecretStr] = None + _api_key: Optional[SecretStr] = field(default=None, repr=False) @property def api_key(self) -> Optional[str]: return self._api_key.get_secret_value() if self._api_key else None -@dataclass + +@dataclass(frozen=True) class TranscodeSettings(OutputDirMixin): OUTPUT_SUBDIR: ClassVar[str] = 'transcoded_videos' @@ -73,29 +73,34 @@ class TranscodeSettings(OutputDirMixin): target_duration_seconds: float = 100.0 target_file_size_mb: float = 50.0 -@dataclass + +@dataclass(frozen=True) class SceneDetectionSettings(OutputDirMixin): OUTPUT_SUBDIR: ClassVar[str] = 'scene_timestamps' min_scene_len: int = 10 threshold: float = 0.5 -@dataclass + +@dataclass(frozen=True) class SceneChangesSettings: frames_per_scene: int = 1 -@dataclass + +@dataclass(frozen=True) class KeyframeExtractionSettings: scene_changes: SceneChangesSettings = field(default_factory=SceneChangesSettings) strategy: str = 'scene_changes' -@dataclass + +@dataclass(frozen=True) class FrameExportSettings(OutputDirMixin): OUTPUT_SUBDIR: ClassVar[str] = 'exported_frames' resolution: Resolution = Resolution.R1080P -@dataclass + +@dataclass(frozen=True) class TranscriptionSettings(OutputDirMixin): OUTPUT_SUBDIR: ClassVar[str] = 'transcriptions' @@ -103,7 +108,8 @@ class TranscriptionSettings(OutputDirMixin): language: str = 'Polish' model: str = 'large-v3-turbo' -@dataclass + +@dataclass(frozen=True) class WhisperSettings: model: str = 'large-v3-turbo' @@ -111,13 +117,15 @@ class WhisperSettings: def _from_env(cls) -> 'WhisperSettings': return cls(model=os.getenv('WHISPER_MODEL', 'large-v3-turbo')) -@dataclass + +@dataclass(frozen=True) class TextChunkingSettings: segments_per_embedding: int = 5 text_chunk_overlap: int = 3 text_sentences_per_chunk: int = 8 -@dataclass + +@dataclass(frozen=True) class ElevenLabsSettings(BaseAPISettings): diarize: bool = True language_code: str = 'pol' @@ -132,7 +140,8 @@ def _from_env(cls) -> 'ElevenLabsSettings': api_key = SecretStr(os.getenv('ELEVEN_API_KEY', '')) return cls(_api_key=api_key) -@dataclass + +@dataclass(frozen=True) class EmbeddingModelSettings: embedding_dim: int = 4096 enable_chunked_prefill: bool = True @@ -145,7 +154,8 @@ class EmbeddingModelSettings: model_revision: str = 'main' tensor_parallel_size: int = 1 -@dataclass + +@dataclass(frozen=True) class EmbeddingSettings(OutputDirMixin): OUTPUT_SUBDIR: ClassVar[str] = 'embeddings' @@ -155,12 +165,14 @@ class EmbeddingSettings(OutputDirMixin): progress_sub_batch_size: int = 100 text_batch_size: int = 64 -@dataclass + +@dataclass(frozen=True) class FaceRecognitionSettings: detection_size: Tuple[int, int] = (1280, 1280) model_name: str = 'buffalo_l' -@dataclass + +@dataclass(frozen=True) class FaceClusteringSettings(OutputDirMixin): OUTPUT_SUBDIR: ClassVar[str] = 'face_clusters' @@ -168,7 +180,8 @@ class FaceClusteringSettings(OutputDirMixin): min_samples: int = 3 save_noise: bool = True -@dataclass + +@dataclass(frozen=True) class EmotionDetectionSettings: model_name: str = 'enet_b2_8' @@ -177,7 +190,8 @@ def _from_env(cls) -> 'EmotionDetectionSettings': model_name = os.getenv('EMOTION_MODEL_NAME', 'enet_b2_8') return cls(model_name=model_name) -@dataclass + +@dataclass(frozen=True) class CharacterSettings(OutputDirMixin): OUTPUT_SUBDIR: ClassVar[str] = 'characters' @@ -187,18 +201,21 @@ class CharacterSettings(OutputDirMixin): reference_images_per_character: int = 3 reference_matching_threshold: float = 0.5 -@dataclass + +@dataclass(frozen=True) class ObjectDetectionSettings(OutputDirMixin): OUTPUT_SUBDIR: ClassVar[str] = 'object_detections' conf_threshold: float = 0.3 model_name: str = 'ustc-community/dfine-xlarge-obj2coco' -@dataclass + +@dataclass(frozen=True) class ImageHashSettings(OutputDirMixin): OUTPUT_SUBDIR: ClassVar[str] = 'image_hashes' -@dataclass + +@dataclass(frozen=True) class ImageScraperSettings(BaseAPISettings): max_results_to_scrape: int = 50 min_image_height: int = 600 @@ -220,23 +237,29 @@ def _from_env(cls) -> 'ImageScraperSettings': api_key = SecretStr(os.getenv('SERPAPI_API_KEY', '')) return cls(_api_key=api_key) -@dataclass + +@dataclass(frozen=True) class ScraperSettings(OutputDirMixin): OUTPUT_SUBDIR: ClassVar[str] = 'scraped_pages' -@dataclass + +@dataclass(frozen=True) class ElasticsearchSettings: host: str = '' - password: str = '' + password: str = field(default='', repr=False) user: str = '' @classmethod def _from_env(cls) -> 'ElasticsearchSettings': - return cls(host=os.getenv('ES_HOST', ''), user=os.getenv('ES_USER', ''), password=os.getenv('ES_PASS', '')) + return cls( + host=os.getenv('ES_HOST', ''), + user=os.getenv('ES_USER', ''), + password=os.getenv('ES_PASS', ''), + ) -@dataclass -class GeminiSettings(BaseAPISettings): +@dataclass(frozen=True) +class GeminiSettings(BaseAPISettings): @classmethod def _from_env(cls) -> 'GeminiSettings': api_key = None @@ -244,7 +267,8 @@ def _from_env(cls) -> 'GeminiSettings': api_key = SecretStr(os.getenv('GEMINI_API_KEY', '')) return cls(_api_key=api_key) -@dataclass + +@dataclass(frozen=True) class Settings: # pylint: disable=too-many-instance-attributes character: CharacterSettings elasticsearch: ElasticsearchSettings @@ -293,65 +317,3 @@ def _from_env(cls) -> 'Settings': transcode=TranscodeSettings(), transcription=TranscriptionSettings(), ) - -@dataclass -class TranscodeConfig: - codec: str - gop_size: float - resolution: Resolution - transcoded_videos: Path - videos: Path - audio_bitrate_kbps: int = 128 - bufsize_mbps: Optional[float] = None - episodes_info_json: Optional[Path] = None - maxrate_mbps: Optional[float] = None - minrate_mbps: Optional[float] = None - video_bitrate_mbps: Optional[float] = None - - def to_dict(self) -> Dict[str, Any]: - return { - 'videos': self.videos, - 'transcoded_videos': self.transcoded_videos, - 'resolution': self.resolution, - 'codec': self.codec, - 'video_bitrate_mbps': self.video_bitrate_mbps, - 'minrate_mbps': self.minrate_mbps, - 'maxrate_mbps': self.maxrate_mbps, - 'bufsize_mbps': self.bufsize_mbps, - 'audio_bitrate_kbps': self.audio_bitrate_kbps, - 'gop_size': self.gop_size, - 'episodes_info_json': self.episodes_info_json, - } - -@dataclass -class TranscriptionConfig: - device: str - episodes_info_json: Path - language: str - model: str - name: str - transcription_jsons: Path - videos: Path - extra_json_keys_to_remove: List[str] = field(default_factory=list) - - def to_dict(self) -> Dict[str, Any]: - return { - 'videos': self.videos, - 'episodes_info_json': self.episodes_info_json, - 'transcription_jsons': self.transcription_jsons, - 'model': self.model, - 'language': self.language, - 'device': self.device, - 'extra_json_keys_to_remove': self.extra_json_keys_to_remove, - 'name': self.name, - } - -@dataclass -class IndexConfig: - name: str - transcription_jsons: Path - append: bool = False - dry_run: bool = False - - def to_dict(self) -> Dict[str, Any]: - return {'name': self.name, 'transcription_jsons': str(self.transcription_jsons), 'dry_run': self.dry_run, 'append': self.append} diff --git a/preprocessor/config/constants.py b/preprocessor/config/constants.py index 1dd71f4a0..7f25767b8 100644 --- a/preprocessor/config/constants.py +++ b/preprocessor/config/constants.py @@ -40,17 +40,20 @@ 'scenes_suffix': '_scenes.json', } + class EpisodesDataKeys: EPISODES = 'episodes' SEASONS = 'seasons' SEASON_NUMBER = 'season' + class EpisodeMetadataKeys: EPISODE_NUMBER = 'episode_number' PREMIERE_DATE = 'premiere_date' TITLE = 'title' VIEWERSHIP = 'viewership' + class FfprobeKeys: FORMAT = 'format' STREAMS = 'streams' diff --git a/preprocessor/config/enums.py b/preprocessor/config/enums.py index 00f3e5343..cebade622 100644 --- a/preprocessor/config/enums.py +++ b/preprocessor/config/enums.py @@ -4,6 +4,7 @@ class KeyframeStrategy(str, Enum): SCENE_CHANGES = 'scene_changes' + class FrameType(str, Enum): SCENE_END = 'scene_end' SCENE_SINGLE = 'scene_single' @@ -13,18 +14,22 @@ class FrameType(str, Enum): def scene_mid(index: int) -> str: return f'scene_mid_{index}' + class ScraperMethod(str, Enum): CLIPBOARD = 'clipboard' CRAWL4AI = 'crawl4ai' + class ParserMode(str, Enum): NORMAL = 'normal' PREMIUM = 'premium' + class TranscriptionFormat(str, Enum): ELEVENLABS = '11labs' ELEVENLABS_SEGMENTED = '11labs_segmented' + class Device(str, Enum): CPU = 'cpu' CUDA = 'cuda' diff --git a/preprocessor/config/mixins.py b/preprocessor/config/mixins.py index 80f210050..9134b99e4 100644 --- a/preprocessor/config/mixins.py +++ b/preprocessor/config/mixins.py @@ -5,11 +5,12 @@ class OutputDirMixin: - OUTPUT_SUBDIR: ClassVar[str] @classmethod def get_output_dir(cls, series_name: str) -> Path: if not hasattr(cls, 'OUTPUT_SUBDIR'): - raise NotImplementedError(f"{cls.__name__} must define OUTPUT_SUBDIR class variable") + raise NotImplementedError( + f"{cls.__name__} must define OUTPUT_SUBDIR class variable", + ) return get_base_output_dir(series_name) / cls.OUTPUT_SUBDIR diff --git a/preprocessor/config/output_paths.py b/preprocessor/config/output_paths.py index 1012cdb30..73b34c183 100644 --- a/preprocessor/config/output_paths.py +++ b/preprocessor/config/output_paths.py @@ -3,13 +3,17 @@ from preprocessor.services.core.environment import Environment -BASE_OUTPUT_DIR = Path('/app/output_data') if Environment.is_docker() else Path('preprocessor/output_data') -def get_base_output_dir(series_name: Optional[str]=None) -> Path: - base = Path('/app/output_data') if Environment.is_docker() else Path('preprocessor/output_data') +def get_base_output_dir(series_name: Optional[str] = None) -> Path: + if Environment.is_docker(): + base = Path('/app/output_data') + else: + base = Path('preprocessor/output_data') + if series_name: return base / series_name.lower() return base -def get_output_path(relative_path: str, series_name: Optional[str]=None) -> Path: + +def get_output_path(relative_path: str, series_name: Optional[str] = None) -> Path: return get_base_output_dir(series_name) / relative_path diff --git a/preprocessor/config/series_config.py b/preprocessor/config/series_config.py index f63e49a5a..822293fc3 100644 --- a/preprocessor/config/series_config.py +++ b/preprocessor/config/series_config.py @@ -55,14 +55,11 @@ class TranscriptionProcessingConfig: @dataclass class TranscodeProcessingConfig: - bufsize_mbps: float - codec: str + bitrate_reference_mb: float + bitrate_reference_seconds: float force_deinterlace: bool - gop_size: float - maxrate_mbps: float - minrate_mbps: float + keyframe_interval_seconds: float resolution: str - video_bitrate_mbps: float @dataclass @@ -152,14 +149,11 @@ def __load_from_dict(data: Dict[str, Any]) -> 'SeriesConfig': device=data['processing']['transcription']['device'], ), transcode=TranscodeProcessingConfig( - codec=data['processing']['transcode']['codec'], - resolution=data['processing']['transcode']['resolution'], - video_bitrate_mbps=data['processing']['transcode']['video_bitrate_mbps'], - minrate_mbps=data['processing']['transcode']['minrate_mbps'], - maxrate_mbps=data['processing']['transcode']['maxrate_mbps'], - bufsize_mbps=data['processing']['transcode']['bufsize_mbps'], - gop_size=data['processing']['transcode']['gop_size'], + bitrate_reference_mb=data['processing']['transcode']['bitrate_reference_mb'], + bitrate_reference_seconds=data['processing']['transcode']['bitrate_reference_seconds'], force_deinterlace=data['processing']['transcode']['force_deinterlace'], + keyframe_interval_seconds=data['processing']['transcode']['keyframe_interval_seconds'], + resolution=data['processing']['transcode']['resolution'], ), scene_detection=SceneDetectionProcessingConfig( threshold=data['processing']['scene_detection']['threshold'], diff --git a/preprocessor/config/settings_factory.py b/preprocessor/config/settings_factory.py index 55a5a003b..beb52eb5a 100644 --- a/preprocessor/config/settings_factory.py +++ b/preprocessor/config/settings_factory.py @@ -4,15 +4,14 @@ class SettingsFactory: + __instance: Optional[Settings] = None - _instance: Optional[Settings] = None + @classmethod + def get_settings(cls) -> Settings: + if cls.__instance is None: + cls.__instance = Settings._from_env() + return cls.__instance - @staticmethod - def get_settings() -> Settings: - if SettingsFactory._instance is None: - SettingsFactory._instance = Settings._from_env() - return SettingsFactory._instance - - @staticmethod - def reset(new_settings: Optional[Settings] = None) -> None: - SettingsFactory._instance = new_settings + @classmethod + def reset(cls, new_settings: Optional[Settings] = None) -> None: + cls.__instance = new_settings diff --git a/preprocessor/config/step_configs.py b/preprocessor/config/step_configs.py index 1e8b407ac..850d6e8e2 100644 --- a/preprocessor/config/step_configs.py +++ b/preprocessor/config/step_configs.py @@ -6,53 +6,70 @@ from pydantic import ( BaseModel, + ConfigDict, Field, - model_validator, ) -from typing_extensions import Self from preprocessor.config.enums import KeyframeStrategy from preprocessor.services.media.resolution import Resolution class TranscodeConfig(BaseModel): - audio_bitrate_kbps: int = 128 - bufsize_mbps: float = Field(gt=0) - codec: str = Field(default='h264_nvenc') + model_config = ConfigDict(arbitrary_types_allowed=True) + + bitrate_reference_mb: float = Field(gt=0) + bitrate_reference_seconds: float = Field(gt=0) force_deinterlace: bool = False - gop_size: float = Field(gt=0) - maxrate_mbps: float = Field(gt=0) - minrate_mbps: float = Field(gt=0) - preset: str = 'p7' + keyframe_interval_seconds: float = Field(gt=0) resolution: Resolution = Field(default=Resolution.R720P) - video_bitrate_mbps: float = Field(gt=0) - class Config: - arbitrary_types_allowed = True + @property + def audio_bitrate_kbps(self) -> int: + return 128 + + @property + def codec(self) -> str: + return 'h264_nvenc' + + @property + def preset(self) -> str: + return 'p7' + + @property + def video_bitrate_mbps(self) -> float: + total = (self.bitrate_reference_mb * 8) / self.bitrate_reference_seconds + audio = self.audio_bitrate_kbps / 1000.0 + return round(total - audio, 2) + + def calculate_minrate_mbps(self, percent: float = 0.6) -> float: + return round(self.video_bitrate_mbps * percent, 2) + + def calculate_maxrate_mbps(self, percent: float = 1.4) -> float: + return round(self.video_bitrate_mbps * percent, 2) + + def calculate_bufsize_mbps(self, multiplier: float = 2.0) -> float: + return round(self.video_bitrate_mbps * multiplier, 2) - @model_validator(mode='after') - def __maxrate_must_be_greater_than_bitrate(self) -> Self: # pylint: disable=unused-private-member - if self.maxrate_mbps < self.video_bitrate_mbps: - raise ValueError('maxrate must be >= video_bitrate') - return self class SceneDetectionConfig(BaseModel): min_scene_len: int = Field(default=10, ge=1) threshold: float = Field(default=0.5, ge=0, le=1) + class FrameExportConfig(BaseModel): + model_config = ConfigDict(arbitrary_types_allowed=True) + frames_per_scene: int = Field(default=3, ge=1) keyframe_strategy: KeyframeStrategy = KeyframeStrategy.SCENE_CHANGES resolution: Resolution = Field(default=Resolution.R720P) - class Config: - arbitrary_types_allowed = True class TranscriptionConfig(BaseModel): language: str = 'pl' model: str = 'large-v3' output_formats: List[str] = ['json', 'srt', 'txt'] + class WhisperTranscriptionConfig(BaseModel): beam_size: int = Field(default=10, ge=1) device: str = 'cuda' @@ -60,9 +77,11 @@ class WhisperTranscriptionConfig(BaseModel): model: str = 'large-v3-turbo' temperature: float = Field(default=0.0, ge=0.0, le=1.0) + class TextAnalysisConfig(BaseModel): language: str = 'pl' + class TextEmbeddingConfig(BaseModel): batch_size: int = Field(default=8, ge=1) device: str = 'cuda' @@ -70,48 +89,61 @@ class TextEmbeddingConfig(BaseModel): text_chunk_overlap: int = Field(default=1, ge=0) text_sentences_per_chunk: int = Field(default=5, ge=1) + class VideoEmbeddingConfig(BaseModel): batch_size: int = Field(default=8, ge=1) device: str = 'cuda' model_name: str = 'Qwen/Qwen3-VL-Embedding-8B' + class SoundSeparationConfig(BaseModel): pass + class DocumentGenerationConfig(BaseModel): generate_segments: bool = True + class ImageHashConfig(BaseModel): batch_size: int = Field(default=32, ge=1) + class TranscriptionImportConfig(BaseModel): format_type: str = '11labs_segmented' source_dir: str + class ElasticsearchConfig(BaseModel): append: bool = False dry_run: bool = False host: str = 'localhost:9200' index_name: str + class AudioExtractionConfig(BaseModel): pass + class CharacterDetectionConfig(BaseModel): threshold: float = Field(default=0.7, ge=0.0, le=1.0) + class EmotionDetectionConfig(BaseModel): pass + class FaceClusteringConfig(BaseModel): pass + class ObjectDetectionConfig(BaseModel): pass + class ArchiveConfig(BaseModel): pass + class ValidationConfig(BaseModel): anomaly_threshold: float = 20.0 episodes_info_json: Optional[Path] = None diff --git a/preprocessor/config/step_defaults.py b/preprocessor/config/step_defaults.py index 59e488258..d5490e817 100644 --- a/preprocessor/config/step_defaults.py +++ b/preprocessor/config/step_defaults.py @@ -20,49 +20,49 @@ ) -def __get_default_step_configs(series_name: str) -> Dict[str, object]: - return { - 'transcode': TranscodeConfig( - video_bitrate_mbps=2.5, - minrate_mbps=1.5, - maxrate_mbps=3.5, - bufsize_mbps=5.0, - gop_size=2.0, - ), - 'transcribe': WhisperTranscriptionConfig( - model='large-v3-turbo', - language='pl', - device='cuda', - beam_size=5, - temperature=0.0, - ), - 'separate_sounds': SoundSeparationConfig(), - 'analyze_text': TextAnalysisConfig(language='pl'), - 'detect_scenes': SceneDetectionConfig(threshold=0.5, min_scene_len=10), - 'export_frames': FrameExportConfig(frames_per_scene=3), - 'text_embeddings': TextEmbeddingConfig( - model_name='Qwen/Qwen3-VL-Embedding-8B', - batch_size=8, - device='cuda', - text_sentences_per_chunk=5, - text_chunk_overlap=1, - ), - 'image_hashing': ImageHashConfig(batch_size=32), - 'video_embeddings': VideoEmbeddingConfig( - model_name='Qwen/Qwen3-VL-Embedding-8B', - batch_size=8, - device='cuda', - ), - 'character_detection': CharacterDetectionConfig(threshold=0.7), - 'emotion_detection': EmotionDetectionConfig(), - 'face_clustering': FaceClusteringConfig(), - 'object_detection': ObjectDetectionConfig(), - 'generate_elastic_documents': DocumentGenerationConfig(generate_segments=True), - 'generate_archives': ArchiveConfig(), - 'index': ElasticsearchConfig( - index_name=f'{series_name}_clips', - host='localhost:9200', - dry_run=False, - append=False, - ), - } +class DefaultConfigFactory: + @staticmethod + def get_configs(series_name: str) -> Dict[str, object]: + return { + 'transcode': TranscodeConfig( + bitrate_reference_mb=50.0, + bitrate_reference_seconds=100.0, + keyframe_interval_seconds=0.5, + ), + 'transcribe': WhisperTranscriptionConfig( + model='large-v3-turbo', + language='pl', + device='cuda', + beam_size=5, + temperature=0.0, + ), + 'separate_sounds': SoundSeparationConfig(), + 'analyze_text': TextAnalysisConfig(language='pl'), + 'detect_scenes': SceneDetectionConfig(threshold=0.5, min_scene_len=10), + 'export_frames': FrameExportConfig(frames_per_scene=3), + 'text_embeddings': TextEmbeddingConfig( + model_name='Qwen/Qwen3-VL-Embedding-8B', + batch_size=8, + device='cuda', + text_sentences_per_chunk=5, + text_chunk_overlap=1, + ), + 'image_hashing': ImageHashConfig(batch_size=32), + 'video_embeddings': VideoEmbeddingConfig( + model_name='Qwen/Qwen3-VL-Embedding-8B', + batch_size=8, + device='cuda', + ), + 'character_detection': CharacterDetectionConfig(threshold=0.7), + 'emotion_detection': EmotionDetectionConfig(), + 'face_clustering': FaceClusteringConfig(), + 'object_detection': ObjectDetectionConfig(), + 'generate_elastic_documents': DocumentGenerationConfig(generate_segments=True), + 'generate_archives': ArchiveConfig(), + 'index': ElasticsearchConfig( + index_name=f'{series_name}_clips', + host='localhost:9200', + dry_run=False, + append=False, + ), + } diff --git a/preprocessor/config/types/detection.py b/preprocessor/config/types/detection.py index 22e003f5e..a45d19291 100644 --- a/preprocessor/config/types/detection.py +++ b/preprocessor/config/types/detection.py @@ -11,12 +11,14 @@ class CharacterDetectionInFrame(TypedDict): embedding: NotRequired[List[float]] name: str + class ObjectDetectionInFrame(TypedDict): bbox: List[int] class_id: int class_name: str confidence: float + class Detection(TypedDict): bbox: List[int] class_id: NotRequired[int] diff --git a/preprocessor/config/types/episode.py b/preprocessor/config/types/episode.py index 446751b89..2d3d087ee 100644 --- a/preprocessor/config/types/episode.py +++ b/preprocessor/config/types/episode.py @@ -11,6 +11,7 @@ class EpisodeInfo(TypedDict): title: str viewership: Union[str, int, float] + class EpisodeMetadata(TypedDict): episode_number: int premiere_date: str @@ -19,6 +20,9 @@ class EpisodeMetadata(TypedDict): title: str viewership: Union[str, int, float] + class SeasonInfo(TypedDict): pass + + SeasonInfoDict = Dict[str, int] diff --git a/preprocessor/config/types/keys.py b/preprocessor/config/types/keys.py index 80479e7a7..5e2b69b3f 100644 --- a/preprocessor/config/types/keys.py +++ b/preprocessor/config/types/keys.py @@ -8,6 +8,7 @@ class SegmentKeys: TEXT = 'text' VIDEO_PATH = 'video_path' + class EpisodeMetadataKeys: EPISODE_INFO = 'episode_info' EPISODE_METADATA = 'episode_metadata' @@ -18,6 +19,7 @@ class EpisodeMetadataKeys: TITLE = 'title' VIEWERSHIP = 'viewership' + class ElasticsearchKeys: AGGREGATIONS = 'aggregations' BUCKETS = 'buckets' @@ -27,17 +29,20 @@ class ElasticsearchKeys: SOURCE = '_source' TOTAL = 'total' + class ElasticsearchAggregationKeys: SEASONS = 'seasons' UNIQUE_EPISODES = 'unique_episodes' VALUE = 'value' + class TranscriptionContextKeys: CONTEXT = 'context' OVERALL_END_TIME = 'overall_end_time' OVERALL_START_TIME = 'overall_start_time' TARGET = 'target' + class ElasticsearchQueryKeys: AGGS = 'aggs' ASC = 'asc' @@ -64,15 +69,18 @@ class ElasticsearchQueryKeys: TERMS = 'terms' TOP_HITS = 'top_hits' + class EpisodesDataKeys: EPISODES = 'episodes' SEASONS = 'seasons' SEASON_NUMBER = 'season_number' + class FfprobeKeys: FORMAT = 'format' STREAMS = 'streams' + class FfprobeStreamKeys: BIT_RATE = 'bit_rate' CODEC_NAME = 'codec_name' @@ -81,10 +89,12 @@ class FfprobeStreamKeys: R_FRAME_RATE = 'r_frame_rate' WIDTH = 'width' + class FfprobeFormatKeys: DURATION = 'duration' SIZE = 'size' + class DetectionKeys: CHARACTERS = 'characters' DETECTIONS = 'detections' @@ -93,22 +103,26 @@ class DetectionKeys: FRAME_NAME = 'frame_name' FRAME_NUMBER = 'frame_number' + class CharacterDetectionKeys: BBOX = 'bbox' CONFIDENCE = 'confidence' EMOTION = 'emotion' NAME = 'name' + class EmotionKeys: CONFIDENCE = 'confidence' LABEL = 'label' + class ObjectDetectionKeys: BBOX = 'bbox' CLASS_ID = 'class_id' CLASS_NAME = 'class_name' CONFIDENCE = 'confidence' + class SceneKeys: END = 'end' SCENES = 'scenes' @@ -119,10 +133,12 @@ class SceneKeys: SCENE_START_TIME = 'scene_start_time' START = 'start' + class SceneTimeKeys: FRAME = 'frame' SECONDS = 'seconds' + class ElasticDocKeys: CHARACTER_APPEARANCES = 'character_appearances' DETECTED_OBJECTS = 'detected_objects' @@ -130,6 +146,7 @@ class ElasticDocKeys: PERCEPTUAL_HASH_INT = 'perceptual_hash_int' SCENE_INFO = 'scene_info' + class EmbeddingKeys: EMBEDDING = 'embedding' EPISODE_ID = 'episode_id' @@ -142,6 +159,7 @@ class EmbeddingKeys: TITLE = 'title' TITLE_EMBEDDING = 'title_embedding' + class ValidationMetadataKeys: CODEC = 'codec' DURATION = 'duration' @@ -152,6 +170,7 @@ class ValidationMetadataKeys: SIZE_MB = 'size_mb' WIDTH = 'width' + class WordKeys: END = 'end' START = 'start' @@ -160,10 +179,12 @@ class WordKeys: WORD = 'word' WORDS = 'words' + class WordTypeValues: AUDIO_EVENT = 'audio_event' SPACING = 'spacing' + class GoogleSearchKeys: API_KEY = 'api_key' ENGINE = 'engine' @@ -172,6 +193,7 @@ class GoogleSearchKeys: IMAGES_RESULTS = 'images_results' Q = 'q' + class ImageResultKeys: IMAGE = 'image' ORIGINAL = 'original' diff --git a/preprocessor/config/types/scene.py b/preprocessor/config/types/scene.py index 5b8d17999..ad98498d1 100644 --- a/preprocessor/config/types/scene.py +++ b/preprocessor/config/types/scene.py @@ -13,15 +13,18 @@ class SceneDict(TypedDict): start_frame: int start_time: float + class SceneTimestampPoint(TypedDict): frame: int seconds: float + class SceneTimestamp(TypedDict): end: SceneTimestampPoint scene_number: int start: SceneTimestampPoint + class SceneTimestampsData(TypedDict): fps: NotRequired[float] scenes: List[SceneTimestamp] diff --git a/preprocessor/config/types/search.py b/preprocessor/config/types/search.py index 2b930892e..390a30364 100644 --- a/preprocessor/config/types/search.py +++ b/preprocessor/config/types/search.py @@ -7,8 +7,6 @@ Union, ) -from .transcription import ElasticsearchSegment - class SearchSegment(TypedDict): end_time: float @@ -17,31 +15,45 @@ class SearchSegment(TypedDict): start_time: float title: str + +class ElasticsearchSegment(TypedDict): + end_time: float + episode_number: int + season: int + start_time: float + title: str + + class ElasticsearchHit(TypedDict): _score: float _source: ElasticsearchSegment + class ElasticsearchHits(TypedDict): hits: List[ElasticsearchHit] max_score: float total: Dict[str, Any] + class ElasticsearchResponse(TypedDict): aggregations: NotRequired[Dict[str, Any]] hits: ElasticsearchHits timed_out: bool took: int + class EpisodeBucket(TypedDict): doc_count: int episode_metadata: Dict[str, Any] key: int + class SeasonBucket(TypedDict): doc_count: int key: int unique_episodes: Dict[str, int] + class ElasticsearchAggregations(TypedDict): buckets: NotRequired[List[Union[SeasonBucket, EpisodeBucket]]] seasons: Dict[str, Union[List[SeasonBucket], int]] diff --git a/preprocessor/config/types/transcription.py b/preprocessor/config/types/transcription.py index 04f52fbc3..c3c68463e 100644 --- a/preprocessor/config/types/transcription.py +++ b/preprocessor/config/types/transcription.py @@ -1,10 +1,18 @@ from typing import ( + Any, List, NotRequired, TypedDict, + Union, ) -from .episode import EpisodeMetadata +from preprocessor.config.types.episode import EpisodeMetadata + + +class ClipSegment(TypedDict): + end_time: float + start_time: float + video_path: Union[str, Any] class BaseSegment(TypedDict): @@ -13,6 +21,7 @@ class BaseSegment(TypedDict): start: float text: str + class SegmentWithTimes(TypedDict): end_time: float episode_metadata: EpisodeMetadata @@ -21,9 +30,11 @@ class SegmentWithTimes(TypedDict): text: str video_path: NotRequired[str] + class SegmentWithScore(SegmentWithTimes): _score: float + class ElasticsearchSegment(TypedDict): _score: NotRequired[float] end: NotRequired[float] @@ -37,6 +48,7 @@ class ElasticsearchSegment(TypedDict): text: str video_path: NotRequired[str] + class TranscriptionContext(TypedDict): context: List[BaseSegment] overall_end_time: float diff --git a/preprocessor/config/types/video.py b/preprocessor/config/types/video.py index 12cd0059f..5dbb0af4f 100644 --- a/preprocessor/config/types/video.py +++ b/preprocessor/config/types/video.py @@ -10,6 +10,7 @@ class HashResult(TypedDict): hash: str timestamp: float + class VideoMetadata(TypedDict): bitrate: NotRequired[int] codec: NotRequired[str] diff --git a/preprocessor/core/artifacts.py b/preprocessor/core/artifacts.py index 3400b24ec..c159d9735 100644 --- a/preprocessor/core/artifacts.py +++ b/preprocessor/core/artifacts.py @@ -19,21 +19,25 @@ class Artifact: pass + @dataclass(frozen=True) class EpisodeArtifact(Artifact): episode_id: str episode_info: 'EpisodeInfo' + @dataclass(frozen=True) class SourceVideo(EpisodeArtifact): path: Path + @dataclass(frozen=True) class TranscodedVideo(EpisodeArtifact): codec: str path: Path resolution: str + @dataclass(frozen=True) class SceneCollection(EpisodeArtifact): min_scene_len: int @@ -42,12 +46,14 @@ class SceneCollection(EpisodeArtifact): threshold: float video_path: Path + @dataclass(frozen=True) class FrameCollection(EpisodeArtifact): directory: Path frame_count: int metadata_path: Path + @dataclass(frozen=True) class TranscriptionData(EpisodeArtifact): format: str @@ -55,6 +61,7 @@ class TranscriptionData(EpisodeArtifact): model: str path: Path + @dataclass(frozen=True) class EmbeddingCollection(EpisodeArtifact): embedding_count: int @@ -62,63 +69,76 @@ class EmbeddingCollection(EpisodeArtifact): model_name: str path: Path + @dataclass(frozen=True) class DetectionResults(EpisodeArtifact): detection_count: int detection_type: str path: Path + @dataclass(frozen=True) class ElasticDocuments(EpisodeArtifact): document_count: int path: Path + @dataclass(frozen=True) class TextAnalysisResults(EpisodeArtifact): path: Path statistics: Dict[str, Any] metadata: Optional[Dict[str, Any]] = field(default=None) + @dataclass(frozen=True) class AudioArtifact(EpisodeArtifact): format: str path: Path + @dataclass(frozen=True) class IndexingResult(Artifact): document_count: int index_name: str success: bool + @dataclass(frozen=True) class ImageHashCollection(EpisodeArtifact): hash_count: int path: Path + @dataclass(frozen=True) class EmotionData(EpisodeArtifact): path: Path + @dataclass(frozen=True) class ClusterData(EpisodeArtifact): path: Path + @dataclass(frozen=True) class ObjectDetectionData(EpisodeArtifact): path: Path + @dataclass(frozen=True) class ArchiveArtifact(EpisodeArtifact): path: Path + @dataclass(frozen=True) class ValidationResult(Artifact): season: str validation_report_dir: Path + @dataclass(frozen=True) class ResolutionAnalysisResult(Artifact): total_files: int upscaling_percentage: float + ProcessedEpisode = ElasticDocuments diff --git a/preprocessor/core/base_step.py b/preprocessor/core/base_step.py index 33579b635..2828cf3c1 100644 --- a/preprocessor/core/base_step.py +++ b/preprocessor/core/base_step.py @@ -21,18 +21,11 @@ class PipelineStep(ABC, Generic[InputT, OutputT, ConfigT]): def __init__(self, config: ConfigT) -> None: - self._config: ConfigT = config - - def cleanup(self) -> None: - pass + self.__config: ConfigT = config @property def config(self) -> ConfigT: - return self._config - - @abstractmethod - def execute(self, input_data: InputT, context: "ExecutionContext") -> OutputT: - pass + return self.__config @property @abstractmethod @@ -43,6 +36,13 @@ def name(self) -> str: def is_global(self) -> bool: return False + @abstractmethod + def execute(self, input_data: InputT, context: "ExecutionContext") -> OutputT: + pass + + def cleanup(self) -> None: + pass + def _check_cache_validity( self, output_path: Path, @@ -50,7 +50,7 @@ def _check_cache_validity( episode_id: str, cache_description: str, ) -> bool: - if output_path.exists() and (not context.force_rerun): + if output_path.exists() and not context.force_rerun: if context.is_step_completed(self.name, episode_id): context.logger.info(f'Skipping {episode_id} ({cache_description})') return True diff --git a/preprocessor/core/context.py b/preprocessor/core/context.py index 95dceb05f..de44c53db 100644 --- a/preprocessor/core/context.py +++ b/preprocessor/core/context.py @@ -13,79 +13,75 @@ from preprocessor.core.state_manager import StateManager from preprocessor.services.episodes.episode_manager import EpisodeInfo -class ExecutionContext: +class ExecutionContext: def __init__( - self, - series_name: str, - base_output_dir: Path, - logger: ErrorHandlingLogger, - state_manager: Optional['StateManager'] = None, - force_rerun: bool = False, - settings: Optional[Settings] = None, + self, + series_name: str, + base_output_dir: Path, + logger: ErrorHandlingLogger, + state_manager: Optional['StateManager'] = None, + force_rerun: bool = False, + settings_instance: Optional[Settings] = None, ) -> None: - self._series_name: str = series_name - self._base_output_dir: Path = base_output_dir / series_name - self._state_manager: Optional['StateManager'] = state_manager - self._force_rerun: bool = force_rerun - self._logger: ErrorHandlingLogger = logger - self._settings: Settings = settings or SettingsFactory.get_settings() + self.__series_name: str = series_name + self.__base_output_dir: Path = base_output_dir / series_name + self.__state_manager: Optional['StateManager'] = state_manager + self.__force_rerun: bool = force_rerun + self.__logger: ErrorHandlingLogger = logger + self.__settings: Settings = settings_instance or SettingsFactory.get_settings() @property def force_rerun(self) -> bool: - return self._force_rerun + return self.__force_rerun + + @property + def logger(self) -> ErrorHandlingLogger: + return self.__logger + + @property + def series_name(self) -> str: + return self.__series_name + + @property + def settings(self) -> Settings: + """Get active Settings instance for this context.""" + return self.__settings + + @property + def state_manager(self) -> Optional['StateManager']: + return self.__state_manager def get_output_path( - self, episode_info: 'EpisodeInfo', subdir: str, filename: str, + self, episode_info: 'EpisodeInfo', subdir: str, filename: str, ) -> Path: season_code: str = episode_info.season_code() episode_code: str = episode_info.episode_num() - path: Path = ( - self._base_output_dir / subdir / season_code / episode_code / filename - ) + + path = self.__base_output_dir / subdir / season_code / episode_code / filename path.parent.mkdir(parents=True, exist_ok=True) return path def get_season_output_path( - self, episode_info: 'EpisodeInfo', subdir: str, filename: str, + self, episode_info: 'EpisodeInfo', subdir: str, filename: str, ) -> Path: season_code: str = episode_info.season_code() - path: Path = self._base_output_dir / subdir / season_code / filename + + path = self.__base_output_dir / subdir / season_code / filename path.parent.mkdir(parents=True, exist_ok=True) return path def is_step_completed(self, step_name: str, episode_id: str) -> bool: - if not self._state_manager: + if not self.__state_manager: return False - return self._state_manager.is_step_completed(step_name, episode_id) - - @property - def logger(self) -> ErrorHandlingLogger: - return self._logger + return self.__state_manager.is_step_completed(step_name, episode_id) def mark_step_completed(self, step_name: str, episode_id: str) -> None: - if self._state_manager: - self._state_manager.mark_step_completed(step_name, episode_id) + if self.__state_manager: + self.__state_manager.mark_step_completed(step_name, episode_id) def mark_step_started( - self, step_name: str, episode_id: str, temp_files: Optional[List[str]] = None, + self, step_name: str, episode_id: str, temp_files: Optional[List[str]] = None, ) -> None: - if self._state_manager: - self._state_manager.mark_step_started(step_name, episode_id, temp_files) - - @property - def series_name(self) -> str: - return self._series_name - - @property - def settings(self) -> Settings: - """Get settings instance. - - Returns: - The active Settings instance for this context. - """ - return self._settings - - @property - def state_manager(self) -> Optional['StateManager']: - return self._state_manager + if self.__state_manager: + self.__state_manager.mark_step_started(step_name, episode_id, temp_files) diff --git a/preprocessor/core/processing_metadata.py b/preprocessor/core/processing_metadata.py index 06e7a6484..502bd1bdc 100644 --- a/preprocessor/core/processing_metadata.py +++ b/preprocessor/core/processing_metadata.py @@ -44,67 +44,97 @@ def to_dict(self) -> Dict[str, Any]: 'extra_info': self.extra_info, } -class ProcessingMetadata: +class ProcessingMetadata: def __init__(self, series_name: str, params: Dict[str, Any]) -> None: - self.series_name = series_name - self.params = self.__sanitize_params(params) - self.start_time = datetime.now() - self.end_time: Optional[datetime] = None - self.total_duration_seconds: Optional[float] = None - self.steps: List[StepMetadata] = [] - self.final_status = 'running' + self.__series_name = series_name + self.__params = self.__sanitize_params(params) + self.__start_time = datetime.now() + self.__end_time: Optional[datetime] = None + self.__total_duration_seconds: Optional[float] = None + self.__steps: List[StepMetadata] = [] + self.__final_status = 'running' + + @property + def final_status(self) -> str: + return self.__final_status + + @final_status.setter + def final_status(self, value: str) -> None: + self.__final_status = value + + @property + def end_time(self) -> Optional[datetime]: + return self.__end_time + + @end_time.setter + def end_time(self, value: datetime) -> None: + self.__end_time = value + + @property + def total_duration_seconds(self) -> Optional[float]: + return self.__total_duration_seconds + + @total_duration_seconds.setter + def total_duration_seconds(self, value: float) -> None: + self.__total_duration_seconds = value def add_step(self, name: str, step_num: str) -> StepMetadata: step = StepMetadata(name=name, step_num=step_num) - self.steps.append(step) + self.__steps.append(step) return step def to_dict(self) -> Dict[str, Any]: return { - 'series_name': self.series_name, - 'start_time': self.start_time.isoformat(), - 'end_time': self.end_time.isoformat() if self.end_time else None, - 'final_status': self.final_status, - 'parameters': self.params, - 'steps': [step.to_dict() for step in self.steps], + 'series_name': self.__series_name, + 'start_time': self.__start_time.isoformat(), + 'end_time': self.__end_time.isoformat() if self.__end_time else None, + 'final_status': self.__final_status, + 'parameters': self.__params, + 'steps': [step.to_dict() for step in self.__steps], 'statistics': self.__get_statistics(), } def __get_statistics(self) -> Dict[str, Any]: - completed_steps = [s for s in self.steps if s.status == 'success'] - failed_steps = [s for s in self.steps if s.status == 'failed'] - skipped_steps = [s for s in self.steps if s.status == 'skipped'] + completed_steps = [s for s in self.__steps if s.status == 'success'] + failed_steps = [s for s in self.__steps if s.status == 'failed'] + skipped_steps = [s for s in self.__steps if s.status == 'skipped'] + step_durations = [ - s.duration_seconds for s in self.steps if s.duration_seconds is not None + s.duration_seconds for s in self.__steps if s.duration_seconds is not None ] + + avg_duration = ( + round(sum(step_durations) / len(step_durations), 2) + if step_durations else None + ) + return { - 'total_steps': len(self.steps), + 'total_steps': len(self.__steps), 'completed_steps': len(completed_steps), 'failed_steps': len(failed_steps), 'skipped_steps': len(skipped_steps), 'total_duration_seconds': ( - round(self.total_duration_seconds, 2) - if self.total_duration_seconds - else None - ), - 'average_step_duration_seconds': ( - round(sum(step_durations) / len(step_durations), 2) - if step_durations - else None + round(self.__total_duration_seconds, 2) + if self.__total_duration_seconds else None ), + 'average_step_duration_seconds': avg_duration, } @staticmethod def __sanitize_params(params: Dict[str, Any]) -> Dict[str, Any]: - sanitized = {} + sanitized: Dict[str, Any] = {} + ignored_keys = {'state_manager'} + for key, value in params.items(): - if key in set('state_manager'): + if key in ignored_keys: continue + if isinstance(value, Path): sanitized[key] = str(value) elif isinstance(value, (str, int, float, bool, list, dict, type(None))): sanitized[key] = value else: sanitized[key] = str(value) + return sanitized diff --git a/preprocessor/core/state_manager.py b/preprocessor/core/state_manager.py index 5bf62b80b..9c2023f05 100644 --- a/preprocessor/core/state_manager.py +++ b/preprocessor/core/state_manager.py @@ -16,19 +16,21 @@ from preprocessor.services.ui.console import console -@dataclass +@dataclass(frozen=True) class StepCheckpoint: completed_at: str episode: str step: str -@dataclass + +@dataclass(frozen=True) class InProgressStep: episode: str started_at: str step: str temp_files: List[str] = field(default_factory=list) + @dataclass class ProcessingState: last_checkpoint: str @@ -47,7 +49,7 @@ def to_dict(self) -> Dict[str, Any]: } @classmethod - def _from_dict(cls, data: Dict[str, Any]) -> 'ProcessingState': + def from_dict(cls, data: Dict[str, Any]) -> 'ProcessingState': completed_steps = [ StepCheckpoint(**step) for step in data.get('completed_steps', []) ] @@ -55,6 +57,7 @@ def _from_dict(cls, data: Dict[str, Any]) -> 'ProcessingState': in_progress = ( InProgressStep(**in_progress_data) if in_progress_data else None ) + return cls( series_name=data['series_name'], started_at=data['started_at'], @@ -63,12 +66,14 @@ def _from_dict(cls, data: Dict[str, Any]) -> 'ProcessingState': in_progress=in_progress, ) + class StateManager: - STATE_FILE_TEMPLATE: str = '.preprocessing_state_{series}.json' + __STATE_FILE_TEMPLATE: str = '.preprocessing_state_{series}.json' def __init__(self, series_name: str, working_dir: Path = Path('.')) -> None: - self.__series_name: str = series_name - state_filename: str = self.STATE_FILE_TEMPLATE.format(series=series_name) + self.__series_name = series_name + + state_filename = self.__STATE_FILE_TEMPLATE.format(series=series_name) self.__state_file: Path = working_dir / state_filename self.__state: Optional[ProcessingState] = None @@ -80,49 +85,37 @@ def cleanup(self) -> None: def is_step_completed(self, step: str, episode: str) -> bool: if self.__state is None: return False + return any( - (s.step == step and s.episode == episode) + s.step == step and s.episode == episode for s in self.__state.completed_steps ) def load_or_create_state(self) -> ProcessingState: if self.__state_file.exists(): - console.print(f'[yellow]Found existing state file: {self.__state_file}[/yellow]') - with open(self.__state_file, 'r', encoding='utf-8') as f: - data = json.load(f) - self.__state = ProcessingState._from_dict(data) - console.print(f'[green]Loaded state for series: {self.__state.series_name}[/green]') - console.print(f'[green]Completed steps: {len(self.__state.completed_steps)}[/green]') - return self.__state - else: - console.print('[blue]Creating new processing state...[/blue]') - now = datetime.now().isoformat() - self.__state = ProcessingState( - series_name=self.__series_name, - started_at=now, - last_checkpoint=now, - ) - self.__save_state() - return self.__state + return self.__load_existing_state() + return self.__create_new_state() def mark_step_completed(self, step: str, episode: str) -> None: - if self.__state is None: - raise RuntimeError('State not initialized') + self.__ensure_state_initialized() + checkpoint = StepCheckpoint( step=step, episode=episode, completed_at=datetime.now().isoformat(), ) + self.__state.completed_steps.append(checkpoint) self.__state.in_progress = None self.__save_state() + console.print(f'[green]Completed: {step} for {episode}[/green]') def mark_step_started( - self, step: str, episode: str, temp_files: Optional[List[str]] = None, + self, step: str, episode: str, temp_files: Optional[List[str]] = None, ) -> None: - if self.__state is None: - raise RuntimeError('State not initialized') + self.__ensure_state_initialized() + self.__state.in_progress = InProgressStep( step=step, episode=episode, @@ -130,11 +123,41 @@ def mark_step_started( temp_files=temp_files or [], ) self.__save_state() + console.print(f'[cyan]Started: {step} for {episode}[/cyan]') + def __load_existing_state(self) -> ProcessingState: + console.print(f'[yellow]Found existing state file: {self.__state_file}[/yellow]') + + with open(self.__state_file, 'r', encoding='utf-8') as f: + data = json.load(f) + + self.__state = ProcessingState.from_dict(data) + + console.print(f'[green]Loaded state for series: {self.__state.series_name}[/green]') + console.print(f'[green]Completed steps: {len(self.__state.completed_steps)}[/green]') + return self.__state + + def __create_new_state(self) -> ProcessingState: + console.print('[blue]Creating new processing state...[/blue]') + now = datetime.now().isoformat() + + self.__state = ProcessingState( + series_name=self.__series_name, + started_at=now, + last_checkpoint=now, + ) + self.__save_state() + return self.__state + + def __ensure_state_initialized(self) -> None: + if self.__state is None: + raise RuntimeError('State not initialized. Call load_or_create_state() first.') + def __save_state(self) -> None: if self.__state is None: return + self.__state.last_checkpoint = datetime.now().isoformat() with open(self.__state_file, 'w', encoding='utf-8') as f: json.dump(self.__state.to_dict(), f, indent=2, ensure_ascii=False) diff --git a/preprocessor/series_configs/defaults.json b/preprocessor/series_configs/defaults.json index fb21e6f8e..f22e0f78e 100644 --- a/preprocessor/series_configs/defaults.json +++ b/preprocessor/series_configs/defaults.json @@ -18,14 +18,11 @@ "threshold": 0.5 }, "transcode": { - "bufsize_mbps": 5.0, - "codec": "h264_nvenc", + "bitrate_reference_mb": 50.0, + "bitrate_reference_seconds": 100.0, "force_deinterlace": false, - "gop_size": 2.0, - "maxrate_mbps": 3.5, - "minrate_mbps": 1.5, - "resolution": "720p", - "video_bitrate_mbps": 2.5 + "keyframe_interval_seconds": 0.5, + "resolution": "720p" }, "transcription": { "device": "cuda", diff --git a/preprocessor/services/ai/clients.py b/preprocessor/services/ai/clients.py index 862452d4b..ac0dd02ec 100644 --- a/preprocessor/services/ai/clients.py +++ b/preprocessor/services/ai/clients.py @@ -28,12 +28,12 @@ class VLLMClient(BaseLLMClient): __DEFAULT_MODEL_NAME = 'Qwen/Qwen2.5-Coder-7B-Instruct' def __init__(self, model_name: Optional[str] = None) -> None: - self._model_name = model_name or self.__DEFAULT_MODEL_NAME - self._model: Optional[LLM] = None + self.__model_name = model_name or self.__DEFAULT_MODEL_NAME + self.__model: Optional[LLM] = None self.__load_model() def generate(self, messages: List[Dict[str, str]], max_tokens: int = 32768) -> str: - if self._model is None: + if self.__model is None: raise RuntimeError('Model not initialized') sampling_params = SamplingParams( @@ -43,14 +43,14 @@ def generate(self, messages: List[Dict[str, str]], max_tokens: int = 32768) -> s max_tokens=max_tokens, repetition_penalty=1.05, ) - outputs = self._model.chat(messages=[messages], sampling_params=sampling_params) + outputs = self.__model.chat(messages=[messages], sampling_params=sampling_params) return outputs[0].outputs[0].text.strip() def __load_model(self) -> None: - console.print(f'[cyan]Loading LLM: {self._model_name} (vLLM, 128K context)[/cyan]') + console.print(f'[cyan]Loading LLM: {self.__model_name} (vLLM, 128K context)[/cyan]') try: - self._model = LLM( - model=self._model_name, + self.__model = LLM( + model=self.__model_name, trust_remote_code=True, max_model_len=131072, gpu_memory_utilization=0.95, @@ -71,30 +71,31 @@ class GeminiClient(BaseLLMClient): __GEMINI_MODEL_NAME = 'gemini-2.5-flash' def __init__(self) -> None: - self._client: Optional[OpenAI] = None + self.__client: Optional[OpenAI] = None self.__init_client() def generate(self, messages: List[Dict[str, str]], max_tokens: int = 32768) -> str: - if self._client is None: + if self.__client is None: raise RuntimeError('Gemini client not initialized') - response = self._client.chat.completions.create( + response = self.__client.chat.completions.create( model=self.__GEMINI_MODEL_NAME, messages=messages, ) return response.choices[0].message.content.strip() def __init_client(self) -> None: - console.print('[cyan]Initializing Gemini 2.5 Flash via OpenAI SDK...[/cyan]') + console.print(f'[cyan]Initializing {self.__GEMINI_MODEL_NAME} via OpenAI SDK...[/cyan]') try: api_key = settings.gemini.api_key if not api_key: raise ValueError('GEMINI_API_KEY not set in environment') - self._client = OpenAI( + + self.__client = OpenAI( base_url='https://generativelanguage.googleapis.com/v1beta/openai/', api_key=api_key, ) - console.print('[green]Gemini 2.5 Flash initialized[/green]') + console.print(f'[green]{self.__GEMINI_MODEL_NAME} initialized[/green]') except Exception as e: console.print(f'[red]Failed to initialize Gemini client: {e}[/red]') raise diff --git a/preprocessor/services/ai/models.py b/preprocessor/services/ai/models.py index 6b7d06ef7..fbd102592 100644 --- a/preprocessor/services/ai/models.py +++ b/preprocessor/services/ai/models.py @@ -20,7 +20,7 @@ class EpisodeInfo(BaseModel): @field_validator('viewership', mode='before') @classmethod - def __convert_viewership_to_str(cls, v: Optional[int]) -> Optional[str]: # pylint: disable=unused-private-member + def _convert_viewership_to_str(cls, v: Optional[int]) -> Optional[str]: if v is None: return None if isinstance(v, int): @@ -34,7 +34,7 @@ class SeasonMetadata(BaseModel): @model_validator(mode='before') @classmethod - def __convert_old_format(cls, data: Dict) -> Dict: # pylint: disable=unused-private-member + def _convert_old_format(cls, data: Dict) -> Dict: if isinstance(data, dict) and 'episodes' in data: for idx, episode in enumerate(data['episodes'], start=1): if isinstance(episode, dict) and 'episode_number' in episode and ('episode_in_season' not in episode): diff --git a/preprocessor/services/ai/provider.py b/preprocessor/services/ai/provider.py index 9adbcc68b..6c44c462e 100644 --- a/preprocessor/services/ai/provider.py +++ b/preprocessor/services/ai/provider.py @@ -31,16 +31,21 @@ class LLMProvider: + def __init__( + self, + model_name: Optional[str] = None, + parser_mode: Optional[ParserMode] = None, + ) -> None: + self.__parser_mode = parser_mode or ParserMode.NORMAL - def __init__(self, model_name: Optional[str] = None, parser_mode: Optional[ParserMode] = None) -> None: - self._parser_mode = parser_mode or ParserMode.NORMAL - - if self._parser_mode == ParserMode.PREMIUM: - self._client: BaseLLMClient = GeminiClient() + if self.__parser_mode == ParserMode.PREMIUM: + self.__client: BaseLLMClient = GeminiClient() else: - self._client: BaseLLMClient = VLLMClient(model_name=model_name) + self.__client: BaseLLMClient = VLLMClient(model_name=model_name) - def extract_all_seasons(self, scraped_pages: List[Dict[str, Any]]) -> Optional[List[SeasonMetadata]]: + def extract_all_seasons( + self, scraped_pages: List[Dict[str, Any]], + ) -> Optional[List[SeasonMetadata]]: combined_content = self.__build_combined_content(scraped_pages) result = self.__process_llm_request( @@ -55,9 +60,9 @@ def extract_all_seasons(self, scraped_pages: List[Dict[str, Any]]) -> Optional[L return result.seasons if result else None def extract_characters( - self, - scraped_pages: List[Dict[str, Any]], - series_name: str, + self, + scraped_pages: List[Dict[str, Any]], + series_name: str, ) -> Optional[List[CharacterInfo]]: combined_content = self.__build_combined_content(scraped_pages) @@ -75,14 +80,6 @@ def extract_characters( @staticmethod def __build_combined_content(scraped_pages: List[Dict[str, Any]]) -> str: - """Build combined markdown from scraped pages. - - Args: - scraped_pages: List of scraped page dictionaries with 'url' and 'markdown' keys. - - Returns: - Combined content with source separators. - """ combined_parts: List[str] = [] for i, page in enumerate(scraped_pages, 1): url: str = page['url'] @@ -105,6 +102,7 @@ def __extract_json(content: str) -> Dict[str, Any]: json_str = content[start:end].strip() else: json_str = content.strip() + return json.loads(json_str) except json.JSONDecodeError as e: console.print(f'[red]JSON parse error: {e}[/red]') @@ -112,18 +110,18 @@ def __extract_json(content: str) -> Dict[str, Any]: raise def __process_llm_request( - self, - system_prompt: str, - user_prompt: str, - response_model: Type[BaseModel], - error_context: str, + self, + system_prompt: str, + user_prompt: str, + response_model: Type[BaseModel], + error_context: str, ) -> Optional[BaseModel]: try: messages = [ {'role': 'system', 'content': system_prompt}, {'role': 'user', 'content': user_prompt}, ] - content = self._client.generate(messages) + content = self.__client.generate(messages) data = self.__extract_json(content) return response_model(**data) except Exception as e: diff --git a/preprocessor/services/audio/extraction.py b/preprocessor/services/audio/extraction.py index 50f21c367..fe3348050 100644 --- a/preprocessor/services/audio/extraction.py +++ b/preprocessor/services/audio/extraction.py @@ -12,37 +12,48 @@ class AudioExtractionStep(PipelineStep[SourceVideo, AudioArtifact, AudioExtractionConfig]): + @property + def name(self) -> str: + return 'audio_extraction' def execute(self, input_data: SourceVideo, context: ExecutionContext) -> AudioArtifact: + output_path = self.__resolve_output_path(input_data, context) + + if self.__is_cached(input_data, output_path, context): + context.logger.info(f'Skipping {input_data.episode_id} (cached audio)') + return self.__create_artifact(input_data, output_path) + + context.logger.info(f'Extracting audio for {input_data.episode_id}') + context.mark_step_started(self.name, input_data.episode_id) + + self.__extract_audio(input_data.path, output_path, context) + + context.mark_step_completed(self.name, input_data.episode_id) + return self.__create_artifact(input_data, output_path) + + def __resolve_output_path(self, input_data: SourceVideo, context: ExecutionContext) -> Path: episode_code = input_data.episode_info.episode_code() - output_filename: str = ( - f'{context.series_name}_{episode_code}.{self.config.format}' - ) - output_path: Path = context.get_output_path( + output_filename = f'{context.series_name}_{episode_code}.{self.config.format}' + + return context.get_output_path( input_data.episode_info, 'extracted_audio', output_filename, ) - if output_path.exists() and (not context.force_rerun): - if context.is_step_completed(self.name, input_data.episode_id): - context.logger.info(f'Skipping {input_data.episode_id} (cached audio)') - return AudioArtifact( - episode_id=input_data.episode_id, - episode_info=input_data.episode_info, - path=output_path, - format=self.config.format, - ) - context.logger.info(f'Extracting audio for {input_data.episode_id}') - context.mark_step_started(self.name, input_data.episode_id) - command: List[str] = [ - 'ffmpeg', '-y', '-v', 'error', - '-i', str(input_data.path), - '-vn', - '-acodec', 'pcm_s16le', - '-ar', str(self.config.sample_rate), - '-ac', str(self.config.channels), - str(output_path), - ] + + def __is_cached( + self, input_data: SourceVideo, output_path: Path, context: ExecutionContext, + ) -> bool: + if not output_path.exists() or context.force_rerun: + return False + + return context.is_step_completed(self.name, input_data.episode_id) + + def __extract_audio( + self, input_path: Path, output_path: Path, context: ExecutionContext, + ) -> None: + command = self.__build_ffmpeg_command(input_path, output_path) + try: subprocess.run(command, check=True) except subprocess.CalledProcessError as e: @@ -50,14 +61,22 @@ def execute(self, input_data: SourceVideo, context: ExecutionContext) -> AudioAr if output_path.exists(): output_path.unlink() raise - context.mark_step_completed(self.name, input_data.episode_id) + + def __build_ffmpeg_command(self, input_path: Path, output_path: Path) -> List[str]: + return [ + 'ffmpeg', '-y', '-v', 'error', + '-i', str(input_path), + '-vn', # Disable video processing + '-acodec', 'pcm_s16le', + '-ar', str(self.config.sample_rate), + '-ac', str(self.config.channels), + str(output_path), + ] + + def __create_artifact(self, input_data: SourceVideo, output_path: Path) -> AudioArtifact: return AudioArtifact( episode_id=input_data.episode_id, episode_info=input_data.episode_info, path=output_path, format=self.config.format, ) - - @property - def name(self) -> str: - return 'audio_extraction' diff --git a/preprocessor/services/characters/face_detection.py b/preprocessor/services/characters/face_detection.py index 9acb35b77..ee1b59dcf 100644 --- a/preprocessor/services/characters/face_detection.py +++ b/preprocessor/services/characters/face_detection.py @@ -5,6 +5,7 @@ Dict, List, Optional, + Tuple, ) import warnings @@ -17,58 +18,130 @@ from preprocessor.config.settings_instance import settings from preprocessor.services.ui.console import console -warnings.filterwarnings('ignore', message='.*estimate.*is deprecated.*', category=FutureWarning, module='insightface') +warnings.filterwarnings( + 'ignore', + message='.*estimate.*is deprecated.*', + category=FutureWarning, + module='insightface', +) -class FaceDetector: +class FaceDetector: @staticmethod def detect_characters_in_frame( - frame_path: Path, - face_app: FaceAnalysis, - character_vectors: Dict[str, np.ndarray], - threshold: float, + frame_path: Path, + face_app: FaceAnalysis, + character_vectors: Dict[str, np.ndarray], + threshold: float, ) -> List[Dict[str, Any]]: img = cv2.imread(str(frame_path)) if img is None: return [] + faces = face_app.get(img) if not faces: return [] + detected = [] for face in faces: - face_embedding = face.normed_embedding - bbox = face.bbox.astype(int) - best_match = None - best_similarity = threshold - for char_name, char_vector in character_vectors.items(): - similarity = np.dot(face_embedding, char_vector) - if similarity > best_similarity: - best_similarity = similarity - best_match = char_name - if best_match is not None: - detected.append({ - 'name': best_match, - 'confidence': float(best_similarity), - 'bbox': { - 'x1': int(bbox[0]), - 'y1': int(bbox[1]), - 'x2': int(bbox[2]), - 'y2': int(bbox[3]), - }, - }) + match = FaceDetector.__find_best_match( + face.normed_embedding, character_vectors, threshold, + ) + if match: + char_name, confidence = match + detected.append( + FaceDetector.__format_detection_result(char_name, confidence, face.bbox), + ) + detected.sort(key=lambda x: x['confidence'], reverse=True) return detected @staticmethod def init() -> FaceAnalysis: model_root = os.getenv('INSIGHTFACE_HOME', os.path.expanduser('~/.insightface')) + FaceDetector.__check_cuda_availability() + + providers = FaceDetector.__build_providers_config() + + with warnings.catch_warnings(): + warnings.filterwarnings('ignore', category=UserWarning, module='onnxruntime') + warnings.filterwarnings('ignore', category=FutureWarning, module='insightface') + + face_app = FaceDetector.__init_face_app(model_root, providers) + FaceDetector.__verify_active_providers(face_app) + + FaceDetector.__print_init_success(model_root) + return face_app + + @staticmethod + def load_character_references( + characters_dir: Path, + face_app: FaceAnalysis, + ) -> Dict[str, np.ndarray]: + console.print('[blue]Loading character references...[/blue]') + character_vectors: Dict[str, np.ndarray] = {} + + for char_dir in characters_dir.iterdir(): + if not char_dir.is_dir(): + continue + + char_name = char_dir.name.replace('_', ' ').title() + vector = FaceDetector.__load_or_compute_vector(char_dir, char_name, face_app) + + if vector is not None: + character_vectors[char_name] = vector + + console.print(f'[green]Loaded {len(character_vectors)} characters[/green]') + return character_vectors + + @staticmethod + def __find_best_match( + face_embedding: np.ndarray, + character_vectors: Dict[str, np.ndarray], + threshold: float, + ) -> Optional[Tuple[str, float]]: + best_match = None + best_similarity = threshold + + for char_name, char_vector in character_vectors.items(): + similarity = float(np.dot(face_embedding, char_vector)) + if similarity > best_similarity: + best_similarity = similarity + best_match = char_name + + return (best_match, best_similarity) if best_match else None + + @staticmethod + def __format_detection_result( + char_name: str, + confidence: float, + bbox: np.ndarray, + ) -> Dict[str, Any]: + bbox_int = bbox.astype(int) + return { + 'name': char_name, + 'confidence': confidence, + 'bbox': { + 'x1': int(bbox_int[0]), + 'y1': int(bbox_int[1]), + 'x2': int(bbox_int[2]), + 'y2': int(bbox_int[3]), + }, + } + + @staticmethod + def __check_cuda_availability() -> None: available_providers = ort.get_available_providers() console.print(f"[dim]Available ONNX providers: {', '.join(available_providers)}[/dim]") + if 'CUDAExecutionProvider' not in available_providers: console.print('[red]CUDAExecutionProvider not available in onnxruntime[/red]') console.print('[red] Check if onnxruntime-gpu is installed and CUDA libraries are accessible[/red]') raise RuntimeError('CUDA provider not available in onnxruntime') - providers = [( + + @staticmethod + def __build_providers_config() -> List[Tuple[str, Dict[str, Any]]]: + return [( 'CUDAExecutionProvider', { 'device_id': 0, @@ -78,66 +151,90 @@ def init() -> FaceAnalysis: 'do_copy_in_default_stream': True, }, )] - with warnings.catch_warnings(): - warnings.filterwarnings('ignore', category=UserWarning, module='onnxruntime') - warnings.filterwarnings('ignore', category=FutureWarning, module='insightface') - console.print(f'[cyan]Loading {settings.face_recognition.model_name} face detection model (GPU-only)...[/cyan]') - try: - face_app = FaceAnalysis(name=settings.face_recognition.model_name, root=model_root, providers=providers) - face_app.prepare(ctx_id=0, det_size=settings.face_recognition.detection_size, det_thresh=settings.character.face_detection_threshold) - except Exception as e: - console.print('[red]Failed to initialize face detection on GPU[/red]') - console.print(f'[red] Error: {e}[/red]') - console.print('[red] Ensure CUDA and onnxruntime-gpu are properly configured[/red]') - raise RuntimeError('GPU required but face detection initialization failed') from e - actual_providers = face_app.models['detection'].session.get_providers() - if 'CUDAExecutionProvider' not in actual_providers: - console.print('[red]CUDA provider not active after initialization[/red]') - console.print(f"[red] Active providers: {', '.join(actual_providers)}[/red]") - raise RuntimeError('CUDA required but not available for face detection') - console.print(f'[green]Face detection initialized ({settings.face_recognition.model_name})[/green]') + + @staticmethod + def __init_face_app( + model_root: str, + providers: List[Tuple[str, Dict[str, Any]]], + ) -> FaceAnalysis: + model_name = settings.face_recognition.model_name + console.print(f'[cyan]Loading {model_name} face detection model (GPU-only)...[/cyan]') + + try: + face_app = FaceAnalysis(name=model_name, root=model_root, providers=providers) + face_app.prepare( + ctx_id=0, + det_size=settings.face_recognition.detection_size, + det_thresh=settings.character.face_detection_threshold, + ) + return face_app + except Exception as e: + console.print('[red]Failed to initialize face detection on GPU[/red]') + console.print(f'[red] Error: {e}[/red]') + console.print('[red] Ensure CUDA and onnxruntime-gpu are properly configured[/red]') + raise RuntimeError('GPU required but face detection initialization failed') from e + + @staticmethod + def __verify_active_providers(face_app: FaceAnalysis) -> None: + actual_providers = face_app.models['detection'].session.get_providers() + if 'CUDAExecutionProvider' not in actual_providers: + console.print('[red]CUDA provider not active after initialization[/red]') + console.print(f"[red] Active providers: {', '.join(actual_providers)}[/red]") + raise RuntimeError('CUDA required but not available for face detection') + + @staticmethod + def __print_init_success(model_root: str) -> None: + model_name = settings.face_recognition.model_name + det_size = settings.face_recognition.detection_size + det_thresh = settings.character.face_detection_threshold + + console.print(f'[green]Face detection initialized ({model_name})[/green]') console.print('[dim] Device: GPU (CUDA)[/dim]') - console.print(f'[dim] Detection size: {settings.face_recognition.detection_size}[/dim]') - console.print(f'[dim] Face detection threshold: {settings.character.face_detection_threshold}[/dim]') + console.print(f'[dim] Detection size: {det_size}[/dim]') + console.print(f'[dim] Face detection threshold: {det_thresh}[/dim]') console.print(f'[dim] Model cache: {model_root}[/dim]') - return face_app @staticmethod - def load_character_references(characters_dir: Path, face_app: FaceAnalysis) -> Dict[str, np.ndarray]: - console.print('[blue]Loading character references...[/blue]') - character_vectors = {} - for char_dir in characters_dir.iterdir(): - if not char_dir.is_dir(): - continue - char_name = char_dir.name.replace('_', ' ').title() - vector_file = char_dir / 'face_vector.npy' - if vector_file.exists(): - character_vectors[char_name] = np.load(vector_file) - console.print(f'[dim]{char_name}: loaded from face_vector.npy[/dim]') - continue - images = list(char_dir.glob('*.jpg')) - if not images: - continue - embeddings = [] - for img_path in images: - emb = FaceDetector.__get_face_embedding(str(img_path), face_app) - if emb is not None: - embeddings.append(emb) - if embeddings: - mean_emb = np.mean(embeddings, axis=0) - centroid = mean_emb / norm(mean_emb) - character_vectors[char_name] = centroid - console.print(f'[green]{char_name}: {len(embeddings)} reference images[/green]') - console.print(f'[green]Loaded {len(character_vectors)} characters[/green]') - return character_vectors + def __load_or_compute_vector( + char_dir: Path, + char_name: str, + face_app: FaceAnalysis, + ) -> Optional[np.ndarray]: + vector_file = char_dir / 'face_vector.npy' + if vector_file.exists(): + console.print(f'[dim]{char_name}: loaded from face_vector.npy[/dim]') + return np.load(vector_file) + + images = list(char_dir.glob('*.jpg')) + if not images: + return None + + embeddings = [] + for img_path in images: + emb = FaceDetector.__get_face_embedding(str(img_path), face_app) + if emb is not None: + embeddings.append(emb) + + if embeddings: + mean_emb = np.mean(embeddings, axis=0) + centroid = mean_emb / norm(mean_emb) + console.print(f'[green]{char_name}: {len(embeddings)} reference images[/green]') + return centroid + + return None @staticmethod def __get_face_embedding(img_path: str, face_app: FaceAnalysis) -> Optional[np.ndarray]: img = cv2.imread(img_path) if img is None: return None + faces = face_app.get(img) if not faces: return None - faces.sort(key=lambda x: (x.bbox[2] - x.bbox[0]) * (x.bbox[3] - x.bbox[1]), reverse=True) + + faces.sort( + key=lambda x: (x.bbox[2] - x.bbox[0]) * (x.bbox[3] - x.bbox[1]), + reverse=True, + ) return faces[0].normed_embedding diff --git a/preprocessor/services/characters/image_search/duckduckgo_image_search.py b/preprocessor/services/characters/image_search/duckduckgo_image_search.py index 5e9998457..c4e74f15e 100644 --- a/preprocessor/services/characters/image_search/duckduckgo_image_search.py +++ b/preprocessor/services/characters/image_search/duckduckgo_image_search.py @@ -3,18 +3,17 @@ List, ) -from ddgs import DDGS +from duckduckgo_search import DDGS from preprocessor.services.characters.image_search.image_search import BaseImageSearch class DuckDuckGoImageSearch(BaseImageSearch): - @property def name(self) -> str: return 'DuckDuckGo' def search(self, query: str) -> List[Dict[str, str]]: with DDGS() as ddgs: - results = ddgs.images(query, max_results=self.max_results) + results = ddgs.images(query, max_results=self._max_results) return list(results) diff --git a/preprocessor/services/characters/image_search/google_image_search.py b/preprocessor/services/characters/image_search/google_image_search.py index 369d5f3ec..5a9cfa013 100644 --- a/preprocessor/services/characters/image_search/google_image_search.py +++ b/preprocessor/services/characters/image_search/google_image_search.py @@ -1,4 +1,5 @@ from typing import ( + Any, Dict, List, ) @@ -9,22 +10,42 @@ class GoogleImageSearch(BaseImageSearch): - - def __init__(self, api_key: str, max_results: int=50) -> None: + def __init__(self, api_key: str, max_results: int = 50) -> None: super().__init__(max_results) + if not api_key: raise ValueError('SerpAPI key is required for Google Image Search') - self.api_key = api_key + + self.__api_key = api_key @property def name(self) -> str: return 'Google Images API' def search(self, query: str) -> List[Dict[str, str]]: - params = {'engine': 'google_images', 'q': query, 'hl': 'pl', 'gl': 'pl', 'api_key': self.api_key} - search = GoogleSearch(params) - results = search.get_dict() - images = [] - for img_result in results.get('images_results', [])[:self.max_results]: - images.append({'image': img_result.get('original'), 'thumbnail': img_result.get('thumbnail')}) + params = self.__build_search_params(query) + search_client = GoogleSearch(params) + raw_results = search_client.get_dict() + + return self.__extract_image_data(raw_results) + + def __build_search_params(self, query: str) -> Dict[str, str]: + return { + 'engine': 'google_images', + 'q': query, + 'hl': 'pl', + 'gl': 'pl', + 'api_key': self.__api_key, + } + + def __extract_image_data(self, raw_results: Dict[str, Any]) -> List[Dict[str, str]]: + images: List[Dict[str, str]] = [] + image_results = raw_results.get('images_results', [])[:self._max_results] + + for img_result in image_results: + images.append({ + 'image': img_result.get('original', ''), + 'thumbnail': img_result.get('thumbnail', ''), + }) + return images diff --git a/preprocessor/services/characters/image_search/image_search.py b/preprocessor/services/characters/image_search/image_search.py index d6bafe33a..00662c92e 100644 --- a/preprocessor/services/characters/image_search/image_search.py +++ b/preprocessor/services/characters/image_search/image_search.py @@ -9,9 +9,8 @@ class BaseImageSearch(ABC): - - def __init__(self, max_results: int=50) -> None: - self.max_results = max_results + def __init__(self, max_results: int = 50) -> None: + self._max_results = max_results @property @abstractmethod diff --git a/preprocessor/services/characters/models.py b/preprocessor/services/characters/models.py index 484aebebc..013b4ebcf 100644 --- a/preprocessor/services/characters/models.py +++ b/preprocessor/services/characters/models.py @@ -5,7 +5,7 @@ import numpy as np -@dataclass +@dataclass(frozen=True) class FaceData: bbox: np.ndarray face_img: np.ndarray @@ -13,7 +13,8 @@ class FaceData: source_image_idx: int source_image_path: Path -@dataclass + +@dataclass(frozen=True) class CandidateFace: avg_similarity: float faces: List[FaceData] diff --git a/preprocessor/services/characters/reference_downloader.py b/preprocessor/services/characters/reference_downloader.py index ca6da287b..e2a23126d 100644 --- a/preprocessor/services/characters/reference_downloader.py +++ b/preprocessor/services/characters/reference_downloader.py @@ -38,61 +38,75 @@ class CharacterReferenceDownloader(BaseProcessor): - def __init__(self, args: Dict[str, Any]) -> None: - super().__init__(args=args, class_name=self.__class__.__name__, error_exit_code=8, loglevel=logging.DEBUG) - self.characters_json: Path = self._args['characters_json'] - self.series_name: str = self._args['series_name'] - self.output_dir: Path = self._args.get('output_dir', settings.character.get_output_dir(self.series_name)) - self.images_per_character: int = self._args.get('images_per_character', settings.character.reference_images_per_character) - self.max_results: int = settings.image_scraper.max_results_to_scrape - self.min_width: int = settings.image_scraper.min_image_width - self.min_height: int = settings.image_scraper.min_image_height - self.use_gpu: bool = True - self.search_mode: str = self._args.get('search_mode', 'normal') - self.search_engine: BaseImageSearch = self.__create_search_engine() - self.face_app: Optional[FaceAnalysis] = None - self.playwright: Optional[Playwright] = None - self.browser_context: Optional[BrowserContext] = None + super().__init__( + args=args, + class_name=self.__class__.__name__, + error_exit_code=8, + loglevel=logging.DEBUG, + ) + self.__characters_json: Path = self._args['characters_json'] + self.__series_name: str = self._args['series_name'] + self.__output_dir: Path = self._args.get( + 'output_dir', settings.character.get_output_dir(self.__series_name), + ) + self.__images_per_character: int = self._args.get( + 'images_per_character', settings.character.reference_images_per_character, + ) - def cleanup(self) -> None: - if self.browser_context: - self.browser_context.close() - if self.playwright: - self.playwright.stop() + self.__max_results: int = settings.image_scraper.max_results_to_scrape + self.__min_width: int = settings.image_scraper.min_image_width + self.__min_height: int = settings.image_scraper.min_image_height + self.__search_mode: str = self._args.get('search_mode', 'normal') + + self.__search_engine: BaseImageSearch = self.__create_search_engine() + self.__face_app: Optional[FaceAnalysis] = None + self.__playwright: Optional[Playwright] = None + self.__browser_context: Optional[BrowserContext] = None def get_output_subdir(self) -> str: return 'character_references' + def cleanup(self) -> None: + if self.__browser_context: + self.__browser_context.close() + if self.__playwright: + self.__playwright.stop() + + def _validate_args(self, args: Dict[str, Any]) -> None: + if 'characters_json' not in args: + raise ValueError("Argument 'characters_json' is required.") + def _get_expected_outputs(self, item: ProcessingItem) -> List[OutputSpec]: char_name = item.metadata['char_name'] - output_folder = self.output_dir / char_name.replace(' ', '_').lower() - expected_files = [ + output_folder = self.__output_dir / char_name.replace(' ', '_').lower() + + return [ OutputSpec(path=output_folder / f'{i:02d}.jpg', required=True) - for i in range(self.images_per_character) + for i in range(self.__images_per_character) ] - return expected_files def _get_processing_items(self) -> List[ProcessingItem]: - if not self.characters_json.exists(): - console.print(f'[red]Characters JSON not found: {self.characters_json}[/red]') + if not self.__characters_json.exists(): + console.print(f'[red]Characters JSON not found: {self.__characters_json}[/red]') return [] - with open(self.characters_json, encoding='utf-8') as f: + + with open(self.__characters_json, encoding='utf-8') as f: data = json.load(f) - characters = data.get('characters', []) + return [ ProcessingItem( episode_id=f"char_{char['name']}", - input_path=self.characters_json, + input_path=self.__characters_json, metadata={'char_name': char['name']}, ) - for char in characters + for char in data.get('characters', []) ] def _load_resources(self) -> bool: - self.face_app = FaceDetector.init() - self.playwright = sync_playwright().start() - self.browser_context = self.playwright.chromium.launch_persistent_context( + self.__face_app = FaceDetector.init() + self.__playwright = sync_playwright().start() + self.__browser_context = self.__playwright.chromium.launch_persistent_context( user_data_dir='/tmp/patchright_profile', headless=True, args=['--no-sandbox', '--disable-dev-shm-usage', '--disable-gpu'], @@ -103,160 +117,161 @@ def _load_resources(self) -> bool: def _process_item(self, item: ProcessingItem, missing_outputs: List[OutputSpec]) -> None: char_name = item.metadata['char_name'] output_folder = self.__prepare_output_folder(char_name) - existing_images = list(output_folder.glob('*.jpg')) - saved_count = len(existing_images) - if saved_count >= self.images_per_character: + + saved_count = len(list(output_folder.glob('*.jpg'))) + if saved_count >= self.__images_per_character: return - search_query = f'Serial {self.series_name} {char_name} postać' - self.logger.info(f'Searching [{self.search_engine.name}]: {search_query}') - for attempt in range(settings.image_scraper.retry_attempts): - try: - results = self.search_engine.search(search_query) - saved_count = self.__process_search_results(results, output_folder, saved_count) - break - except KeyboardInterrupt: # pylint: disable=try-except-raise - raise - except Exception as e: - if attempt < settings.image_scraper.retry_attempts - 1: - delay = settings.image_scraper.retry_delay * 2 ** attempt - self.logger.warning( - f'Attempt {attempt + 1} failed for {char_name}, retrying in {delay}s: {e}', - ) - time.sleep(delay) - else: - self.logger.error(f'All retry attempts failed for {char_name}: {e}') - self.__log_results(char_name, saved_count) - delay = random.uniform( - settings.image_scraper.request_delay_min, - settings.image_scraper.request_delay_max, - ) - time.sleep(delay) - def _validate_args(self, args: Dict[str, Any]) -> None: - if 'characters_json' not in args: - raise ValueError('characters_json is required') + search_query = f'Serial {self.__series_name} {char_name} postać' + self.logger.info(f'Searching [{self.__search_engine.name}]: {search_query}') + saved_count = self.__execute_search_with_retries(search_query, char_name, output_folder, saved_count) + self.__log_final_results(char_name, saved_count) + self.__apply_random_delay() - def __count_faces(self, img) -> int: - faces = self.face_app.get(img) - return len(faces) def __create_search_engine(self) -> BaseImageSearch: - if self.search_mode == 'premium': - serpapi_key = settings.image_scraper.serpapi_key - return GoogleImageSearch(api_key=serpapi_key, max_results=self.max_results) - return DuckDuckGoImageSearch(max_results=self.max_results) - - - def __download_image_with_browser( - self, img_url: str, page: Page, - ) -> np.ndarray | None: - try: - response = page.goto( - img_url, - timeout=settings.image_scraper.page_navigation_timeout, - wait_until='domcontentloaded', + if self.__search_mode == 'premium': + return GoogleImageSearch( + api_key=settings.image_scraper.serpapi_key, + max_results=self.__max_results, ) - if not response or response.status != 200: - return None - content_type = response.headers.get('content-type', '') - if 'image' not in content_type: - return None - img_bytes = response.body() - return self.__validate_and_decode_image(img_bytes, img_url, self.logger) - except TimeoutError: - self.logger.debug(f'Timeout downloading image {img_url}') - return None - except Exception as e: - if 'net::ERR_CONNECTION_CLOSED' in str(e) or 'Navigation' in str(e): - self.logger.debug( - f'Connection/navigation error for {img_url}: {e}', - ) - else: - self.logger.debug(f'Failed to download image {img_url}: {e}') - return None + return DuckDuckGoImageSearch(max_results=self.__max_results) def __prepare_output_folder(self, char_name: str) -> Path: - output_folder = self.output_dir / char_name.replace(' ', '_').lower() + output_folder = self.__output_dir / char_name.replace(' ', '_').lower() output_folder.mkdir(parents=True, exist_ok=True) return output_folder - def __log_results(self, char_name: str, saved_count: int) -> None: - if saved_count >= self.images_per_character: - self.logger.info( - f'{char_name}: {saved_count}/{self.images_per_character} images', - ) - elif saved_count > 0: - self.logger.warning( - f'{char_name}: {saved_count}/{self.images_per_character} images (incomplete)', - ) + def __execute_search_with_retries( + self, query: str, char_name: str, output_folder: Path, saved_count: int, + ) -> int: + for attempt in range(settings.image_scraper.retry_attempts): + try: + results = self.__search_engine.search(query) + return self.__download_and_process_images(results, output_folder, saved_count) + except Exception as e: + if isinstance(e, KeyboardInterrupt): + raise + self.__handle_retry_logic(e, attempt, char_name) + return saved_count + + def __handle_retry_logic(self, error: Exception, attempt: int, char_name: str) -> None: + if attempt < settings.image_scraper.retry_attempts - 1: + delay = settings.image_scraper.retry_delay * (2 ** attempt) + self.logger.warning(f'Attempt {attempt + 1} failed for {char_name}, retrying in {delay}s: {error}') + time.sleep(delay) else: - self.logger.error(f'{char_name}: No suitable images found') + self.logger.error(f'All retry attempts failed for {char_name}: {error}') - def __process_search_results( - self, results: List[Dict[str, Any]], output_folder: Path, saved_count: int, + def __download_and_process_images( + self, results: List[Dict[str, Any]], output_folder: Path, saved_count: int, ) -> int: - sorted_results = sorted( - results, - key=lambda x: ( - 0 if x.get('image', '').lower().endswith(('.jpg', '.jpeg')) else 1, - 1 if x.get('image', '').lower().endswith('.png') else 2, - ), - ) - page = self.browser_context.new_page() + sorted_results = self.__sort_results_by_extension(results) + + page = self.__browser_context.new_page() try: for res in sorted_results: - if saved_count >= self.images_per_character: + if saved_count >= self.__images_per_character: break - img_url = res['image'] + + img_url = res.get('image', '') try: - img = self.__download_image_with_browser(img_url, page) - if img is None: - continue - if self.__validate_and_save_image( - img, img_url, output_folder, saved_count, - ): + img = self.__download_image_via_browser(img_url, page) + if img is not None and self.__validate_and_save_image(img, img_url, output_folder, saved_count): saved_count += 1 except Exception as e: - self.logger.debug(f'Error processing image: {e}') - continue + self.logger.debug(f'Error processing image {img_url}: {e}') finally: page.close() + return saved_count - @staticmethod - def __validate_and_decode_image( - img_bytes: bytes, img_url: str, logger, - ) -> np.ndarray | None: + def __download_image_via_browser(self, img_url: str, page: Page) -> Optional[np.ndarray]: + try: + response = page.goto( + img_url, + timeout=settings.image_scraper.page_navigation_timeout, + wait_until='domcontentloaded', + ) + + if not response or response.status != 200: + return None + + if 'image' not in response.headers.get('content-type', ''): + return None + + return self.__decode_image_bytes(response.body(), img_url) + + except TimeoutError: + self.logger.debug(f'Timeout downloading image {img_url}') + except Exception as e: + msg = str(e) + if 'net::ERR_CONNECTION_CLOSED' in msg or 'Navigation' in msg: + self.logger.debug(f'Connection/navigation error for {img_url}: {msg}') + else: + self.logger.debug(f'Failed to download image {img_url}: {msg}') + return None + + def __decode_image_bytes(self, img_bytes: bytes, img_url: str) -> Optional[np.ndarray]: if not img_bytes: return None + img_array = np.asarray(bytearray(img_bytes), dtype=np.uint8) img = cv2.imdecode(img_array, cv2.IMREAD_COLOR) + if img is None or img.size == 0: - logger.debug(f'Failed to decode image from {img_url}') + self.logger.debug(f'Failed to decode image from {img_url}') return None + if len(img.shape) != 3 or img.shape[2] != 3: - logger.debug(f'Image has unexpected shape {img.shape} from {img_url}') + self.logger.debug(f'Image has unexpected shape {img.shape} from {img_url}') return None + return img def __validate_and_save_image( - self, img: np.ndarray, img_url: str, output_folder: Path, saved_count: int, + self, img: np.ndarray, img_url: str, output_folder: Path, saved_count: int, ) -> bool: - if not isinstance(img, np.ndarray) or img.size == 0: - self.logger.debug(f'Invalid image array from {img_url}') - return False h, w = img.shape[:2] - if w < self.min_width or h < self.min_height: + if w < self.__min_width or h < self.__min_height: return False + try: - face_count = self.__count_faces(img) + face_count = len(self.__face_app.get(img)) except Exception as face_err: self.logger.debug(f'Face detection failed for {img_url}: {face_err}') return False + if face_count != 1: return False + filename = f'{saved_count:02d}.jpg' - path = output_folder / filename - cv2.imwrite(str(path), img) + cv2.imwrite(str(output_folder / filename), img) return True + + @staticmethod + def __sort_results_by_extension(results: List[Dict[str, Any]]) -> List[Dict[str, Any]]: + return sorted( + results, + key=lambda x: ( + 0 if x.get('image', '').lower().endswith(('.jpg', '.jpeg')) else 1, + 1 if x.get('image', '').lower().endswith('.png') else 2, + ), + ) + + def __log_final_results(self, char_name: str, saved_count: int) -> None: + if saved_count >= self.__images_per_character: + self.logger.info(f'{char_name}: {saved_count}/{self.__images_per_character} images') + elif saved_count > 0: + self.logger.warning(f'{char_name}: {saved_count}/{self.__images_per_character} images (incomplete)') + else: + self.logger.error(f'{char_name}: No suitable images found') + + @staticmethod + def __apply_random_delay() -> None: + delay = random.uniform( + settings.image_scraper.request_delay_min, + settings.image_scraper.request_delay_max, + ) + time.sleep(delay) diff --git a/preprocessor/services/core/base_processor.py b/preprocessor/services/core/base_processor.py index 60ca635fc..67eabdf49 100644 --- a/preprocessor/services/core/base_processor.py +++ b/preprocessor/services/core/base_processor.py @@ -30,6 +30,7 @@ class ProcessingItem: input_path: Path metadata: Dict[str, Any] + @dataclass class OutputSpec: path: Path @@ -38,7 +39,6 @@ class OutputSpec: @dataclass class _FilterResult: - """Result of filtering processing items.""" items_to_process: List[ProcessingItem] skipped_count: int skip_messages: List[str] @@ -52,10 +52,18 @@ class BaseProcessor(ABC): REQUIRES: List[str] = [] SUPPORTED_VIDEO_EXTENSIONS = SUPPORTED_VIDEO_EXTENSIONS - def __init__(self, args: Dict[str, Any], class_name: str, error_exit_code: int, loglevel: int = 10) -> None: + def __init__( + self, + args: Dict[str, Any], + class_name: str, + error_exit_code: int, + loglevel: int = 10, + ) -> None: self._validate_args(args) self._args = args - self.logger = ErrorHandlingLogger(class_name=class_name, loglevel=loglevel, error_exit_code=error_exit_code) + self.logger = ErrorHandlingLogger( + class_name=class_name, loglevel=loglevel, error_exit_code=error_exit_code, + ) self.state_manager: Optional[StateManager] = args.get('state_manager') self.series_name: str = args.get('series_name', 'unknown') self.path_manager: PathService = args.get('path_manager', PathService(self.series_name)) @@ -64,13 +72,6 @@ def __init__(self, args: Dict[str, Any], class_name: str, error_exit_code: int, def cleanup(self) -> None: pass - def _finalize(self) -> None: - pass - - @abstractmethod - def get_output_subdir(self) -> str: - pass - def work(self) -> int: try: self._execute() @@ -81,11 +82,33 @@ def work(self) -> int: return 130 except Exception as e: self.logger.error(f'{self.__class__.__name__} failed: {e}') + self.cleanup() return self.logger.finalize() + @abstractmethod + def get_output_subdir(self) -> str: + pass + + @abstractmethod + def _get_expected_outputs(self, item: ProcessingItem) -> List[OutputSpec]: + pass + + @abstractmethod + def _get_processing_items(self) -> List[ProcessingItem]: + pass + + @abstractmethod + def _process_item( + self, item: ProcessingItem, missing_outputs: List[OutputSpec], + ) -> None: + pass + + @abstractmethod + def _validate_args(self, args: Dict[str, Any]) -> None: + pass + def _execute(self) -> None: - """Main execution flow - orchestration only.""" all_items = self._get_processing_items() if not all_items: console.print('[yellow]No items to process[/yellow]') @@ -104,13 +127,16 @@ def _execute(self) -> None: self.__execute_processing(filter_result.items_to_process) self._finalize() - def __filter_skipped_items(self, all_items: List[ProcessingItem]) -> _FilterResult: - """ - Filters out items that should be skipped (cached). + def _finalize(self) -> None: + pass - Returns: - FilterResult with items to process and skip information - """ + def _get_progress_description(self) -> str: + return f'Processing {self.__class__.__name__}' + + def _load_resources(self) -> bool: + return True + + def __filter_skipped_items(self, all_items: List[ProcessingItem]) -> _FilterResult: items_to_process: List[ProcessingItem] = [] skipped_count = 0 skip_messages: List[str] = [] @@ -133,96 +159,102 @@ def __filter_skipped_items(self, all_items: List[ProcessingItem]) -> _FilterResu total_items=len(all_items), ) - @staticmethod - def __display_processing_summary(result: _FilterResult) -> None: - """Displays summary of what will be processed and what was skipped.""" - for skip_message in result.skip_messages: - console.print(skip_message) - - console.print( - f'[blue]Processing {len(result.items_to_process)} items ' - f'(of {result.total_items} total, {result.skipped_count} skipped)[/blue]', - ) - - @abstractmethod - def _get_expected_outputs(self, item: ProcessingItem) -> List[OutputSpec]: - pass + def __should_skip_item( + self, item: ProcessingItem, + ) -> Tuple[bool, List[OutputSpec], str]: + expected_outputs = self._get_expected_outputs(item) + if not expected_outputs: + return False, [], '' - @abstractmethod - def _get_processing_items(self) -> List[ProcessingItem]: - pass + missing_outputs = self.__get_missing_outputs(expected_outputs) + step_name = self.__get_step_name() + state_completed = self.__is_step_completed_in_state(step_name, item.episode_id) + has_all_outputs = len(missing_outputs) == 0 - def _get_progress_description(self) -> str: - return f'Processing {self.__class__.__name__}' + if has_all_outputs and state_completed: + return True, [], f'[yellow]Skipping (completed): {item.episode_id}[/yellow]' - def _load_resources(self) -> bool: - return True + if has_all_outputs and not state_completed: + self.__sync_state_completed(step_name, item.episode_id) + return True, [], f'[yellow]Skipping (files exist, state synced): {item.episode_id}[/yellow]' - @abstractmethod - def _process_item( - self, item: ProcessingItem, missing_outputs: List[OutputSpec], - ) -> None: - pass + if not has_all_outputs and state_completed: + console.print( + f'[yellow]Warning: State marked complete but outputs missing ' + f'for {item.episode_id}[/yellow]', + ) - @abstractmethod - def _validate_args(self, args: Dict[str, Any]) -> None: - pass + return False, missing_outputs, '' def __execute_processing(self, items: List[ProcessingItem]) -> None: if not items: console.print('[yellow]No items to process, skipping resource loading[/yellow]') return + if not self._load_resources(): return + step_name = self.__get_step_name() try: with SimpleProgress() as progress: task = progress.add_task(self._get_progress_description(), total=len(items)) for item in items: - try: - if self.state_manager: - self.state_manager.mark_step_started(step_name, item.episode_id, []) - missing_outputs = item.metadata.get('missing_outputs', []) - self._process_item(item, missing_outputs) - if self.state_manager: - self.state_manager.mark_step_completed(step_name, item.episode_id) - except Exception as e: - self.logger.error(f'Failed to process {item.episode_id}: {e}') - finally: - progress.advance(task) + self.__process_single_item(item, step_name, progress, task) except KeyboardInterrupt: console.print('\n[yellow]Processing interrupted[/yellow]') raise + def __process_single_item( + self, + item: ProcessingItem, + step_name: str, + progress: SimpleProgress, + task: int, + ) -> None: + try: + if self.state_manager: + self.state_manager.mark_step_started(step_name, item.episode_id, []) + + missing_outputs = item.metadata.get('missing_outputs', []) + self._process_item(item, missing_outputs) + + if self.state_manager: + self.state_manager.mark_step_completed(step_name, item.episode_id) + except Exception as e: + self.logger.error(f'Failed to process {item.episode_id}: {e}') + finally: + progress.advance(task) + + def __is_step_completed_in_state(self, step_name: str, episode_id: str) -> bool: + if not self.state_manager: + return False + return self.state_manager.is_step_completed(step_name, episode_id) + + def __sync_state_completed(self, step_name: str, episode_id: str) -> None: + if self.state_manager: + self.state_manager.mark_step_completed(step_name, episode_id) + def __get_step_name(self) -> str: class_name = self.__class__.__name__ - suffixes_to_remove = ['Processor', 'Generator', 'Detector', 'Transcoder', 'Importer', 'Indexer'] + suffixes_to_remove = [ + 'Processor', 'Generator', 'Detector', 'Transcoder', 'Importer', 'Indexer', + ] + name = class_name for suffix in suffixes_to_remove: name = name.replace(suffix, '') + return self.__to_snake_case(name) - def __should_skip_item( - self, item: ProcessingItem, - ) -> Tuple[bool, List[OutputSpec], str]: - expected_outputs = self._get_expected_outputs(item) - if not expected_outputs: - return False, [], '' - missing_outputs = self.__get_missing_outputs(expected_outputs) - step_name = self.__get_step_name() - state_completed = self.__is_step_completed_in_state(step_name, item.episode_id) - has_all_outputs = len(missing_outputs) == 0 - if has_all_outputs and state_completed: - return True, [], f'[yellow]Skipping (completed): {item.episode_id}[/yellow]' - if has_all_outputs and not state_completed: - self.__sync_state_completed(step_name, item.episode_id) - return True, [], f'[yellow]Skipping (files exist, state synced): {item.episode_id}[/yellow]' - if not has_all_outputs and state_completed: - console.print( - f'[yellow]Warning: State marked complete but outputs missing ' - f'for {item.episode_id}[/yellow]', - ) - return False, missing_outputs, '' + @staticmethod + def __display_processing_summary(result: _FilterResult) -> None: + for skip_message in result.skip_messages: + console.print(skip_message) + + console.print( + f'[blue]Processing {len(result.items_to_process)} items ' + f'(of {result.total_items} total, {result.skipped_count} skipped)[/blue]', + ) @staticmethod def __get_missing_outputs(expected_outputs: List[OutputSpec]) -> List[OutputSpec]: @@ -231,17 +263,7 @@ def __get_missing_outputs(expected_outputs: List[OutputSpec]) -> List[OutputSpec if not output.path.exists() or output.path.stat().st_size == 0 ] - def __is_step_completed_in_state(self, step_name: str, episode_id: str) -> bool: - return bool( - self.state_manager - and self.state_manager.is_step_completed(step_name, episode_id), - ) - - def __sync_state_completed(self, step_name: str, episode_id: str) -> None: - if self.state_manager: - self.state_manager.mark_step_completed(step_name, episode_id) - @staticmethod def __to_snake_case(name: str) -> str: - name = re.sub('(.)([A-Z][a-z]+)', '\\1_\\2', name) - return re.sub('([a-z0-9])([A-Z])', '\\1_\\2', name).lower() + name = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', name) + return re.sub('([a-z0-9])([A-Z])', r'\1_\2', name).lower() diff --git a/preprocessor/services/core/environment.py b/preprocessor/services/core/environment.py index c101a83f7..cf6c6af16 100644 --- a/preprocessor/services/core/environment.py +++ b/preprocessor/services/core/environment.py @@ -3,17 +3,16 @@ class Environment: - - _is_docker_cached: Optional[bool] = None + __is_docker_cached: Optional[bool] = None @staticmethod def is_docker() -> bool: - if Environment._is_docker_cached is None: - Environment._is_docker_cached = ( + if Environment.__is_docker_cached is None: + Environment.__is_docker_cached = ( os.getenv('DOCKER_CONTAINER', 'false').lower() == 'true' ) - return Environment._is_docker_cached + return Environment.__is_docker_cached @staticmethod def reset_cache() -> None: - Environment._is_docker_cached = None + Environment.__is_docker_cached = None diff --git a/preprocessor/services/core/logging.py b/preprocessor/services/core/logging.py index 54468c6db..9eedd0bb1 100644 --- a/preprocessor/services/core/logging.py +++ b/preprocessor/services/core/logging.py @@ -8,10 +8,10 @@ class LoggerNotFinalizedException(Exception): - def __init__(self) -> None: super().__init__('Logger destroyed without finalize() being called.') + class ErrorHandlingLogger: DEBUG = 10 INFO = 20 @@ -20,30 +20,39 @@ class ErrorHandlingLogger: CRITICAL = 50 def __init__(self, class_name: str, loglevel: int, error_exit_code: int) -> None: - self.__class_name: str = class_name - self.__error_exit_code: int = error_exit_code + self.__class_name = class_name + self.__error_exit_code = error_exit_code self.__errors: List[str] = [] - self.__is_finalized: bool = False - self.__setup_logger(loglevel) + self.__is_finalized = False + self.__logger: logging.Logger = self.__setup_logger(loglevel) def __del__(self) -> None: if not self.__is_finalized: - self.__logger.error(f"ErrorHandlingLogger for '{self.__class_name}' destroyed without finalize().") + self.__logger.error( + f"ErrorHandlingLogger for '{self.__class_name}' destroyed without finalize().", + ) if self.__errors: self.__logger.error('Logged errors:') for error in self.__errors: self.__logger.error(f'- {error}') - raise LoggerNotFinalizedException + raise LoggerNotFinalizedException() def debug(self, message: str) -> None: self.__logger.debug(message) + def info(self, message: str) -> None: + self.__logger.info(message) + + def warning(self, message: str) -> None: + self.__logger.warning(message) + def error(self, message: str) -> None: self.__logger.error(message) self.__errors.append(message) def finalize(self) -> int: self.__is_finalized = True + if self.__errors: console.print( Panel( @@ -54,6 +63,7 @@ def finalize(self) -> int: ), ) return self.__error_exit_code + console.print( Panel( f"[bold green]Processing for '{self.__class_name}' " @@ -64,13 +74,7 @@ def finalize(self) -> int: ) return 0 - def info(self, message: str) -> None: - self.__logger.info(message) - - def warning(self, message: str) -> None: - self.__logger.warning(message) - - def __setup_logger(self, level: int) -> None: + def __setup_logger(self, level: int) -> logging.Logger: logging.basicConfig( level=level, format='%(message)s', @@ -84,4 +88,4 @@ def __setup_logger(self, level: int) -> None: ], force=True, ) - self.__logger: logging.Logger = logging.getLogger(self.__class_name) + return logging.getLogger(self.__class_name) diff --git a/preprocessor/services/core/time.py b/preprocessor/services/core/time.py index 1d00d1713..8abda2cd9 100644 --- a/preprocessor/services/core/time.py +++ b/preprocessor/services/core/time.py @@ -1,9 +1,8 @@ class TimeFormatter: - @staticmethod def format_hms(seconds: float) -> str: hours = int(seconds // 3600) - minutes = int(seconds % 3600 // 60) + minutes = int((seconds % 3600) // 60) secs = int(seconds % 60) return f'{hours}:{minutes:02d}:{secs:02d}' @@ -11,10 +10,13 @@ def format_hms(seconds: float) -> str: def format_human(seconds: float) -> str: if seconds < 60: return f'{seconds:.1f}s' + minutes = int(seconds // 60) secs = int(seconds % 60) + if minutes < 60: return f'{minutes}m {secs}s' + hours = minutes // 60 minutes = minutes % 60 return f'{hours}h {minutes}m {secs}s' diff --git a/preprocessor/services/episodes/episode_manager.py b/preprocessor/services/episodes/episode_manager.py index d1330ac0e..6e3f58a32 100644 --- a/preprocessor/services/episodes/episode_manager.py +++ b/preprocessor/services/episodes/episode_manager.py @@ -35,42 +35,56 @@ def episode_num(self) -> str: def season_code(self) -> str: return f'S{self.season:02d}' - def __is_special(self) -> bool: # pylint: disable=unused-private-member + def is_special(self) -> bool: return self.season == 0 + class EpisodeManager: + def __init__( + self, + episodes_info_json: Optional[Path], + series_name: str, + logger: Optional[ErrorHandlingLogger] = None, + ) -> None: + self.__series_name = series_name.lower() + self.__episodes_data: Optional[Dict[str, Any]] = None + self.__path_manager = PathService(self.__series_name) + self.__logger = logger + + self.__load_episodes_data(episodes_info_json) - def __init__(self, episodes_info_json: Optional[Path], series_name: str, logger: Optional[ErrorHandlingLogger]=None) -> None: - self.series_name = series_name.lower() - self.episodes_data: Optional[Dict[str, Any]] = None - self.path_manager = PathService(self.series_name) - self._logger: Optional[ErrorHandlingLogger] = logger - if episodes_info_json and episodes_info_json.exists(): - with open(episodes_info_json, 'r', encoding='utf-8') as f: - self.episodes_data = json.load(f) + @property + def path_manager(self) -> PathService: + return self.__path_manager def get_episode_by_season_and_relative(self, season: int, relative_episode: int) -> EpisodeInfo: - if not self.episodes_data: - return self.__create_episode_info(season, relative_episode) - for season_data in self.episodes_data.get(EpisodesDataKeys.SEASONS, []): + if not self.__episodes_data: + return self.__create_fallback_episode_info(season, relative_episode) + + season_list = self.__episodes_data.get(EpisodesDataKeys.SEASONS, []) + for season_data in season_list: if season_data.get(EpisodesDataKeys.SEASON_NUMBER) == season: - episodes = sorted(season_data.get(EpisodesDataKeys.EPISODES, []), key=lambda ep: ep.get(EpisodeMetadataKeys.EPISODE_NUMBER, 0)) - if 0 < relative_episode <= len(episodes): - ep_data = episodes[relative_episode - 1] - return self.__create_episode_info( - season=season, - relative_episode=relative_episode, - title=ep_data.get(EpisodeMetadataKeys.TITLE), - premiere_date=ep_data.get(EpisodeMetadataKeys.PREMIERE_DATE), - viewership=ep_data.get(EpisodeMetadataKeys.VIEWERSHIP), - ) - if self._logger: - self._logger.warning( - f'Season {season} not found in episodes_info_json! ' - f'Processing S{season:02d}E{relative_episode:02d} with filename-only metadata. ' - f'Scrape episode info for season {season} to get title, premiere date, etc.', + return self.__extract_episode_from_season(season_data, season, relative_episode) + + self.__log_missing_season_warning(season, relative_episode) + return self.__create_fallback_episode_info(season, relative_episode) + + def parse_filename(self, file_path: Path) -> Optional[EpisodeInfo]: + full_path_str = str(file_path) + match_season_episode = re.search(r'S(\d+)[/\\]?E(\d+)', full_path_str, re.IGNORECASE) + + if match_season_episode: + season = int(match_season_episode.group(1)) + episode = int(match_season_episode.group(2)) + return self.get_episode_by_season_and_relative(season, episode) + + if self.__logger: + self.__logger.error( + f'Cannot parse episode from filename: {file_path.name}. ' + 'Expected format: S##E## (e.g., S01E05, S10E13). ' + 'Absolute episode numbers (E## without season) are not supported.', ) - return self.__create_episode_info(season, relative_episode) + return None @staticmethod def get_episode_id_for_state(episode_info: EpisodeInfo) -> str: @@ -86,35 +100,64 @@ def get_metadata(episode_info: EpisodeInfo) -> Dict[str, Any]: 'viewership': episode_info.viewership, } - def parse_filename(self, file_path: Path) -> Optional[EpisodeInfo]: - full_path_str = str(file_path) - match_season_episode = re.search('S(\\d+)[/\\\\]?E(\\d+)', full_path_str, re.IGNORECASE) - if match_season_episode: - season = int(match_season_episode.group(1)) - episode = int(match_season_episode.group(2)) - return self.get_episode_by_season_and_relative(season, episode) - if self._logger: - self._logger.error( - f'Cannot parse episode from filename: {file_path.name}. ' - 'Expected format: S##E## (e.g., S01E05, S10E13). ' - 'Absolute episode numbers (E## without season) are not supported.', + def __load_episodes_data(self, json_path: Optional[Path]) -> None: + if json_path and json_path.exists(): + try: + with open(json_path, 'r', encoding='utf-8') as f: + self.__episodes_data = json.load(f) + except Exception as e: + if self.__logger: + self.__logger.error(f'Failed to load episodes data from {json_path}: {e}') + + def __extract_episode_from_season( + self, season_data: Dict[str, Any], season: int, relative_episode: int, + ) -> EpisodeInfo: + episodes = sorted( + season_data.get(EpisodesDataKeys.EPISODES, []), + key=lambda ep: ep.get(EpisodeMetadataKeys.EPISODE_NUMBER, 0), + ) + + if 0 < relative_episode <= len(episodes): + ep_data = episodes[relative_episode - 1] + return self.__create_episode_info( + season=season, + relative_episode=relative_episode, + title=ep_data.get(EpisodeMetadataKeys.TITLE), + premiere_date=ep_data.get(EpisodeMetadataKeys.PREMIERE_DATE), + viewership=ep_data.get(EpisodeMetadataKeys.VIEWERSHIP), ) - return None + + return self.__create_fallback_episode_info(season, relative_episode) + + def __log_missing_season_warning(self, season: int, relative_episode: int) -> None: + if self.__logger: + self.__logger.warning( + f'Season {season} not found in episodes_info_json! ' + f'Processing S{season:02d}E{relative_episode:02d} with filename-only metadata. ' + f'Scrape episode info for season {season} to get title, premiere date, etc.', + ) + + def __create_fallback_episode_info(self, season: int, relative_episode: int) -> EpisodeInfo: + return self.__create_episode_info( + season=season, + relative_episode=relative_episode, + title=f'S{season:02d}E{relative_episode:02d}', + ) def __create_episode_info( - self, - season: int, - relative_episode: int, - title: Optional[str]=None, - premiere_date: Optional[str]=None, - viewership: Optional[str]=None, + self, + season: int, + relative_episode: int, + title: Optional[str] = None, + premiere_date: Optional[str] = None, + viewership: Optional[str] = None, ) -> EpisodeInfo: return EpisodeInfo( absolute_episode=0, season=season, relative_episode=relative_episode, title=title or f'S{season:02d}E{relative_episode:02d}', - series_name=self.series_name, + series_name=self.__series_name, premiere_date=premiere_date, viewership=viewership, ) diff --git a/preprocessor/services/io/files.py b/preprocessor/services/io/files.py index 851480f8f..906667311 100644 --- a/preprocessor/services/io/files.py +++ b/preprocessor/services/io/files.py @@ -8,14 +8,13 @@ class FileOperations: - @staticmethod - def atomic_write_json(path: Path, data: Dict[str, Any], indent: int=2) -> None: - - def __write(temp: Path) -> None: - with open(temp, 'w', encoding='utf-8') as f: + def atomic_write_json(path: Path, data: Dict[str, Any], indent: int = 2) -> None: + def __write_temp(temp_path: Path) -> None: + with open(temp_path, 'w', encoding='utf-8') as f: json.dump(data, f, ensure_ascii=False, indent=indent) - FileOperations.__atomic_write(path, __write) + + FileOperations.__execute_atomic_write(path, __write_temp) @staticmethod def load_json(path: Path) -> Dict[str, Any]: @@ -23,8 +22,8 @@ def load_json(path: Path) -> Dict[str, Any]: return json.load(f) @staticmethod - def __atomic_write(path: Path, write_func: Callable[[Any], None]) -> None: - temp_path = path.with_suffix(path.suffix + '.tmp') + def __execute_atomic_write(path: Path, write_func: Callable[[Path], None]) -> None: + temp_path = path.with_suffix(f'{path.suffix}.tmp') try: write_func(temp_path) temp_path.replace(path) diff --git a/preprocessor/services/io/metadata.py b/preprocessor/services/io/metadata.py index fe3c074c0..c54528c8f 100644 --- a/preprocessor/services/io/metadata.py +++ b/preprocessor/services/io/metadata.py @@ -10,7 +10,6 @@ class MetadataBuilder: - @staticmethod def create_embedding_collection( episode_id: str, @@ -31,7 +30,7 @@ def create_embedding_collection( @staticmethod def create_processing_metadata( - episode_info, + episode_info: Any, processing_params: Dict[str, Any], statistics: Dict[str, Any], results_key: str, @@ -46,5 +45,8 @@ def create_processing_metadata( } @staticmethod - def __create_minimal_episode_info(episode_info) -> Dict[str, Any]: - return {'season': episode_info.season, 'episode_number': episode_info.relative_episode} + def __create_minimal_episode_info(episode_info: Any) -> Dict[str, Any]: + return { + 'season': episode_info.season, + 'episode_number': episode_info.relative_episode, + } diff --git a/preprocessor/services/io/path_service.py b/preprocessor/services/io/path_service.py index 1f6f808bd..1f0e03ef8 100644 --- a/preprocessor/services/io/path_service.py +++ b/preprocessor/services/io/path_service.py @@ -9,25 +9,31 @@ class PathService: - def __init__(self, series_name: str) -> None: - self._series_name: str = series_name.lower() + self.__series_name = series_name.lower() def build_filename( - self, episode_info: 'EpisodeInfo', extension: str = 'json', suffix: str = '', + self, + episode_info: 'EpisodeInfo', + extension: str = 'json', + suffix: str = '', ) -> str: - base: str = f'{self._series_name}_{episode_info.episode_code()}' - suffix_str: str = f'_{suffix}' if suffix else '' + base = f'{self.__series_name}_{episode_info.episode_code()}' + suffix_str = f'_{suffix}' if suffix else '' return f'{base}{suffix_str}.{extension}' def get_episode_dir(self, episode_info: 'EpisodeInfo', subdir: str) -> Path: - base_output_dir: Path = get_base_output_dir(self._series_name) + base_output_dir = get_base_output_dir(self.__series_name) return base_output_dir / subdir / episode_info.season_code() / episode_info.episode_num() @staticmethod def get_input_base() -> Path: - return Path('/input_data') if Environment.is_docker() else Path('preprocessor/input_data') + if Environment.is_docker(): + return Path('/input_data') + return Path('preprocessor/input_data') @staticmethod def get_output_base() -> Path: - return Path('/app/output_data') if Environment.is_docker() else Path('preprocessor/output_data') + if Environment.is_docker(): + return Path('/app/output_data') + return Path('preprocessor/output_data') diff --git a/preprocessor/services/media/ffmpeg.py b/preprocessor/services/media/ffmpeg.py index ea1d25176..ed3b61e25 100644 --- a/preprocessor/services/media/ffmpeg.py +++ b/preprocessor/services/media/ffmpeg.py @@ -27,9 +27,9 @@ class FFmpegWrapper: @staticmethod def detect_interlacing( - video_path: Path, - analysis_time: Optional[int] = 60, - threshold: float = 0.15, + video_path: Path, + analysis_time: Optional[int] = 60, + threshold: float = 0.15, ) -> Tuple[bool, Optional[Dict[str, Any]]]: cmd = ['ffmpeg'] @@ -77,9 +77,11 @@ def get_audio_bitrate(probe_data: Dict[str, Any]) -> Optional[int]: stream = FFmpegWrapper.__get_stream_by_type(probe_data, 'audio') if not stream: return None + bit_rate = stream.get('bit_rate') if not bit_rate: return None + return int(int(bit_rate) / 1000) @staticmethod @@ -87,9 +89,11 @@ def get_framerate(probe_data: Dict[str, Any]) -> float: stream = FFmpegWrapper.__get_stream_by_type(probe_data, 'video') if not stream: raise ValueError('No video streams found') + r_frame_rate = stream.get('r_frame_rate') if not r_frame_rate: raise ValueError('Frame rate not found') + num, denom = [int(x) for x in r_frame_rate.split('/')] return num / denom @@ -98,9 +102,11 @@ def get_video_bitrate(probe_data: Dict[str, Any]) -> Optional[float]: stream = FFmpegWrapper.__get_stream_by_type(probe_data, 'video') if not stream: return None + bit_rate = stream.get('bit_rate') if not bit_rate: return None + return round(int(bit_rate) / 1000000, 2) @staticmethod @@ -108,10 +114,12 @@ def get_resolution(probe_data: Dict[str, Any]) -> Tuple[int, int]: stream = FFmpegWrapper.__get_stream_by_type(probe_data, 'video') if not stream: raise ValueError('No video streams found') + width = stream.get('width') height = stream.get('height') if not width or not height: raise ValueError('Resolution not found') + return int(width), int(height) @staticmethod @@ -119,9 +127,11 @@ def get_sample_aspect_ratio(probe_data: Dict[str, Any]) -> Tuple[int, int]: stream = FFmpegWrapper.__get_stream_by_type(probe_data, 'video') if not stream: return (1, 1) + sar = stream.get('sample_aspect_ratio', '1:1') if sar == '0:1' or not sar: return (1, 1) + try: num, denom = [int(x) for x in sar.split(':')] return (num, denom) @@ -137,17 +147,15 @@ def get_field_order(probe_data: Dict[str, Any]) -> str: @staticmethod def probe_video(video_path: Path) -> Dict[str, Any]: - cmd = ['ffprobe', '-v', 'error', '-show_streams', '-show_format', '-of', 'json', str(video_path)] + cmd = [ + 'ffprobe', '-v', 'error', '-show_streams', '-show_format', + '-of', 'json', str(video_path), + ] result = subprocess.run(cmd, capture_output=True, text=True, check=True) return json.loads(result.stdout) @staticmethod def transcode(params: TranscodeParams) -> None: - """Transcode video with parameter object. - - Args: - params: Transcoding parameters. - """ width, height = params.get_resolution_tuple() vf_filter = FFmpegWrapper.__build_video_filter( width, height, params.deinterlace, params.is_upscaling, @@ -172,19 +180,23 @@ def transcode(params: TranscodeParams) -> None: ) if params.log_command: - print('ffmpeg \\') - for i, arg in enumerate(command[1:], 1): - if i == len(command) - 1: - print(f' {arg}') - else: - print(f' {arg} \\') - print() + FFmpegWrapper.__log_ffmpeg_command(command) subprocess.run(command, check=True, capture_output=False) + @staticmethod + def __log_ffmpeg_command(command: List[str]) -> None: + print('ffmpeg \\') + for i, arg in enumerate(command[1:], 1): + if i == len(command) - 1: + print(f' {arg}') + else: + print(f' {arg} \\') + print() + @staticmethod def __build_audio_and_output_params( - audio_bitrate: str, vf_filter: str, output_path: Path, + audio_bitrate: str, vf_filter: str, output_path: Path, ) -> List[str]: return [ '-c:a', 'aac', @@ -199,7 +211,7 @@ def __build_audio_and_output_params( @staticmethod def __build_base_command( - input_path: Path, codec: str, preset: str, target_fps: Optional[float], + input_path: Path, codec: str, preset: str, target_fps: Optional[float], ) -> List[str]: command = [ 'ffmpeg', '-v', 'error', '-stats', '-hide_banner', '-y', @@ -216,18 +228,20 @@ def __build_base_command( '-color_range', 'tv', '-video_track_timescale', '90000', ] + if target_fps: command.extend(['-r', str(target_fps)]) + return command @staticmethod def __build_encoding_params( - video_bitrate: str, - minrate: str, - maxrate: str, - bufsize: str, - gop_size: int, - is_upscaling: bool = False, + video_bitrate: str, + minrate: str, + maxrate: str, + bufsize: str, + gop_size: int, + is_upscaling: bool = False, ) -> List[str]: params = [ '-rc', 'vbr_hq', @@ -266,7 +280,7 @@ def __build_encoding_params( @staticmethod def __build_video_filter( - width: int, height: int, deinterlace: bool = False, is_upscaling: bool = False, + width: int, height: int, deinterlace: bool = False, is_upscaling: bool = False, ) -> str: filters = [] diff --git a/preprocessor/services/media/resolution.py b/preprocessor/services/media/resolution.py index cfa0ef219..2f92eb0f9 100644 --- a/preprocessor/services/media/resolution.py +++ b/preprocessor/services/media/resolution.py @@ -7,16 +7,17 @@ T = TypeVar('T', bound='Resolution') + class Resolution(Enum): - R1080P = (1920, 1080) - R1440P = (2560, 1440) R144P = (256, 144) - R2160P = (3840, 2160) R240P = (426, 240) R360P = (640, 360) - R4320P = (7680, 4320) R480P = (854, 480) R720P = (1280, 720) + R1080P = (1920, 1080) + R1440P = (2560, 1440) + R2160P = (3840, 2160) + R4320P = (7680, 4320) def __init__(self, width: int, height: int) -> None: self.width = width @@ -27,17 +28,11 @@ def __str__(self) -> str: @classmethod def from_string(cls: Type[T], init: str) -> T: - init = init.strip() - if not init[0].isalpha(): - init = 'R' + init.upper() - else: - init = init.upper() - return cls[init] - - @classmethod - def __from_str(cls: Type[T], init: str) -> T: # pylint: disable=unused-private-member - return cls.from_string(init) + clean_init = init.strip().upper() + if not clean_init[0].isalpha(): + clean_init = f'R{clean_init}' + return cls[clean_init] @classmethod - def __get_all_choices(cls) -> List[str]: # pylint: disable=unused-private-member + def get_all_choices(cls) -> List[str]: return [str(r) for r in cls] diff --git a/preprocessor/services/media/scene_detection.py b/preprocessor/services/media/scene_detection.py index bfd70e3ca..dc34753e2 100644 --- a/preprocessor/services/media/scene_detection.py +++ b/preprocessor/services/media/scene_detection.py @@ -14,72 +14,77 @@ class TransNetWrapper: - def __init__(self) -> None: - self.model: Optional[TransNetV2] = None + self.__model: Optional[TransNetV2] = None + + def load_model(self) -> None: + if not torch.cuda.is_available(): + raise RuntimeError('CUDA not available for TransNetV2.') + self.__model = TransNetV2().cuda() def cleanup(self) -> None: - if self.model is not None: - del self.model - self.model = None + if self.__model is not None: + del self.__model + self.__model = None + gc.collect() if torch.cuda.is_available(): torch.cuda.empty_cache() def detect_scenes( - self, - video_path: Path, - threshold: float = 0.5, - min_scene_len: int = 10, + self, + video_path: Path, + threshold: float = 0.5, + min_scene_len: int = 10, ) -> List[Dict[str, Any]]: - if self.model is None: + if self.__model is None: raise RuntimeError('Model not loaded. Call load_model() first.') + video_info = self.__get_video_info(video_path) if not video_info: raise RuntimeError(f'Failed to get video info for {video_path}') + try: - _, single_frame_predictions, _ = self.model.predict_video(str(video_path)) + _, single_frame_predictions, _ = self.__model.predict_video(str(video_path)) scene_changes = np.where(single_frame_predictions > threshold)[0] + return self.__build_scenes_from_predictions( scene_changes, video_info, min_scene_len, ) - except (RuntimeError, ValueError, OSError) as e: + except Exception as e: raise RuntimeError(f'TransNetV2 detection failed: {e}') from e - def load_model(self) -> None: - if not torch.cuda.is_available(): - raise RuntimeError('CUDA not available') - self.model = TransNetV2().cuda() - def __build_scenes_from_predictions( - self, - scene_changes: np.ndarray, - video_info: Dict[str, Any], - min_scene_len: int, + self, + scene_changes: np.ndarray, + video_info: Dict[str, Any], + min_scene_len: int, ) -> List[Dict[str, Any]]: - scenes = [] + scenes: List[Dict[str, Any]] = [] fps = video_info['fps'] prev_frame = 0 + for frame_num in scene_changes: if frame_num - prev_frame < min_scene_len: continue - scene = self.__create_scene_dict(len(scenes) + 1, prev_frame, frame_num, fps) - scenes.append(scene) + + scenes.append(self.__create_scene_dict(len(scenes) + 1, prev_frame, frame_num, fps)) prev_frame = frame_num + total_frames = video_info['total_frames'] if total_frames - prev_frame > min_scene_len: - scene = self.__create_scene_dict(len(scenes) + 1, prev_frame, total_frames, fps) - scenes.append(scene) + scenes.append(self.__create_scene_dict(len(scenes) + 1, prev_frame, total_frames, fps)) + return scenes def __create_scene_dict( - self, - scene_number: int, - start_frame: int, - end_frame: int, - fps: float, + self, + scene_number: int, + start_frame: int, + end_frame: int, + fps: float, ) -> Dict[str, Any]: return { 'scene_number': scene_number, @@ -103,7 +108,7 @@ def __frame_to_timecode(frame: int, fps: float) -> str: hours = int(seconds // 3600) minutes = int(seconds % 3600 // 60) secs = int(seconds % 60) - frames = int(seconds % 1 * fps) + frames = int((seconds % 1) * fps) return f'{hours:02d}:{minutes:02d}:{secs:02d}:{frames:02d}' @staticmethod @@ -113,6 +118,11 @@ def __get_video_info(video_file: Path) -> Optional[Dict[str, Any]]: fps = vr.get_avg_fps() total_frames = len(vr) duration = total_frames / fps if fps > 0 else 0 - return {'fps': fps, 'duration': duration, 'total_frames': total_frames} - except (RuntimeError, ValueError, OSError): + + return { + 'fps': fps, + 'duration': duration, + 'total_frames': total_frames, + } + except Exception: return None diff --git a/preprocessor/services/media/transcode_params.py b/preprocessor/services/media/transcode_params.py index 32e5b082e..3c4d1b412 100644 --- a/preprocessor/services/media/transcode_params.py +++ b/preprocessor/services/media/transcode_params.py @@ -6,9 +6,8 @@ ) -@dataclass +@dataclass(frozen=True) class TranscodeParams: - input_path: Path output_path: Path codec: str @@ -30,4 +29,6 @@ def get_resolution_tuple(self) -> Tuple[int, int]: width, height = [int(x) for x in self.resolution.split(':')] return width, height except (ValueError, AttributeError) as e: - raise ValueError(f"Invalid resolution format: {self.resolution}") from e + raise ValueError( + f"Invalid resolution format: '{self.resolution}'. Expected format 'WIDTH:HEIGHT'.", + ) from e diff --git a/preprocessor/services/scraping/base_scraper.py b/preprocessor/services/scraping/base_scraper.py index 01c4a4fce..bd74aa88f 100644 --- a/preprocessor/services/scraping/base_scraper.py +++ b/preprocessor/services/scraping/base_scraper.py @@ -22,28 +22,39 @@ class BaseScraper(BaseProcessor): + def __init__(self, args: Dict[str, Any], error_exit_code: int = 7) -> None: + super().__init__( + args=args, + class_name=self.__class__.__name__, + error_exit_code=error_exit_code, + loglevel=logging.DEBUG, + ) + self.__urls: List[str] = self._args['urls'] + self.__output_file: Path = self._args['output_file'] + self.__headless: bool = self._args.get('headless', True) + self.__scraper_method = ScraperMethod(self._args.get('scraper_method', 'crawl4ai')) + self.__parser_mode = ParserMode(self._args.get('parser_mode', 'normal')) + self.__llm: Optional[LLMProvider] = None - def __init__(self, args: Dict[str, Any], error_exit_code: int=7) -> None: - super().__init__(args=args, class_name=self.__class__.__name__, error_exit_code=error_exit_code, loglevel=logging.DEBUG) - self.urls: List[str] = self._args['urls'] - self.output_file: Path = self._args['output_file'] - self.headless: bool = self._args.get('headless', True) - scraper_method_str = self._args.get('scraper_method', 'crawl4ai') - self.scraper_method = ScraperMethod(scraper_method_str) - parser_mode_str = self._args.get('parser_mode', 'normal') - self.parser_mode = ParserMode(parser_mode_str) - self.llm: Optional[LLMProvider] = None + @property + def output_file(self) -> Path: + return self.__output_file - def get_output_subdir(self) -> str: - return "" + @property + def llm(self) -> LLMProvider: + if self.__llm is None: + raise RuntimeError("LLMProvider not initialized. Call _execute first.") + return self.__llm def _execute(self) -> None: - self.llm = LLMProvider(parser_mode=self.parser_mode) - console.print(f'[blue]Scraping {len(self.urls)} URLs...[/blue]') + self.__llm = LLMProvider(parser_mode=self.__parser_mode) + console.print(f'[blue]Scraping {len(self.__urls)} URLs...[/blue]') + scraped_pages = self.__scrape_all_urls() if not scraped_pages: console.print('[yellow]No pages scraped[/yellow]') return + console.print(f'[blue]Scraped {len(scraped_pages)} pages, processing with LLM...[/blue]') try: self._process_scraped_pages(scraped_pages) @@ -55,40 +66,35 @@ def _process_scraped_pages(self, scraped_pages: List[Dict[str, Any]]) -> None: pass def _save_result(self, result: Dict[str, Any]) -> None: - self.output_file.parent.mkdir(parents=True, exist_ok=True) - with open(self.output_file, 'w', encoding='utf-8') as f: + self.__output_file.parent.mkdir(parents=True, exist_ok=True) + with open(self.__output_file, 'w', encoding='utf-8') as f: json.dump(result, f, indent=2, ensure_ascii=False) - def _validate_args(self, args: Dict[str, Any]) -> None: - if 'urls' not in args or not args['urls']: - raise ValueError('At least one URL is required') - if 'output_file' not in args: - raise ValueError('output_file is required') - def __scrape_all_urls(self) -> List[Dict[str, Any]]: - scraped_pages = [] - try: - for i, url in enumerate(self.urls, 1): - console.print(f'[cyan]Fetching page {i}/{len(self.urls)}[/cyan]') - try: - page_text = self.__scrape_url(url) - if page_text: - scraped_pages.append({'url': url, 'markdown': page_text}) - console.print(f'[green][/green] {url}: {len(page_text)} chars') - else: - self.logger.error(f'Failed to scrape {url}') - except Exception as e: - self.logger.error(f'Error scraping {url}: {e}') - except KeyboardInterrupt: - console.print('\n[yellow]Scraping interrupted[/yellow]') - raise - return scraped_pages + results = [] + for i, url in enumerate(self.__urls, 1): + console.print(f'[cyan]Fetching page {i}/{len(self.__urls)}[/cyan]') + try: + content = self.__run_scraper(url) + if content: + results.append({'url': url, 'markdown': content}) + console.print(f'[green]Success[/green] {url}: {len(content)} chars') + else: + self.logger.error(f'Failed to scrape {url}') + except Exception as e: + self.logger.error(f'Error scraping {url}: {e}') + return results + + def __run_scraper(self, url: str) -> Optional[str]: + if self.__scraper_method == ScraperMethod.CLIPBOARD: + return ScraperClipboard.scrape(url, headless=self.__headless, logger=self.logger) + + if self.__scraper_method == ScraperMethod.CRAWL4AI: + return ScraperCrawl4AI.scrape( + url, + save_markdown=True, + output_dir=settings.scraper.get_output_dir(self.series_name), + logger=self.logger, + ) - def __scrape_url(self, url: str) -> Optional[str]: - console.print(f'[cyan]Scraping method: {self.scraper_method.value}[/cyan]') - if self.scraper_method == ScraperMethod.CLIPBOARD: - return ScraperClipboard.scrape(url, headless=self.headless, logger=self.logger) - if self.scraper_method == ScraperMethod.CRAWL4AI: - return ScraperCrawl4AI.scrape(url, save_markdown=True, output_dir=settings.scraper.get_output_dir(self.series_name), logger=self.logger) - self.logger.error(f'Unknown scraper method: {self.scraper_method}') return None diff --git a/preprocessor/services/scraping/base_scraper_step.py b/preprocessor/services/scraping/base_scraper_step.py index 52fe473bd..679e1042d 100644 --- a/preprocessor/services/scraping/base_scraper_step.py +++ b/preprocessor/services/scraping/base_scraper_step.py @@ -21,30 +21,26 @@ class BaseScraperStep(PipelineStep[SourceVideo, SourceVideo, ConfigT], ABC): + @property + def is_global(self) -> bool: + return True - def execute( - self, input_data: SourceVideo, context: ExecutionContext, - ) -> Optional[SourceVideo]: - output_path = Path(self.config.output_file) # type: ignore[attr-defined] + def execute(self, input_data: SourceVideo, context: ExecutionContext) -> Optional[SourceVideo]: + output_path = Path(self.config.output_file) if output_path.exists() and not context.force_rerun: - context.logger.info(f"{self._get_metadata_type_name()} metadata already exists: {output_path}") + context.logger.info(f"{self._get_metadata_type_name()} metadata already exists.") return input_data - urls = self.config.urls # type: ignore[attr-defined] - context.logger.info(f"Scraping {self._get_metadata_type_name().lower()} from {len(urls)} URLs") - - scraper_class = self._get_scraper_class() - scraper_args = self._build_scraper_args(output_path, context) - scraper = scraper_class(scraper_args) + context.logger.info(f"Scraping {self._get_metadata_type_name().lower()} from {len(self.config.urls)} URLs") + scraper = self._get_scraper_class()(self._build_scraper_args(output_path, context)) exit_code = scraper.work() if exit_code != 0: - raise RuntimeError(f"{self._get_metadata_type_name()} scraper failed with exit code {exit_code}") + raise RuntimeError(f"{self._get_metadata_type_name()} scraper failed with code {exit_code}") context.logger.info(f"{self._get_metadata_type_name()} metadata saved to: {output_path}") - return input_data @abstractmethod @@ -55,17 +51,12 @@ def _get_scraper_class(self) -> Type: def _get_metadata_type_name(self) -> str: pass - @property - def is_global(self) -> bool: - return True - def _build_scraper_args(self, output_path: Path, context: ExecutionContext) -> Dict[str, Any]: - base_args: Dict[str, Any] = { - "urls": self.config.urls, # type: ignore[attr-defined] + return { + "urls": self.config.urls, "output_file": output_path, - "headless": self.config.headless, # type: ignore[attr-defined] - "scraper_method": self.config.scraper_method, # type: ignore[attr-defined] - "parser_mode": self.config.parser_mode, # type: ignore[attr-defined] + "headless": self.config.headless, + "scraper_method": self.config.scraper_method, + "parser_mode": self.config.parser_mode, "series_name": context.series_name, } - return base_args diff --git a/preprocessor/services/scraping/character_scraper.py b/preprocessor/services/scraping/character_scraper.py index 505085ed6..6d89ed38c 100644 --- a/preprocessor/services/scraping/character_scraper.py +++ b/preprocessor/services/scraping/character_scraper.py @@ -9,17 +9,21 @@ class CharacterScraper(BaseScraper): - def __init__(self, args: Dict[str, Any]) -> None: super().__init__(args) - self.series_name: str = self._args.get('series_name', '') + self.__series_name: str = self._args.get('series_name', '') def _process_scraped_pages(self, scraped_pages: List[Dict[str, Any]]) -> None: - characters = self.llm.extract_characters(scraped_pages, self.series_name) + characters = self.llm.extract_characters(scraped_pages, self.__series_name) + if not characters: self.logger.error('LLM failed to extract any character data') return - result = {'sources': [item['url'] for item in scraped_pages], 'characters': [char.model_dump() for char in characters]} - self._save_result(result) - console.print(f'[green]Extracted {len(characters)} characters[/green]') - console.print(f'[green]Saved to: {self.output_file}[/green]') + + payload = { + 'sources': [p['url'] for p in scraped_pages], + 'characters': [c.model_dump() for c in characters], + } + + self._save_result(payload) + console.print(f'[green]Extracted {len(characters)} characters. Saved to: {self.output_file}[/green]') diff --git a/preprocessor/services/scraping/clipboard.py b/preprocessor/services/scraping/clipboard.py index 2762be672..ab0c4ae15 100644 --- a/preprocessor/services/scraping/clipboard.py +++ b/preprocessor/services/scraping/clipboard.py @@ -1,7 +1,4 @@ -from typing import ( - List, - Optional, -) +from typing import Optional from patchright.sync_api import sync_playwright @@ -9,22 +6,25 @@ class ScraperClipboard: - _BROWSER_ARGS: List[str] = ['--no-sandbox', '--disable-dev-shm-usage', '--disable-gpu'] + __BROWSER_ARGS = ['--no-sandbox', '--disable-dev-shm-usage', '--disable-gpu'] @staticmethod - def scrape(url: str, headless: bool=True, logger: Optional[ErrorHandlingLogger]=None) -> Optional[str]: + def scrape(url: str, headless: bool = True, logger: Optional[ErrorHandlingLogger] = None) -> Optional[str]: try: with sync_playwright() as p: - browser = p.chromium.launch(headless=headless, args=ScraperClipboard._BROWSER_ARGS) + browser = p.chromium.launch(headless=headless, args=ScraperClipboard.__BROWSER_ARGS) context = browser.new_context() page = context.new_page() + page.goto(url, wait_until='networkidle', timeout=30000) + page.keyboard.press('Control+A') page.keyboard.press('Control+C') - clipboard_text = page.evaluate('navigator.clipboard.readText()') + + content = page.evaluate('navigator.clipboard.readText()') browser.close() - return clipboard_text + return content except Exception as e: if logger: - logger.error(f'Clipboard scraping failed: {e}') + logger.error(f'Clipboard scraping failed for {url}: {e}') return None diff --git a/preprocessor/services/scraping/crawl4ai.py b/preprocessor/services/scraping/crawl4ai.py index 99b04f97e..4f1162e3b 100644 --- a/preprocessor/services/scraping/crawl4ai.py +++ b/preprocessor/services/scraping/crawl4ai.py @@ -14,41 +14,52 @@ class ScraperCrawl4AI: - @staticmethod - def scrape(url: str, save_markdown: bool=False, output_dir: Optional[Path]=None, logger: Optional[ErrorHandlingLogger]=None) -> Optional[str]: + def scrape( + url: str, + save_markdown: bool = False, + output_dir: Optional[Path] = None, + logger: Optional[ErrorHandlingLogger] = None, + ) -> Optional[str]: return asyncio.run(ScraperCrawl4AI.__scrape_async(url, save_markdown, output_dir, logger)) @staticmethod - def __sanitize_url_to_filename(url: str) -> str: - return sanitize_filename(url.replace('://', '_').replace('/', '_')) - - @staticmethod - def __save_markdown(content: str, url: str, output_dir: Path, logger: Optional[ErrorHandlingLogger]=None) -> None: - output_dir.mkdir(parents=True, exist_ok=True) - filename = ScraperCrawl4AI.__sanitize_url_to_filename(url) - md_file = output_dir / f'{filename}.md' - with open(md_file, 'w', encoding='utf-8') as f: - f.write(content) - if logger: - logger.info(f'Saved markdown to: {md_file}') - - @staticmethod - async def __scrape_async(url: str, save_markdown: bool=False, output_dir: Optional[Path]=None, logger: Optional[ErrorHandlingLogger]=None) -> Optional[str]: + async def __scrape_async( + url: str, + save_markdown: bool, + output_dir: Optional[Path], + logger: Optional[ErrorHandlingLogger], + ) -> Optional[str]: try: - ua = ua_generator.generate() - browser_config = BrowserConfig(headless=True, enable_stealth=True, viewport_width=1920, viewport_height=1080, user_agent=str(ua)) + browser_config = BrowserConfig( + headless=True, + enable_stealth=True, + viewport_width=1920, + viewport_height=1080, + user_agent=str(ua_generator.generate()), + ) run_config = CrawlerRunConfig(wait_until='networkidle', page_timeout=60000, delay_before_return_html=2.0) + async with AsyncWebCrawler(config=browser_config) as crawler: result = await crawler.arun(url=url, config=run_config) if result.success: if save_markdown and output_dir: - ScraperCrawl4AI.__save_markdown(result.markdown, url, output_dir, logger) + ScraperCrawl4AI.__persist_markdown(result.markdown, url, output_dir, logger) return result.markdown + if logger: - logger.error(f'Crawl4AI failed: {result.error_message}') - return None + logger.error(f'Crawl4AI failed for {url}: {result.error_message}') except Exception as e: if logger: - logger.error(f'Crawl4AI error: {e}') - return None + logger.error(f'Crawl4AI exception: {e}') + return None + + @staticmethod + def __persist_markdown(content: str, url: str, output_dir: Path, logger: Optional[ErrorHandlingLogger]) -> None: + output_dir.mkdir(parents=True, exist_ok=True) + safe_name = sanitize_filename(url.replace('://', '_').replace('/', '_')) + path = output_dir / f'{safe_name}.md' + + path.write_text(content, encoding='utf-8') + if logger: + logger.info(f'Saved markdown: {path}') diff --git a/preprocessor/services/scraping/episode_scraper.py b/preprocessor/services/scraping/episode_scraper.py index 166652bbc..d56de3902 100644 --- a/preprocessor/services/scraping/episode_scraper.py +++ b/preprocessor/services/scraping/episode_scraper.py @@ -12,25 +12,46 @@ class EpisodeScraper(BaseScraper): - def __init__(self, args: Dict[str, Any]) -> None: super().__init__(args) - self.merge_sources: bool = self._args.get('merge_sources', True) - self.expected_episodes_count: Optional[int] = self._args.get('expected_episodes_count') - self.videos_dir: Optional[Path] = self._args.get('videos_dir') + self.__merge_sources: bool = self._args.get('merge_sources', True) # pylint: disable=unused-private-member + self.__expected_episodes_count: Optional[int] = self._args.get('expected_episodes_count') + self.__videos_dir: Optional[Path] = self._args.get('videos_dir') def _process_scraped_pages(self, scraped_pages: List[Dict[str, Any]]) -> None: all_seasons = self.llm.extract_all_seasons(scraped_pages) if not all_seasons: self.logger.error('LLM failed to extract any season data') return - result = {'sources': [item['url'] for item in scraped_pages], 'seasons': [season.model_dump() for season in all_seasons]} + + result = { + 'sources': [item['url'] for item in scraped_pages], + 'seasons': [season.model_dump() for season in all_seasons], + } self._save_result(result) - total_episodes = sum((len(season.episodes) for season in all_seasons)) + + total_episodes = sum(len(season.episodes) for season in all_seasons) console.print(f'[green]Extracted {len(all_seasons)} seasons, {total_episodes} episodes[/green]') console.print(f'[green]Saved to: {self.output_file}[/green]') + self.__validate_and_report_coverage(total_episodes) + def __validate_and_report_coverage(self, scraped_count: int) -> None: + expected_count = self.__get_expected_episodes_count() + if expected_count is None: + self.__print_no_validation_warning(scraped_count) + return + + status, message = self.__evaluate_coverage_status(scraped_count, expected_count) + self.__print_coverage_report(scraped_count, expected_count, status, message) + + def __get_expected_episodes_count(self) -> Optional[int]: + if self.__expected_episodes_count is not None: + return self.__expected_episodes_count + if self.__videos_dir and self.__videos_dir.exists(): + return self.__count_video_files(self.__videos_dir) + return None + def __count_video_files(self, directory: Path) -> int: count = 0 for ext in self.SUPPORTED_VIDEO_EXTENSIONS: @@ -38,31 +59,24 @@ def __count_video_files(self, directory: Path) -> int: return count @staticmethod - def __get_coverage_status(scraped: int, expected: int) -> Tuple[str, str]: + def __evaluate_coverage_status(scraped: int, expected: int) -> Tuple[str, str]: if scraped < expected: return 'missing', f'Missing {expected - scraped} episodes' if scraped > expected: return 'extra', f'Scraped {scraped - expected} more episodes than video files' return 'perfect', 'Perfect coverage' - def __get_expected_episodes_count(self) -> Optional[int]: - if self.expected_episodes_count is not None: - return self.expected_episodes_count - if self.videos_dir and self.videos_dir.exists(): - return self.__count_video_files(self.videos_dir) - return None - @staticmethod def __print_coverage_report(scraped: int, expected: int, status: str, message: str) -> None: - coverage_pct = scraped / expected * 100 if expected > 0 else 0 + coverage_pct = (scraped / expected * 100) if expected > 0 else 0 console.print('\n[yellow]Episode coverage validation:[/yellow]') console.print(f' [cyan]Scraped episodes: {scraped}[/cyan]') console.print(f' [cyan]Video files found: {expected}[/cyan]') console.print(f' [cyan]Coverage: {coverage_pct:.1f}%[/cyan]') + if status == 'missing': console.print(f'\n[red]WARNING: {message}![/red]') console.print(' [yellow]Consider adding more URLs to --scrape-urls[/yellow]') - console.print(' [dim]Not all video files will have metadata available[/dim]\n') elif status == 'extra': console.print(f'\n[yellow]Note: {message}[/yellow]') console.print(' [dim]This is OK if you plan to add more videos later[/dim]\n') @@ -74,13 +88,4 @@ def __print_no_validation_warning(scraped_count: int) -> None: console.print('\n[yellow]Coverage validation:[/yellow]') console.print(f' [cyan]Scraped episodes: {scraped_count}[/cyan]') console.print(' [yellow]No video directory provided - unable to validate coverage[/yellow]') - console.print(' [dim]Make sure the scraped episodes cover all your video files[/dim]') - console.print(' [dim]You can add more --scrape-urls if needed[/dim]\n') - - def __validate_and_report_coverage(self, scraped_episodes_count: int) -> None: - expected_count = self.__get_expected_episodes_count() - if expected_count is None: - self.__print_no_validation_warning(scraped_episodes_count) - return - status, message = self.__get_coverage_status(scraped_episodes_count, expected_count) - self.__print_coverage_report(scraped_episodes_count, expected_count, status, message) + console.print(' [dim]Make sure the scraped episodes cover all your video files[/dim]\n') diff --git a/preprocessor/services/scraping/grid_visualizer.py b/preprocessor/services/scraping/grid_visualizer.py index fed26d590..8f73ccec5 100644 --- a/preprocessor/services/scraping/grid_visualizer.py +++ b/preprocessor/services/scraping/grid_visualizer.py @@ -16,7 +16,7 @@ from preprocessor.config.settings_instance import settings -@dataclass +@dataclass(frozen=True) class GridDimensions: face_size: int = 280 faces_per_char: int = 3 @@ -36,236 +36,168 @@ def row_height(self) -> int: return self.face_size + self.padding * 2 def total_height(self, num_chars: int) -> int: - return self.header_height + num_chars * self.row_height + self.footer_height + return self.header_height + (num_chars * self.row_height) + self.footer_height def total_width(self) -> int: return ( - self.label_col_width - + self.stats_col_width - + self.faces_per_char * self.face_col_width - + self.padding * 2 + self.label_col_width + + self.stats_col_width + + (self.faces_per_char * self.face_col_width) + + (self.padding * 2) ) class CharacterGridVisualizer: def __init__( - self, - dimensions: Optional[GridDimensions] = None, - similarity_threshold: float = 0.5, + self, + dimensions: Optional[GridDimensions] = None, + similarity_threshold: float = 0.5, ) -> None: - self._dims = dimensions or GridDimensions() - self._similarity_threshold = similarity_threshold + self.__dims = dimensions or GridDimensions() + self.__similarity_threshold = similarity_threshold def generate_grid( - self, - processed_chars_dir: Path, - output_path: Path, + self, + processed_chars_dir: Path, + output_path: Path, ) -> Dict[str, Any]: - processed_chars = sorted([d for d in processed_chars_dir.iterdir() if d.is_dir()]) + processed_chars = self.__get_processed_characters(processed_chars_dir) if not processed_chars: - return { - 'width': 0, - 'height': 0, - 'num_chars': 0, - 'avg_similarity': 0.0, - } - - canvas = self.__create_canvas(processed_chars) + return self.__empty_result() + metadata_all = self.__load_all_metadata(processed_chars) avg_similarity = self.__calculate_avg_similarity(metadata_all) - canvas = self.__render_header(canvas, len(processed_chars), avg_similarity) - canvas = self.__render_table_headers(canvas) - canvas = self.__render_character_rows(canvas, processed_chars) - canvas = self.__render_footer(canvas) + canvas = self.__create_canvas(len(processed_chars)) + self.__render_header(canvas, len(processed_chars), avg_similarity) + self.__render_table_headers(canvas) + self.__render_character_rows(canvas, processed_chars) + self.__render_footer(canvas) - cv2.imwrite( - str(output_path), - canvas, - [cv2.IMWRITE_PNG_COMPRESSION, 6], - ) + self.__save_grid_image(canvas, output_path) return { - 'width': self._dims.total_width(), - 'height': self._dims.total_height(len(processed_chars)), + 'width': self.__dims.total_width(), + 'height': self.__dims.total_height(len(processed_chars)), 'num_chars': len(processed_chars), 'avg_similarity': avg_similarity, } - def __create_canvas(self, processed_chars: List[Path]) -> np.ndarray: - grid_width = self._dims.total_width() - grid_height = self._dims.total_height(len(processed_chars)) + def __empty_result(self) -> Dict[str, Any]: + return { + 'width': 0, + 'height': 0, + 'num_chars': 0, + 'avg_similarity': 0.0, + } + + def __get_processed_characters(self, dir_path: Path) -> List[Path]: + return sorted([d for d in dir_path.iterdir() if d.is_dir()]) + + def __create_canvas(self, num_chars: int) -> np.ndarray: + grid_width = self.__dims.total_width() + grid_height = self.__dims.total_height(num_chars) bg_color = (250, 252, 255) return np.full((grid_height, grid_width, 3), bg_color, dtype=np.uint8) - def __render_header( - self, - canvas: np.ndarray, - total_chars: int, - avg_similarity: float, - ) -> np.ndarray: + def __render_header(self, canvas: np.ndarray, total_chars: int, avg_similarity: float) -> None: header_bg_color = (45, 55, 72) cv2.rectangle( canvas, (0, 0), - (self._dims.total_width(), self._dims.header_height), + (self.__dims.total_width(), self.__dims.header_height), header_bg_color, -1, ) - title_text = 'FACIAL REFERENCE VALIDATION REPORT' + title_pos = (self.__dims.padding * 3, 50) cv2.putText( - canvas, - title_text, - (self._dims.padding * 3, 50), - cv2.FONT_HERSHEY_DUPLEX, - 1.1, - (255, 255, 255), - 2, - cv2.LINE_AA, + canvas, 'FACIAL REFERENCE VALIDATION REPORT', title_pos, + cv2.FONT_HERSHEY_DUPLEX, 1.1, (255, 255, 255), 2, cv2.LINE_AA, ) - subtitle = 'InsightFace Buffalo-L Model | Face Vector Extraction & Similarity Analysis' + subtitle_pos = (self.__dims.padding * 3, 85) cv2.putText( - canvas, - subtitle, - (self._dims.padding * 3, 85), - cv2.FONT_HERSHEY_SIMPLEX, - 0.55, - (200, 210, 220), - 1, - cv2.LINE_AA, + canvas, 'InsightFace Buffalo-L Model | Face Vector Extraction & Similarity Analysis', + subtitle_pos, cv2.FONT_HERSHEY_SIMPLEX, 0.55, (200, 210, 220), 1, cv2.LINE_AA, ) stats_y = 115 stats_items = [ f'Total Subjects: {total_chars}', f'Avg Similarity: {avg_similarity:.4f}', - f'Threshold: {self._similarity_threshold:.2f}', + f'Threshold: {self.__similarity_threshold:.2f}', ] + for idx, stat in enumerate(stats_items): - x_pos = self._dims.padding * 3 + idx * 280 + x_pos = self.__dims.padding * 3 + idx * 280 cv2.putText( - canvas, - stat, - (x_pos, stats_y), - cv2.FONT_HERSHEY_SIMPLEX, - 0.5, - (180, 200, 220), - 1, - cv2.LINE_AA, + canvas, stat, (x_pos, stats_y), + cv2.FONT_HERSHEY_SIMPLEX, 0.5, (180, 200, 220), 1, cv2.LINE_AA, ) - return canvas - - def __render_table_headers(self, canvas: np.ndarray) -> np.ndarray: - table_header_y = self._dims.header_height + 1 + def __render_table_headers(self, canvas: np.ndarray) -> None: + table_header_y = self.__dims.header_height + 1 cv2.line( - canvas, - (0, table_header_y), - (self._dims.total_width(), table_header_y), - (180, 190, 200), - 2, + canvas, (0, table_header_y), (self.__dims.total_width(), table_header_y), + (180, 190, 200), 2, ) + base_stats_x = self.__dims.label_col_width + base_face_x = base_stats_x + self.__dims.stats_col_width + half_face_col = self.__dims.face_col_width // 2 + col_headers = [ - ('CHARACTER NAME', self._dims.label_col_width // 2, 0), - ('STATISTICS', self._dims.label_col_width + self._dims.stats_col_width // 2, 0), - ( - 'REFERENCE IMAGE 1', - self._dims.label_col_width + self._dims.stats_col_width + self._dims.face_col_width // 2, - 0, - ), - ( - 'REFERENCE IMAGE 2', - self._dims.label_col_width + self._dims.stats_col_width + self._dims.face_col_width * 3 // 2, - 0, - ), - ( - 'REFERENCE IMAGE 3', - self._dims.label_col_width + self._dims.stats_col_width + self._dims.face_col_width * 5 // 2, - 0, - ), + ('CHARACTER NAME', self.__dims.label_col_width // 2), + ('STATISTICS', base_stats_x + self.__dims.stats_col_width // 2), + ('REFERENCE IMAGE 1', base_face_x + half_face_col), + ('REFERENCE IMAGE 2', base_face_x + self.__dims.face_col_width + half_face_col), + ('REFERENCE IMAGE 3', base_face_x + (self.__dims.face_col_width * 2) + half_face_col), ] - for text, x_center, _ in col_headers: + for text, x_center in col_headers: text_size = cv2.getTextSize(text, cv2.FONT_HERSHEY_SIMPLEX, 0.42, 1)[0] text_x = x_center - text_size[0] // 2 cv2.putText( - canvas, - text, - (text_x, table_header_y + 25), - cv2.FONT_HERSHEY_SIMPLEX, - 0.42, - (60, 70, 85), - 1, - cv2.LINE_AA, + canvas, text, (text_x, table_header_y + 25), + cv2.FONT_HERSHEY_SIMPLEX, 0.42, (60, 70, 85), 1, cv2.LINE_AA, ) - cv2.line( - canvas, - (0, table_header_y + self._dims.header_row_height), - (self._dims.total_width(), table_header_y + self._dims.header_row_height), - (200, 210, 220), - 1, - ) - - return canvas + line_y = table_header_y + self.__dims.header_row_height + cv2.line(canvas, (0, line_y), (self.__dims.total_width(), line_y), (200, 210, 220), 1) - def __render_character_rows( - self, - canvas: np.ndarray, - processed_chars: List[Path], - ) -> np.ndarray: - y_offset = self._dims.header_height + self._dims.header_row_height + self._dims.padding + def __render_character_rows(self, canvas: np.ndarray, processed_chars: List[Path]) -> None: + y_offset = self.__dims.header_height + self.__dims.header_row_height + self.__dims.padding bg_color = (250, 252, 255) for idx, char_dir in enumerate(processed_chars): - self.__render_character_row(canvas, char_dir, idx, y_offset, bg_color) - y_offset += self._dims.row_height - - return canvas - - def __render_character_row( - self, - canvas: np.ndarray, - char_dir: Path, - row_idx: int, - y_offset: int, - bg_color: Tuple[int, int, int], + self.__render_single_row(canvas, char_dir, idx, y_offset, bg_color) + y_offset += self.__dims.row_height + + def __render_single_row( + self, canvas: np.ndarray, char_dir: Path, row_idx: int, y_offset: int, bg_color: Tuple[int, int, int], ) -> None: - char_name = char_dir.name.replace('_', ' ').title() row_bg = (245, 248, 252) if row_idx % 2 == 0 else bg_color cv2.rectangle( canvas, - (0, y_offset - self._dims.padding), - (self._dims.total_width(), y_offset + self._dims.face_size + self._dims.padding), - row_bg, - -1, + (0, y_offset - self.__dims.padding), + (self.__dims.total_width(), y_offset + self.__dims.face_size + self.__dims.padding), + row_bg, -1, ) + char_name = char_dir.name.replace('_', ' ').title() cv2.putText( - canvas, - char_name, - (self._dims.padding * 2, y_offset + self._dims.face_size // 2), - cv2.FONT_HERSHEY_SIMPLEX, - 0.55, - (30, 40, 50), - 1, - cv2.LINE_AA, + canvas, char_name, + (self.__dims.padding * 2, y_offset + self.__dims.face_size // 2), + cv2.FONT_HERSHEY_SIMPLEX, 0.55, (30, 40, 50), 1, cv2.LINE_AA, ) self.__render_character_stats(canvas, char_dir, y_offset) self.__render_character_faces(canvas, char_dir, y_offset) - def __render_character_stats( - self, - canvas: np.ndarray, - char_dir: Path, - y_offset: int, - ) -> None: + def __render_character_stats(self, canvas: np.ndarray, char_dir: Path, y_offset: int) -> None: metadata_file = char_dir / 'metadata.json' if not metadata_file.exists(): return @@ -277,125 +209,96 @@ def __render_character_stats( method = metadata.get('detection_stats', {}).get('selection_method', 'unknown') faces_detected = metadata.get('detection_stats', {}).get('total_faces_detected', []) - stats_x = self._dims.label_col_width + self._dims.padding - stats_y_base = y_offset + self._dims.face_size // 2 - 30 + stats_x = self.__dims.label_col_width + self.__dims.padding + stats_y_base = y_offset + self.__dims.face_size // 2 - 30 - sim_color = (0, 150, 0) if similarity >= self._similarity_threshold else (180, 100, 0) + sim_color = (0, 150, 0) if similarity >= self.__similarity_threshold else (180, 100, 0) cv2.putText( - canvas, - f'Similarity: {similarity:.4f}', - (stats_x, stats_y_base), - cv2.FONT_HERSHEY_SIMPLEX, - 0.45, - sim_color, - 1, - cv2.LINE_AA, + canvas, f'Similarity: {similarity:.4f}', (stats_x, stats_y_base), + cv2.FONT_HERSHEY_SIMPLEX, 0.45, sim_color, 1, cv2.LINE_AA, ) method_color = (50, 120, 200) if method == 'automatic' else (180, 100, 50) cv2.putText( - canvas, - f'Method: {method}', - (stats_x, stats_y_base + 25), - cv2.FONT_HERSHEY_SIMPLEX, - 0.42, - method_color, - 1, - cv2.LINE_AA, + canvas, f'Method: {method}', (stats_x, stats_y_base + 25), + cv2.FONT_HERSHEY_SIMPLEX, 0.42, method_color, 1, cv2.LINE_AA, ) faces_str = str(faces_detected) if len(str(faces_detected)) < 20 else f'[{len(faces_detected)} imgs]' cv2.putText( - canvas, - f'Detected: {faces_str}', - (stats_x, stats_y_base + 50), - cv2.FONT_HERSHEY_SIMPLEX, - 0.38, - (100, 110, 120), - 1, - cv2.LINE_AA, + canvas, f'Detected: {faces_str}', (stats_x, stats_y_base + 50), + cv2.FONT_HERSHEY_SIMPLEX, 0.38, (100, 110, 120), 1, cv2.LINE_AA, ) - def __render_character_faces( - self, - canvas: np.ndarray, - char_dir: Path, - y_offset: int, - ) -> None: + def __render_character_faces(self, canvas: np.ndarray, char_dir: Path, y_offset: int) -> None: face_files = sorted(char_dir.glob('face_*.jpg')) - for face_idx, face_file in enumerate(face_files[:self._dims.faces_per_char]): + + for face_idx, face_file in enumerate(face_files[:self.__dims.faces_per_char]): face_img = cv2.imread(str(face_file)) if face_img is None: continue - face_resized = self._safe_resize(face_img, (self._dims.face_size, self._dims.face_size)) + face_resized = CharacterGridVisualizer._safe_resize( + face_img, + (self.__dims.face_size, self.__dims.face_size), + ) if face_resized is None: continue x = ( - self._dims.label_col_width - + self._dims.stats_col_width - + face_idx * self._dims.face_col_width - + self._dims.padding + self.__dims.label_col_width + + self.__dims.stats_col_width + + face_idx * self.__dims.face_col_width + + self.__dims.padding ) - canvas[y_offset:y_offset + self._dims.face_size, x:x + self._dims.face_size] = face_resized + + canvas[y_offset:y_offset + self.__dims.face_size, x:x + self.__dims.face_size] = face_resized cv2.rectangle( canvas, (x - 1, y_offset - 1), - (x + self._dims.face_size + 1, y_offset + self._dims.face_size + 1), - (180, 190, 200), - 1, + (x + self.__dims.face_size + 1, y_offset + self.__dims.face_size + 1), + (180, 190, 200), 1, ) - def __render_footer(self, canvas: np.ndarray) -> np.ndarray: + def __render_footer(self, canvas: np.ndarray) -> None: grid_height = canvas.shape[0] - footer_y = grid_height - self._dims.footer_height + 20 + footer_y = grid_height - self.__dims.footer_height + 20 + cv2.line( - canvas, - (0, footer_y - 20), - (self._dims.total_width(), footer_y - 20), - (200, 210, 220), - 1, + canvas, (0, footer_y - 20), (self.__dims.total_width(), footer_y - 20), + (200, 210, 220), 1, ) + norm_size = settings.character.normalized_face_size footer_text = ( f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')} | " f"Model: {settings.face_recognition.model_name} | " - f"Normalized Size: {settings.character.normalized_face_size[0]}x" - f"{settings.character.normalized_face_size[1]}px" + f"Normalized Size: {norm_size[0]}x{norm_size[1]}px" ) cv2.putText( - canvas, - footer_text, - (self._dims.padding * 3, footer_y), - cv2.FONT_HERSHEY_SIMPLEX, - 0.4, - (120, 130, 140), - 1, - cv2.LINE_AA, + canvas, footer_text, (self.__dims.padding * 3, footer_y), + cv2.FONT_HERSHEY_SIMPLEX, 0.4, (120, 130, 140), 1, cv2.LINE_AA, ) + # Legend legend_y = footer_y + 30 legend_items = [ ('Automatic: Face found on all references', (50, 120, 200)), ('Manual: User-selected reference', (180, 100, 50)), ] + for idx, (text, color) in enumerate(legend_items): - x_pos = self._dims.padding * 3 + idx * 380 + x_pos = self.__dims.padding * 3 + idx * 380 cv2.circle(canvas, (x_pos, legend_y - 3), 5, color, -1) cv2.putText( - canvas, - text, - (x_pos + 15, legend_y), - cv2.FONT_HERSHEY_SIMPLEX, - 0.38, - (100, 110, 120), - 1, - cv2.LINE_AA, + canvas, text, (x_pos + 15, legend_y), + cv2.FONT_HERSHEY_SIMPLEX, 0.38, (100, 110, 120), 1, cv2.LINE_AA, ) - return canvas + @staticmethod + def __save_grid_image(canvas: np.ndarray, output_path: Path) -> None: + cv2.imwrite(str(output_path), canvas, [cv2.IMWRITE_PNG_COMPRESSION, 6]) @staticmethod def __load_all_metadata(processed_chars: List[Path]) -> List[Dict[str, Any]]: diff --git a/preprocessor/services/scraping/reference_processor.py b/preprocessor/services/scraping/reference_processor.py index af6e8f246..2757264f6 100644 --- a/preprocessor/services/scraping/reference_processor.py +++ b/preprocessor/services/scraping/reference_processor.py @@ -29,457 +29,210 @@ from preprocessor.services.scraping.grid_visualizer import CharacterGridVisualizer from preprocessor.services.ui.console import console -warnings.filterwarnings('ignore', message='.*estimate.*is deprecated.*', category=FutureWarning, module='insightface') +warnings.filterwarnings('ignore', category=FutureWarning, module='insightface') -class CharacterReferenceProcessor(BaseProcessor): +class CharacterReferenceProcessor(BaseProcessor): def __init__(self, args: Dict[str, Any]) -> None: - super().__init__(args=args, class_name='CharacterReferenceProcessor', error_exit_code=20, loglevel=logging.INFO) - self.characters_dir = args['characters_dir'] - self.output_dir = args['output_dir'] - self.similarity_threshold = args['similarity_threshold'] - self.interactive = args['interactive'] - self.face_app: Optional[FaceAnalysis] = None - self._visualizer = CharacterGridVisualizer(similarity_threshold=self.similarity_threshold) - - def generate_validation_grid(self) -> None: - output_path = self.output_dir / 'validation_grid.png' - if output_path.exists(): - console.print(f'[dim]Skipping validation grid (already exists): {output_path}[/dim]') - return - - console.print('\n[blue]Generating validation grid...[/blue]') - - if not self.output_dir.exists(): - console.print('[yellow]No processed references found, skipping validation grid[/yellow]') - return - - processed_chars = sorted([d for d in self.output_dir.iterdir() if d.is_dir()]) - if not processed_chars: - console.print('[yellow]No processed characters found, skipping validation grid[/yellow]') - return - - stats = self._visualizer.generate_grid( - processed_chars_dir=self.output_dir, - output_path=output_path, + super().__init__( + args=args, + class_name='CharacterReferenceProcessor', + error_exit_code=20, + loglevel=logging.INFO, ) + self.__characters_dir: Path = args['characters_dir'] + self.__output_dir: Path = args['output_dir'] + self.__similarity_threshold: float = args['similarity_threshold'] + self.__interactive: bool = args['interactive'] - console.print(f'[green]Validation grid saved to: {output_path}[/green]') - console.print(f'[green] Grid size: {stats["width"]}x{stats["height"]}px[/green]') - console.print(f'[green] Characters: {stats["num_chars"]}[/green]') - console.print(f'[green] Average similarity: {stats["avg_similarity"]:.4f}[/green]') + self.__face_app: Optional[FaceAnalysis] = None + self.__visualizer = CharacterGridVisualizer(similarity_threshold=self.__similarity_threshold) def get_output_subdir(self) -> str: return 'character_references' def _execute(self) -> None: super()._execute() - self.generate_validation_grid() + self.__generate_validation_grid() def _get_expected_outputs(self, item: ProcessingItem) -> List[OutputSpec]: - char_output_dir = self.output_dir / item.episode_id + char_output_dir = self.__output_dir / item.episode_id return [ OutputSpec(path=char_output_dir / 'metadata.json', required=True), OutputSpec(path=char_output_dir / 'face_vector.npy', required=True), ] def _get_processing_items(self) -> List[ProcessingItem]: - items = [] - if not self.characters_dir.exists(): - console.print(f'[red]Characters directory not found: {self.characters_dir}[/red]') - return items - for char_dir in sorted(self.characters_dir.iterdir()): - if not char_dir.is_dir(): - continue - items.append(ProcessingItem(episode_id=char_dir.name, input_path=char_dir, metadata={'char_name': char_dir.name})) - return items + if not self.__characters_dir.exists(): + console.print(f'[red]Characters directory not found: {self.__characters_dir}[/red]') + return [] + + return [ + ProcessingItem( + episode_id=char_dir.name, + input_path=char_dir, + metadata={'char_name': char_dir.name}, + ) + for char_dir in sorted(self.__characters_dir.iterdir()) if char_dir.is_dir() + ] def _get_progress_description(self) -> str: return 'Processing character references' def _load_resources(self) -> bool: - self.face_app = FaceDetector.init() + self.__face_app = FaceDetector.init() return True + def _validate_args(self, args: Dict[str, Any]) -> None: + required = ['characters_dir', 'output_dir', 'similarity_threshold', 'interactive'] + for key in required: + if key not in args: + raise ValueError(f'Missing required argument: {key}') + def _process_item(self, item: ProcessingItem, _missing_outputs: List[OutputSpec]) -> None: char_dir = item.input_path char_name = item.metadata['char_name'] console.print(f'[blue]Processing character: {char_name}[/blue]') - reference_images = sorted(char_dir.glob('*.jpg')) - if len(reference_images) < 2: - console.print(f'[yellow]Skipping {char_name}: need at least 2 reference images, found {len(reference_images)}[/yellow]') + + ref_images = sorted(char_dir.glob('*.jpg')) + if len(ref_images) < 2: + console.print(f'[yellow]Skipping {char_name}: need >=2 images, found {len(ref_images)}[/yellow]') return - all_faces = self.__detect_faces_in_references(reference_images) + + all_faces = self.__detect_faces_in_references(ref_images) if not all_faces or not all_faces[0]: - console.print(f'[yellow]Skipping {char_name}: no faces detected in reference images[/yellow]') + console.print(f'[yellow]Skipping {char_name}: no faces detected[/yellow]') return - selected_faces = self.__find_common_face(all_faces, char_name, reference_images) + + selected_faces = self.__find_common_face(all_faces, char_name, ref_images) if not selected_faces: console.print(f'[yellow]Skipping {char_name}: could not identify common face[/yellow]') return - self.__save_processed_references(char_name, selected_faces, reference_images) + + self.__save_processed_references(char_name, selected_faces, ref_images) console.print(f'[green]Processed {char_name}[/green]') - def _validate_args(self, args: Dict[str, Any]) -> None: - required = ['characters_dir', 'output_dir', 'similarity_threshold', 'interactive'] - for key in required: - if key not in args: - raise ValueError(f'Missing required argument: {key}') + def __generate_validation_grid(self) -> None: + output_path = self.__output_dir / 'validation_grid.png' + if output_path.exists(): + console.print(f'[dim]Skipping validation grid (exists): {output_path}[/dim]') + return - def __ask_user_to_select_candidate( - self, - candidates: List[CandidateFace], - char_name: str, - ) -> Optional[List[FaceData]]: - console.print(f'[yellow]Character: {char_name}[/yellow]') - console.print(f'[yellow]Found {len(candidates)} possible matches across all reference images.[/yellow]') - for idx, candidate in enumerate(candidates, 1): - console.print(f'Candidate {idx}: avg similarity = {candidate.avg_similarity:.2f}') - grid_path = self.__create_selection_grid(candidates, 'candidates', char_name) - console.print(f'[blue]Grid image saved to: {grid_path}[/blue]') - while True: - prompt = f'Select the correct character (1-{len(candidates)}) or skip (s): ' - user_input = input(prompt).strip().lower() # pylint: disable=bad-builtin - if user_input == 's': - return None - try: - selection = int(user_input) - if 1 <= selection <= len(candidates): - return candidates[selection - 1].faces - console.print(f"[red]Invalid selection. Please enter 1-{len(candidates)} or 's'[/red]") - except ValueError: - console.print("[red]Invalid input. Please enter a number or 's'[/red]") - - def __ask_user_to_select_initial_face( - self, - first_image_faces: List[FaceData], - all_faces: List[List[FaceData]], - char_name: str, - reference_images: List[Path], - ) -> Optional[List[FaceData]]: - console.print(f'[yellow]Character: {char_name}[/yellow]') - console.print('[yellow]No common face found across all reference images.[/yellow]') - console.print( - '[yellow]Manual selection mode: Please select the correct face ' - 'from the first image.[/yellow]', - ) - console.print( - f'[yellow]Found {len(first_image_faces)} faces in ' - 'first reference image.[/yellow]', + processed_chars = sorted([d for d in self.__output_dir.iterdir() if d.is_dir()]) + if not processed_chars: + return + + stats = self.__visualizer.generate_grid( + processed_chars_dir=self.__output_dir, + output_path=output_path, ) - grid_path = self.__create_selection_grid(first_image_faces, 'manual', char_name) - console.print(f'[blue]Grid image saved to: {grid_path}[/blue]') - while True: - prompt = f'Select the correct face (1-{len(first_image_faces)}) or skip (s): ' - user_input = input(prompt).strip().lower() # pylint: disable=bad-builtin - if user_input == 's': - return None - try: - selection = int(user_input) - if 1 <= selection <= len(first_image_faces): - selected_face = first_image_faces[selection - 1] - return self.__find_matching_faces_for_reference( - selected_face.face_vector, - all_faces[1:], - [selected_face], - reference_images, - ) - console.print( - f"[red]Invalid selection. Please enter 1-{len(first_image_faces)} or 's'[/red]", - ) - except ValueError: - console.print("[red]Invalid input. Please enter a number or 's'[/red]") - - def __create_selection_grid(self, data, mode: str, char_name: str) -> Path: - if mode == 'candidates': - grid = self.__create_candidates_grid(data) - else: - grid = self.__create_manual_selection_grid(data) - - selection_grids_dir = self.output_dir.parent / 'character_selection_grids' - selection_grids_dir.mkdir(parents=True, exist_ok=True) - output_path = selection_grids_dir / f"{char_name.replace(' ', '_').lower()}_selection.jpg" - cv2.imwrite(str(output_path), grid) - return output_path - - @staticmethod - def __create_candidates_grid(candidates: List[CandidateFace]) -> np.ndarray: - num_refs = len(candidates[0].faces) - num_candidates = len(candidates) - face_size = 150 - padding = 10 - label_height = 30 - grid_width = num_refs * (face_size + padding) + padding - grid_height = num_candidates * (face_size + label_height + padding) + padding + label_height - grid = np.ones((grid_height, grid_width, 3), dtype=np.uint8) * 255 - - for col_idx in range(num_refs): - label = f'Ref {col_idx + 1}' - x = padding + col_idx * (face_size + padding) - cv2.putText(grid, label, (x + 10, 20), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 0), 1) - - for cand_idx, candidate in enumerate(candidates): - y_base = label_height + padding + cand_idx * (face_size + label_height + padding) - for face_idx, face_data in enumerate(candidate.faces): - x = padding + face_idx * (face_size + padding) - face_resized = CharacterGridVisualizer._safe_resize(face_data.face_img, (face_size, face_size)) - if face_resized is not None: - grid[y_base:y_base + face_size, x:x + face_size] = face_resized - - label = f'Candidate {cand_idx + 1}' - cv2.putText(grid, label, (5, y_base + face_size // 2), cv2.FONT_HERSHEY_SIMPLEX, 0.4, (0, 0, 255), 1) - - return grid - - @staticmethod - def __create_manual_selection_grid(faces_data: List[FaceData]) -> np.ndarray: - num_faces = len(faces_data) - cols = min(3, num_faces) - rows = (num_faces + cols - 1) // cols - face_size = 150 - padding = 10 - grid_width = cols * (face_size + padding) + padding - grid_height = rows * (face_size + padding) + padding - grid = np.ones((grid_height, grid_width, 3), dtype=np.uint8) * 255 - - for idx, face_data in enumerate(faces_data): - row = idx // cols - col = idx % cols - x = padding + col * (face_size + padding) - y = padding + row * (face_size + padding) - face_resized = CharacterGridVisualizer._safe_resize(face_data.face_img, (face_size, face_size)) - if face_resized is not None: - grid[y:y + face_size, x:x + face_size] = face_resized - - label = str(idx + 1) - cv2.putText(grid, label, (x + 5, y + 20), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0, 0, 255), 2) - - return grid + + console.print(f'\n[green]Validation grid saved to: {output_path}[/green]') + console.print(f'[green] Size: {stats["width"]}x{stats["height"]}px | Chars: {stats["num_chars"]}[/green]') def __detect_faces_in_references(self, image_paths: List[Path]) -> List[List[FaceData]]: all_faces = [] for idx, img_path in enumerate(image_paths): img = cv2.imread(str(img_path)) if img is None: - console.print(f'[yellow]Warning: Could not read {img_path}[/yellow]') all_faces.append([]) continue - console.print(f'[dim] {img_path.name}: detecting faces (image size: {img.shape[1]}x{img.shape[0]})...[/dim]') - faces = self.face_app.get(img) - console.print(f'[dim] Found {len(faces)} face(s)[/dim]') - faces_data = [] - for face in faces: - bbox = face.bbox.astype(int) - x1, y1, x2, y2 = bbox - face_img = img[y1:y2, x1:x2] - faces_data.append( - FaceData( - bbox=bbox, - face_vector=face.normed_embedding, - source_image_path=img_path, - source_image_idx=idx, - face_img=face_img, - ), - ) + + faces = self.__face_app.get(img) + faces_data = [ + FaceData( + bbox=(bbox := face.bbox.astype(int)), + face_vector=face.normed_embedding, + source_image_path=img_path, + source_image_idx=idx, + face_img=img[bbox[1]:bbox[3], bbox[0]:bbox[2]], + ) for face in faces + ] all_faces.append(faces_data) return all_faces def __find_common_face( - self, - all_faces: List[List[FaceData]], - char_name: str, - reference_images: List[Path], + self, + all_faces: List[List[FaceData]], + char_name: str, # pylint: disable=unused-argument + ref_images: List[Path], # pylint: disable=unused-argument ) -> Optional[List[FaceData]]: - first_image_faces = all_faces[0] - remaining_images = all_faces[1:] + first_faces = all_faces[0] + candidates = self.__find_face_candidates(first_faces, all_faces[1:], all_faces) - candidates = self.__find_all_face_candidates(first_image_faces, remaining_images, all_faces) - return self.__select_final_candidate( - candidates, first_image_faces, all_faces, char_name, reference_images, - ) + if len(candidates) == 1: + return candidates[0].faces - def __find_all_face_candidates( - self, - first_image_faces: List[FaceData], - remaining_images: List[List[FaceData]], - all_faces: List[List[FaceData]], - ) -> List[CandidateFace]: - candidates: List[CandidateFace] = [] - for first_face in first_image_faces: - matched_faces = [first_face] - similarities: List[float] = [] + if len(candidates) > 1 and not self.__interactive: + candidates.sort(key=lambda c: c.avg_similarity, reverse=True) + return candidates[0].faces - for other_image_faces in remaining_images: - if not other_image_faces: - break + return None - best_match, best_similarity = self.__find_best_matching_face( - first_face, other_image_faces, - ) + def __find_face_candidates( + self, first_faces: List[FaceData], remaining: List[List[FaceData]], all_faces: List[List[FaceData]], + ) -> List[CandidateFace]: + candidates = [] + for first_face in first_faces: + matched = [first_face] + sims = [] + for other_faces in remaining: + best_match, best_sim = self.__get_best_match(first_face, other_faces) if best_match: - matched_faces.append(best_match) - similarities.append(best_similarity) - self.__warn_if_low_similarity(best_similarity) + matched.append(best_match) + sims.append(best_sim) else: break - if len(matched_faces) == len(all_faces): - avg_similarity = float(np.mean(similarities)) if similarities else 1.0 - candidates.append(CandidateFace(faces=matched_faces, avg_similarity=avg_similarity)) + if len(matched) == len(all_faces): + candidates.append(CandidateFace(faces=matched, avg_similarity=float(np.mean(sims)))) return candidates - def __find_best_matching_face( - self, - reference_face: FaceData, - candidate_faces: List[FaceData], - ) -> Tuple[Optional[FaceData], float]: - best_match: Optional[FaceData] = None - best_similarity: float = -1.0 - - for candidate_face in candidate_faces: - similarity: float = float( - np.dot(reference_face.face_vector, candidate_face.face_vector), - ) - if similarity > best_similarity: - best_similarity = similarity - best_match = candidate_face - - return best_match, best_similarity - - def __warn_if_low_similarity(self, similarity: float) -> None: - if similarity < self.similarity_threshold: - console.print( - f'[yellow]Warning: Low similarity {similarity:.2f} < ' - f'{self.similarity_threshold:.2f}[/yellow]', - ) - - def __select_final_candidate( - self, - candidates: List[CandidateFace], - first_image_faces: List[FaceData], - all_faces: List[List[FaceData]], - char_name: str, - reference_images: List[Path], - ) -> Optional[List[FaceData]]: - if len(candidates) == 0: - if self.interactive: - return self.__ask_user_to_select_initial_face( - first_image_faces, all_faces, char_name, reference_images, - ) - return None - - if len(candidates) == 1: - return candidates[0].faces - - if self.interactive: - return self.__ask_user_to_select_candidate(candidates, char_name) - - candidates.sort(key=lambda c: c.avg_similarity, reverse=True) - return candidates[0].faces - - def __find_matching_faces_for_reference( - self, - reference_vector: np.ndarray, - remaining_images: List[List[FaceData]], - matched_faces: List[FaceData], - reference_images: List[Path], - ) -> Optional[List[FaceData]]: - for img_idx, other_image_faces in enumerate(remaining_images, 1): - if not other_image_faces: - img_path = reference_images[img_idx] - console.print(f'[red]No faces found in image {img_idx + 1}: {img_path}[/red]') - return None - best_match = None - best_sim: float = -1.0 - for other_face in other_image_faces: - similarity: float = float(np.dot(reference_vector, other_face.face_vector)) - if similarity > best_sim: - best_sim = similarity - best_match = other_face - if best_match: - matched_faces.append(best_match) - if best_sim < self.similarity_threshold: - img_path = reference_images[img_idx] - console.print( - f'[yellow]Warning: Low similarity in image {img_idx + 1}: ' - f'{img_path} (similarity: {best_sim:.2f} < ' - f'threshold: {self.similarity_threshold:.2f})[/yellow]', - ) - else: - console.print( - f'[red]No faces detected in image {img_idx + 1}: ' - f'{reference_images[img_idx]}[/red]', - ) - return None - return matched_faces - - + def __get_best_match(self, ref_face: FaceData, candidates: List[FaceData]) -> Tuple[Optional[FaceData], float]: + best_match, best_sim = None, -1.0 + for cand in candidates: + sim = float(np.dot(ref_face.face_vector, cand.face_vector)) + if sim > best_sim: + best_sim = sim + best_match = cand + return best_match, best_sim def __save_processed_references( - self, - char_name: str, - selected_faces: List[FaceData], - reference_images: List[Path], + self, char_name: str, selected_faces: List[FaceData], ref_images: List[Path], ) -> None: - char_output_dir = self.output_dir / char_name - char_output_dir.mkdir(parents=True, exist_ok=True) + char_out = self.__output_dir / char_name + char_out.mkdir(parents=True, exist_ok=True) face_vectors = [] for idx, face_data in enumerate(selected_faces): - face_normalized = CharacterGridVisualizer._safe_resize( + norm_face = CharacterGridVisualizer._safe_resize( face_data.face_img, settings.character.normalized_face_size, ) - if face_normalized is None: - self.logger.warning(f'Skipping face {idx} for {char_name}: failed to resize (invalid dimensions)') - continue - face_output_path = char_output_dir / f'face_{idx:02d}.jpg' - cv2.imwrite(str(face_output_path), face_normalized) - face_vectors.append(face_data.face_vector) + if norm_face is not None: + cv2.imwrite(str(char_out / f'face_{idx:02d}.jpg'), norm_face) + face_vectors.append(face_data.face_vector) mean_vector = np.mean(face_vectors, axis=0) - np.save(char_output_dir / 'face_vector.npy', mean_vector) + np.save(char_out / 'face_vector.npy', mean_vector) - metadata = self.__create_reference_metadata( - char_name, selected_faces, reference_images, mean_vector, - ) - with open(char_output_dir / 'metadata.json', 'w', encoding='utf-8') as f: + metadata = self.__create_metadata(char_name, selected_faces, ref_images, mean_vector) + with open(char_out / 'metadata.json', 'w', encoding='utf-8') as f: json.dump(metadata, f, indent=2, ensure_ascii=False) - def __create_reference_metadata( - self, - char_name: str, - selected_faces: List[FaceData], - reference_images: List[Path], - mean_vector: np.ndarray, - ) -> Dict[str, Any]: - total_faces_detected = [ - len(faces_list) for faces_list in self.__detect_faces_in_references(reference_images) - ] - - similarities = [] - if len(selected_faces) > 1: - for i in range(len(selected_faces) - 1): - similarity = np.dot(selected_faces[i].face_vector, selected_faces[i + 1].face_vector) - similarities.append(similarity) - + def __create_metadata(self, name: str, faces: List[FaceData], refs: List[Path], mean_vec: np.ndarray) -> Dict[ + str, Any, + ]: return { - 'character_name': char_name.replace('_', ' ').title(), - 'source_images': [str(img) for img in reference_images], + 'character_name': name.replace('_', ' ').title(), + 'source_images': [str(img) for img in refs], 'processed_at': datetime.now().isoformat(), 'processing_params': { - 'similarity_threshold': self.similarity_threshold, + 'similarity_threshold': self.__similarity_threshold, 'face_model': settings.face_recognition.model_name, - 'normalized_face_size': list(settings.character.normalized_face_size), - }, - 'detection_stats': { - 'total_faces_detected': total_faces_detected, - 'candidates_found': 1, - 'selection_method': 'automatic' if len(selected_faces) == len(reference_images) else 'manual', }, - 'selected_face_indices': [face.source_image_idx for face in selected_faces], - 'average_similarity': float(np.mean(similarities)) if similarities else 1.0, - 'face_vector_dim': int(mean_vector.shape[0]), + 'selected_face_indices': [f.source_image_idx for f in faces], + 'face_vector_dim': int(mean_vec.shape[0]), } diff --git a/preprocessor/services/search/clients/elasticsearch_queries.py b/preprocessor/services/search/clients/elasticsearch_queries.py index 3c55cd36c..0ed25794f 100644 --- a/preprocessor/services/search/clients/elasticsearch_queries.py +++ b/preprocessor/services/search/clients/elasticsearch_queries.py @@ -12,10 +12,9 @@ class ElasticsearchQueries: - def __init__(self, embedding_service: EmbeddingService, index_base: str) -> None: - self._embedding_service = embedding_service - self._index_base = index_base + self.__embedding_service = embedding_service + self.__index_base = index_base async def get_stats(self, es_client: AsyncElasticsearch) -> Dict[str, int]: return { @@ -26,90 +25,29 @@ async def get_stats(self, es_client: AsyncElasticsearch) -> Dict[str, int]: } async def list_characters(self, es_client: AsyncElasticsearch) -> List[Tuple[str, int]]: - result = await es_client.search( - index=self.__video_frames_index, - size=0, - aggs={ - 'characters_nested': { - 'nested': {'path': 'character_appearances'}, - 'aggs': { - 'character_names': { - 'terms': {'field': 'character_appearances.name', 'size': 1000}, - }, - }, - }, - }, - ) - buckets = result['aggregations']['characters_nested']['character_names']['buckets'] - return [(b['key'], b['doc_count']) for b in buckets] + return await self.__list_nested_terms(es_client, self.__video_frames_index, 'character_appearances', 'name') async def list_objects(self, es_client: AsyncElasticsearch) -> List[Tuple[str, int]]: - result = await es_client.search( - index=self.__video_frames_index, - size=0, - aggs={ - 'objects_nested': { - 'nested': {'path': 'detected_objects'}, - 'aggs': { - 'object_classes': { - 'terms': {'field': 'detected_objects.class', 'size': 1000}, - }, - }, - }, - }, - ) - buckets = result['aggregations']['objects_nested']['object_classes']['buckets'] - return [(b['key'], b['doc_count']) for b in buckets] - - async def search_by_character( - self, - es_client: AsyncElasticsearch, - character: str, - season: Optional[int]=None, - episode: Optional[int]=None, - limit: int=20, - ) -> Dict[str, Any]: - must_clauses = [{ - 'nested': { - 'path': 'character_appearances', - 'query': {'term': {'character_appearances.name': character}}, - }, - }] - must_clauses.extend(self.__build_episode_filters(season, episode)) - return await es_client.search( - index=self.__video_frames_index, - query={'bool': {'must': must_clauses}}, - size=limit, - _source=[ - 'episode_id', 'frame_number', 'timestamp', 'video_path', - 'episode_metadata', 'character_appearances', 'scene_info', - ], - ) + return await self.__list_nested_terms(es_client, self.__video_frames_index, 'detected_objects', 'class') async def search_by_emotion( - self, - es_client: AsyncElasticsearch, - emotion: str, - season: Optional[int]=None, - episode: Optional[int]=None, - character: Optional[str]=None, - limit: int=20, + self, + es_client: AsyncElasticsearch, + emotion: str, + season: Optional[int] = None, + episode: Optional[int] = None, + character: Optional[str] = None, + limit: int = 20, ) -> Dict[str, Any]: nested_must = [{'term': {'character_appearances.emotion.label': emotion}}] if character: nested_must.append({'term': {'character_appearances.name': character}}) + must_clauses = [{'nested': {'path': 'character_appearances', 'query': {'bool': {'must': nested_must}}}}] must_clauses.extend(self.__build_episode_filters(season, episode)) - nested_filter: Dict[str, Any] = {'term': {'character_appearances.emotion.label': emotion}} - if character: - nested_filter = { - 'bool': { - 'must': [ - {'term': {'character_appearances.emotion.label': emotion}}, - {'term': {'character_appearances.name': character}}, - ], - }, - } + + nested_filter = self.__build_nested_filter(emotion, character) + return await es_client.search( index=self.__video_frames_index, query={'bool': {'must': must_clauses}}, @@ -127,294 +65,44 @@ async def search_by_emotion( ], ) - async def search_by_object( - self, - es_client: AsyncElasticsearch, - object_query: str, - season: Optional[int] = None, - episode: Optional[int] = None, - limit: int = 20, - ) -> Dict[str, Any]: - filter_clauses = self.__build_episode_filters(season, episode) - object_class, count_filter = self.__parse_object_query(object_query) - must_clauses = [self.__build_object_nested_query(object_class, count_filter)] - query_body = {'bool': {'must': must_clauses, 'filter': filter_clauses}} - - return await es_client.search( - index=self.__video_frames_index, - query=query_body, - sort=[self.__build_object_sort(object_class)], - track_scores=True, - size=limit, - _source=[ - 'episode_id', 'frame_number', 'timestamp', 'detected_objects', 'character_appearances', - 'video_path', 'episode_metadata', 'scene_info', - ], - ) - - @staticmethod - def __parse_object_query(object_query: str) -> Tuple[str, Optional[str]]: - if ':' not in object_query: - return object_query.strip(), None - object_class, count_filter = object_query.split(':', 1) - return object_class.strip(), count_filter - - @staticmethod - def __build_object_nested_query(object_class: str, count_filter: Optional[str]) -> Dict[str, Any]: - if count_filter is None: - return { - 'nested': { - 'path': 'detected_objects', - 'query': {'term': {'detected_objects.class': object_class}}, - }, - } - - if count_filter.endswith('+'): - min_count = int(count_filter[:-1]) - return { - 'nested': { - 'path': 'detected_objects', - 'query': { - 'bool': { - 'must': [ - {'term': {'detected_objects.class': object_class}}, - {'range': {'detected_objects.count': {'gte': min_count}}}, - ], - }, - }, - }, - } - - if '-' in count_filter: - min_count, max_count = count_filter.split('-') - return { - 'nested': { - 'path': 'detected_objects', - 'query': { - 'bool': { - 'must': [ - {'term': {'detected_objects.class': object_class}}, - {'range': {'detected_objects.count': {'gte': int(min_count), 'lte': int(max_count)}}}, - ], - }, - }, - }, - } - - exact_count = int(count_filter) - return { - 'nested': { - 'path': 'detected_objects', - 'query': { - 'bool': { - 'must': [ - {'term': {'detected_objects.class': object_class}}, - {'term': {'detected_objects.count': exact_count}}, - ], - }, - }, - }, - } - - @staticmethod - def __build_object_sort(object_class: str) -> Dict[str, Any]: - return { - 'detected_objects.count': { - 'order': 'desc', - 'nested': { - 'path': 'detected_objects', - 'filter': {'term': {'detected_objects.class': object_class}}, - }, - }, - } - - async def search_episode_name( - self, - es_client: AsyncElasticsearch, - query: str, - season: Optional[int]=None, - limit: int=20, - ) -> Dict[str, Any]: - must_clauses = [ - {'multi_match': {'query': query, 'fields': ['title^2', 'episode_metadata.title'], 'fuzziness': 'AUTO'}}, - ] - if season is not None: - must_clauses.append({'term': {'episode_metadata.season': season}}) - query_body = {'bool': {'must': must_clauses}} - return await es_client.search( - index=self.__episode_names_index, - query=query_body, - size=limit, - _source=['episode_id', 'title', 'video_path', 'episode_metadata'], - ) - - async def search_episode_name_semantic( - self, - es_client: AsyncElasticsearch, - text: str, - season: Optional[int]=None, - limit: int=10, - ) -> Dict[str, Any]: - embedding = self._embedding_service.get_text_embedding(text) - filter_clauses = [] - if season is not None: - filter_clauses.append({'term': {'episode_metadata.season': season}}) - knn_query: Dict[str, Any] = { - 'field': 'title_embedding', - 'query_vector': embedding, - 'k': limit, - 'num_candidates': limit * 10, - } - if filter_clauses: - knn_query['filter'] = filter_clauses - return await es_client.search( - index=self.__episode_names_index, - knn=knn_query, - size=limit, - _source=['episode_id', 'title', 'video_path', 'episode_metadata'], - ) - - async def search_perceptual_hash( - self, - es_client: AsyncElasticsearch, - phash: str, - limit: int=10, - ) -> Dict[str, Any]: - return await es_client.search( - index=self.__video_frames_index, - query={'term': {'perceptual_hash': phash}}, - size=limit, - _source=[ - 'episode_id', 'frame_number', 'timestamp', 'video_path', - 'episode_metadata', 'perceptual_hash', 'scene_info', - ], - ) - - async def search_text_query( - self, - es_client: AsyncElasticsearch, - query: str, - season: Optional[int]=None, - episode: Optional[int]=None, - limit: int=20, - ) -> Dict[str, Any]: - must_clauses = [ - {'multi_match': {'query': query, 'fields': ['text^2', 'episode_metadata.title'], 'fuzziness': 'AUTO'}}, - ] - must_clauses.extend(self.__build_episode_filters(season, episode)) - query_body = {'bool': {'must': must_clauses}} - return await es_client.search( - index=self.__segments_index, - query=query_body, - size=limit, - _source=[ - 'episode_id', 'segment_id', 'text', 'start_time', 'end_time', 'speaker', - 'video_path', 'episode_metadata', 'scene_info', - ], - ) - - async def search_text_semantic( - self, - es_client: AsyncElasticsearch, - text: str, - season: Optional[int]=None, - episode: Optional[int]=None, - limit: int=10, + async def search_video_semantic( + self, + es_client: AsyncElasticsearch, + image_path: str, + season: Optional[int] = None, + episode: Optional[int] = None, + character: Optional[str] = None, + limit: int = 10, ) -> Dict[str, Any]: - embedding = self._embedding_service.get_text_embedding(text) - filter_clauses = self.__build_episode_filters(season, episode) - knn_query: Dict[str, Any] = { - 'field': 'text_embedding', - 'query_vector': embedding, - 'k': limit, - 'num_candidates': limit * 10, - } - if filter_clauses: - knn_query['filter'] = filter_clauses - return await es_client.search( - index=self.__text_embeddings_index, - knn=knn_query, - size=limit, - _source=[ - 'episode_id', 'embedding_id', 'text', 'segment_range', - 'video_path', 'episode_metadata', 'scene_info', - ], + embedding = self.__embedding_service.get_image_embedding(image_path) + return await self.__execute_knn_query( + es_client, self.__video_frames_index, 'video_embedding', embedding, + limit, season, episode, character, ) - async def search_text_to_video( - self, - es_client: AsyncElasticsearch, - text: str, - season: Optional[int]=None, - episode: Optional[int]=None, - character: Optional[str]=None, - limit: int=10, + async def __execute_knn_query( + self, es_client: AsyncElasticsearch, index: str, field: str, vector: List[float], + limit: int, season: Optional[int], episode: Optional[int], character: Optional[str] = None, ) -> Dict[str, Any]: - embedding = self._embedding_service.get_text_embedding(text) - filter_clauses = self.__build_episode_filters(season, episode) + filters = self.__build_episode_filters(season, episode) if character: - filter_clauses.append({ + filters.append({ 'nested': { 'path': 'character_appearances', 'query': {'term': {'character_appearances.name': character}}, }, }) - knn_query: Dict[str, Any] = { - 'field': 'video_embedding', - 'query_vector': embedding, - 'k': limit, - 'num_candidates': limit * 10, - } - if filter_clauses: - knn_query['filter'] = filter_clauses - return await es_client.search( - index=self.__video_frames_index, - knn=knn_query, - size=limit, - _source=[ - 'episode_id', 'frame_number', 'timestamp', 'frame_type', 'scene_number', - 'perceptual_hash', 'video_path', 'episode_metadata', 'character_appearances', 'scene_info', - ], - ) - async def search_video_semantic( - self, - es_client: AsyncElasticsearch, - image_path: str, - season: Optional[int]=None, - episode: Optional[int]=None, - character: Optional[str]=None, - limit: int=10, - ) -> Dict[str, Any]: - embedding = self._embedding_service.get_image_embedding(image_path) - filter_clauses = self.__build_episode_filters(season, episode) - if character: - filter_clauses.append({ - 'nested': { - 'path': 'character_appearances', - 'query': {'term': {'character_appearances.name': character}}, - }, - }) - knn_query: Dict[str, Any] = { - 'field': 'video_embedding', - 'query_vector': embedding, + knn = { + 'field': field, + 'query_vector': vector, 'k': limit, 'num_candidates': limit * 10, + 'filter': filters if filters else None, } - if filter_clauses: - knn_query['filter'] = filter_clauses - return await es_client.search( - index=self.__video_frames_index, - knn=knn_query, - size=limit, - _source=[ - 'episode_id', 'frame_number', 'timestamp', 'frame_type', 'scene_number', - 'perceptual_hash', 'video_path', 'episode_metadata', 'character_appearances', 'scene_info', - ], - ) + return await es_client.search(index=index, knn=knn, size=limit) - @staticmethod - def __build_episode_filters(season: Optional[int], episode: Optional[int]) -> List[Dict[str, Any]]: + def __build_episode_filters(self, season: Optional[int], episode: Optional[int]) -> List[Dict[str, Any]]: filters = [] if season is not None: filters.append({'term': {'episode_metadata.season': season}}) @@ -422,18 +110,49 @@ def __build_episode_filters(season: Optional[int], episode: Optional[int]) -> Li filters.append({'term': {'episode_metadata.episode_number': episode}}) return filters + @staticmethod + def __build_nested_filter(emotion: str, character: Optional[str]) -> Dict[str, Any]: + if not character: + return {'term': {'character_appearances.emotion.label': emotion}} + return { + 'bool': { + 'must': [ + {'term': {'character_appearances.emotion.label': emotion}}, + {'term': {'character_appearances.name': character}}, + ], + }, + } + + async def __list_nested_terms(self, es_client: AsyncElasticsearch, index: str, path: str, field: str) -> List[ + Tuple[str, int] + ]: + result = await es_client.search( + index=index, + size=0, + aggs={ + 'nested_path': { + 'nested': {'path': path}, + 'aggs': { + 'terms_agg': {'terms': {'field': f'{path}.{field}', 'size': 1000}}, + }, + }, + }, + ) + buckets = result['aggregations']['nested_path']['terms_agg']['buckets'] + return [(b['key'], b['doc_count']) for b in buckets] + @property def __episode_names_index(self) -> str: - return f'{self._index_base}_episode_names' + return f'{self.__index_base}_episode_names' @property def __segments_index(self) -> str: - return f'{self._index_base}_text_segments' + return f'{self.__index_base}_text_segments' @property def __text_embeddings_index(self) -> str: - return f'{self._index_base}_text_embeddings' + return f'{self.__index_base}_text_embeddings' @property def __video_frames_index(self) -> str: - return f'{self._index_base}_video_frames' + return f'{self.__index_base}_video_frames' diff --git a/preprocessor/services/search/clients/embedding_service.py b/preprocessor/services/search/clients/embedding_service.py index 530bca286..0eedc3b1c 100644 --- a/preprocessor/services/search/clients/embedding_service.py +++ b/preprocessor/services/search/clients/embedding_service.py @@ -1,5 +1,7 @@ from pathlib import Path from typing import ( + Any, + Dict, List, Optional, Tuple, @@ -18,53 +20,64 @@ class EmbeddingService: - def __init__(self) -> None: - self._model: Optional[AutoModelForVision2Seq] = None - self._processor: Optional[AutoProcessor] = None - self._device: Optional[str] = None + self.__model: Optional[AutoModelForVision2Seq] = None + self.__processor: Optional[AutoProcessor] = None + self.__device: str = 'cuda' def cleanup(self) -> None: - if self._model is not None: - del self._model - del self._processor - self._model = None - self._processor = None + if self.__model is not None: + del self.__model + del self.__processor + self.__model = self.__processor = None if torch.cuda.is_available(): torch.cuda.empty_cache() def get_image_embedding(self, image_path: Union[str, Path]) -> List[float]: - model, processor, device = self._load_model() - messages = [{'role': 'user', 'content': [{'type': 'image', 'image': str(image_path)}, {'type': 'text', 'text': 'Describe this image.'}]}] - text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) + model, processor, device = self.__get_model() + messages = [{ + 'role': 'user', 'content': [ + {'type': 'image', 'image': str(image_path)}, + {'type': 'text', 'text': 'Describe this image.'}, + ], + }] + image_inputs, video_inputs = process_vision_info(messages) - inputs = processor(text=[text], images=image_inputs, videos=video_inputs, padding=True, return_tensors='pt') - inputs = inputs.to(device) - with torch.no_grad(): - output = model(**inputs, output_hidden_states=True) - embedding = output.hidden_states[-1][:, -1, :].squeeze(0) - embedding = torch.nn.functional.normalize(embedding, p=2, dim=0) - return embedding.float().cpu().numpy().tolist() + prompt = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) + + inputs = processor( + text=[prompt], images=image_inputs, videos=video_inputs, padding=True, + return_tensors='pt', + ).to(device) + return self.__compute_normalized_embedding(model, inputs) def get_text_embedding(self, text: str) -> List[float]: - model, processor, device = self._load_model() + model, processor, device = self.__get_model() messages = [{'role': 'user', 'content': [{'type': 'text', 'text': text}]}] - text_inputs = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=True, return_tensors='pt').to(device) + + inputs = processor.apply_chat_template( + messages, add_generation_prompt=True, tokenize=True, + return_tensors='pt', + ).to(device) + return self.__compute_normalized_embedding(model, {'input_ids': inputs}) + + def __compute_normalized_embedding(self, model: Any, inputs: Dict[str, Any]) -> List[float]: with torch.no_grad(): - output = model(input_ids=text_inputs, output_hidden_states=True) + output = model(**inputs, output_hidden_states=True) embedding = output.hidden_states[-1][:, -1, :].squeeze(0) embedding = torch.nn.functional.normalize(embedding, p=2, dim=0) return embedding.float().cpu().numpy().tolist() - def _load_model(self) -> Tuple[AutoModelForVision2Seq, AutoProcessor, str]: - if self._model is not None: - return self._model, self._processor, self._device - click.echo('Loading embedding model...', err=True) + def __get_model(self) -> Tuple[AutoModelForVision2Seq, AutoProcessor, str]: + if self.__model is None: + self.__load_resources() + return self.__model, self.__processor, self.__device + + def __load_resources(self) -> None: + click.echo('Loading Qwen-VL embedding model...', err=True) if not torch.cuda.is_available(): - raise RuntimeError('CUDA is required but not available. This pipeline requires GPU.') + raise RuntimeError('CUDA required for multimodal embeddings.') + model_name = settings.embedding_model.model_name - self._device = 'cuda' - self._model = AutoModelForVision2Seq.from_pretrained(model_name, dtype=torch.bfloat16, device_map='auto') - self._processor = AutoProcessor.from_pretrained(model_name) - click.echo(f'Model loaded on {self._device}', err=True) - return self._model, self._processor, self._device + self.__model = AutoModelForVision2Seq.from_pretrained(model_name, dtype=torch.bfloat16, device_map='auto') + self.__processor = AutoProcessor.from_pretrained(model_name) diff --git a/preprocessor/services/search/clients/hash_service.py b/preprocessor/services/search/clients/hash_service.py index a37ce1db4..8f3c9b888 100644 --- a/preprocessor/services/search/clients/hash_service.py +++ b/preprocessor/services/search/clients/hash_service.py @@ -12,29 +12,24 @@ class HashService: - def __init__(self) -> None: - self._hasher: Optional[PerceptualHasher] = None - - def cleanup(self) -> None: - if self._hasher is not None: - del self._hasher - self._hasher = None - if torch.cuda.is_available(): - torch.cuda.empty_cache() + self.__hasher: Optional[PerceptualHasher] = None def get_perceptual_hash(self, image_path: Union[str, Path]) -> Optional[str]: - hasher = self.__load_hasher() - image = Image.open(image_path).convert('RGB') - hashes = hasher.compute_phash_batch([image]) # pylint: disable=no-member + hasher = self.__get_hasher() + with Image.open(image_path) as img: + rgb_img = img.convert('RGB') + hashes = hasher.compute_phash_batch([rgb_img]) return hashes[0] if hashes else None - def __load_hasher(self) -> PerceptualHasher: - if self._hasher is not None: - return self._hasher - click.echo('Loading perceptual hasher...', err=True) - if not torch.cuda.is_available(): - raise RuntimeError('CUDA is required but not available. This pipeline requires GPU.') - self._hasher = PerceptualHasher(device='cuda', hash_size=8) # pylint: disable=unexpected-keyword-arg - click.echo('Hasher loaded on cuda', err=True) - return self._hasher + def __get_hasher(self) -> PerceptualHasher: + if self.__hasher is None: + click.echo('Loading perceptual hasher...', err=True) + self.__hasher = PerceptualHasher(device='cuda', hash_size=8) + return self.__hasher + + def cleanup(self) -> None: + if self.__hasher: + self.__hasher = None + if torch.cuda.is_available(): + torch.cuda.empty_cache() diff --git a/preprocessor/services/search/clients/result_formatters.py b/preprocessor/services/search/clients/result_formatters.py index ba5382ee7..384340ac0 100644 --- a/preprocessor/services/search/clients/result_formatters.py +++ b/preprocessor/services/search/clients/result_formatters.py @@ -14,85 +14,52 @@ class ResultFormatter: - @staticmethod def format_timestamp(seconds: float) -> str: - minutes = int(seconds // 60) - secs = seconds % 60 - return f'{minutes}m {secs:.1f}s' + return f'{int(seconds // 60)}m {seconds % 60:.1f}s' @staticmethod - def print_results(result: Dict[str, Any], result_type: str='text') -> None: - total = result[ElasticsearchKeys.HITS][ElasticsearchKeys.TOTAL][ElasticsearchAggregationKeys.VALUE] - hits = result[ElasticsearchKeys.HITS][ElasticsearchKeys.HITS] - click.echo(f'\nZnaleziono: {total} wynikow') + def print_results(result: Dict[str, Any], result_type: str = 'text') -> None: + hits_data = result[ElasticsearchKeys.HITS] + total = hits_data[ElasticsearchKeys.TOTAL][ElasticsearchAggregationKeys.VALUE] + hits = hits_data[ElasticsearchKeys.HITS] + + click.echo(f'\nResults found: {total}') click.echo('=' * 80) + for i, hit in enumerate(hits, 1): source = hit[ElasticsearchKeys.SOURCE] - score = hit[ElasticsearchKeys.SCORE] meta = source[EpisodeMetadataKeys.EPISODE_METADATA] - scene_ctx = ResultFormatter.__format_scene_context(source.get('scene_info')) - click.echo(f'\n[{i}] Score: {score:.2f}') - season_code = 'S00' if meta['season'] == 0 else f"S{meta['season']:02d}" - click.echo(f"Episode: {season_code}E{meta['episode_number']:02d} - {meta.get('title', 'N/A')}") - if result_type == 'text': - ResultFormatter.__print_text_result(source, scene_ctx) - elif result_type == 'text_semantic': - click.echo(f"Segments: {source['segment_range'][0]}-{source['segment_range'][1]}{scene_ctx}") - click.echo(f"Embedding ID: {source.get('embedding_id', 'N/A')}") - click.echo(f"Text: {source['text']}") - elif result_type == 'episode_name': - click.echo(f"Episode Title: {source.get('title', 'N/A')}") - else: - ResultFormatter.__print_video_result(source, scene_ctx) - click.echo(f"Path: {source['video_path']}") - @staticmethod - def __format_character_appearances(appearances: list) -> str: - chars_strs = [] - for char in appearances: - char_str = char.get('name', 'Unknown') - if char.get('emotion'): - emotion_label = char['emotion'].get('label', '?') - emotion_conf = char['emotion'].get('confidence', 0) - char_str += f' ({emotion_label} {emotion_conf:.2f})' - chars_strs.append(char_str) - return ', '.join(chars_strs) + click.echo(f'\n[{i}] Score: {hit[ElasticsearchKeys.SCORE]:.2f}') + click.echo(f"Episode: S{meta['season']:02d}E{meta['episode_number']:02d} - {meta.get('title', 'N/A')}") - @staticmethod - def __format_detected_objects(objects: list) -> str: - return ', '.join([f"{obj['class']}:{obj['count']}" for obj in objects]) + ResultFormatter.__print_specific_content(source, result_type) + click.echo(f"Video: {source['video_path']}") @staticmethod - def __format_scene_context(scene_info: Optional[Dict[str, Any]]) -> str: - if not scene_info: - return '' - start = ResultFormatter.format_timestamp(scene_info.get('scene_start_time', 0)) - end = ResultFormatter.format_timestamp(scene_info.get('scene_end_time', 0)) - return f" [Scene {scene_info.get('scene_number', '?')}: {start} - {end}]" + def __print_specific_content(source: Dict[str, Any], r_type: str) -> None: + scene_ctx = ResultFormatter.__get_scene_ctx(source.get('scene_info')) + + if r_type == 'text': + click.echo( + f"Time: {ResultFormatter.format_timestamp(source['start_time'])} - {ResultFormatter.format_timestamp(source['end_time'])}{scene_ctx}", + ) + click.echo(f"Speaker: {source.get('speaker', 'N/A')}\nText: {source['text']}") + elif r_type == 'text_semantic': + click.echo(f"Range: {source['segment_range']}{scene_ctx}\nText: {source['text']}") + else: + ts = ResultFormatter.format_timestamp(source['timestamp']) + click.echo(f"Frame: {source['frame_number']} @ {ts}{scene_ctx}") + if source.get('character_appearances'): + click.echo(f"Characters: {ResultFormatter.__fmt_chars(source['character_appearances'])}") @staticmethod - def __print_text_result(source: Dict[str, Any], scene_ctx: str) -> None: - click.echo(f"Segment ID: {source.get('segment_id', 'N/A')}") - start_time = ResultFormatter.format_timestamp(source['start_time']) - end_time = ResultFormatter.format_timestamp(source['end_time']) - click.echo(f'Time: {start_time} - {end_time}{scene_ctx}') - click.echo(f"Speaker: {source.get('speaker', 'N/A')}") - click.echo(f"Text: {source['text']}") + def __get_scene_ctx(info: Optional[Dict[str, Any]]) -> str: + if not info: + return '' + return f" [Scene {info.get('scene_number')}: {ResultFormatter.format_timestamp(info.get('scene_start_time', 0))}]" @staticmethod - def __print_video_result(source: Dict[str, Any], scene_ctx: str) -> None: - timestamp = ResultFormatter.format_timestamp(source['timestamp']) - click.echo(f"Frame: {source['frame_number']} @ {timestamp}{scene_ctx}") - if 'frame_type' in source: - click.echo(f"Type: {source['frame_type']}") - if 'scene_number' in source: - click.echo(f"Scene number: {source['scene_number']}") - if 'perceptual_hash' in source: - click.echo(f"Hash: {source['perceptual_hash']}") - if source.get('character_appearances'): - chars = ResultFormatter.__format_character_appearances(source['character_appearances']) - click.echo(f"Characters: {chars}") - if source.get('detected_objects'): - objects = ResultFormatter.__format_detected_objects(source['detected_objects']) - click.echo(f'Objects: {objects}') + def __fmt_chars(appearances: list) -> str: + return ', '.join([f"{c['name']} ({c['emotion']['label']})" for c in appearances if 'emotion' in c]) diff --git a/preprocessor/services/search/elasticsearch.py b/preprocessor/services/search/elasticsearch.py index 9a2f6f46b..6390d4a82 100644 --- a/preprocessor/services/search/elasticsearch.py +++ b/preprocessor/services/search/elasticsearch.py @@ -10,52 +10,75 @@ urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) + class ElasticsearchWrapper: + def __init__( + self, + index_name: str, + host: str = 'localhost:9200', + dry_run: bool = False, + ) -> None: + self.__index_name = index_name + self.__host = host + self.__dry_run = dry_run + self.__client: Optional[AsyncElasticsearch] = None - def __init__(self, index_name: str, host: str='localhost:9200', dry_run: bool=False) -> None: - self.index_name: str = index_name - self.host: str = host - self.dry_run: bool = dry_run - self._client: Optional[AsyncElasticsearch] = None + @property + def index_name(self) -> str: + return self.__index_name async def bulk_index(self, documents: List[Dict[str, Any]]) -> Dict[str, Any]: - if self.dry_run: + if self.__dry_run: return {'indexed': len(documents), 'errors': []} - client = await self._get_client() - actions = [] - for doc in documents: - actions.append({'index': {'_index': self.index_name}}) - actions.append(doc) + + client = await self.__ensure_client() + actions = self.__build_bulk_actions(documents) + try: response = await client.bulk(operations=actions) return response except Exception as e: - return {'errors': str(e)} - - async def close(self) -> None: - if self._client is not None: - await self._client.close() - self._client = None + return {'errors': [str(e)]} async def create_index(self, mapping: Dict[str, Any]) -> None: - if self.dry_run: + if self.__dry_run: return - client = await self._get_client() - await client.indices.create(index=self.index_name, body=mapping) + + client = await self.__ensure_client() + await client.indices.create(index=self.__index_name, body=mapping) async def delete_index(self) -> None: - if self.dry_run: + if self.__dry_run: return - client = await self._get_client() - await client.indices.delete(index=self.index_name, ignore=[404]) + + client = await self.__ensure_client() + if await client.indices.exists(index=self.__index_name): + await client.indices.delete(index=self.__index_name) async def index_exists(self) -> bool: - if self.dry_run: + if self.__dry_run: return False - client = await self._get_client() - return await client.indices.exists(index=self.index_name) - async def _get_client(self) -> AsyncElasticsearch: - if self._client is None: - self._client = AsyncElasticsearch([self.host], verify_certs=False, ssl_show_warn=False) - return self._client + client = await self.__ensure_client() + return await client.indices.exists(index=self.__index_name) + + async def close(self) -> None: + if self.__client is not None: + await self.__client.close() + self.__client = None + + async def __ensure_client(self) -> AsyncElasticsearch: + if self.__client is None: + self.__client = AsyncElasticsearch( + [self.__host], + verify_certs=False, + ssl_show_warn=False, + ) + return self.__client + + def __build_bulk_actions(self, documents: List[Dict[str, Any]]) -> List[Any]: + actions = [] + for doc in documents: + actions.append({'index': {'_index': self.__index_name}}) + actions.append(doc) + return actions diff --git a/preprocessor/services/search/embedding_model.py b/preprocessor/services/search/embedding_model.py index 2d0c9bb3f..3368f75d2 100644 --- a/preprocessor/services/search/embedding_model.py +++ b/preprocessor/services/search/embedding_model.py @@ -3,18 +3,42 @@ Union, ) +import numpy as np + from preprocessor.services.search.clients.embedding_service import EmbeddingService class EmbeddingModelWrapper: + def __init__( + self, + model_name: str, + device: str = 'cuda', + batch_size: int = 8, + ) -> None: + self.__model_name = model_name # pylint: disable=unused-private-member + self.__device = device # pylint: disable=unused-private-member + self.__batch_size = batch_size # pylint: disable=unused-private-member + self.__service = EmbeddingService() + self.__loaded = False # pylint: disable=unused-private-member + + def load_model(self) -> None: + self.__loaded = True # pylint: disable=unused-private-member - def __init__(self, model_name: str, device: str='cuda', batch_size: int=8) -> None: - self.model_name: str = model_name - self.device: str = device - self.batch_size: int = batch_size - self._service = EmbeddingService() + def cleanup(self) -> None: + self.__loaded = False # pylint: disable=unused-private-member def encode_text(self, text: Union[str, List[str]]) -> Union[List[float], List[List[float]]]: if isinstance(text, str): - return self._service.get_text_embedding(text) - return [self._service.get_text_embedding(t) for t in text] + return self.__service.get_text_embedding(text) + + return self.__process_batch_encoding(text) + + def encode_images(self, image_paths: List[str]) -> List[np.ndarray]: + embeddings: List[np.ndarray] = [] + for path in image_paths: + embedding = self.__service.get_image_embedding(path) + embeddings.append(np.array(embedding)) + return embeddings + + def __process_batch_encoding(self, texts: List[str]) -> List[List[float]]: + return [self.__service.get_text_embedding(t) for t in texts] diff --git a/preprocessor/services/text/import_step.py b/preprocessor/services/text/import_step.py index d9e599aec..b9bb30127 100644 --- a/preprocessor/services/text/import_step.py +++ b/preprocessor/services/text/import_step.py @@ -19,53 +19,124 @@ if TYPE_CHECKING: from preprocessor.services.episodes.episode_manager import EpisodeInfo -class TranscriptionImportStep(PipelineStep[None, List[TranscriptionData], TranscriptionImportConfig]): +class TranscriptionImportStep(PipelineStep[None, List[TranscriptionData], TranscriptionImportConfig]): def __init__(self, config: TranscriptionImportConfig) -> None: super().__init__(config) - self._episode_manager: Optional[EpisodeManager] = None + self.__episode_manager: Optional[EpisodeManager] = None + + @property + def name(self) -> str: + return 'transcription_import' def execute(self, input_data: None, context: ExecutionContext) -> List[TranscriptionData]: - if self._episode_manager is None: - self._episode_manager = EpisodeManager(None, context.series_name, context.logger) - json_files: List[Path] = self.__find_transcription_files() + self.__ensure_episode_manager(context) + + json_files = self.__find_transcription_files() if not json_files: context.logger.warning(f'No transcription files found in {self.config.source_dir}') return [] + context.logger.info(f'Found {len(json_files)} transcription files to import') + results: List[TranscriptionData] = [] for json_file in json_files: try: - artifact: Optional[TranscriptionData] = self.__import_single_file(json_file, context) + artifact = self.__import_single_file(json_file, context) if artifact: results.append(artifact) except Exception as e: context.logger.error(f'Failed to import {json_file.name}: {e}') + return results - @property - def name(self) -> str: - return 'transcription_import' + def __ensure_episode_manager(self, context: ExecutionContext) -> None: + if self.__episode_manager is None: + self.__episode_manager = EpisodeManager(None, context.series_name, context.logger) + + def __find_transcription_files(self) -> List[Path]: + pattern = '*_segmented.json' if self.config.format_type == '11labs_segmented' else '*.json' + files = sorted(self.config.source_dir.rglob(pattern)) + return [f for f in files if not f.name.startswith('.')] + + def __import_single_file(self, json_file: Path, context: ExecutionContext) -> Optional[TranscriptionData]: + episode_info = self.__resolve_episode_info(json_file) + if not episode_info: + context.logger.warning(f'Could not determine episode for {json_file}') + return None + + episode_id = self.__episode_manager.get_episode_id_for_state(episode_info) + output_path = self.__get_output_path(episode_info, context) + + if self.__should_skip_import(output_path, episode_id, context): + return self.__construct_cached_artifact(episode_id, episode_info, output_path) + + context.logger.info(f'Importing {episode_id} from {json_file.name}') + context.mark_step_started(self.name, episode_id) + + source_data = self.__load_json(json_file) + converted_data = self.__convert_data(source_data, json_file) + converted_data['episode_info'] = EpisodeManager.get_metadata(episode_info) + + self.__save_converted_data(output_path, converted_data) + context.mark_step_completed(self.name, episode_id) + + return self.__construct_new_artifact(episode_id, episode_info, output_path, converted_data) + + def __resolve_episode_info(self, json_file: Path) -> Optional['EpisodeInfo']: + info = self.__episode_manager.parse_filename(json_file) + if not info: + season, episode = self.__extract_season_episode_fallback(json_file) + info = self.__episode_manager.get_episode_by_season_and_relative(season, episode) + return info + + def __convert_data(self, data: Dict[str, Any], source_file: Path) -> Dict[str, Any]: + if self.config.format_type == '11labs_segmented': + return self.__convert_11labs_segmented(data, source_file) + if self.config.format_type == '11labs': + return self.__convert_11labs_full(data, source_file) + raise ValueError(f'Unknown format type: {self.config.format_type}') + + def __should_skip_import(self, output_path: Path, episode_id: str, context: ExecutionContext) -> bool: + if output_path.exists() and not context.force_rerun: + context.logger.info(f'Skipping {episode_id} (output exists)') + if not context.is_step_completed(self.name, episode_id): + context.mark_step_completed(self.name, episode_id) + return True + return False + + def __get_output_path(self, episode_info: 'EpisodeInfo', context: ExecutionContext) -> Path: + filename = self.__episode_manager.path_manager.build_filename(episode_info, extension='json') + return context.get_output_path(episode_info, 'transcriptions', filename) @staticmethod def __convert_11labs_full(data: Dict[str, Any], source_file: Path) -> Dict[str, Any]: segments: List[Dict[str, Any]] = [] - words: List[Dict[str, Any]] = data.get('words', []) - current_segment: Dict[str, Any] = {'words': [], 'start': None, 'end': None, 'text': '', 'speaker': 'unknown'} + words = data.get('words', []) + current_seg: Dict[str, Any] = {'words': [], 'start': None, 'end': None, 'text': '', 'speaker': 'unknown'} + for word in words: - if current_segment['start'] is None: - current_segment['start'] = word.get('start') - current_segment['words'].append(word) - current_segment['end'] = word.get('end') - if word.get('text', '').endswith(('.', '!', '?')) or len(current_segment['words']) >= 20: - current_segment['text'] = ' '.join((w.get('text', '') for w in current_segment['words'])) - segments.append(dict(current_segment)) - current_segment = {'words': [], 'start': None, 'end': None, 'text': '', 'speaker': word.get('speaker_id', 'unknown')} - if current_segment['words']: - current_segment['text'] = ' '.join((w.get('text', '') for w in current_segment['words'])) - segments.append(current_segment) + if current_seg['start'] is None: + current_seg['start'] = word.get('start') + + current_seg['words'].append(word) + current_seg['end'] = word.get('end') + + if word.get('text', '').endswith(('.', '!', '?')) or len(current_seg['words']) >= 20: + current_seg['text'] = ' '.join(w.get('text', '') for w in current_seg['words']) + segments.append(dict(current_seg)) + current_seg = { + 'words': [], 'start': None, 'end': None, 'text': '', + 'speaker': word.get('speaker_id', 'unknown'), + } + + if current_seg['words']: + current_seg['text'] = ' '.join(w.get('text', '') for w in current_seg['words']) + segments.append(current_seg) + for i, seg in enumerate(segments): seg['id'] = i + return { 'transcription': { 'format': '11labs', @@ -78,17 +149,16 @@ def __convert_11labs_full(data: Dict[str, Any], source_file: Path) -> Dict[str, @staticmethod def __convert_11labs_segmented(data: Dict[str, Any], source_file: Path) -> Dict[str, Any]: - segments: List[Dict[str, Any]] = [] + segments = [] for i, segment in enumerate(data.get('segments', [])): - converted_segment: Dict[str, Any] = { + segments.append({ 'id': i, 'start': segment.get('start'), 'end': segment.get('end'), 'text': segment.get('text', ''), 'speaker': segment.get('speaker', 'unknown'), 'words': segment.get('words', []), - } - segments.append(converted_segment) + }) return { 'transcription': {'format': '11labs_segmented', 'source_file': source_file.name, 'segments': segments}, 'segments': segments, @@ -96,60 +166,47 @@ def __convert_11labs_segmented(data: Dict[str, Any], source_file: Path) -> Dict[ @staticmethod def __extract_season_episode_fallback(file_path: Path) -> Tuple[int, int]: - match: Optional[re.Match] = re.search('S(\\d+)E(\\d+)', file_path.name, re.IGNORECASE) + match = re.search('S(\\d+)E(\\d+)', file_path.name, re.IGNORECASE) if match: return int(match.group(1)), int(match.group(2)) - parent_match: Optional[re.Match] = re.search('S(\\d+)', file_path.parent.name, re.IGNORECASE) + + parent_match = re.search('S(\\d+)', file_path.parent.name, re.IGNORECASE) if parent_match: - season: int = int(parent_match.group(1)) - episode_match: Optional[re.Match] = re.search('E(\\d+)', file_path.name, re.IGNORECASE) + season = int(parent_match.group(1)) + episode_match = re.search('E(\\d+)', file_path.name, re.IGNORECASE) if episode_match: return season, int(episode_match.group(1)) return 1, 1 - def __find_transcription_files(self) -> List[Path]: - pattern: str = '*.json' - if self.config.format_type == '11labs_segmented': - pattern = '*_segmented.json' - files: List[Path] = sorted(self.config.source_dir.rglob(pattern)) - return [f for f in files if not f.name.startswith('.')] + @staticmethod + def __load_json(file_path: Path) -> Dict[str, Any]: + with open(file_path, 'r', encoding='utf-8') as f: + return json.load(f) - def __import_single_file(self, json_file: Path, context: ExecutionContext) -> Optional[TranscriptionData]: - episode_info: Optional['EpisodeInfo'] = self._episode_manager.parse_filename(json_file) - if not episode_info: - season_num, episode_num = self.__extract_season_episode_fallback(json_file) - episode_info = self._episode_manager.get_episode_by_season_and_relative(season_num, episode_num) - if not episode_info: - context.logger.warning(f'Could not determine episode for {json_file}') - return None - episode_id: str = self._episode_manager.get_episode_id_for_state(episode_info) - output_filename: str = self._episode_manager.path_manager.build_filename(episode_info, extension='json') - output_path: Path = context.get_output_path(episode_info, 'transcriptions', output_filename) - if output_path.exists() and (not context.force_rerun): - context.logger.info(f'Skipping {episode_id} (output exists)') - if not context.is_step_completed(self.name, episode_id): - context.mark_step_completed(self.name, episode_id) - return TranscriptionData(episode_id=episode_id, episode_info=episode_info, path=output_path, language='pl', model='11labs', format='json') - context.logger.info(f'Importing {episode_id} from {json_file.name}') - context.mark_step_started(self.name, episode_id) - with open(json_file, 'r', encoding='utf-8') as f: - source_data: Dict[str, Any] = json.load(f) - if self.config.format_type == '11labs_segmented': - converted_data: Dict[str, Any] = self.__convert_11labs_segmented(source_data, json_file) - elif self.config.format_type == '11labs': - converted_data = self.__convert_11labs_full(source_data, json_file) - else: - raise ValueError(f'Unknown format type: {self.config.format_type}') - converted_data['episode_info'] = EpisodeManager.get_metadata(episode_info) + @staticmethod + def __save_converted_data(output_path: Path, data: Dict[str, Any]) -> None: output_path.parent.mkdir(parents=True, exist_ok=True) with open(output_path, 'w', encoding='utf-8') as f: - json.dump(converted_data, f, indent=2, ensure_ascii=False) - context.mark_step_completed(self.name, episode_id) + json.dump(data, f, indent=2, ensure_ascii=False) + + @staticmethod + def __construct_cached_artifact(episode_id: str, info: 'EpisodeInfo', path: Path) -> TranscriptionData: + return TranscriptionData( + episode_id=episode_id, episode_info=info, path=path, + language='pl', model='11labs', format='json', + ) + + @staticmethod + def __construct_new_artifact( + episode_id: str, info: 'EpisodeInfo', path: Path, + data: Dict[str, Any], + ) -> TranscriptionData: + trans_meta = data.get('transcription', {}) return TranscriptionData( episode_id=episode_id, - episode_info=episode_info, - path=output_path, - language=converted_data.get('transcription', {}).get('language_code', 'pl'), - model=converted_data.get('transcription', {}).get('format', '11labs'), + episode_info=info, + path=path, + language=trans_meta.get('language_code', 'pl'), + model=trans_meta.get('format', '11labs'), format='json', ) diff --git a/preprocessor/services/text/language_config.py b/preprocessor/services/text/language_config.py index 1f61f07b3..83343d61b 100644 --- a/preprocessor/services/text/language_config.py +++ b/preprocessor/services/text/language_config.py @@ -2,27 +2,31 @@ from typing import Set -@dataclass +@dataclass(frozen=True) class LanguageConfig: consonants: Set[str] punctuation: Set[str] special_chars: Set[str] vowels: Set[str] -POLISH_VOWELS = set('aąeęioóuyAĄEĘIOÓUY') -POLISH_CONSONANTS = set('bcćdfghjklłmnńprsśtwzźżBCĆDFGHJKLŁMNŃPRSŚTWZŹŻ') -ENGLISH_VOWELS = set('aeiouAEIOU') -ENGLISH_CONSONANTS = set('bcdfghjklmnpqrstvwxyzBCDFGHJKLMNPQRSTVWXYZ') -PUNCTUATION = set('.,;:!?…-—–()[]{}"\'«»„\'\'') -SPECIAL_CHARS = set('@#$%^&*+=<>|\\/_~`') + + +_PUNCTUATION = set('.,;:!?…-—–()[]{}"\'«»„\'\'') +_SPECIAL_CHARS = set('@#$%^&*+=<>|\\/_~`') +_ENGLISH_VOWELS = set('aeiouAEIOU') +_ENGLISH_CONSONANTS = set('bcdfghjklmnpqrstvwxyzBCDFGHJKLMNPQRSTVWXYZ') +_POLISH_VOWELS = set('aąeęioóuyAĄEĘIOÓUY') +_POLISH_CONSONANTS = set('bcćdfghjklłmnńprsśtwzźżBCĆDFGHJKLŁMNŃPRSŚTWZŹŻ') + POLISH_CONFIG = LanguageConfig( - vowels=POLISH_VOWELS | ENGLISH_VOWELS, - consonants=POLISH_CONSONANTS | ENGLISH_CONSONANTS, - punctuation=PUNCTUATION, - special_chars=SPECIAL_CHARS, + vowels=_POLISH_VOWELS | _ENGLISH_VOWELS, + consonants=_POLISH_CONSONANTS | _ENGLISH_CONSONANTS, + punctuation=_PUNCTUATION, + special_chars=_SPECIAL_CHARS, ) + ENGLISH_CONFIG = LanguageConfig( - vowels=ENGLISH_VOWELS, - consonants=ENGLISH_CONSONANTS, - punctuation=PUNCTUATION, - special_chars=SPECIAL_CHARS, + vowels=_ENGLISH_VOWELS, + consonants=_ENGLISH_CONSONANTS, + punctuation=_PUNCTUATION, + special_chars=_SPECIAL_CHARS, ) diff --git a/preprocessor/services/text/text_statistics.py b/preprocessor/services/text/text_statistics.py index ba2b86f78..25a48e005 100644 --- a/preprocessor/services/text/text_statistics.py +++ b/preprocessor/services/text/text_statistics.py @@ -14,13 +14,14 @@ from preprocessor.services.text.language_config import ( ENGLISH_CONFIG, POLISH_CONFIG, - LanguageConfig, ) @dataclass class TextStatistics: # pylint: disable=too-many-instance-attributes text: str + language: str = 'pl' + avg_sentence_length: float = 0.0 avg_word_length: float = 0.0 bigrams: List[Dict[str, Any]] = field(default_factory=list) @@ -28,7 +29,6 @@ class TextStatistics: # pylint: disable=too-many-instance-attributes consonants: int = 0 digits: int = 0 empty_lines: int = 0 - language: str = 'pl' letter_frequency: Dict[str, int] = field(default_factory=dict) letters: int = 0 lines: int = 0 @@ -47,11 +47,11 @@ class TextStatistics: # pylint: disable=too-many-instance-attributes words: int = 0 @classmethod - def from_file(cls, file_path: Path, language: str='pl') -> 'TextStatistics': + def from_file(cls, file_path: Path, language: str = 'pl') -> 'TextStatistics': with open(file_path, 'r', encoding='utf-8') as f: text = f.read() stats = cls(text=text, language=language) - stats.__calculate() + stats.__process_calculations() return stats def to_dict(self) -> Dict[str, Any]: @@ -85,42 +85,34 @@ def to_dict(self) -> Dict[str, Any]: 'trigrams': self.trigrams, } - def __calculate(self) -> None: # pylint: disable=unused-private-member - self.__calculate_basic_stats() - self.__calculate_character_stats() - self.__calculate_word_stats() - self.__calculate_advanced_stats() - - def __calculate_advanced_stats(self) -> None: - if self.sentences > 0: - self.avg_sentence_length = round(self.words / self.sentences, 2) - words = self.__get_words() - if len(words) >= 2: - bigram_counter = Counter(zip(words[:-1], words[1:])) - self.bigrams = [{'bigram': f'{w1} {w2}', 'count': count} for (w1, w2), count in bigram_counter.most_common(25)] - if len(words) >= 3: - trigram_counter = Counter(zip(words[:-2], words[1:-1], words[2:])) - self.trigrams = [{'trigram': f'{w1} {w2} {w3}', 'count': count} for (w1, w2, w3), count in trigram_counter.most_common(25)] + def __process_calculations(self) -> None: # pylint: disable=unused-private-member + self.__calculate_structural_stats() + self.__calculate_character_distribution() + self.__calculate_lexical_stats() + self.__generate_n_grams() - def __calculate_basic_stats(self) -> None: + def __calculate_structural_stats(self) -> None: lines = self.text.split('\n') self.lines = len(lines) - self.empty_lines = sum((1 for line in lines if not line.strip())) + self.empty_lines = sum(1 for line in lines if not line.strip()) + paragraphs = self.text.split('\n\n') self.paragraphs = len([p for p in paragraphs if p.strip()]) - sentence_pattern = '[.!?…]+(?:\\s|$)' - self.sentences = len(re.findall(sentence_pattern, self.text)) + + self.sentences = len(re.findall(r'[.!?…]+(?:\s|$)', self.text)) self.total_chars = len(self.text) self.spaces = self.text.count(' ') + self.text.count('\t') + self.text.count('\n') self.chars_without_spaces = self.total_chars - self.spaces - def __calculate_character_stats(self) -> None: - config = self.__get_config() - letter_counter = Counter() + def __calculate_character_distribution(self) -> None: + config = POLISH_CONFIG if self.language == 'pl' else ENGLISH_CONFIG + letter_counter: Counter = Counter() + for char in self.text: if char.isalpha(): self.letters += 1 - letter_counter[char.lower()] += 1 + char_lower = char.lower() + letter_counter[char_lower] += 1 if char in config.vowels: self.vowels += 1 elif char in config.consonants: @@ -133,21 +125,34 @@ def __calculate_character_stats(self) -> None: self.special_characters += 1 elif not char.isspace(): self.symbols += 1 - self.letter_frequency = dict(sorted(letter_counter.items(), key=lambda x: x[1], reverse=True)) - def __calculate_word_stats(self) -> None: - words = self.__get_words() + self.letter_frequency = dict(letter_counter.most_common()) + + def __calculate_lexical_stats(self) -> None: + words = self.__extract_words() self.words = len(words) + if self.words > 0: word_counter = Counter(words) self.unique_words = len(word_counter) - self.type_token_ratio = round(self.unique_words / self.words, 4) if self.words > 0 else 0.0 - word_lengths = [len(w) for w in words] - self.avg_word_length = round(sum(word_lengths) / len(word_lengths), 2) if word_lengths else 0.0 - self.word_frequency = [{'word': word, 'count': count} for word, count in word_counter.most_common(50)] + self.type_token_ratio = round(self.unique_words / self.words, 4) + + lengths = [len(w) for w in words] + self.avg_word_length = round(sum(lengths) / self.words, 2) + self.word_frequency = [{'word': w, 'count': c} for w, c in word_counter.most_common(50)] - def __get_config(self) -> LanguageConfig: - return POLISH_CONFIG if self.language == 'pl' else ENGLISH_CONFIG + if self.sentences > 0: + self.avg_sentence_length = round(self.words / self.sentences, 2) + + def __generate_n_grams(self) -> None: + words = self.__extract_words() + if len(words) >= 2: + bigrams = Counter(zip(words[:-1], words[1:])) + self.bigrams = [{'bigram': f'{w1} {w2}', 'count': c} for (w1, w2), c in bigrams.most_common(25)] + + if len(words) >= 3: + trigrams = Counter(zip(words[:-2], words[1:-1], words[2:])) + self.trigrams = [{'trigram': f'{w1} {w2} {w3}', 'count': c} for (w1, w2, w3), c in trigrams.most_common(25)] - def __get_words(self) -> List[str]: - return re.findall('\\b\\w+\\b', self.text.lower()) + def __extract_words(self) -> List[str]: + return re.findall(r'\b\w+\b', self.text.lower()) diff --git a/preprocessor/services/transcription/engines/base_engine.py b/preprocessor/services/transcription/engines/base_engine.py index 1c77a5dec..4078a3368 100644 --- a/preprocessor/services/transcription/engines/base_engine.py +++ b/preprocessor/services/transcription/engines/base_engine.py @@ -10,11 +10,10 @@ class TranscriptionEngine(ABC): - @abstractmethod def get_name(self) -> str: - ... + pass @abstractmethod def transcribe(self, audio_path: Path) -> Dict[str, Any]: - ... + pass diff --git a/preprocessor/services/transcription/engines/elevenlabs_engine.py b/preprocessor/services/transcription/engines/elevenlabs_engine.py index 86f39265c..e585961b7 100644 --- a/preprocessor/services/transcription/engines/elevenlabs_engine.py +++ b/preprocessor/services/transcription/engines/elevenlabs_engine.py @@ -4,6 +4,7 @@ from typing import ( Any, Dict, + List, Optional, ) @@ -17,23 +18,24 @@ class ElevenLabsEngine(TranscriptionEngine): - def __init__( - self, - logger: ErrorHandlingLogger, - model_id: Optional[str]=None, - language_code: Optional[str]=None, - diarize: Optional[bool]=None, - polling_interval: Optional[int]=None, - ): - if not settings.elevenlabs.api_key: - raise ValueError('ElevenLabs API key not provided. Set ELEVEN_API_KEY environment variable.') - self.client = ElevenLabs(api_key=settings.elevenlabs.api_key) - self.model_id = model_id or settings.elevenlabs.model_id - self.language_code = language_code or settings.elevenlabs.language_code - self.diarize = diarize if diarize is not None else settings.elevenlabs.diarize - self.polling_interval = polling_interval or settings.elevenlabs.polling_interval - self.additional_formats = [ + self, + logger: ErrorHandlingLogger, + model_id: Optional[str] = None, + language_code: Optional[str] = None, + diarize: Optional[bool] = None, + polling_interval: Optional[int] = None, + ) -> None: + self.__validate_api_key() + + self.__client = ElevenLabs(api_key=settings.elevenlabs.api_key) + self.__logger = logger + self.__model_id = model_id or settings.elevenlabs.model_id + self.__language_code = language_code or settings.elevenlabs.language_code + self.__diarize = diarize if diarize is not None else settings.elevenlabs.diarize + self.__polling_interval = polling_interval or settings.elevenlabs.polling_interval + + self.__additional_formats: List[Dict[str, Any]] = [ {'format': 'srt'}, { 'format': 'segmented_json', @@ -44,79 +46,103 @@ def __init__( 'max_segment_chars': 200, }, ] - self._logger: ErrorHandlingLogger = logger def get_name(self) -> str: return 'ElevenLabs' def transcribe(self, audio_path: Path) -> Dict[str, Any]: - console.print(f'[cyan]Transcribing with 11labs: {audio_path.name}[/cyan]') + console.print(f'[cyan]Transcribing with ElevenLabs: {audio_path.name}[/cyan]') + if not audio_path.exists(): raise FileNotFoundError(f'Audio file not found: {audio_path}') - transcription_id = self.__submit_job(audio_path) - result = self.__poll_for_results(transcription_id) - console.print(f'[green]Transcription completed: {audio_path.name}[/green]') - return self.__convert_to_unified_format(result) - @staticmethod - def __convert_to_unified_format(result) -> Dict[str, Any]: - unified_data = {'text': result.text, 'language_code': result.language_code, 'segments': []} - if result.additional_formats: - for fmt in result.additional_formats: - if fmt.requested_format == 'segmented_json': - segmented_data = json.loads(fmt.content) - for seg in segmented_data.get('segments', []): - words = seg.get('words', []) - if not words: - continue - non_spacing_words = [w for w in words if w.get('type') != 'spacing'] - segment = {'text': seg.get('text', '').strip(), 'words': words} - if non_spacing_words: - first_word = non_spacing_words[0] - last_word = non_spacing_words[-1] - segment['start'] = first_word.get('start') - segment['end'] = last_word.get('end') - segment['speaker'] = first_word.get('speaker_id') - unified_data['segments'].append(segment) - break - return unified_data + job_id = self.__submit_job(audio_path) + raw_result = self.__poll_for_results(job_id) - def __poll_for_results(self, transcription_id: str): - self._logger.info(f'Polling for results (ID: {transcription_id})...') - max_attempts = settings.elevenlabs.max_attempts - attempt = 0 - while attempt < max_attempts: - try: - result = self.client.speech_to_text.transcripts.get(transcription_id=transcription_id) - self._logger.info('Transcription complete!') - return result - except ApiError as e: - if e.status_code == 404: - self._logger.info(' ...Processing. Waiting...') - time.sleep(self.polling_interval) - attempt += 1 - else: - self._logger.error(f'API error during polling: {e.body}') - raise - raise TimeoutError(f'Transcription timeout after {max_attempts} attempts') + console.print(f'[green]Transcription completed: {audio_path.name}[/green]') + return self.__convert_to_unified_format(raw_result) def __submit_job(self, audio_path: Path) -> str: try: with open(audio_path, 'rb') as audio_file: audio_data = audio_file.read() - submit_response = self.client.speech_to_text.convert( + + response = self.__client.speech_to_text.convert( file=audio_data, - model_id=self.model_id, - language_code=self.language_code, + model_id=self.__model_id, + language_code=self.__language_code, tag_audio_events=True, timestamps_granularity='character', - diarize=self.diarize, + diarize=self.__diarize, use_multi_channel=False, - additional_formats=self.additional_formats, + additional_formats=self.__additional_formats, webhook=True, ) - self._logger.info(f'Job submitted. ID: {submit_response.transcription_id}') - return submit_response.transcription_id + self.__logger.info(f'Job submitted. ID: {response.transcription_id}') + return response.transcription_id except ApiError as e: - self._logger.error(f'API error during job submission: {e.body}') + self.__logger.error(f'API error during job submission: {e.body}') raise + + def __poll_for_results(self, transcription_id: str) -> Any: + self.__logger.info(f'Polling for results (ID: {transcription_id})...') + max_attempts = settings.elevenlabs.max_attempts + + for _attempt in range(max_attempts): + try: + result = self.__client.speech_to_text.transcripts.get(transcription_id=transcription_id) + self.__logger.info('Transcription ready!') + return result + except ApiError as e: + if e.status_code == 404: + time.sleep(self.__polling_interval) + else: + self.__logger.error(f'API error during polling: {e.body}') + raise + + raise TimeoutError(f'Transcription timeout after {max_attempts} attempts') + + @staticmethod + def __convert_to_unified_format(result: Any) -> Dict[str, Any]: + unified_data = { + 'text': result.text, + 'language_code': result.language_code, + 'segments': [], + } + + if not result.additional_formats: + return unified_data + + for fmt in result.additional_formats: + if fmt.requested_format == 'segmented_json': + segmented_data = json.loads(fmt.content) + for seg in segmented_data.get('segments', []): + segment = ElevenLabsEngine.__parse_segment(seg) + if segment: + unified_data['segments'].append(segment) + break + return unified_data + + @staticmethod + def __parse_segment(seg_data: Dict[str, Any]) -> Optional[Dict[str, Any]]: + words = seg_data.get('words', []) + if not words: + return None + + non_spacing = [w for w in words if w.get('type') != 'spacing'] + segment = { + 'text': seg_data.get('text', '').strip(), + 'words': words, + } + + if non_spacing: + segment['start'] = non_spacing[0].get('start') + segment['end'] = non_spacing[-1].get('end') + segment['speaker'] = non_spacing[0].get('speaker_id') + + return segment + + @staticmethod + def __validate_api_key() -> None: + if not settings.elevenlabs.api_key: + raise ValueError('ElevenLabs API key missing in settings.') diff --git a/preprocessor/services/transcription/engines/whisper_engine.py b/preprocessor/services/transcription/engines/whisper_engine.py index 839ba4ccb..01647bd7c 100644 --- a/preprocessor/services/transcription/engines/whisper_engine.py +++ b/preprocessor/services/transcription/engines/whisper_engine.py @@ -3,6 +3,7 @@ from typing import ( Any, Dict, + Optional, ) from faster_whisper import WhisperModel @@ -14,36 +15,61 @@ class WhisperEngine(TranscriptionEngine): + def __init__( + self, + model_name: str = 'large-v3-turbo', + language: str = 'Polish', + device: str = 'cuda', + ) -> None: + self.__model_name = model_name + self.__language = language + self.__device = device - def __init__(self, model: str='large-v3-turbo', language: str='Polish', device: str='cuda'): - self.model_name = model - self.language = language - self.device = device if device != 'cuda': - raise ValueError(f'Only GPU (cuda) is supported, got device={device}') - compute_type = 'float16' - console.print(f'[cyan]Loading Whisper model: {model} on {device} with compute_type={compute_type}[/cyan]') - self.model = WhisperModel(model, device=device, compute_type=compute_type) - console.print('[green]Whisper model loaded[/green]') + raise ValueError(f'Whisper acceleration requires CUDA, got: {device}') + + self.__model: Optional[WhisperModel] = self.__load_model() def cleanup(self) -> None: console.print('[cyan]Unloading Whisper model and clearing GPU memory...[/cyan]') - if hasattr(self, 'model'): - del self.model + if self.__model: + del self.__model + self.__model = None + gc.collect() if torch.cuda.is_available(): torch.cuda.empty_cache() console.print('[green]Whisper model unloaded, GPU memory cleared[/green]') def get_name(self) -> str: - return f'Whisper-{self.model_name}' + return f'Whisper-{self.__model_name}' def transcribe(self, audio_path: Path) -> Dict[str, Any]: console.print(f'[cyan]Transcribing with Whisper: {audio_path.name}[/cyan]') + if not audio_path.exists(): raise FileNotFoundError(f'Audio file not found: {audio_path}') - language_code = WhisperUtils.get_language_code(self.language) - segments, info = self.model.transcribe(str(audio_path), language=language_code, beam_size=10, word_timestamps=True, condition_on_previous_text=False) + if not self.__model: + raise RuntimeError('Whisper model not loaded.') + + language_code = WhisperUtils.get_language_code(self.__language) + + segments, info = self.__model.transcribe( + str(audio_path), + language=language_code, + beam_size=10, + word_timestamps=True, + condition_on_previous_text=False, + ) + result = WhisperUtils.build_transcription_result(segments, language=info.language) console.print(f'[green]Transcription completed: {audio_path.name}[/green]') return result + + def __load_model(self) -> WhisperModel: + compute_type = 'float16' + console.print(f'[cyan]Loading Whisper: {self.__model_name} on {self.__device} ({compute_type})[/cyan]') + + model = WhisperModel(self.__model_name, device=self.__device, compute_type=compute_type) + console.print('[green]Whisper model loaded[/green]') + return model diff --git a/preprocessor/services/transcription/generators/base_generator.py b/preprocessor/services/transcription/generators/base_generator.py index 6d14a9994..f5cec09c7 100644 --- a/preprocessor/services/transcription/generators/base_generator.py +++ b/preprocessor/services/transcription/generators/base_generator.py @@ -13,21 +13,24 @@ class BaseTranscriptionGenerator(ABC): - def __init__(self, input_dir: Path, output_dir: Path, logger: ErrorHandlingLogger) -> None: - self.input_dir = input_dir - self.output_dir = output_dir - self.logger = logger + self._input_dir = input_dir + self._output_dir = output_dir + self._logger = logger def generate(self) -> None: - self.output_dir.mkdir(parents=True, exist_ok=True) - for json_file in self.input_dir.rglob('*.json'): + self._output_dir.mkdir(parents=True, exist_ok=True) + for json_file in self._input_dir.rglob('*.json'): try: - with open(json_file, 'r', encoding='utf-8') as f: - data = json.load(f) - self._process_file(json_file, data) + data = self.__load_json(json_file) + if data: + self._process_file(json_file, data) except Exception as e: - self.logger.error(f'Failed to generate output for {json_file}: {e}') + self._logger.error(f'Failed to generate output for {json_file}: {e}') + + def __load_json(self, file_path: Path) -> Dict[str, Any]: + with open(file_path, 'r', encoding='utf-8') as f: + return json.load(f) @abstractmethod def _get_output_filename(self, json_file: Path) -> str: diff --git a/preprocessor/services/transcription/generators/json_generator.py b/preprocessor/services/transcription/generators/json_generator.py index a9a7a332a..0724fad51 100644 --- a/preprocessor/services/transcription/generators/json_generator.py +++ b/preprocessor/services/transcription/generators/json_generator.py @@ -1,3 +1,4 @@ +import json from pathlib import Path from typing import ( Any, @@ -14,61 +15,71 @@ class JsonGenerator(BaseTranscriptionGenerator): - - def __init__(self, format_type: Literal['full', 'simple', 'segmented'], *args, **kwargs): + def __init__(self, format_type: Literal['full', 'simple', 'segmented'], *args, **kwargs) -> None: super().__init__(*args, **kwargs) - self.format_type = format_type + self.__format_type = format_type def convert(self, data: Dict[str, Any]) -> Dict[str, Any]: - if self.format_type == 'full': - return self.convert_to_full_format(data) - if self.format_type == 'simple': - return self.convert_to_simple_format(data) - if self.format_type == 'segmented': - return self.convert_to_segmented_format(data) - raise ValueError(f'Unknown format type: {self.format_type}') + converters = { + 'full': self.convert_to_full_format, + 'simple': self.convert_to_simple_format, + 'segmented': self.convert_to_segmented_format, + } + if self.__format_type not in converters: + raise ValueError(f'Unknown format type: {self.__format_type}') + return converters[self.__format_type](data) @staticmethod def convert_to_full_format(data: Dict[str, Any]) -> Dict[str, Any]: segments = data.get('segments', []) - full_text = ' '.join((seg.get('text', '').strip() for seg in segments)) - language_code = data.get('language', 'pol') - if language_code in {'Polish', 'polish'}: - language_code = 'pol' + full_text = ' '.join(seg.get('text', '').strip() for seg in segments) + + language = data.get('language', 'pol').lower() + language_code = 'pol' if language in {'polish', 'pol'} else language + words = [] for seg in segments: - seg_words = seg.get('words', []) - words.extend(TranscriptionUtils.convert_words_list(seg_words)) - return {'language_code': language_code, 'language_probability': 1.0, 'text': full_text, 'words': words} + words.extend(TranscriptionUtils.convert_words_list(seg.get('words', []))) + + return { + 'language_code': language_code, + 'language_probability': 1.0, + 'text': full_text, + 'words': words, + } @staticmethod def convert_to_segmented_format(data: Dict[str, Any]) -> Dict[str, Any]: segments = data.get('segments', []) - result_segments = [] + result = [] for seg in segments: - text = seg.get('text', '').strip() - seg_words = seg.get('words', []) - result_segments.append({'text': text, 'words': TranscriptionUtils.convert_words_list(seg_words)}) - return {'segments': result_segments} + result.append({ + 'text': seg.get('text', '').strip(), + 'words': TranscriptionUtils.convert_words_list(seg.get('words', [])), + }) + return {'segments': result} @staticmethod def convert_to_simple_format(data: Dict[str, Any]) -> Dict[str, Any]: segments = data.get('segments', []) - result_segments = [] + result = [] for seg in segments: - text = seg.get('text', '').strip() - seg_words = seg.get('words', []) - speaker = 'speaker_unknown' - if seg_words: - speaker = seg_words[0].get('speaker_id', 'speaker_unknown') - result_segments.append({'speaker': speaker, 'text': text}) - return {'segments': result_segments} + words = seg.get('words', []) + speaker = words[0].get('speaker_id', 'speaker_unknown') if words else 'speaker_unknown' + result.append({ + 'speaker': speaker, + 'text': seg.get('text', '').strip(), + }) + return {'segments': result} def _get_output_filename(self, json_file: Path) -> str: - if self.format_type == 'full': + if self.__format_type == 'full': return json_file.name - suffix = FILE_SUFFIXES[self.format_type] + suffix = FILE_SUFFIXES[self.__format_type] return json_file.name.replace(FILE_EXTENSIONS['json'], f"{suffix}{FILE_EXTENSIONS['json']}") def _process_file(self, json_file: Path, data: Dict[str, Any]) -> None: - ... + converted_data = self.convert(data) + output_path = self._output_dir / self._get_output_filename(json_file) + with open(output_path, 'w', encoding='utf-8') as f: + json.dump(converted_data, f, indent=2, ensure_ascii=False) diff --git a/preprocessor/services/transcription/generators/multi_format_generator.py b/preprocessor/services/transcription/generators/multi_format_generator.py index 5f4815f02..b8e8605cc 100644 --- a/preprocessor/services/transcription/generators/multi_format_generator.py +++ b/preprocessor/services/transcription/generators/multi_format_generator.py @@ -6,10 +6,8 @@ Optional, ) -from preprocessor.config.config import ( - get_base_output_dir, - settings, -) +from preprocessor.config.output_paths import get_base_output_dir +from preprocessor.config.settings_instance import settings from preprocessor.services.core.logging import ErrorHandlingLogger from preprocessor.services.episodes import EpisodeManager from preprocessor.services.transcription.generators.json_generator import JsonGenerator @@ -18,139 +16,100 @@ class MultiFormatGenerator: + def __init__( + self, + jsons_dir: Path, + episodes_info_json: Path, + output_base_path: Path, # pylint: disable=unused-argument + logger: ErrorHandlingLogger, + series_name: str = '', + ) -> None: + self.__jsons_dir = jsons_dir + self.__logger = logger + self.__series_name = series_name.lower() if series_name else 'unknown' + self.__episode_manager = EpisodeManager(episodes_info_json, self.__series_name, logger) - def __init__(self, jsons_dir: Path, episodes_info_json: Path, output_base_path: Path, logger: ErrorHandlingLogger, series_name: str='') -> None: - self.jsons_dir = jsons_dir - self.output_base_path = output_base_path - self.logger = logger - self.series_name = series_name.lower() if series_name else 'unknown' - self.episode_manager = EpisodeManager(episodes_info_json, self.series_name, logger) + def __call__(self) -> None: + for transcription_file in self.__jsons_dir.rglob('*.json'): + self.__process_transcription_file(transcription_file) - def generate(self) -> None: - for transcription_file in self.jsons_dir.rglob('*.json'): - self.__process_file(transcription_file) + def __process_transcription_file(self, file_path: Path) -> None: + try: + transcription = self.__load_json(file_path) + if not transcription: + return - def __call__(self) -> None: - self.generate() + episode_info = self.__episode_manager.parse_filename(file_path) + if not episode_info: + self.__logger.error(f'Cannot extract episode info from {file_path.name}') + return - def __check_if_already_processed(self, episode_info) -> bool: - filename = self.episode_manager.path_manager.build_filename( - episode_info, extension='json', - ) - season_code = episode_info.season_code() - episode_code = episode_info.episode_num() - main_output_file = ( - get_base_output_dir(self.series_name) - / settings.output_subdirs.transcriptions - / season_code - / episode_code - / 'raw' - / filename + if self.__is_already_processed(episode_info): + return + + self.__generate_all_formats(transcription, episode_info) + except Exception as e: + self.__logger.error(f'Error processing {file_path.name}: {e}') + + def __generate_all_formats(self, transcription: Dict[str, Any], episode_info: Any) -> None: + base_dir = self.__get_raw_output_dir(episode_info) + base_dir.mkdir(parents=True, exist_ok=True) + + metadata = EpisodeManager.get_metadata(episode_info) + full_data = {'episode_info': metadata, **transcription} + + # Generowanie formatów + self.__save_json(full_data, episode_info, base_dir, 'full') + self.__save_json(transcription, episode_info, base_dir, 'segmented') + self.__save_json(transcription, episode_info, base_dir, 'simple') + self.__save_srt(transcription, episode_info, base_dir) + self.__save_txt(transcription, episode_info, base_dir) + + def __save_json(self, data: Dict[str, Any], ep_info: Any, out_dir: Path, fmt: str) -> None: + gen = JsonGenerator(fmt, Path(''), out_dir, self.__logger) + filename = self.__episode_manager.path_manager.build_filename( + ep_info, extension='json', suffix=fmt if fmt != 'full' else None, ) - if main_output_file.exists(): - self.logger.info( - f'Skipping (already exists): {episode_info.episode_code()}', - ) + + converted = gen.convert(data) + if fmt != 'full': + converted['episode_info'] = {'season': ep_info.season, 'episode_number': ep_info.relative_episode} + else: + converted['episode_info'] = data.get('episode_info', {}) + + with open(out_dir / filename, 'w', encoding='utf-8') as f: + json.dump(converted, f, indent=2, ensure_ascii=False) + + def __save_srt(self, data: Dict[str, Any], ep_info: Any, out_dir: Path) -> None: + gen = SrtGenerator(Path(''), out_dir, self.__logger) + filename = self.__episode_manager.path_manager.build_filename(ep_info, extension='srt') + (out_dir / filename).write_text(gen.convert_to_srt_format(data), encoding='utf-8') + + def __save_txt(self, data: Dict[str, Any], ep_info: Any, out_dir: Path) -> None: + gen = TxtGenerator(Path(''), out_dir, self.__logger) + filename = self.__episode_manager.path_manager.build_filename(ep_info, extension='txt') + (out_dir / filename).write_text(gen.convert_to_txt_format(data), encoding='utf-8') + + def __is_already_processed(self, ep_info: Any) -> bool: + filename = self.__episode_manager.path_manager.build_filename(ep_info, extension='json') + target = self.__get_raw_output_dir(ep_info) / filename + if target.exists(): + self.__logger.info(f'Skipping existing: {ep_info.episode_code()}') return True return False - def __generate_all_formats( - self, transcription: Dict[str, Any], episode_info, - ) -> None: - episode_metadata = EpisodeManager.get_metadata(episode_info) - transcription_with_info = {'episode_info': episode_metadata, **transcription} - self.__generate_full_json(transcription_with_info, episode_info) - self.__generate_segmented_json(transcription, episode_info) - self.__generate_simple_json(transcription, episode_info) - self.__generate_srt(transcription, episode_info) - self.__generate_txt(transcription, episode_info) - - def __generate_full_json(self, data: Dict[str, Any], episode_info) -> None: - filename = self.episode_manager.path_manager.build_filename(episode_info, extension='json') - season_code = episode_info.season_code() - episode_code = episode_info.episode_num() - output_file = get_base_output_dir(self.series_name) / settings.output_subdirs.transcriptions / season_code / episode_code / 'raw' / filename - output_file.parent.mkdir(parents=True, exist_ok=True) - generator = JsonGenerator('full', Path(''), output_file.parent, self.logger) - full_json = generator.convert_to_full_format(data) - full_json['episode_info'] = data.get('episode_info', {}) - with open(output_file, 'w', encoding='utf-8') as f: - json.dump(full_json, f, indent=2, ensure_ascii=False) - self.logger.info(f'Generated full JSON: {output_file}') - - def __generate_segmented_json(self, data: Dict[str, Any], episode_info) -> None: - filename = self.episode_manager.path_manager.build_filename(episode_info, extension='json', suffix='segmented') - season_code = episode_info.season_code() - episode_code = episode_info.episode_num() - output_file = get_base_output_dir(self.series_name) / settings.output_subdirs.transcriptions / season_code / episode_code / 'raw' / filename - output_file.parent.mkdir(parents=True, exist_ok=True) - generator = JsonGenerator('segmented', Path(''), output_file.parent, self.logger) - segmented_json = generator.convert_to_segmented_format(data) - segmented_json['episode_info'] = {'season': episode_info.season, 'episode_number': episode_info.relative_episode} - with open(output_file, 'w', encoding='utf-8') as f: - json.dump(segmented_json, f, indent=2, ensure_ascii=False) - self.logger.info(f'Generated segmented JSON: {output_file}') - - def __generate_simple_json(self, data: Dict[str, Any], episode_info) -> None: - filename = self.episode_manager.path_manager.build_filename(episode_info, extension='json', suffix='simple') - season_code = episode_info.season_code() - episode_code = episode_info.episode_num() - output_file = get_base_output_dir(self.series_name) / settings.output_subdirs.transcriptions / season_code / episode_code / 'raw' / filename - output_file.parent.mkdir(parents=True, exist_ok=True) - generator = JsonGenerator('simple', Path(''), output_file.parent, self.logger) - simple_json = generator.convert_to_simple_format(data) - simple_json['episode_info'] = {'season': episode_info.season, 'episode_number': episode_info.relative_episode} - with open(output_file, 'w', encoding='utf-8') as f: - json.dump(simple_json, f, indent=2, ensure_ascii=False) - self.logger.info(f'Generated simple JSON: {output_file}') - - def __generate_srt(self, data: Dict[str, Any], episode_info) -> None: - filename = self.episode_manager.path_manager.build_filename(episode_info, extension='srt') - season_code = episode_info.season_code() - episode_code = episode_info.episode_num() - output_file = get_base_output_dir(self.series_name) / settings.output_subdirs.transcriptions / season_code / episode_code / 'raw' / filename - output_file.parent.mkdir(parents=True, exist_ok=True) - output_file.parent.mkdir(parents=True, exist_ok=True) - generator = SrtGenerator(Path(''), output_file.parent, self.logger) - srt_content = generator.convert_to_srt_format(data) - with open(output_file, 'w', encoding='utf-8') as f: - f.write(srt_content) - self.logger.info(f'Generated SRT: {output_file}') - - def __generate_txt(self, data: Dict[str, Any], episode_info) -> None: - filename = self.episode_manager.path_manager.build_filename(episode_info, extension='txt') - season_code = episode_info.season_code() - episode_code = episode_info.episode_num() - output_file = get_base_output_dir(self.series_name) / settings.output_subdirs.transcriptions / season_code / episode_code / 'raw' / filename - output_file.parent.mkdir(parents=True, exist_ok=True) - output_file.parent.mkdir(parents=True, exist_ok=True) - generator = TxtGenerator(Path(''), output_file.parent, self.logger) - txt_content = generator.convert_to_txt_format(data) - with open(output_file, 'w', encoding='utf-8') as f: - f.write(txt_content) - self.logger.info(f'Generated TXT: {output_file}') - - def __load_transcription(self, transcription_file: Path) -> Optional[Dict[str, Any]]: + def __get_raw_output_dir(self, ep_info: Any) -> Path: + return ( + get_base_output_dir(self.__series_name) / + settings.output_subdirs.transcriptions / + ep_info.season_code() / + ep_info.episode_num() / 'raw' + ) + + def __load_json(self, path: Path) -> Optional[Dict[str, Any]]: try: - with open(transcription_file, 'r', encoding='utf-8') as f: + with open(path, 'r', encoding='utf-8') as f: return json.load(f) except Exception as e: - self.logger.error(f'Failed to load transcription {transcription_file}: {e}') + self.__logger.error(f'Load error {path.name}: {e}') return None - - def __process_file(self, transcription_file: Path) -> None: - try: - transcription = self.__load_transcription(transcription_file) - if not transcription: - return - episode_info = self.episode_manager.parse_filename(transcription_file) - if not episode_info: - self.logger.error( - f'Cannot extract episode info from {transcription_file.name}', - ) - return - if self.__check_if_already_processed(episode_info): - return - self.__generate_all_formats(transcription, episode_info) - except Exception as e: - self.logger.error(f'Error processing file {transcription_file}: {e}') diff --git a/preprocessor/services/transcription/generators/srt_generator.py b/preprocessor/services/transcription/generators/srt_generator.py index 8a1fc9e6e..cbd4ab23e 100644 --- a/preprocessor/services/transcription/generators/srt_generator.py +++ b/preprocessor/services/transcription/generators/srt_generator.py @@ -9,31 +9,31 @@ class SrtGenerator(BaseTranscriptionGenerator): - def convert_to_srt_format(self, data: Dict[str, Any]) -> str: segments = data.get('segments', []) srt_lines = [] index = 1 + for seg in segments: - start = seg.get('start', 0.0) - end = seg.get('end', 0.0) text = seg.get('text', '').strip() if not text: continue - start_time = self.__format_timestamp(start) - end_time = self.__format_timestamp(end) - srt_lines.append(f'{index}') - srt_lines.append(f'{start_time} --> {end_time}') - srt_lines.append(text) - srt_lines.append('') + + start_time = self.__format_timestamp(seg.get('start', 0.0)) + end_time = self.__format_timestamp(seg.get('end', 0.0)) + + srt_lines.extend([str(index), f'{start_time} --> {end_time}', text, '']) index += 1 + return '\n'.join(srt_lines) def _get_output_filename(self, json_file: Path) -> str: return json_file.name.replace(FILE_EXTENSIONS['json'], FILE_EXTENSIONS['srt']) def _process_file(self, json_file: Path, data: Dict[str, Any]) -> None: - ... + content = self.convert_to_srt_format(data) + output_path = self._output_dir / self._get_output_filename(json_file) + output_path.write_text(content, encoding='utf-8') @staticmethod def __format_timestamp(seconds: float) -> str: diff --git a/preprocessor/services/transcription/generators/txt_generator.py b/preprocessor/services/transcription/generators/txt_generator.py index a899851ad..9c28ee5a4 100644 --- a/preprocessor/services/transcription/generators/txt_generator.py +++ b/preprocessor/services/transcription/generators/txt_generator.py @@ -9,19 +9,15 @@ class TxtGenerator(BaseTranscriptionGenerator): - @staticmethod def convert_to_txt_format(data: Dict[str, Any]) -> str: segments = data.get('segments', []) - text_parts = [] - for seg in segments: - text = seg.get('text', '').strip() - if text: - text_parts.append(text) - return ' '.join(text_parts) + return ' '.join(seg.get('text', '').strip() for seg in segments if seg.get('text')) def _get_output_filename(self, json_file: Path) -> str: return json_file.name.replace(FILE_EXTENSIONS['json'], FILE_EXTENSIONS['txt']) def _process_file(self, json_file: Path, data: Dict[str, Any]) -> None: - ... + content = self.convert_to_txt_format(data) + output_path = self._output_dir / self._get_output_filename(json_file) + output_path.write_text(content, encoding='utf-8') diff --git a/preprocessor/services/transcription/processors/audio_normalizer.py b/preprocessor/services/transcription/processors/audio_normalizer.py index 0964abc65..879ed0438 100644 --- a/preprocessor/services/transcription/processors/audio_normalizer.py +++ b/preprocessor/services/transcription/processors/audio_normalizer.py @@ -13,52 +13,72 @@ class AudioNormalizer: SUPPORTED_VIDEO_EXTENSIONS = BaseProcessor.SUPPORTED_VIDEO_EXTENSIONS - def __init__(self, input_videos: Path, output_dir: Path, logger: ErrorHandlingLogger, video_files: Optional[List[Path]]=None) -> None: - self.__input_videos: Path = input_videos - self.__output_dir: Path = output_dir - self.__logger: ErrorHandlingLogger = logger - self.__video_files: Optional[List[Path]] = video_files + def __init__( + self, + input_videos: Path, + output_dir: Path, + logger: ErrorHandlingLogger, + video_files: Optional[List[Path]] = None, + ) -> None: + self.__input_videos = input_videos + self.__output_dir = output_dir + self.__logger = logger + self.__video_files = video_files self.__output_dir.mkdir(parents=True, exist_ok=True) def __call__(self) -> None: - if self.__video_files is not None: - for video in self.__video_files: - self.__process_video(video) - else: - for video in self.__input_videos.rglob('*'): - if video.suffix.lower() in self.SUPPORTED_VIDEO_EXTENSIONS: - self.__process_video(video) + targets = self.__video_files if self.__video_files is not None else self.__discover_videos() + for video in targets: + self.__process_video(video) + + def __discover_videos(self) -> List[Path]: + return [ + v for v in self.__input_videos.rglob('*') + if v.suffix.lower() in self.SUPPORTED_VIDEO_EXTENSIONS + ] + + def __process_video(self, video: Path) -> None: + try: + output_path = self.__output_dir / video.with_suffix('.wav').name + if output_path.exists(): + return + + audio_idx = self.__get_best_audio_stream(video) + if audio_idx is None: + return + + self.__execute_normalization_pipeline(video, audio_idx, output_path) + except Exception as e: + self.__logger.error(f'Error processing video {video}: {e}') def __get_best_audio_stream(self, video: Path) -> Optional[int]: - cmd = ['ffprobe', '-v', 'error', '-select_streams', 'a', '-show_entries', 'stream=index,bit_rate', '-of', 'json', str(video)] + cmd = [ + 'ffprobe', '-v', 'error', '-select_streams', 'a', + '-show_entries', 'stream=index,bit_rate', '-of', 'json', str(video), + ] result = subprocess.run(cmd, capture_output=True, text=True, check=True) streams = json.loads(result.stdout).get('streams', []) + if not streams: self.__logger.error(f'No audio streams found in file: {video}') return None - best_stream = max(streams, key=lambda s: int(s.get('bit_rate', 0))) + + best_stream = max(streams, key=lambda s: int(s.get('bit_rate', 0) or 0)) return best_stream['index'] - def __normalize(self, video: Path, audio_idx: int, output: Path) -> None: + def __execute_normalization_pipeline(self, video: Path, audio_idx: int, output: Path) -> None: + self.__extract_audio(video, audio_idx, output) + tmp_output = output.with_name(output.stem + '_temp.wav') - extract_cmd = ['ffmpeg', '-y', '-i', str(video), '-map', f'0:{audio_idx}', '-acodec', 'pcm_s16le', '-ar', '48000', '-ac', '1', str(output)] - subprocess.run(extract_cmd, check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) - self.__logger.info(f'Converted audio: {output}') normalize_cmd = ['ffmpeg', '-y', '-i', str(output), '-af', 'dynaudnorm', str(tmp_output)] subprocess.run(normalize_cmd, check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) - self.__logger.info(f'Normalized audio: {tmp_output}') + tmp_output.replace(output) - self.__logger.info(f'Replaced original file with normalized audio: {video} -> {output}') + self.__logger.info(f'Normalization complete: {output.name}') - def __process_video(self, video: Path) -> None: - try: - output_path = self.__output_dir / video.with_suffix('.wav').name - if output_path.exists(): - return - audio_idx = self.__get_best_audio_stream(video) - if audio_idx is None: - self.__logger.error(f"Cannot find audio stream for file: '{video}'") - return - self.__normalize(video=video, audio_idx=audio_idx, output=output_path) - except Exception as e: - self.__logger.error(f'Error processing video {video}: {e}') + def __extract_audio(self, video: Path, audio_idx: int, output: Path) -> None: + cmd = [ + 'ffmpeg', '-y', '-i', str(video), '-map', f'0:{audio_idx}', + '-acodec', 'pcm_s16le', '-ar', '48000', '-ac', '1', str(output), + ] + subprocess.run(cmd, check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) diff --git a/preprocessor/services/transcription/processors/episode_info_processor.py b/preprocessor/services/transcription/processors/episode_info_processor.py index d0e0f1730..a52972205 100644 --- a/preprocessor/services/transcription/processors/episode_info_processor.py +++ b/preprocessor/services/transcription/processors/episode_info_processor.py @@ -11,15 +11,19 @@ class EpisodeInfoProcessor: + def __init__( + self, + jsons_dir: Path, + episodes_info_json: Path, + output_path: Path, + logger: ErrorHandlingLogger, + series_name: str = '', + ) -> None: + self.__jsons_dir = jsons_dir + self.__output_path = output_path + self.__logger = logger + self.__series_name = self.__resolve_series_name(series_name) - def __init__(self, jsons_dir: Path, episodes_info_json: Path, output_path: Path, logger: ErrorHandlingLogger, series_name: str='') -> None: - self.__jsons_dir: Path = jsons_dir - self.__output_path: Path = output_path - self.__logger: ErrorHandlingLogger = logger - if not series_name: - series_name = self.__output_path.parent.name.lower() - self.__logger.warning(f"No series name provided. Using fallback from folder name: '{series_name}'") - self.__series_name: str = series_name.lower() self.__output_path.mkdir(parents=True, exist_ok=True) self.__episode_manager = EpisodeManager(episodes_info_json, self.__series_name, self.__logger) @@ -27,40 +31,53 @@ def __call__(self) -> None: for transcription_file in self.__jsons_dir.rglob('*.json'): self.__process_file(transcription_file) - @staticmethod - def __load_transcription(path: Path) -> Dict[str, Any]: - with path.open('r', encoding='utf-8') as f: - return json.load(f) + def __resolve_series_name(self, series_name: str) -> str: + if not series_name: + name = self.__output_path.parent.name.lower() + self.__logger.warning(f"Using fallback series name from folder: '{name}'") + return name + return series_name.lower() def __process_file(self, transcription_file: Path) -> None: try: - transcription = self.__load_transcription(transcription_file) + transcription = self.__load_json(transcription_file) episode_info = self.__episode_manager.parse_filename(transcription_file) + if not episode_info: - self.__logger.error(f'Cannot extract episode info from {transcription_file.name}') + self.__logger.error(f'Failed to parse episode info: {transcription_file.name}') return - _, new_json_name = self.__write_episode_json(transcription, episode_info) - self.__rename_original_file(transcription_file, new_json_name) + + _, new_name = self.__write_structured_json(transcription, episode_info) + self.__sync_original_filename(transcription_file, new_name) except Exception as e: - self.__logger.error(f'Error processing file {transcription_file}: {e}') + self.__logger.error(f'Error processing {transcription_file.name}: {e}') + + def __write_structured_json(self, transcription: Dict[str, Any], episode_info) -> Tuple[Path, str]: + new_name = self.__episode_manager.path_manager.build_filename(episode_info, extension='json') + target_path = self.__output_path / episode_info.season_code() / new_name + target_path.parent.mkdir(parents=True, exist_ok=True) - def __rename_original_file(self, original_path: Path, new_name: str) -> None: - new_src = original_path.parent / new_name + payload = { + 'episode_info': EpisodeManager.get_metadata(episode_info), + 'segments': transcription.get('segments', []), + } + + with target_path.open('w', encoding='utf-8') as f: + json.dump(payload, f, ensure_ascii=False, indent=4) + + return target_path, new_name + + def __sync_original_filename(self, original_path: Path, new_name: str) -> None: + target_path = original_path.parent / new_name if original_path.name == new_name: - self.__logger.info(f'File {original_path} already has correct name.') - elif new_src.exists(): - self.__logger.error(f'Cannot rename {original_path} -> {new_src}, file already exists!') + return + + if target_path.exists(): + self.__logger.error(f'Rename conflict: {target_path} already exists!') else: - original_path.rename(new_src) - self.__logger.info(f'Renamed source transcription file: {original_path} -> {new_src}') - - def __write_episode_json(self, transcription: Dict[str, Any], episode_info) -> Tuple[Path, str]: - new_json_name = self.__episode_manager.path_manager.build_filename(episode_info, extension='json') - season_dir = self.__output_path / episode_info.season_code() - output_path = season_dir / new_json_name - output_path.parent.mkdir(parents=True, exist_ok=True) - result = {'episode_info': EpisodeManager.get_metadata(episode_info), 'segments': transcription.get('segments', [])} - with output_path.open('w', encoding='utf-8') as f: - json.dump(result, f, ensure_ascii=False, indent=4) - self.__logger.info(f'Created episode info {output_path}.') - return output_path, new_json_name + original_path.rename(target_path) + + @staticmethod + def __load_json(path: Path) -> Dict[str, Any]: + with path.open('r', encoding='utf-8') as f: + return json.load(f) diff --git a/preprocessor/services/transcription/processors/normalized_audio_processor.py b/preprocessor/services/transcription/processors/normalized_audio_processor.py index c563ec8c4..6b031d0c1 100644 --- a/preprocessor/services/transcription/processors/normalized_audio_processor.py +++ b/preprocessor/services/transcription/processors/normalized_audio_processor.py @@ -18,75 +18,78 @@ class NormalizedAudioProcessor: SUPPORTED_AUDIO_EXTENSIONS: Tuple[str, str] = ('.wav', '.mp3') def __init__( - self, - input_audios: Path, - output_dir: Path, - logger: ErrorHandlingLogger, - language: str, - model: str, - device: str, - audio_files: Optional[List[Path]] = None, + self, + input_audios: Path, + output_dir: Path, + logger: ErrorHandlingLogger, + language: str, + model: str, + device: str, + audio_files: Optional[List[Path]] = None, ): - self.__input_audios: Path = input_audios - self.__output_dir: Path = output_dir - self.__logger: ErrorHandlingLogger = logger - self.__audio_files: Optional[List[Path]] = audio_files - self.__language: str = language - self.__input_audios.mkdir(parents=True, exist_ok=True) + self.__input_audios = input_audios + self.__output_dir = output_dir + self.__logger = logger + self.__audio_files = audio_files + self.__language = language + self.__output_dir.mkdir(parents=True, exist_ok=True) + if device != 'cuda': - raise ValueError(f'Only GPU (cuda) is supported, got device={device}') - compute_type = 'float16' - self.__logger.info( - f'Loading Whisper model {model} on {device} with compute_type={compute_type}', - ) + raise ValueError(f'Whisper acceleration requires CUDA device, got: {device}') + self.__whisper_model = WhisperModel( model, device=device, - compute_type=compute_type, + compute_type='float16', ) + self.__logger.info(f'Whisper {model} initialized on {device}') def cleanup(self) -> None: - self.__logger.info('Unloading Whisper model and clearing GPU memory...') + self.__logger.info('Purging GPU memory and unloading Whisper model...') if hasattr(self, '_NormalizedAudioProcessor__whisper_model'): del self.__whisper_model + gc.collect() if torch.cuda.is_available(): torch.cuda.empty_cache() - self.__logger.info('Whisper model unloaded, GPU memory cleared') def __call__(self) -> None: - if self.__audio_files is not None: - for audio in self.__audio_files: - self.__process_normalized_audio(audio) - else: - for audio in self.__input_audios.rglob('*'): - if audio.suffix.lower() in self.SUPPORTED_AUDIO_EXTENSIONS: - self.__process_normalized_audio(audio) - - def __process_normalized_audio(self, normalized_audio: Path) -> None: + targets = self.__audio_files if self.__audio_files is not None else self.__discover_audios() + for audio in targets: + self.__transcribe_file(audio) + + def __discover_audios(self) -> List[Path]: + return [ + a for a in self.__input_audios.rglob('*') + if a.suffix.lower() in self.SUPPORTED_AUDIO_EXTENSIONS + ] + + def __transcribe_file(self, audio_path: Path) -> None: try: - output_file = self.__output_dir / normalized_audio.with_suffix('.json').name + output_file = self.__output_dir / audio_path.with_suffix('.json').name if output_file.exists(): return - language_code = WhisperUtils.get_language_code(self.__language) + segments, info = self.__whisper_model.transcribe( - str(normalized_audio), - language=language_code, + str(audio_path), + language=WhisperUtils.get_language_code(self.__language), beam_size=10, word_timestamps=True, condition_on_previous_text=False, temperature=0.0, - compression_ratio_threshold=None, - ) - result = WhisperUtils.build_transcription_result( - segments, - language=info.language, ) - for segment_dict in result['segments']: - segment_dict['temperature'] = 0.0 - with open(output_file, 'w', encoding='utf-8') as f: - json.dump(result, f, ensure_ascii=False, indent=2) - self.__logger.info(f'Processed: {normalized_audio}') + + result = WhisperUtils.build_transcription_result(segments, language=info.language) + self.__save_results(result, output_file) + self.__logger.info(f'Transcription saved: {output_file.name}') + except Exception as e: - self.__logger.error(f'Error processing file {normalized_audio}: {e}') + self.__logger.error(f'Whisper error on {audio_path.name}: {e}') + + def __save_results(self, result: dict, path: Path) -> None: + for segment in result.get('segments', []): + segment['temperature'] = 0.0 + + with open(path, 'w', encoding='utf-8') as f: + json.dump(result, f, ensure_ascii=False, indent=2) diff --git a/preprocessor/services/transcription/sound_classification.py b/preprocessor/services/transcription/sound_classification.py index 6b22f8076..525d76f1d 100644 --- a/preprocessor/services/transcription/sound_classification.py +++ b/preprocessor/services/transcription/sound_classification.py @@ -13,23 +13,25 @@ def is_sound_event(word: Dict[str, Any]) -> bool: if word.get(WordKeys.TYPE) == WordTypeValues.AUDIO_EVENT: return True + text = word.get(WordKeys.TEXT, '').strip() - if re.match(r'^\(.*\)$', text): - return True - return False + return bool(re.match(r'^\(.*\)$', text)) def classify_segment(segment: Dict[str, Any]) -> str: words = segment.get(WordKeys.WORDS, []) if not words: return 'dialogue' + has_sound = False has_dialogue = False + for word in words: if is_sound_event(word): has_sound = True elif word.get(WordKeys.TYPE) not in [WordTypeValues.SPACING, '']: has_dialogue = True + if has_sound and has_dialogue: return 'mixed' if has_sound: diff --git a/preprocessor/services/transcription/utils.py b/preprocessor/services/transcription/utils.py index 9d753c16e..87b2c279c 100644 --- a/preprocessor/services/transcription/utils.py +++ b/preprocessor/services/transcription/utils.py @@ -4,11 +4,11 @@ Any, Dict, List, + Optional, ) class TranscriptionUtils: - @staticmethod def convert_words_list(words: List[Dict[str, Any]]) -> List[Dict[str, Any]]: return [ @@ -26,25 +26,20 @@ def convert_words_list(words: List[Dict[str, Any]]) -> List[Dict[str, Any]]: def fix_transcription_file_unicode(file_path: Path) -> bool: if not file_path.exists(): return False + with open(file_path, 'r', encoding='utf-8') as f: original_content = f.read() f.seek(0) data: Dict[str, Any] = json.load(f) + new_content = json.dumps(data, ensure_ascii=False, indent=2) + if original_content != new_content: with open(file_path, 'w', encoding='utf-8') as f: f.write(new_content) return True return False - @staticmethod - def __fix_unicode(file_path: Path) -> None: # pylint: disable=unused-private-member - if not file_path.exists(): - return - with open(file_path, 'r', encoding='utf-8') as f: - data: Dict[str, Any] = json.load(f) - with open(file_path, 'w', encoding='utf-8') as f: - json.dump(data, f, ensure_ascii=False, indent=2) class WhisperUtils: LANGUAGE_MAP: Dict[str, str] = { @@ -56,23 +51,24 @@ class WhisperUtils: } @staticmethod - def build_transcription_result(segments: Any, language: str=None) -> Dict[str, Any]: + def get_language_code(language: str) -> str: + return WhisperUtils.LANGUAGE_MAP.get(language.lower(), language.lower()) + + @staticmethod + def build_transcription_result(segments: Any, language: Optional[str] = None) -> Dict[str, Any]: result: Dict[str, Any] = {'text': '', 'segments': []} if language: result['language'] = language + for segment in segments: segment_dict = WhisperUtils.__process_segment(segment) result['segments'].append(segment_dict) result['text'] += segment.text return result - @staticmethod - def get_language_code(language: str) -> str: - return WhisperUtils.LANGUAGE_MAP.get(language.lower(), language.lower()) - @staticmethod def __process_segment(segment: Any) -> Dict[str, Any]: - words = [] + words: List[Dict[str, Any]] = [] if hasattr(segment, 'words') and segment.words: for word in segment.words: words.append({ @@ -81,6 +77,7 @@ def __process_segment(segment: Any) -> Dict[str, Any]: 'end': word.end, 'probability': word.probability, }) + return { 'id': segment.id, 'seek': 0, diff --git a/preprocessor/services/transcription/whisper.py b/preprocessor/services/transcription/whisper.py index 1450ead68..3dfabcf1a 100644 --- a/preprocessor/services/transcription/whisper.py +++ b/preprocessor/services/transcription/whisper.py @@ -13,49 +13,69 @@ class Whisper: - - def __init__(self, model: str='large-v3-turbo', language: str='pl', device: str='cuda', beam_size: int=10, temperature: float=0.0) -> None: - self.model_name: str = model - self.language: str = language - self.device: str = device - self.beam_size: int = beam_size - self.temperature: float = temperature - self._model: Optional[WhisperModel] = None + def __init__( + self, + model: str = 'large-v3-turbo', + language: str = 'pl', + device: str = 'cuda', + beam_size: int = 10, + temperature: float = 0.0, + ) -> None: + self.__model_name = model + self.__language = language + self.__device = device + self.__beam_size = beam_size + self.__temperature = temperature + self.__model: Optional[WhisperModel] = None def cleanup(self) -> None: console.print('[cyan]Unloading Whisper model and clearing GPU memory...[/cyan]') - if self._model is not None: - del self._model - self._model = None + if self.__model is not None: + del self.__model + self.__model = None + if torch.cuda.is_available(): torch.cuda.empty_cache() console.print('[green]Whisper model unloaded, GPU memory cleared[/green]') def transcribe(self, audio_path: Path) -> Dict[str, Any]: console.print(f'[cyan]Transcribing with Whisper: {audio_path.name}[/cyan]') + if not audio_path.exists(): raise FileNotFoundError(f'Audio file not found: {audio_path}') - model = self._load_model() - language_code = WhisperUtils.get_language_code(self.language) + + model = self.__get_or_load_model() + language_code = WhisperUtils.get_language_code(self.__language) + segments, info = model.transcribe( str(audio_path), language=language_code, - beam_size=self.beam_size, + beam_size=self.__beam_size, word_timestamps=True, condition_on_previous_text=False, - temperature=self.temperature, + temperature=self.__temperature, ) + result = WhisperUtils.build_transcription_result(segments, language=info.language) console.print(f'[green]Transcription completed: {audio_path.name}[/green]') return result - def _load_model(self) -> WhisperModel: - if self._model is not None: - return self._model - if self.device != 'cuda': - raise ValueError(f'Only GPU (cuda) is supported, got device={self.device}') + def __get_or_load_model(self) -> WhisperModel: + if self.__model is not None: + return self.__model + + if self.__device != 'cuda': + raise ValueError(f'Only GPU (cuda) is supported, got device={self.__device}') + compute_type = 'float16' - console.print(f'[cyan]Loading Whisper model: {self.model_name} on {self.device} with compute_type={compute_type}[/cyan]') - self._model = WhisperModel(self.model_name, device=self.device, compute_type=compute_type) + console.print( + f'[cyan]Loading Whisper: {self.__model_name} on {self.__device} ({compute_type})[/cyan]', + ) + + self.__model = WhisperModel( + self.__model_name, + device=self.__device, + compute_type=compute_type, + ) console.print('[green]Whisper model loaded[/green]') - return self._model + return self.__model diff --git a/preprocessor/services/ui/console.py b/preprocessor/services/ui/console.py index 926ab547f..41c1aeeae 100644 --- a/preprocessor/services/ui/console.py +++ b/preprocessor/services/ui/console.py @@ -1,85 +1,92 @@ import os import sys import time +from typing import ( + Any, + Dict, + Optional, +) from rich.console import Console from preprocessor.services.core.time import TimeFormatter -_console_instance = None +_console_instance: Optional[Console] = None + def __get_console() -> Console: global _console_instance # pylint: disable=global-statement if _console_instance is None: - in_docker = ( + _console_instance = __initialize_rich_console() + return _console_instance + + +def __initialize_rich_console() -> Console: + in_docker = ( os.path.exists('/.dockerenv') or os.getenv('DOCKER_CONTAINER', 'false') == 'true' - ) - color_system = 'standard' if in_docker else 'auto' - _console_instance = Console( - force_terminal=True, - file=sys.stderr, - color_system=color_system, - ) - return _console_instance + ) + return Console( + force_terminal=True, + file=sys.stderr, + color_system='standard' if in_docker else 'auto', + ) -class SimpleProgress: +class SimpleProgress: def __init__(self) -> None: - self.tasks = {} - self.task_counter = 0 - self.console = console + self.__tasks: Dict[int, Dict[str, Any]] = {} + self.__task_counter = 0 + self.__console = console + + def __enter__(self) -> 'SimpleProgress': + return self + + def __exit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> None: + self.__tasks.clear() def add_task(self, description: str, total: int) -> int: - task_id = self.task_counter - self.task_counter += 1 - self.tasks[task_id] = { + task_id = self.__task_counter + self.__task_counter += 1 + + self.__tasks[task_id] = { 'description': description, 'total': total, 'completed': 0, 'start_time': time.time(), - 'last_print': 0, + 'last_print_time': 0.0, } - self.__print_progress(task_id) + + self.__render_progress(task_id) return task_id - def advance(self, task_id: int, advance: int=1) -> None: - if task_id not in self.tasks: + def advance(self, task_id: int, step: int = 1) -> None: + task = self.__tasks.get(task_id) + if not task: return - task = self.tasks[task_id] - task['completed'] += advance + + task['completed'] += step current_time = time.time() - if current_time - task['last_print'] >= 1.0 or task['completed'] >= task['total']: - self.__print_progress(task_id) - task['last_print'] = current_time - def __enter__(self) -> 'SimpleProgress': - return self + if self.__should_render(task, current_time): + self.__render_progress(task_id) + task['last_print_time'] = current_time - def __exit__(self, exc_type, exc_val, exc_tb) -> None: - pass + def __should_render(self, task: Dict[str, Any], current_time: float) -> bool: + is_finished = task['completed'] >= task['total'] + is_second_passed = (current_time - task['last_print_time']) >= 1.0 + return is_finished or is_second_passed - def __print_progress(self, task_id: int) -> None: - task = self.tasks[task_id] + def __render_progress(self, task_id: int) -> None: + task = self.__tasks[task_id] completed = task['completed'] total = task['total'] - percent = completed / total * 100 if total > 0 else 0 - elapsed = time.time() - task['start_time'] - if 0 < completed < total: - eta_seconds = elapsed / completed * (total - completed) - eta = TimeFormatter.format_hms(eta_seconds) - elif completed >= total: - eta = '0:00:00' - else: - eta = '-:--:--' - bar_width = 30 - filled = int(bar_width * completed / total) if total > 0 else 0 - if filled < bar_width: - progress_bar = '=' * filled + '>' + '-' * (bar_width - filled - 1) - else: - progress_bar = '=' * bar_width - - console.print( + + percent = (completed / total * 100) if total > 0 else 0 + eta = self.__compute_task_eta(task) + progress_bar = self.__build_visual_bar(completed, total) + + self.__console.print( f"[bold blue]{task['description']}[/bold blue] " f"[cyan]{progress_bar}[/cyan] " f"[green]{percent:3.0f}%[/green] " @@ -88,7 +95,31 @@ def __print_progress(self, task_id: int) -> None: highlight=False, ) + def __compute_task_eta(self, task: Dict[str, Any]) -> str: + completed = task['completed'] + total = task['total'] + + if completed >= total: + return '0:00:00' + if completed <= 0: + return '-:--:--' + + elapsed = time.time() - task['start_time'] + eta_seconds = (elapsed / completed) * (total - completed) + return TimeFormatter.format_hms(eta_seconds) + + def __build_visual_bar(self, completed: int, total: int, width: int = 30) -> str: + if total <= 0: + return '-' * width + + filled_length = int(width * completed / total) + if filled_length < width: + return '=' * filled_length + '>' + '-' * (width - filled_length - 1) + return '=' * width + + def create_progress() -> SimpleProgress: return SimpleProgress() + console = __get_console() diff --git a/preprocessor/services/ui/progress.py b/preprocessor/services/ui/progress.py index aad14ee39..e27e40e41 100644 --- a/preprocessor/services/ui/progress.py +++ b/preprocessor/services/ui/progress.py @@ -6,37 +6,53 @@ class ProgressTracker: - def __init__(self) -> None: - self.current_operation: Optional[str] = None - self.start_time: Optional[float] = None + self.__current_operation: Optional[str] = None # pylint: disable=unused-private-member + self.__start_time: Optional[float] = None # pylint: disable=unused-private-member -class OperationTracker: +class OperationTracker: def __init__(self, operation_name: str, total: int, start_time: float) -> None: - self.operation_name = operation_name - self.total = total - self.completed = 0 - self.start_time = start_time - self.last_report = 0 - - def update(self, completed: int, interval: int=10) -> None: - self.completed = completed - should_report = completed % interval == 0 or completed == self.total or completed == 1 - if should_report and completed != self.last_report: + self.__operation_name = operation_name + self.__total = total + self.__completed = 0 + self.__start_time = start_time + self.__last_report_count = 0 + + def update(self, completed: int, interval: int = 10) -> None: + self.__completed = completed + + if self.__should_report_progress(completed, interval): self.__report_progress() - self.last_report = completed + self.__last_report_count = completed + + def __should_report_progress(self, completed: int, interval: int) -> bool: + if completed == self.__last_report_count: + return False + + is_milestone = (completed % interval == 0) or (completed == self.__total) or (completed == 1) + return is_milestone def __report_progress(self) -> None: - elapsed = time.time() - self.start_time - percent = self.completed / self.total * 100 if self.total > 0 else 0 - if 0 < self.completed < self.total: - rate = self.completed / elapsed if elapsed > 0 else 0 - remaining = self.total - self.completed - eta_seconds = remaining / rate if rate > 0 else 0 - eta = TimeFormatter.format_hms(eta_seconds) if eta_seconds > 0 else '0:00:00' - elif self.completed >= self.total: - eta = '0:00:00' - else: - eta = '-:--:--' - console.print(f' [dim]{self.operation_name}: {self.completed}/{self.total} ({percent:.0f}%) ETA: {eta}[/dim]') + percent = (self.__completed / self.__total * 100) if self.__total > 0 else 0 + eta = self.__calculate_eta() + + console.print( + f' [dim]{self.__operation_name}: {self.__completed}/{self.__total} ' + f'({percent:.0f}%) ETA: {eta}[/dim]', + ) + + def __calculate_eta(self) -> str: + elapsed = time.time() - self.__start_time + + if self.__completed >= self.__total: + return '0:00:00' + + if self.__completed <= 0: + return '-:--:--' + + rate = self.__completed / elapsed if elapsed > 0 else 0 + remaining = self.__total - self.__completed + eta_seconds = remaining / rate if rate > 0 else 0 + + return TimeFormatter.format_hms(eta_seconds) if eta_seconds > 0 else '0:00:00' diff --git a/preprocessor/services/validation/base_result.py b/preprocessor/services/validation/base_result.py index 2719268ae..2e75f1112 100644 --- a/preprocessor/services/validation/base_result.py +++ b/preprocessor/services/validation/base_result.py @@ -24,8 +24,13 @@ def status(self) -> str: @dataclass class BaseValidationResult(ValidationStatusMixin): errors: List[str] = field(default_factory=list) - stats: Dict[str, Any] = field(default_factory=dict) warnings: List[str] = field(default_factory=list) + stats: Dict[str, Any] = field(default_factory=dict) def to_dict(self) -> Dict[str, Any]: - return {'status': self.status, 'errors': self.errors, 'warnings': self.warnings, 'stats': self.stats} + return { + 'status': self.status, + 'errors': self.errors, + 'warnings': self.warnings, + 'stats': self.stats, + } diff --git a/preprocessor/services/validation/episode_stats.py b/preprocessor/services/validation/episode_stats.py index 4e8400642..7728b1f1a 100644 --- a/preprocessor/services/validation/episode_stats.py +++ b/preprocessor/services/validation/episode_stats.py @@ -3,9 +3,10 @@ field, ) from typing import ( + Any, + Dict, List, Optional, - Tuple, TypedDict, ) @@ -22,104 +23,61 @@ TranscriptionValidator, VideoValidator, ) -from preprocessor.services.validation.validators.base_validator import BaseValidator class EpisodeStatsData(TypedDict, total=False): - """Type-safe dict for episode statistics data.""" transcription_chars: Optional[int] transcription_duration: Optional[float] transcription_words: Optional[int] exported_frames_count: Optional[int] exported_frames_total_size_mb: Optional[float] - exported_frames_avg_resolution: Optional[Tuple[int, int]] video_size_mb: Optional[float] video_duration: Optional[float] - video_codec: Optional[str] - video_resolution: Optional[Tuple[int, int]] scenes_count: Optional[int] - scenes_avg_duration: Optional[float] - image_hashes_count: Optional[int] - character_visualizations_count: Optional[int] - face_clusters_count: Optional[int] - face_clusters_total_faces: Optional[int] - object_detections_count: Optional[int] - object_visualizations_count: Optional[int] - - -class EpisodeStatsDict(TypedDict): - """Type-safe dict representation of EpisodeStats.""" - status: str - errors: List[str] - warnings: List[str] - stats: EpisodeStatsData @dataclass -class EpisodeStats(ValidationStatusMixin): # pylint: disable=too-many-instance-attributes +class EpisodeStats(ValidationStatusMixin): episode_info: EpisodeInfo series_name: str - character_visualizations_count: Optional[int] = None errors: List[str] = field(default_factory=list) - exported_frames_avg_resolution: Optional[Tuple[int, int]] = None - exported_frames_count: Optional[int] = None - exported_frames_total_size_mb: Optional[float] = None - face_clusters_count: Optional[int] = None - face_clusters_total_faces: Optional[int] = None - image_hashes_count: Optional[int] = None - object_detections_count: Optional[int] = None - object_visualizations_count: Optional[int] = None - scenes_avg_duration: Optional[float] = None - scenes_count: Optional[int] = None + warnings: List[str] = field(default_factory=list) + + # Metryki transcription_chars: Optional[int] = None transcription_duration: Optional[float] = None transcription_words: Optional[int] = None - video_codec: Optional[str] = None + exported_frames_count: Optional[int] = None + exported_frames_total_size_mb: Optional[float] = None video_duration: Optional[float] = None - video_resolution: Optional[Tuple[int, int]] = None video_size_mb: Optional[float] = None - warnings: List[str] = field(default_factory=list) + scenes_count: Optional[int] = None def __post_init__(self) -> None: - self._validators: List[BaseValidator] = [ - TranscriptionValidator(), - FrameValidator(), - VideoValidator(), - SceneValidator(), - ImageHashValidator(), - CharacterValidator(), - FaceClusterValidator(), - ObjectValidator(), - ElasticValidator(), + self.__validators = [ + TranscriptionValidator(), FrameValidator(), VideoValidator(), + SceneValidator(), ImageHashValidator(), CharacterValidator(), + FaceClusterValidator(), ObjectValidator(), ElasticValidator(), ] def collect_stats(self) -> None: - for validator in self._validators: - validator.validate(self) + for v in self.__validators: + v.validate(self) - def to_dict(self) -> EpisodeStatsDict: + def to_dict(self) -> Dict[str, Any]: return { 'status': self.status, 'errors': self.errors, 'warnings': self.warnings, - 'stats': { - 'transcription_chars': self.transcription_chars, - 'transcription_duration': self.transcription_duration, - 'transcription_words': self.transcription_words, - 'exported_frames_count': self.exported_frames_count, - 'exported_frames_total_size_mb': self.exported_frames_total_size_mb, - 'exported_frames_avg_resolution': self.exported_frames_avg_resolution, - 'video_size_mb': self.video_size_mb, - 'video_duration': self.video_duration, - 'video_codec': self.video_codec, - 'video_resolution': self.video_resolution, - 'scenes_count': self.scenes_count, - 'scenes_avg_duration': self.scenes_avg_duration, - 'image_hashes_count': self.image_hashes_count, - 'character_visualizations_count': self.character_visualizations_count, - 'face_clusters_count': self.face_clusters_count, - 'face_clusters_total_faces': self.face_clusters_total_faces, - 'object_detections_count': self.object_detections_count, - 'object_visualizations_count': self.object_visualizations_count, - }, + 'stats': self.__get_metric_map(), + } + + def __get_metric_map(self) -> Dict[str, Any]: + return { + 'transcription_chars': self.transcription_chars, + 'transcription_duration': self.transcription_duration, + 'exported_frames_count': self.exported_frames_count, + 'video_duration': self.video_duration, + 'video_size_mb': self.video_size_mb, + 'scenes_count': self.scenes_count, } diff --git a/preprocessor/services/validation/file_validators.py b/preprocessor/services/validation/file_validators.py index 8058328f2..e57729061 100644 --- a/preprocessor/services/validation/file_validators.py +++ b/preprocessor/services/validation/file_validators.py @@ -7,7 +7,6 @@ Dict, Optional, ) -import zipfile from PIL import Image @@ -25,100 +24,73 @@ class ValidationResult: error_message: Optional[str] = None metadata: Optional[Dict[str, Any]] = None -class FileValidator: +class FileValidator: @staticmethod def validate_image_file(path: Path) -> ValidationResult: - if error := FileValidator.__check_file_exists(path): - return error + err = FileValidator.__verify_existence(path) + if err: + return err try: with Image.open(path) as img: img.verify() with Image.open(path) as img: - width, height = img.size - format_type = img.format - size_mb = path.stat().st_size / (1024 * 1024) - return ValidationResult( - is_valid=True, - metadata={ - ValidationMetadataKeys.WIDTH: width, - ValidationMetadataKeys.HEIGHT: height, - ValidationMetadataKeys.FORMAT: format_type, - ValidationMetadataKeys.SIZE_MB: round(size_mb, 2), - }, - ) + return ValidationResult( + is_valid=True, + metadata={ + ValidationMetadataKeys.WIDTH: img.size[0], + ValidationMetadataKeys.HEIGHT: img.size[1], + ValidationMetadataKeys.FORMAT: img.format, + ValidationMetadataKeys.SIZE_MB: round(path.stat().st_size / (1024 * 1024), 2), + }, + ) except Exception as e: - return ValidationResult(is_valid=False, error_message=f'Invalid image: {e}') + return ValidationResult(False, f'Invalid image: {e}') @staticmethod def validate_json_file(path: Path) -> ValidationResult: - if error := FileValidator.__check_file_exists(path): - return error + err = FileValidator.__verify_existence(path) + if err: + return err try: with open(path, 'r', encoding='utf-8') as f: json.load(f) - return ValidationResult( - is_valid=True, - metadata={ValidationMetadataKeys.SIZE_BYTES: path.stat().st_size}, - ) - except json.JSONDecodeError as e: - return ValidationResult(is_valid=False, error_message=f'Invalid JSON: {e}') + return ValidationResult(True, metadata={ValidationMetadataKeys.SIZE_BYTES: path.stat().st_size}) except Exception as e: - return ValidationResult(is_valid=False, error_message=f'Error reading file: {e}') + return ValidationResult(False, f'JSON error: {e}') @staticmethod def validate_jsonl_file(path: Path) -> ValidationResult: - if error := FileValidator.__check_file_exists(path): - return error + if err := FileValidator.__verify_existence(path): + return err try: line_count = 0 with open(path, 'r', encoding='utf-8') as f: - for line_num, line in enumerate(f, 1): - line = line.strip() - if not line: - continue - try: + for line in f: + if line.strip(): json.loads(line) line_count += 1 - except json.JSONDecodeError as e: - return ValidationResult( - is_valid=False, - error_message=f'Invalid JSON at line {line_num}: {e}', - ) return ValidationResult( - is_valid=True, + True, metadata={ ValidationMetadataKeys.SIZE_BYTES: path.stat().st_size, - ValidationMetadataKeys.LINE_COUNT: line_count, + 'line_count': line_count, }, ) except Exception as e: - return ValidationResult(is_valid=False, error_message=f'Error reading file: {e}') + return ValidationResult(False, f'JSONL error: {e}') @staticmethod def validate_video_file(path: Path) -> ValidationResult: - if error := FileValidator.__check_file_exists(path): - return error + err = FileValidator.__verify_existence(path) + if err: + return err try: - result = subprocess.run( - [ - 'ffprobe', '-v', 'error', '-select_streams', 'v:0', - '-show_entries', 'stream=codec_name,width,height,duration', - '-show_entries', 'format=duration,size', - '-of', 'json', str(path), - ], - capture_output=True, - text=True, - check=True, - ) - probe_data = json.loads(result.stdout) - stream = probe_data.get(FfprobeKeys.STREAMS, [{}])[0] - format_info = probe_data.get(FfprobeKeys.FORMAT, {}) - stream_duration = stream.get(FfprobeStreamKeys.DURATION) - format_duration = format_info.get(FfprobeFormatKeys.DURATION, 0) - duration = float(stream_duration or format_duration) - size_bytes = int(format_info.get(FfprobeFormatKeys.SIZE, 0)) - size_mb = size_bytes / (1024 * 1024) + probe = FileValidator.__run_ffprobe(path) + stream = probe.get(FfprobeKeys.STREAMS, [{}])[0] + fmt = probe.get(FfprobeKeys.FORMAT, {}) + duration = float(stream.get(FfprobeStreamKeys.DURATION) or fmt.get(FfprobeFormatKeys.DURATION, 0)) + return ValidationResult( is_valid=True, metadata={ @@ -126,48 +98,25 @@ def validate_video_file(path: Path) -> ValidationResult: ValidationMetadataKeys.WIDTH: stream.get(FfprobeStreamKeys.WIDTH), ValidationMetadataKeys.HEIGHT: stream.get(FfprobeStreamKeys.HEIGHT), ValidationMetadataKeys.DURATION: round(duration, 2), - ValidationMetadataKeys.SIZE_MB: round(size_mb, 2), + ValidationMetadataKeys.SIZE_MB: round(int(fmt.get(FfprobeFormatKeys.SIZE, 0)) / (1024 * 1024), 2), }, ) - except subprocess.CalledProcessError as e: - return ValidationResult(is_valid=False, error_message=f'ffprobe error: {e.stderr}') except Exception as e: - return ValidationResult(is_valid=False, error_message=f'Error validating video: {e}') + return ValidationResult(False, str(e)) @staticmethod - def __check_file_exists(path: Path) -> Optional[ValidationResult]: + def __verify_existence(path: Path) -> Optional[ValidationResult]: if not path.exists(): - return ValidationResult(is_valid=False, error_message=f'File does not exist: {path}') + return ValidationResult(False, f'Missing: {path}') return None @staticmethod - def __validate_archive_file(path: Path) -> ValidationResult: # pylint: disable=unused-private-member - if error := FileValidator.__check_file_exists(path): - return error - try: - with zipfile.ZipFile(path, 'r') as zip_ref: - bad_file = zip_ref.testzip() - if bad_file: - return ValidationResult(is_valid=False, error_message=f'Corrupt file in archive: {bad_file}') - file_count = len(zip_ref.namelist()) - compressed_size = sum((info.compress_size for info in zip_ref.infolist())) - uncompressed_size = sum((info.file_size for info in zip_ref.infolist())) - compression_ratio = 0 - if uncompressed_size > 0: - compression_ratio = (1 - compressed_size / uncompressed_size) * 100 - return ValidationResult( - is_valid=True, - metadata={ - ValidationMetadataKeys.SIZE_MB: round( - path.stat().st_size / (1024 * 1024), 2, - ), - 'file_count': file_count, - 'compressed_size_mb': round(compressed_size / (1024 * 1024), 2), - 'uncompressed_size_mb': round(uncompressed_size / (1024 * 1024), 2), - 'compression_ratio': round(compression_ratio, 2), - }, - ) - except zipfile.BadZipFile as e: - return ValidationResult(is_valid=False, error_message=f'Invalid ZIP file: {e}') - except Exception as e: - return ValidationResult(is_valid=False, error_message=f'Error validating archive: {e}') + def __run_ffprobe(path: Path) -> Dict[str, Any]: + res = subprocess.run( + [ + 'ffprobe', '-v', 'error', '-select_streams', 'v:0', '-show_entries', + 'stream=codec_name,width,height,duration:format=duration,size', '-of', 'json', str(path), + ], + capture_output=True, text=True, check=True, + ) + return json.loads(res.stdout) diff --git a/preprocessor/services/validation/global_validator.py b/preprocessor/services/validation/global_validator.py index f0a434aef..dab1de61a 100644 --- a/preprocessor/services/validation/global_validator.py +++ b/preprocessor/services/validation/global_validator.py @@ -8,83 +8,96 @@ class GlobalValidationResult(BaseValidationResult): pass -class GlobalValidator: +class GlobalValidator: def __init__(self, series_name: str, base_output_dir: Path) -> None: - self.series_name = series_name - self.base_output_dir = base_output_dir - self.result = GlobalValidationResult() + self.__series_name = series_name + self.__base_output_dir = base_output_dir + self.__result = GlobalValidationResult() def validate(self) -> GlobalValidationResult: - self.__validate_main_json_files() - self.__validate_characters_folder() - self.__validate_processing_metadata() - return self.result + self.__check_main_json_files() + self.__check_characters_assets() + self.__check_processing_metadata_store() + return self.__result - @staticmethod - def __get_character_images(char_folder: Path) -> List[Path]: - extensions = ['*.jpg', '*.jpeg', '*.png', '*.webp'] - image_files = [] - for ext in extensions: - image_files.extend(char_folder.glob(ext)) - return image_files - - def __validate_characters_folder(self) -> None: - characters_dir = self.base_output_dir / 'characters' - if not characters_dir.exists(): - self.result.warnings.append('Missing characters/ directory') + def __check_main_json_files(self) -> None: + files = [ + (f'{self.__series_name}_episodes.json', 'episodes_json_valid'), + (f'{self.__series_name}_characters.json', 'characters_json_valid'), + ] + for filename, stats_key in files: + self.__validate_json_at_path(self.__base_output_dir / filename, stats_key) + + def __check_characters_assets(self) -> None: + char_dir = self.__base_output_dir / 'characters' + if not char_dir.exists(): + self.__result.warnings.append('Missing characters/ directory') return - character_folders = [d for d in characters_dir.iterdir() if d.is_dir()] - if not character_folders: - self.result.warnings.append('No character folders in characters/') + + folders = [d for d in char_dir.iterdir() if d.is_dir()] + self.__result.stats['character_folders_count'] = len(folders) + + if not folders: + self.__result.warnings.append('No character folders in characters/') return - self.result.stats['character_folders_count'] = len(character_folders) - total_images = 0 - invalid_images = 0 - characters_without_images: List[str] = [] - for char_folder in character_folders: - image_files = self.__get_character_images(char_folder) - if not image_files: - characters_without_images.append(char_folder.name) + + self.__process_all_character_folders(folders) + + def __process_all_character_folders(self, folders: List[Path]) -> None: + counters = {'total': 0, 'invalid': 0, 'empty_chars': []} + + for folder in folders: + images = self.__get_image_files(folder) + if not images: + counters['empty_chars'].append(folder.name) continue - total_images += len(image_files) - for img_file in image_files: - result = FileValidator.validate_image_file(img_file) - if not result.is_valid: - invalid_images += 1 - self.result.errors.append(f'Invalid character image {char_folder.name}/{img_file.name}: {result.error_message}') - self.result.stats['character_images_count'] = total_images - self.result.stats['invalid_character_images'] = invalid_images - if characters_without_images: - self.result.warnings.append(f'{len(characters_without_images)} characters without reference images') - - def __validate_json_file(self, file_path: Path, stats_key: str) -> None: - if file_path.exists(): - result = FileValidator.validate_json_file(file_path) - if not result.is_valid: - self.result.errors.append(f'Invalid {file_path.name}: {result.error_message}') - else: - self.result.stats[stats_key] = True - else: - self.result.warnings.append(f'Missing {file_path.name}') - - def __validate_main_json_files(self) -> None: - episodes_file = self.base_output_dir / f'{self.series_name}_episodes.json' - self.__validate_json_file(episodes_file, 'episodes_json_valid') - characters_file = self.base_output_dir / f'{self.series_name}_characters.json' - self.__validate_json_file(characters_file, 'characters_json_valid') - - def __validate_processing_metadata(self) -> None: - metadata_dir = self.base_output_dir / 'processing_metadata' - if not metadata_dir.exists(): - self.result.warnings.append('Missing processing_metadata/ directory') + + counters['total'] += len(images) + counters['invalid'] += self.__validate_image_batch(images, folder.name) + + self.__result.stats['character_images_count'] = counters['total'] + self.__result.stats['invalid_character_images'] = counters['invalid'] + + if counters['empty_chars']: + self.__result.warnings.append(f'{len(counters["empty_chars"])} characters without images') + + def __validate_image_batch(self, images: List[Path], char_name: str) -> int: + invalid_count = 0 + for img in images: + v_res = FileValidator.validate_image_file(img) + if not v_res.is_valid: + invalid_count += 1 + self.__result.errors.append(f'Invalid image {char_name}/{img.name}: {v_res.error_message}') + return invalid_count + + def __check_processing_metadata_store(self) -> None: + meta_dir = self.__base_output_dir / 'processing_metadata' + if not meta_dir.exists(): + self.__result.warnings.append('Missing processing_metadata/ directory') return - json_files = list(metadata_dir.glob('*.json')) - if not json_files: - self.result.warnings.append('No JSON files in processing_metadata/') + + json_files = list(meta_dir.glob('*.json')) + self.__result.stats['processing_metadata_files'] = len(json_files) + + for f in json_files: + v_res = FileValidator.validate_json_file(f) + if not v_res.is_valid: + self.__result.errors.append(f'Invalid metadata {f.name}: {v_res.error_message}') + + def __validate_json_at_path(self, path: Path, stats_key: str) -> None: + if not path.exists(): + self.__result.warnings.append(f'Missing {path.name}') return - self.result.stats['processing_metadata_files'] = len(json_files) - for json_file in json_files: - result = FileValidator.validate_json_file(json_file) - if not result.is_valid: - self.result.errors.append(f'Invalid processing metadata {json_file.name}: {result.error_message}') + v_res = FileValidator.validate_json_file(path) + if not v_res.is_valid: + self.__result.errors.append(f'Invalid {path.name}: {v_res.error_message}') + else: + self.__result.stats[stats_key] = True + + @staticmethod + def __get_image_files(folder: Path) -> List[Path]: + found = [] + for ext in ('*.jpg', '*.jpeg', '*.png', '*.webp'): + found.extend(folder.glob(ext)) + return found diff --git a/preprocessor/services/validation/report_generator.py b/preprocessor/services/validation/report_generator.py index 20294c96a..b413f9049 100644 --- a/preprocessor/services/validation/report_generator.py +++ b/preprocessor/services/validation/report_generator.py @@ -12,11 +12,10 @@ class ReportGenerator: - def __init__(self, season: str, anomaly_threshold: float) -> None: - self.season = season - self.anomaly_threshold = anomaly_threshold - self.timestamp = datetime.now().isoformat() + self.__season = season + self.__anomaly_threshold = anomaly_threshold + self.__timestamp = datetime.now().isoformat() def generate_report( self, @@ -25,20 +24,16 @@ def generate_report( output_path: Path, ) -> Optional[Dict[str, Any]]: report = { - 'validation_timestamp': self.timestamp, - 'season': self.season, - 'anomaly_threshold': self.anomaly_threshold, - 'episodes': { - episode_id: stats.to_dict() - for episode_id, stats in episodes_stats.items() - }, + 'validation_timestamp': self.__timestamp, + 'season': self.__season, + 'anomaly_threshold': self.__anomaly_threshold, + 'episodes': {eid: s.to_dict() for eid, s in episodes_stats.items()}, 'season_comparison': season_comparison.to_dict(), } - self.__save_report(report, output_path) + self.__write_to_disk(report, output_path) return report - @staticmethod - def __save_report(report: Dict[str, Any], output_path: Path) -> None: - output_path.parent.mkdir(parents=True, exist_ok=True) - with open(output_path, 'w', encoding='utf-8') as f: - json.dump(report, f, indent=2, ensure_ascii=False) + def __write_to_disk(self, data: Dict[str, Any], path: Path) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + with open(path, 'w', encoding='utf-8') as f: + json.dump(data, f, indent=2, ensure_ascii=False) diff --git a/preprocessor/services/validation/season_comparator.py b/preprocessor/services/validation/season_comparator.py index 4c32554f6..76ed15cde 100644 --- a/preprocessor/services/validation/season_comparator.py +++ b/preprocessor/services/validation/season_comparator.py @@ -20,6 +20,7 @@ class MetricComparison: metric_name: str min_value: Optional[float] + @dataclass class Anomaly: avg: float @@ -29,6 +30,7 @@ class Anomaly: severity: str value: float + @dataclass class SeasonComparison: anomaly_threshold: float @@ -37,82 +39,69 @@ class SeasonComparison: metrics: Dict[str, MetricComparison] = field(default_factory=dict) def compare_episodes(self, episodes_stats: Dict[str, EpisodeStats]) -> None: - metric_keys = [ - 'transcription_duration', - 'transcription_chars', - 'transcription_words', - 'exported_frames_count', - 'exported_frames_total_size_mb', - 'video_size_mb', - 'video_duration', - 'scenes_count', + metrics_to_check = [ + 'transcription_duration', 'transcription_chars', 'transcription_words', + 'exported_frames_count', 'exported_frames_total_size_mb', + 'video_size_mb', 'video_duration', 'scenes_count', ] - for metric_key in metric_keys: - self.__compare_metric(metric_key, episodes_stats) + for key in metrics_to_check: + self.__analyze_metric_across_episodes(key, episodes_stats) def to_dict(self) -> Dict[str, Any]: return { 'metrics': { - metric_name: { - 'min': metric.min_value, - 'max': metric.max_value, - 'avg': metric.avg_value, - 'difference_percent': metric.difference_percent, - } - for metric_name, metric in self.metrics.items() + name: { + 'min': m.min_value, 'max': m.max_value, + 'avg': m.avg_value, 'difference_percent': m.difference_percent, + } for name, m in self.metrics.items() }, 'anomalies': [ { - 'episode': anomaly.episode, - 'metric': anomaly.metric, - 'value': anomaly.value, - 'avg': anomaly.avg, - 'deviation_percent': anomaly.deviation_percent, - 'severity': anomaly.severity, - } - for anomaly in self.anomalies + 'episode': a.episode, 'metric': a.metric, 'value': a.value, + 'avg': a.avg, 'deviation_percent': a.deviation_percent, 'severity': a.severity, + } for a in self.anomalies ], } - def __compare_metric(self, metric_key: str, episodes_stats: Dict[str, EpisodeStats]) -> None: - values = [] - episode_values = {} - for episode_id, stats in episodes_stats.items(): - value = getattr(stats, metric_key, None) - if value is not None: - values.append(value) - episode_values[episode_id] = value - if not values: + def __analyze_metric_across_episodes(self, key: str, stats_dict: Dict[str, EpisodeStats]) -> None: + episode_values = { + ep_id: val for ep_id, s in stats_dict.items() + if (val := getattr(s, key, None)) is not None + } + + if not episode_values: return - min_val = min(values) - max_val = max(values) + + values = list(episode_values.values()) avg_val = sum(values) / len(values) - if min_val > 0: - diff_percent = (max_val - min_val) / min_val * 100 - else: - diff_percent = 0.0 - self.metrics[metric_key] = MetricComparison( - metric_name=metric_key, - min_value=round(min_val, 2), - max_value=round(max_val, 2), + + self.__calculate_metric_summary(key, values, avg_val) + self.__detect_anomalies_for_metric(key, episode_values, avg_val) + + def __calculate_metric_summary(self, key: str, values: List[float], avg_val: float) -> None: + min_v, max_v = min(values), max(values) + diff = ((max_v - min_v) / min_v * 100) if min_v > 0 else 0.0 + + self.metrics[key] = MetricComparison( + metric_name=key, + min_value=round(min_v, 2), + max_value=round(max_v, 2), avg_value=round(avg_val, 2), - difference_percent=round(diff_percent, 2), + difference_percent=round(diff, 2), + ) + + def __detect_anomalies_for_metric(self, key: str, ep_values: Dict[str, float], avg_val: float) -> None: + if avg_val <= 0: + return + + for ep_id, val in ep_values.items(): + deviation = abs((val - avg_val) / avg_val) * 100 + if deviation > self.anomaly_threshold: + self.anomalies.append(self.__create_anomaly_record(ep_id, key, val, avg_val, deviation)) + + def __create_anomaly_record(self, ep_id: str, key: str, val: float, avg: float, dev: float) -> Anomaly: + severity = 'ERROR' if dev > (self.anomaly_threshold * 2) else 'WARNING' + return Anomaly( + episode=ep_id, metric=key, value=round(val, 2), + avg=round(avg, 2), deviation_percent=round(dev, 2), severity=severity, ) - for episode_id, value in episode_values.items(): - if avg_val > 0: - deviation_percent = abs((value - avg_val) / avg_val) * 100 - else: - deviation_percent = 0.0 - if deviation_percent > self.anomaly_threshold: - threshold_doubled = self.anomaly_threshold * 2 - severity = 'ERROR' if deviation_percent > threshold_doubled else 'WARNING' - self.anomalies.append( - Anomaly( - episode=episode_id, - metric=metric_key, - value=round(value, 2), - avg=round(avg_val, 2), - deviation_percent=round(deviation_percent, 2), - severity=severity, - ), - ) diff --git a/preprocessor/services/validation/validator.py b/preprocessor/services/validation/validator.py index 9f9e34375..3495e4efa 100644 --- a/preprocessor/services/validation/validator.py +++ b/preprocessor/services/validation/validator.py @@ -1,7 +1,9 @@ from datetime import datetime from pathlib import Path from typing import ( + Any, Dict, + List, Optional, ) @@ -18,106 +20,137 @@ console = Console() -class Validator: +class Validator: def __init__( - self, - season: str, - series_name: str = 'ranczo', - anomaly_threshold: float = 20.0, - base_output_dir: Path = None, - episodes_info_json: Optional[Path] = None, - ): - self.season = season - self.series_name = series_name - self.anomaly_threshold = anomaly_threshold - self.base_output_dir = base_output_dir - self.episode_manager = EpisodeManager(episodes_info_json, series_name) - self.validation_reports_dir = base_output_dir / settings.output_subdirs.validation_reports + self, + season: str, + series_name: str = 'ranczo', + anomaly_threshold: float = 20.0, + base_output_dir: Optional[Path] = None, + episodes_info_json: Optional[Path] = None, + ) -> None: + self.__season = season + self.__series_name = series_name + self.__anomaly_threshold = anomaly_threshold + self.__base_output_dir = base_output_dir + self.__episode_manager = EpisodeManager(episodes_info_json, series_name) + self.__validation_reports_dir = base_output_dir / settings.output_subdirs.validation_reports def validate(self) -> int: - transcriptions_season_path = self.base_output_dir / 'transcriptions' / self.season - if not transcriptions_season_path.exists(): - console.print(f'[red]Season directory not found: {transcriptions_season_path}[/red]') + transcriptions_path = self.__base_output_dir / 'transcriptions' / self.__season + if not transcriptions_path.exists(): + console.print(f'[red]Season directory not found: {transcriptions_path}[/red]') return 1 - console.print(f'[bold cyan]Validating season {self.season}...[/bold cyan]') - episodes_stats = self.__collect_episodes_stats(transcriptions_season_path) + + console.print(f'[bold cyan]Validating season {self.__season}...[/bold cyan]') + + episodes_stats = self.__collect_all_episodes_stats(transcriptions_path) if not episodes_stats: - console.print(f'[red]No episodes found in {transcriptions_season_path}[/red]') + console.print(f'[red]No episodes found in {transcriptions_path}[/red]') return 1 - self.validation_reports_dir.mkdir(parents=True, exist_ok=True) - self.__generate_episode_reports(episodes_stats) - season_comparison = SeasonComparison(season=self.season, anomaly_threshold=self.anomaly_threshold) - season_comparison.compare_episodes(episodes_stats) - report_generator = ReportGenerator(season=self.season, anomaly_threshold=self.anomaly_threshold) - season_report_path = self.validation_reports_dir / f'{self.series_name}_{self.season}_season.json' - report_generator.generate_report(episodes_stats, season_comparison, season_report_path) - self.__print_summary(episodes_stats, season_comparison) - console.print(f'\n[green]Validation reports saved to: {self.validation_reports_dir}[/green]') + + self.__generate_reports_and_compare(episodes_stats) return 0 - def __collect_episodes_stats(self, transcriptions_season_path: Path) -> Dict[str, EpisodeStats]: - episode_dirs = sorted([d for d in transcriptions_season_path.iterdir() if d.is_dir() and d.name.startswith('E')]) - episodes_stats = {} - for episode_dir in track(episode_dirs, description='Collecting episode stats'): - episode_num = int(episode_dir.name[1:]) - season_num = int(self.season[1:]) - episode_info = self.episode_manager.get_episode_by_season_and_relative(season_num, episode_num) - if not episode_info: - console.print(f'[yellow]Skipping {episode_dir.name}: could not parse episode info[/yellow]') - continue - episode_id = episode_info.episode_code() - stats = EpisodeStats(episode_info=episode_info, series_name=self.series_name) + def __generate_reports_and_compare(self, episodes_stats: Dict[str, EpisodeStats]) -> None: + self.__validation_reports_dir.mkdir(parents=True, exist_ok=True) + + self.__save_individual_episode_reports(episodes_stats) + + comparison = SeasonComparison(season=self.__season, anomaly_threshold=self.__anomaly_threshold) + comparison.compare_episodes(episodes_stats) + + self.__generate_season_summary_report(episodes_stats, comparison) + self.__print_execution_summary(episodes_stats, comparison) + + console.print(f'\n[green]Validation reports saved to: {self.__validation_reports_dir}[/green]') + + def __collect_all_episodes_stats(self, season_path: Path) -> Dict[str, EpisodeStats]: + episode_dirs = sorted([d for d in season_path.iterdir() if d.is_dir() and d.name.startswith('E')]) + results: Dict[str, EpisodeStats] = {} + + for ep_dir in track(episode_dirs, description='Collecting episode stats'): + stats = self.__process_single_episode_dir(ep_dir) + if stats: + results[stats.episode_info.episode_code()] = stats + return results + + def __process_single_episode_dir(self, ep_dir: Path) -> Optional[EpisodeStats]: + try: + episode_num = int(ep_dir.name[1:]) + season_num = int(self.__season[1:]) + info = self.__episode_manager.get_episode_by_season_and_relative(season_num, episode_num) + + if not info: + console.print(f'[yellow]Skipping {ep_dir.name}: could not parse info[/yellow]') + return None + + stats = EpisodeStats(episode_info=info, series_name=self.__series_name) stats.collect_stats() - episodes_stats[episode_id] = stats - return episodes_stats + return stats + except ValueError: + return None - def __generate_episode_reports(self, episodes_stats: Dict[str, EpisodeStats]) -> None: + def __save_individual_episode_reports(self, episodes_stats: Dict[str, EpisodeStats]) -> None: + path_manager = PathService(self.__series_name) for stats in episodes_stats.values(): - episode_report = { - 'validation_timestamp': datetime.now().isoformat(), - 'episode_id': stats.episode_info.episode_code(), - 'episode_title': stats.episode_info.title, - 'status': stats.status, - 'errors': stats.errors, - 'warnings': stats.warnings, - 'stats': stats.to_dict()['stats'], - } - path_manager = PathService(self.series_name) - report_filename = path_manager.build_filename(stats.episode_info, extension='json') - report_path = self.validation_reports_dir / report_filename - FileOperations.atomic_write_json(report_path, episode_report) - - def __print_summary(self, episodes_stats: Dict[str, EpisodeStats], season_comparison: SeasonComparison) -> None: - console.print(f'\n[bold]Validation Summary for {self.season}[/bold]') - console.print(f'Total episodes: {len(episodes_stats)}') - pass_count = sum((1 for stats in episodes_stats.values() if stats.status == 'PASS')) - warning_count = sum((1 for stats in episodes_stats.values() if stats.status == 'WARNING')) - fail_count = sum((1 for stats in episodes_stats.values() if stats.status == 'FAIL')) - console.print(f' [green]PASS:[/green] {pass_count}') - console.print(f' [yellow]WARNING:[/yellow] {warning_count}') - console.print(f' [red]FAIL:[/red] {fail_count}') - if season_comparison.anomalies: - console.print(f'\n[bold yellow]Anomalies detected: {len(season_comparison.anomalies)}[/bold yellow]') - for anomaly in season_comparison.anomalies[:5]: - color = 'red' if anomaly.severity == 'ERROR' else 'yellow' - msg = ( - f'{anomaly.metric} = {anomaly.value} ' - f'(avg: {anomaly.avg}, deviation: {anomaly.deviation_percent:.1f}%)' - ) - console.print(f' [{color}]{anomaly.episode}[/{color}]: {msg}') - if len(season_comparison.anomalies) > 5: - console.print(f' ... and {len(season_comparison.anomalies) - 5} more') - for episode_id, stats in episodes_stats.items(): + report = self.__build_episode_report_payload(stats) + filename = path_manager.build_filename(stats.episode_info, extension='json') + FileOperations.atomic_write_json(self.__validation_reports_dir / filename, report) + + def __generate_season_summary_report(self, stats: Dict[str, EpisodeStats], comparison: SeasonComparison) -> None: + generator = ReportGenerator(season=self.__season, anomaly_threshold=self.__anomaly_threshold) + report_path = self.__validation_reports_dir / f'{self.__series_name}_{self.__season}_season.json' + generator.generate_report(stats, comparison, report_path) + + def __print_execution_summary(self, stats: Dict[str, EpisodeStats], comparison: SeasonComparison) -> None: + console.print(f'\n[bold]Validation Summary for {self.__season}[/bold]') + console.print(f'Total episodes: {len(stats)}') + + self.__print_status_counts(stats) + self.__print_anomalies(comparison) + self.__print_issues(stats) + + def __build_episode_report_payload(self, stats: EpisodeStats) -> Dict[str, Any]: + return { + 'validation_timestamp': datetime.now().isoformat(), + 'episode_id': stats.episode_info.episode_code(), + 'episode_title': stats.episode_info.title, + 'status': stats.status, + 'errors': stats.errors, + 'warnings': stats.warnings, + 'stats': stats.to_dict()['stats'], + } + + def __print_status_counts(self, stats: Dict[str, EpisodeStats]) -> None: + counts = {'PASS': 0, 'WARNING': 0, 'FAIL': 0} + for s in stats.values(): + counts[s.status] += 1 + console.print(f' [green]PASS:[/green] {counts["PASS"]}') + console.print(f' [yellow]WARNING:[/yellow] {counts["WARNING"]}') + console.print(f' [red]FAIL:[/red] {counts["FAIL"]}') + + def __print_anomalies(self, comparison: SeasonComparison) -> None: + if not comparison.anomalies: + return + console.print(f'\n[bold yellow]Anomalies detected: {len(comparison.anomalies)}[/bold yellow]') + for anomaly in comparison.anomalies[:5]: + color = 'red' if anomaly.severity == 'ERROR' else 'yellow' + msg = f'{anomaly.metric} = {anomaly.value} (avg: {anomaly.avg}, dev: {anomaly.deviation_percent:.1f}%)' + console.print(f' [{color}]{anomaly.episode}[/{color}]: {msg}') + + def __print_issues(self, stats_dict: Dict[str, EpisodeStats]) -> None: + for ep_id, stats in stats_dict.items(): if stats.errors: - console.print(f'\n[red]Errors in {episode_id}:[/red]') - for error in stats.errors[:3]: - console.print(f' - {error}') - if len(stats.errors) > 3: - console.print(f' ... and {len(stats.errors) - 3} more') + self.__print_list('red', f'Errors in {ep_id}', stats.errors) if stats.warnings: - console.print(f'\n[yellow]Warnings in {episode_id}:[/yellow]') - for warning in stats.warnings[:3]: - console.print(f' - {warning}') - if len(stats.warnings) > 3: - console.print(f' ... and {len(stats.warnings) - 3} more') + self.__print_list('yellow', f'Warnings in {ep_id}', stats.warnings) + + @staticmethod + def __print_list(color: str, title: str, items: List[str]) -> None: + console.print(f'\n[{color}]{title}:[/{color}]') + for item in items[:3]: + console.print(f' - {item}') + if len(items) > 3: + console.print(f' ... and {len(items) - 3} more') diff --git a/preprocessor/services/validation/validators/base_validator.py b/preprocessor/services/validation/validators/base_validator.py index 69d0fb587..289790102 100644 --- a/preprocessor/services/validation/validators/base_validator.py +++ b/preprocessor/services/validation/validators/base_validator.py @@ -18,7 +18,6 @@ class BaseValidator(ABC): - @abstractmethod def validate(self, stats: 'EpisodeStats') -> None: pass diff --git a/preprocessor/services/validation/validators/character_validator.py b/preprocessor/services/validation/validators/character_validator.py index aa1a9e99a..a1ac2f0b9 100644 --- a/preprocessor/services/validation/validators/character_validator.py +++ b/preprocessor/services/validation/validators/character_validator.py @@ -9,7 +9,6 @@ class CharacterValidator(BaseValidator): - def validate(self, stats: 'EpisodeStats') -> None: VisualizationValidationHelper.validate_visualizations( stats, diff --git a/preprocessor/services/validation/validators/elastic_validator.py b/preprocessor/services/validation/validators/elastic_validator.py index 1e757eadb..ea601edd3 100644 --- a/preprocessor/services/validation/validators/elastic_validator.py +++ b/preprocessor/services/validation/validators/elastic_validator.py @@ -1,6 +1,10 @@ import json from pathlib import Path -from typing import TYPE_CHECKING +from typing import ( + TYPE_CHECKING, + Any, + Dict, +) from preprocessor.config.constants import OUTPUT_FILE_NAMES from preprocessor.config.settings_instance import settings @@ -15,7 +19,6 @@ class ElasticValidator(BaseValidator): - def validate(self, stats: 'EpisodeStats') -> None: self.__validate_character_detections(stats) self.__validate_embeddings(stats) @@ -23,9 +26,7 @@ def validate(self, stats: 'EpisodeStats') -> None: self.__validate_text_statistics(stats) def __validate_character_detections(self, stats: 'EpisodeStats') -> None: - char_detections_dir = PathService(stats.series_name).get_episode_dir( - stats.episode_info, settings.output_subdirs.character_detections, - ) + char_detections_dir = self.__get_dir(stats, settings.output_subdirs.character_detections) detections_file = char_detections_dir / OUTPUT_FILE_NAMES['detections'] self._validate_json_if_exists( @@ -35,10 +36,7 @@ def __validate_character_detections(self, stats: 'EpisodeStats') -> None: ) def __validate_embeddings(self, stats: 'EpisodeStats') -> None: - embeddings_dir = PathService(stats.series_name).get_episode_dir( - stats.episode_info, settings.output_subdirs.embeddings, - ) - + embeddings_dir = self.__get_dir(stats, settings.output_subdirs.embeddings) if embeddings_dir.exists(): embeddings_file = embeddings_dir / OUTPUT_FILE_NAMES['embeddings_text'] self._validate_json_if_exists( @@ -48,46 +46,38 @@ def __validate_embeddings(self, stats: 'EpisodeStats') -> None: ) def __validate_elastic_documents(self, stats: 'EpisodeStats') -> None: - elastic_subdirs = [ - ELASTIC_SUBDIRS.text_segments, - ELASTIC_SUBDIRS.text_embeddings, - ELASTIC_SUBDIRS.video_frames, - ELASTIC_SUBDIRS.episode_names, - ELASTIC_SUBDIRS.text_statistics, - ELASTIC_SUBDIRS.full_episode_embeddings, - ELASTIC_SUBDIRS.sound_events, - ELASTIC_SUBDIRS.sound_event_embeddings, + subdirs_to_check = [ + ELASTIC_SUBDIRS.text_segments, ELASTIC_SUBDIRS.text_embeddings, + ELASTIC_SUBDIRS.video_frames, ELASTIC_SUBDIRS.episode_names, + ELASTIC_SUBDIRS.text_statistics, ELASTIC_SUBDIRS.full_episode_embeddings, + ELASTIC_SUBDIRS.sound_events, ELASTIC_SUBDIRS.sound_event_embeddings, ] - found_elastic_docs = False - for subdir in elastic_subdirs: - elastic_base = settings.output_subdirs.elastic_documents - elastic_docs_dir = PathService(stats.series_name).get_episode_dir( - stats.episode_info, f'{elastic_base}/{subdir}', - ) + found_any = False + elastic_base = settings.output_subdirs.elastic_documents - if elastic_docs_dir.exists(): - found_elastic_docs = True - for jsonl_file in elastic_docs_dir.glob('*.jsonl'): - result = FileValidator.validate_jsonl_file(jsonl_file) - if not result.is_valid: - self._add_error(stats, f'Invalid JSONL {jsonl_file.name}: {result.error_message}') - else: - self.__validate_embedding_dimensions(stats, jsonl_file, subdir) + for subdir in subdirs_to_check: + docs_dir = self.__get_dir(stats, f'{elastic_base}/{subdir}') + if docs_dir.exists(): + found_any = True + self.__process_jsonl_files(stats, docs_dir, subdir) - if not found_elastic_docs: + if not found_any: self._add_warning(stats, f'Missing {settings.output_subdirs.elastic_documents} directory') - def __validate_text_statistics(self, stats: 'EpisodeStats') -> None: - transcriptions_dir = PathService(stats.series_name).get_episode_dir( - stats.episode_info, settings.output_subdirs.transcriptions, - ) + def __process_jsonl_files(self, stats: 'EpisodeStats', docs_dir: Path, subdir: str) -> None: + for jsonl_file in docs_dir.glob('*.jsonl'): + result = FileValidator.validate_jsonl_file(jsonl_file) + if not result.is_valid: + self._add_error(stats, f'Invalid JSONL {jsonl_file.name}: {result.error_message}') + else: + self.__validate_embedding_dimensions(stats, jsonl_file, subdir) - if transcriptions_dir.exists(): + def __validate_text_statistics(self, stats: 'EpisodeStats') -> None: + trans_dir = self.__get_dir(stats, settings.output_subdirs.transcriptions) + if trans_dir.exists(): clean_subdir = settings.output_subdirs.transcription_subdirs.clean - clean_dir = transcriptions_dir / clean_subdir - filename = f'{stats.series_name}_{stats.episode_info.episode_code()}_text_stats.json' - text_stats_file = clean_dir / filename + text_stats_file = trans_dir / clean_subdir / f'{stats.series_name}_{stats.episode_info.episode_code()}_text_stats.json' if text_stats_file.exists(): result = FileValidator.validate_json_file(text_stats_file) @@ -96,9 +86,7 @@ def __validate_text_statistics(self, stats: 'EpisodeStats') -> None: else: self._add_warning(stats, f'Missing text statistics file: {text_stats_file.name}') - def __validate_embedding_dimensions( - self, stats: 'EpisodeStats', jsonl_file: Path, subdir: str, - ) -> None: + def __validate_embedding_dimensions(self, stats: 'EpisodeStats', jsonl_file: Path, subdir: str) -> None: embedding_fields = { ELASTIC_SUBDIRS.text_embeddings: 'text_embedding', ELASTIC_SUBDIRS.video_frames: 'video_embedding', @@ -110,8 +98,8 @@ def __validate_embedding_dimensions( if subdir not in embedding_fields: return - embedding_field = embedding_fields[subdir] expected_dim = settings.embedding_model.embedding_dim + field_name = embedding_fields[subdir] try: with open(jsonl_file, 'r', encoding='utf-8') as f: @@ -119,17 +107,18 @@ def __validate_embedding_dimensions( if not line.strip(): continue doc = json.loads(line) - if embedding_field in doc: - embedding = doc[embedding_field] - if isinstance(embedding, list): - actual_dim = len(embedding) - if actual_dim != expected_dim: - error_msg = ( - f'{jsonl_file.name} line {line_num}: ' - f'{embedding_field} has {actual_dim} dimensions, ' - f'expected {expected_dim}' - ) - self._add_error(stats, error_msg) - return + self.__check_doc_dimension(stats, doc, field_name, expected_dim, jsonl_file.name, line_num) except Exception as e: self._add_error(stats, f'Error validating embeddings in {jsonl_file.name}: {e}') + + def __check_doc_dimension( + self, stats: 'EpisodeStats', doc: Dict[str, Any], field: str, expected: int, fname: str, + lnum: int, + ) -> None: + if field in doc and isinstance(doc[field], list): + actual = len(doc[field]) + if actual != expected: + self._add_error(stats, f'{fname} line {lnum}: {field} has {actual} dim, expected {expected}') + + def __get_dir(self, stats: 'EpisodeStats', subdir: str) -> Path: + return PathService(stats.series_name).get_episode_dir(stats.episode_info, subdir) diff --git a/preprocessor/services/validation/validators/face_cluster_validator.py b/preprocessor/services/validation/validators/face_cluster_validator.py index a9bdc6808..bc63844a9 100644 --- a/preprocessor/services/validation/validators/face_cluster_validator.py +++ b/preprocessor/services/validation/validators/face_cluster_validator.py @@ -1,4 +1,3 @@ -import json from pathlib import Path from typing import ( TYPE_CHECKING, @@ -9,7 +8,6 @@ from preprocessor.config.settings_instance import settings from preprocessor.services.io.path_service import PathService -from preprocessor.services.validation.file_validators import FileValidator from preprocessor.services.validation.validators.base_validator import BaseValidator if TYPE_CHECKING: @@ -17,7 +15,6 @@ class FaceClusterValidator(BaseValidator): - def validate(self, stats: 'EpisodeStats') -> None: clusters_dir = PathService(stats.series_name).get_episode_dir( stats.episode_info, settings.output_subdirs.face_clusters, @@ -26,44 +23,35 @@ def validate(self, stats: 'EpisodeStats') -> None: if not clusters_dir.exists(): return - metadata_files = list(clusters_dir.glob('*_face_clusters.json')) - metadata_file = metadata_files[0] if metadata_files else None - - if not metadata_file or not metadata_file.exists(): + metadata_file = self.__get_metadata_file(clusters_dir) + if not metadata_file: self._add_warning(stats, 'Missing face clustering metadata file') return - result = FileValidator.validate_json_file(metadata_file) - if not result.is_valid: - self._add_error(stats, f'Invalid face clustering metadata: {result.error_message}') + if not self._validate_json_with_error(stats, metadata_file, 'Missing metadata', 'Invalid face metadata'): return - data = self.__load_json_safely(metadata_file) - if not data: - self._add_error(stats, f'Error reading face clustering metadata: {metadata_file}') - return + data = self._load_json_safely(metadata_file) + if data: + self.__parse_cluster_stats(stats, data) + + def __get_metadata_file(self, clusters_dir: Path) -> Optional[Path]: + files = list(clusters_dir.glob('*_face_clusters.json')) + return files[0] if files else None + def __parse_cluster_stats(self, stats: 'EpisodeStats', data: Dict[str, Any]) -> None: clusters = data.get('clusters', {}) - if isinstance(clusters, dict): - stats.face_clusters_count = len(clusters) - total_faces = sum((cluster_info.get('face_count', 0) for cluster_info in clusters.values())) - elif isinstance(clusters, list): + total_faces = 0 + + if isinstance(clusters, (dict, list)): stats.face_clusters_count = len(clusters) - total_faces = sum((cluster_info.get('face_count', 0) for cluster_info in clusters)) + items = clusters.values() if isinstance(clusters, dict) else clusters + total_faces = sum(item.get('face_count', 0) for item in items) else: self._add_warning(stats, 'Unexpected clusters format in face clustering metadata') return noise_info = data.get('noise', {}) - if noise_info: - total_faces += noise_info.get('face_count', 0) + total_faces += noise_info.get('face_count', 0) stats.face_clusters_total_faces = total_faces - - @staticmethod - def __load_json_safely(file_path: Path) -> Optional[Dict[str, Any]]: - try: - with open(file_path, 'r', encoding='utf-8') as f: - return json.load(f) - except Exception: - return None diff --git a/preprocessor/services/validation/validators/frame_validator.py b/preprocessor/services/validation/validators/frame_validator.py index fbe2ad06f..82c976e0c 100644 --- a/preprocessor/services/validation/validators/frame_validator.py +++ b/preprocessor/services/validation/validators/frame_validator.py @@ -1,4 +1,9 @@ -from typing import TYPE_CHECKING +from pathlib import Path +from typing import ( + TYPE_CHECKING, + List, + Tuple, +) from preprocessor.config.constants import OUTPUT_FILE_PATTERNS from preprocessor.config.settings_instance import settings @@ -11,16 +16,12 @@ class FrameValidator(BaseValidator): - def validate(self, stats: 'EpisodeStats') -> None: frames_dir = PathService(stats.series_name).get_episode_dir( stats.episode_info, settings.output_subdirs.frames, ) - if not frames_dir.exists(): - self._add_warning( - stats, f'Missing {settings.output_subdirs.frames} directory: {frames_dir}', - ) + if not self.__check_dir(stats, frames_dir): return frame_files = sorted(frames_dir.glob(OUTPUT_FILE_PATTERNS['frame'])) @@ -29,8 +30,17 @@ def validate(self, stats: 'EpisodeStats') -> None: return stats.exported_frames_count = len(frame_files) - total_size = 0 - resolutions = [] + self.__process_frames(stats, frame_files) + + def __check_dir(self, stats: 'EpisodeStats', frames_dir: Path) -> bool: + if not frames_dir.exists(): + self._add_warning(stats, f'Missing {settings.output_subdirs.frames} directory') + return False + return True + + def __process_frames(self, stats: 'EpisodeStats', frame_files: List[Path]) -> None: + total_size = 0.0 + resolutions: List[Tuple[int, int]] = [] invalid_count = 0 for frame_file in frame_files: @@ -47,5 +57,4 @@ def validate(self, stats: 'EpisodeStats') -> None: stats.exported_frames_total_size_mb = round(total_size, 2) if resolutions: - most_common_res = max(set(resolutions), key=resolutions.count) - stats.exported_frames_avg_resolution = most_common_res + stats.exported_frames_avg_resolution = max(set(resolutions), key=resolutions.count) diff --git a/preprocessor/services/validation/validators/image_hash_validator.py b/preprocessor/services/validation/validators/image_hash_validator.py index 9e26e6806..91f9b101f 100644 --- a/preprocessor/services/validation/validators/image_hash_validator.py +++ b/preprocessor/services/validation/validators/image_hash_validator.py @@ -9,7 +9,6 @@ class ImageHashValidator(BaseValidator): - def validate(self, stats: 'EpisodeStats') -> None: JsonDirectoryValidationHelper.validate_json_directory( stats, diff --git a/preprocessor/services/validation/validators/object_validator.py b/preprocessor/services/validation/validators/object_validator.py index 58bd44b8d..0d7c37cce 100644 --- a/preprocessor/services/validation/validators/object_validator.py +++ b/preprocessor/services/validation/validators/object_validator.py @@ -12,13 +12,11 @@ class ObjectValidator(BaseValidator): - def validate(self, stats: 'EpisodeStats') -> None: self.__validate_object_detections(stats) self.__validate_object_visualizations(stats) - @staticmethod - def __validate_object_detections(stats: 'EpisodeStats') -> None: + def __validate_object_detections(self, stats: 'EpisodeStats') -> None: JsonDirectoryValidationHelper.validate_json_directory( stats, settings.output_subdirs.object_detections, @@ -27,8 +25,7 @@ def __validate_object_detections(stats: 'EpisodeStats') -> None: exclude_pattern='visualizations', ) - @staticmethod - def __validate_object_visualizations(stats: 'EpisodeStats') -> None: + def __validate_object_visualizations(self, stats: 'EpisodeStats') -> None: VisualizationValidationHelper.validate_visualizations( stats, settings.output_subdirs.object_visualizations, diff --git a/preprocessor/services/validation/validators/scene_validator.py b/preprocessor/services/validation/validators/scene_validator.py index f9b32eb3f..a81aae5e6 100644 --- a/preprocessor/services/validation/validators/scene_validator.py +++ b/preprocessor/services/validation/validators/scene_validator.py @@ -1,10 +1,9 @@ -import json from pathlib import Path from typing import ( TYPE_CHECKING, Any, Dict, - Optional, + List, ) from preprocessor.config.constants import OUTPUT_FILE_PATTERNS @@ -18,37 +17,37 @@ class SceneValidator(BaseValidator): - def validate(self, stats: 'EpisodeStats') -> None: + scenes_file = self.__resolve_scenes_file(stats) + + if not self._check_path_exists(scenes_file, stats, f'Missing scenes file: {scenes_file}'): + return + + if not self.__validate_json_integrity(stats, scenes_file): + return + + data = self._load_json_safely(scenes_file) + if data: + self.__extract_scene_stats(stats, data) + + def __resolve_scenes_file(self, stats: 'EpisodeStats') -> Path: scenes_dir = PathService(stats.series_name).get_episode_dir( stats.episode_info, settings.output_subdirs.scenes, ) - scenes_file = scenes_dir / f"{stats.series_name}_{stats.episode_info.episode_code()}{OUTPUT_FILE_PATTERNS['scenes_suffix']}" + suffix = OUTPUT_FILE_PATTERNS['scenes_suffix'] + return scenes_dir / f"{stats.series_name}_{stats.episode_info.episode_code()}{suffix}" - if not scenes_file.exists(): - self._add_error(stats, f'Missing scenes file: {scenes_file}') - return - - result = FileValidator.validate_json_file(scenes_file) + def __validate_json_integrity(self, stats: 'EpisodeStats', file_path: Path) -> bool: + result = FileValidator.validate_json_file(file_path) if not result.is_valid: self._add_error(stats, f'Invalid scenes JSON: {result.error_message}') - return - - data = self.__load_json_safely(scenes_file) - if not data: - self._add_error(stats, f'Error reading scenes: {scenes_file}') - return + return False + return True + def __extract_scene_stats(self, stats: 'EpisodeStats', data: Dict[str, Any]) -> None: stats.scenes_count = data.get('total_scenes', 0) - scenes = data.get('scenes', []) + scenes: List[Dict[str, Any]] = data.get('scenes', []) + if scenes: - durations = [scene.get('duration', 0) for scene in scenes] + durations = [s.get('duration', 0) for s in scenes] stats.scenes_avg_duration = round(sum(durations) / len(durations), 2) - - @staticmethod - def __load_json_safely(file_path: Path) -> Optional[Dict[str, Any]]: - try: - with open(file_path, 'r', encoding='utf-8') as f: - return json.load(f) - except Exception: - return None diff --git a/preprocessor/services/validation/validators/transcription_validator.py b/preprocessor/services/validation/validators/transcription_validator.py index cc1ff3a46..bd6f7ae0e 100644 --- a/preprocessor/services/validation/validators/transcription_validator.py +++ b/preprocessor/services/validation/validators/transcription_validator.py @@ -1,12 +1,13 @@ from pathlib import Path from typing import ( TYPE_CHECKING, + Any, Dict, + List, ) from preprocessor.config.settings_instance import settings from preprocessor.services.io.path_service import PathService -from preprocessor.services.validation.file_validators import FileValidator from preprocessor.services.validation.validators.base_validator import BaseValidator if TYPE_CHECKING: @@ -14,97 +15,91 @@ class TranscriptionValidator(BaseValidator): - def validate(self, stats: 'EpisodeStats') -> None: - transcriptions_dir = PathService(stats.series_name).get_episode_dir( - stats.episode_info, settings.output_subdirs.transcriptions, - ) - base_name = f'{stats.series_name}_{stats.episode_info.episode_code()}' - raw_dir = transcriptions_dir / settings.output_subdirs.transcription_subdirs.raw - clean_dir = transcriptions_dir / settings.output_subdirs.transcription_subdirs.clean - sound_events_dir = transcriptions_dir / settings.output_subdirs.transcription_subdirs.sound_events - - transcription_files = { - 'main': raw_dir / f'{base_name}.json', - 'segmented': raw_dir / f'{base_name}_segmented.json', - 'simple': raw_dir / f'{base_name}_simple.json', - 'clean': clean_dir / f'{base_name}_clean_transcription.json', - 'clean_txt': clean_dir / f'{base_name}_clean_transcription.txt', - 'sound_events': sound_events_dir / f'{base_name}_sound_events.json', - } + trans_files = self.__resolve_file_map(stats) - if not any((f.exists() for f in transcription_files.values())): + if not any(f.exists() for f in trans_files.values()): self._add_error(stats, 'No transcription files found in any format') return - self.__validate_raw_transcription(stats, transcription_files) - self.__validate_clean_transcription(stats, transcription_files['clean']) - self.__validate_clean_txt(stats, transcription_files['clean_txt']) - self.__validate_sound_events(stats, transcription_files['sound_events']) + self.__validate_raw_transcription(stats, trans_files) + self.__validate_clean_transcription(stats, trans_files['clean']) + self.__validate_clean_txt(stats, trans_files['clean_txt']) + self.__validate_sound_events(stats, trans_files['sound_events']) def __validate_raw_transcription( - self, stats: 'EpisodeStats', transcription_files: Dict[str, Path], + self, stats: 'EpisodeStats', trans_files: Dict[str, Path], ) -> None: - raw_transcription = None - for key in ('main', 'segmented', 'simple'): - if transcription_files[key].exists(): - raw_transcription = transcription_files[key] - break - - if not raw_transcription: - self._add_warning( - stats, - 'Missing raw transcription file (checked: .json, _segmented.json, _simple.json)', - ) - return + # Try to find any available raw format + raw_path = next((trans_files[k] for k in ('main', 'segmented', 'simple') if trans_files[k].exists()), None) - result = FileValidator.validate_json_file(raw_transcription) - if not result.is_valid: - self._add_error(stats, f'Invalid transcription JSON: {result.error_message}') + if not raw_path: + self._add_warning(stats, 'Missing raw transcription file (.json, _segmented.json, or _simple.json)') return - self.__extract_transcription_stats(stats, raw_transcription) + if self._validate_json_if_exists(stats, raw_path, "Invalid transcription JSON"): + self.__extract_transcription_metrics(stats, raw_path) - def __extract_transcription_stats(self, stats: 'EpisodeStats', raw_transcription: Path) -> None: - data = self._load_json_safely(raw_transcription) + def __extract_transcription_metrics(self, stats: 'EpisodeStats', raw_path: Path) -> None: + data = self._load_json_safely(raw_path) if not data: - self._add_error(stats, f'Error reading transcription: {raw_transcription}') + self._add_error(stats, f'Error reading transcription: {raw_path}') return - text = data.get('text', '') - if not text: - segments = data.get('segments', []) - if segments: - text = ' '.join((seg.get('text', '') for seg in segments)) - + text = self.__get_full_text(data) stats.transcription_chars = len(text) stats.transcription_words = len(text.split()) + stats.transcription_duration = self.__determine_duration(data) + + def __get_full_text(self, data: Dict[str, Any]) -> str: + text = data.get('text', '') + if not text: + segments: List[Dict[str, Any]] = data.get('segments', []) + text = ' '.join(s.get('text', '') for s in segments) + return text - words = data.get('words', []) + def __determine_duration(self, data: Dict[str, Any]) -> float: + words: List[Dict[str, Any]] = data.get('words', []) if words: - stats.transcription_duration = words[-1].get('end', 0.0) - else: - segments = data.get('segments', []) - if segments and segments[-1].get('end'): - stats.transcription_duration = segments[-1].get('end', 0.0) + return words[-1].get('end', 0.0) + segments: List[Dict[str, Any]] = data.get('segments', []) + if segments and segments[-1].get('end'): + return segments[-1].get('end', 0.0) + return 0.0 - def __validate_clean_transcription(self, stats: 'EpisodeStats', clean_transcription_file: Path) -> None: + def __validate_clean_transcription(self, stats: 'EpisodeStats', file_path: Path) -> None: self._validate_json_with_warning( - stats, - clean_transcription_file, - missing_msg=f'Missing clean transcription file: {clean_transcription_file.name}', + stats, file_path, + missing_msg=f'Missing clean transcription: {file_path.name}', invalid_msg_prefix='Invalid clean transcription JSON', ) - def __validate_clean_txt(self, stats: 'EpisodeStats', clean_txt_file: Path) -> None: - if not clean_txt_file.exists(): - self._add_warning(stats, f'Missing clean transcription txt: {clean_txt_file.name}') + def __validate_clean_txt(self, stats: 'EpisodeStats', file_path: Path) -> None: + if not file_path.exists(): + self._add_warning(stats, f'Missing clean transcription txt: {file_path.name}') - def __validate_sound_events(self, stats: 'EpisodeStats', sound_events_file: Path) -> None: + def __validate_sound_events(self, stats: 'EpisodeStats', file_path: Path) -> None: self._validate_json_with_warning( - stats, - sound_events_file, - missing_msg=f'Missing sound events file: {sound_events_file.name}', + stats, file_path, + missing_msg=f'Missing sound events: {file_path.name}', invalid_msg_prefix='Invalid sound events JSON', ) + + def __resolve_file_map(self, stats: 'EpisodeStats') -> Dict[str, Path]: + path_svc = PathService(stats.series_name) + trans_dir = path_svc.get_episode_dir(stats.episode_info, settings.output_subdirs.transcriptions) + base = f'{stats.series_name}_{stats.episode_info.episode_code()}' + + raw_base = trans_dir / settings.output_subdirs.transcription_subdirs.raw + clean_base = trans_dir / settings.output_subdirs.transcription_subdirs.clean + sound_base = trans_dir / settings.output_subdirs.transcription_subdirs.sound_events + + return { + 'main': raw_base / f'{base}.json', + 'segmented': raw_base / f'{base}_segmented.json', + 'simple': raw_base / f'{base}_simple.json', + 'clean': clean_base / f'{base}_clean_transcription.json', + 'clean_txt': clean_base / f'{base}_clean_transcription.txt', + 'sound_events': sound_base / f'{base}_sound_events.json', + } diff --git a/preprocessor/services/validation/validators/validation_helpers.py b/preprocessor/services/validation/validators/validation_helpers.py index 03865a708..6503c2f91 100644 --- a/preprocessor/services/validation/validators/validation_helpers.py +++ b/preprocessor/services/validation/validators/validation_helpers.py @@ -14,25 +14,23 @@ class JsonDirectoryValidationHelper: - @staticmethod def validate_json_directory( - stats: 'EpisodeStats', - subdir: str, - count_attr: Optional[str], - context_name: str, - exclude_pattern: Optional[str] = None, - check_anomalies: bool = True, + stats: 'EpisodeStats', + subdir: str, + count_attr: Optional[str], + context_name: str, + exclude_pattern: Optional[str] = None, + check_anomalies: bool = True, ) -> None: dir_path = PathService(stats.series_name).get_episode_dir(stats.episode_info, subdir) - count, sizes, errors = JsonDirectoryValidationHelper._validate_json_files_in_directory( - dir_path, exclude_pattern, - ) if not dir_path.exists(): stats.warnings.append(f'Missing {subdir} directory') return + count, sizes, errors = JsonDirectoryValidationHelper.__analyze_json_files(dir_path, exclude_pattern) + if count == 0: stats.warnings.append(f'No JSON files in {subdir}/') return @@ -43,15 +41,13 @@ def validate_json_directory( stats.errors.extend(errors) if check_anomalies: - JsonDirectoryValidationHelper._check_size_anomalies(stats, sizes, context_name) + JsonDirectoryValidationHelper.__perform_size_anomaly_check(stats, sizes, context_name) @staticmethod - def _validate_json_files_in_directory( - directory: Path, exclude_pattern: Optional[str] = None, + def __analyze_json_files( + directory: Path, + exclude_pattern: Optional[str], ) -> Tuple[int, List[int], List[str]]: - if not directory.exists(): - return 0, [], [] - json_files = [ f for f in directory.glob('*.json') if not exclude_pattern or exclude_pattern not in str(f) @@ -60,8 +56,9 @@ def _validate_json_files_in_directory( if not json_files: return 0, [], [] - sizes = [] - errors = [] + sizes: List[int] = [] + errors: List[str] = [] + for json_file in json_files: result = FileValidator.validate_json_file(json_file) if not result.is_valid: @@ -72,8 +69,11 @@ def _validate_json_files_in_directory( return len(json_files), sizes, errors @staticmethod - def _check_size_anomalies( - stats: 'EpisodeStats', sizes: List[int], folder_name: str, threshold: float = 0.2, + def __perform_size_anomaly_check( + stats: 'EpisodeStats', + sizes: List[int], + folder_name: str, + threshold: float = 0.2, ) -> None: if len(sizes) < 2: return @@ -85,41 +85,41 @@ def _check_size_anomalies( for i, size in enumerate(sizes): deviation = abs(size - avg_size) / avg_size if deviation > threshold: - warning_msg = ( - f'{folder_name} file #{i + 1} size deviation: ' - f'{deviation * 100:.1f}% from average' + stats.warnings.append( + f'{folder_name} file #{i + 1} size deviation: {deviation * 100:.1f}% from average', ) - stats.warnings.append(warning_msg) class VisualizationValidationHelper: - @staticmethod def validate_visualizations( - stats: 'EpisodeStats', subdir: str, count_attr: str, context_name: str, + stats: 'EpisodeStats', + subdir: str, + count_attr: str, + context_name: str, ) -> None: viz_dir = PathService(stats.series_name).get_episode_dir(stats.episode_info, subdir) - total_count, invalid_count, errors = VisualizationValidationHelper._validate_images_in_directory(viz_dir) + total, invalid, errors = VisualizationValidationHelper.__scan_images(viz_dir) - if total_count == 0 and viz_dir.exists(): + if total == 0 and viz_dir.exists(): stats.warnings.append(f'No visualization images in {subdir}/') return - if total_count > 0: - setattr(stats, count_attr, total_count) + if total > 0: + setattr(stats, count_attr, total) stats.errors.extend(errors) - if invalid_count > 0: - stats.warnings.append(f'{invalid_count} invalid {context_name} images found') + if invalid > 0: + stats.warnings.append(f'{invalid} invalid {context_name} images found') @staticmethod - def _validate_images_in_directory( - directory: Path, - extensions: Tuple[str, ...] = ('*.jpg', '*.png'), + def __scan_images( + directory: Path, + extensions: Tuple[str, ...] = ('*.jpg', '*.png'), ) -> Tuple[int, int, List[str]]: if not directory.exists(): return 0, 0, [] - image_files = [] + image_files: List[Path] = [] for ext in extensions: image_files.extend(directory.glob(ext)) @@ -127,7 +127,8 @@ def _validate_images_in_directory( return 0, 0, [] invalid_count = 0 - errors = [] + errors: List[str] = [] + for img_file in image_files: result = FileValidator.validate_image_file(img_file) if not result.is_valid: diff --git a/preprocessor/services/validation/validators/video_validator.py b/preprocessor/services/validation/validators/video_validator.py index a7d95c386..f20b336f3 100644 --- a/preprocessor/services/validation/validators/video_validator.py +++ b/preprocessor/services/validation/validators/video_validator.py @@ -1,10 +1,9 @@ +from pathlib import Path from typing import TYPE_CHECKING -from preprocessor.config.config import ( - get_base_output_dir, - settings, -) from preprocessor.config.constants import DEFAULT_VIDEO_EXTENSION +from preprocessor.config.output_paths import get_base_output_dir +from preprocessor.config.settings_instance import settings from preprocessor.services.validation.file_validators import FileValidator from preprocessor.services.validation.validators.base_validator import BaseValidator @@ -13,22 +12,31 @@ class VideoValidator(BaseValidator): - def validate(self, stats: 'EpisodeStats') -> None: - filename = f'{stats.series_name.lower()}_{stats.episode_info.episode_code()}{DEFAULT_VIDEO_EXTENSION}' - season_dir = get_base_output_dir(stats.series_name) / settings.output_subdirs.video / stats.episode_info.season_code() - video_file = season_dir / filename + video_path = self.__resolve_video_file_path(stats) - if not video_file.exists(): - self._add_warning(stats, f'Missing video file: {video_file}') + if not video_path.exists(): + self._add_warning(stats, f'Missing video file: {video_path}') return - result = FileValidator.validate_video_file(video_file) + result = FileValidator.validate_video_file(video_path) if not result.is_valid: self._add_error(stats, f'Invalid video: {result.error_message}') return - stats.video_size_mb = result.metadata['size_mb'] - stats.video_duration = result.metadata['duration'] - stats.video_codec = result.metadata['codec'] - stats.video_resolution = (result.metadata['width'], result.metadata['height']) + self.__populate_video_metrics(stats, result.metadata) + + def __resolve_video_file_path(self, stats: 'EpisodeStats') -> Path: + filename = f'{stats.series_name.lower()}_{stats.episode_info.episode_code()}{DEFAULT_VIDEO_EXTENSION}' + season_dir = ( + get_base_output_dir(stats.series_name) / + settings.output_subdirs.video / + stats.episode_info.season_code() + ) + return season_dir / filename + + def __populate_video_metrics(self, stats: 'EpisodeStats', metadata: dict) -> None: + stats.video_size_mb = metadata['size_mb'] + stats.video_duration = metadata['duration'] + stats.video_codec = metadata['codec'] + stats.video_resolution = (metadata['width'], metadata['height']) diff --git a/preprocessor/services/video/discovery.py b/preprocessor/services/video/discovery.py index 026b2d329..45f3dd9f3 100644 --- a/preprocessor/services/video/discovery.py +++ b/preprocessor/services/video/discovery.py @@ -1,5 +1,8 @@ from pathlib import Path -from typing import List +from typing import ( + List, + Optional, +) class VideoDiscovery: @@ -7,13 +10,14 @@ class VideoDiscovery: @staticmethod def discover( - source_path: Path, - extensions: List[str] = None, + source_path: Path, + extensions: Optional[List[str]] = None, ) -> List[Path]: if extensions is None: extensions = VideoDiscovery.DEFAULT_EXTENSIONS - videos = [] + videos: List[Path] = [] for ext in extensions: videos.extend(source_path.glob(f"**/{ext}")) + return sorted(videos) diff --git a/preprocessor/services/video/emotion_utils.py b/preprocessor/services/video/emotion_utils.py index fd8b79e82..f31d30114 100644 --- a/preprocessor/services/video/emotion_utils.py +++ b/preprocessor/services/video/emotion_utils.py @@ -11,14 +11,14 @@ from preprocessor.config.settings_instance import settings from preprocessor.services.core.logging import ErrorHandlingLogger -EMOTION_LABELS = ['anger', 'contempt', 'disgust', 'fear', 'happiness', 'neutral', 'sadness', 'surprise'] +EMOTION_LABELS: List[str] = ['anger', 'contempt', 'disgust', 'fear', 'happiness', 'neutral', 'sadness', 'surprise'] -class EmotionDetector: +class EmotionDetector: @staticmethod def detect( - face_image: np.ndarray, - model: HSEmotionRecognizer, + face_image: np.ndarray, + model: HSEmotionRecognizer, ) -> Tuple[str, float, Dict[str, float]]: try: emotion, scores = model.predict_emotions(face_image, logits=False) @@ -27,51 +27,40 @@ def detect( raise RuntimeError(f'Emotion detection failed: {e}') from e @staticmethod - def __clip_bbox( - x1: int, - y1: int, - x2: int, - y2: int, - width: int, - height: int, - ) -> Tuple[int, int, int, int]: - x1 = max(0, x1) - y1 = max(0, y1) - x2 = min(width, x2) - y2 = min(height, y2) - return x1, y1, x2, y2 + def _init_model(logger: Optional[ErrorHandlingLogger] = None) -> HSEmotionRecognizer: + model_name = settings.emotion_detection.model_name + if logger: + logger.info(f'Loading HSEmotion model: {model_name}...') - @staticmethod - def _crop_face(frame: np.ndarray, bbox: Dict[str, int]) -> Optional[np.ndarray]: try: - x1, y1, x2, y2 = (bbox['x1'], bbox['y1'], bbox['x2'], bbox['y2']) - height, width = frame.shape[:2] - x1, y1, x2, y2 = EmotionDetector.__clip_bbox(x1, y1, x2, y2, width, height) - if x2 <= x1 or y2 <= y1: - return None - face_crop = frame[y1:y2, x1:x2] - return face_crop if face_crop.size > 0 else None - except Exception: - return None + fer = HSEmotionRecognizer(model_name=model_name) + if logger: + logger.info(f'HSEmotion model loaded: {model_name}') + return fer + except Exception as e: + raise RuntimeError(f'Failed to load HSEmotion model {model_name}: {e}') from e @staticmethod def _detect_batch( - face_images: List[np.ndarray], - model: HSEmotionRecognizer, - batch_size: int = 32, - logger: Optional[ErrorHandlingLogger] = None, + face_images: List[np.ndarray], + model: HSEmotionRecognizer, + batch_size: int = 32, + logger: Optional[ErrorHandlingLogger] = None, ) -> List[Optional[Tuple[str, float, Dict[str, float]]]]: results: List[Optional[Tuple[str, float, Dict[str, float]]]] = [] total = len(face_images) + for batch_start in range(0, total, batch_size): batch_end = min(batch_start + batch_size, total) batch = face_images[batch_start:batch_end] progress_pct = int(batch_end / total * 100) + if logger: logger.info( f'Processing emotion batch {batch_start}-{batch_end}/{total} ' f'({progress_pct}%)', ) + try: batch_results = model.predict_multi_emotions(batch, logits=False) for emotion, scores in batch_results: @@ -83,25 +72,43 @@ def _detect_batch( results.append(EmotionDetector.__process_emotion_result(emotion, scores)) except Exception: results.append(None) + return results @staticmethod - def _init_model(logger: Optional[ErrorHandlingLogger] = None) -> HSEmotionRecognizer: - model_name = settings.emotion_detection.model_name - if logger: - logger.info(f'Loading HSEmotion model: {model_name}...') + def _crop_face(frame: np.ndarray, bbox: Dict[str, int]) -> Optional[np.ndarray]: try: - fer = HSEmotionRecognizer(model_name=model_name) - if logger: - logger.info(f'HSEmotion model loaded: {model_name}') - return fer - except Exception as e: - raise RuntimeError(f'Failed to load HSEmotion model {model_name}: {e}') from e + x1, y1, x2, y2 = (bbox['x1'], bbox['y1'], bbox['x2'], bbox['y2']) + height, width = frame.shape[:2] + + x1, y1, x2, y2 = EmotionDetector.__clip_bbox(x1, y1, x2, y2, width, height) + if x2 <= x1 or y2 <= y1: + return None + + face_crop = frame[y1:y2, x1:x2] + return face_crop if face_crop.size > 0 else None + except Exception: + return None + + @staticmethod + def __clip_bbox( + x1: int, + y1: int, + x2: int, + y2: int, + width: int, + height: int, + ) -> Tuple[int, int, int, int]: + x1 = max(0, x1) + y1 = max(0, y1) + x2 = min(width, x2) + y2 = min(height, y2) + return x1, y1, x2, y2 @staticmethod def __process_emotion_result( - emotion: str, - scores: np.ndarray, + emotion: str, + scores: np.ndarray, ) -> Tuple[str, float, Dict[str, float]]: emotion_scores = { EMOTION_LABELS[i]: float(scores[i]) diff --git a/preprocessor/services/video/frame_utils.py b/preprocessor/services/video/frame_utils.py index ef58db552..e4f7e0645 100644 --- a/preprocessor/services/video/frame_utils.py +++ b/preprocessor/services/video/frame_utils.py @@ -10,23 +10,36 @@ class FrameLoader: - @staticmethod - def load_from_requests(frames_dir: Path, frame_requests: List[Dict[str, Any]], convert_rgb: bool=False, num_workers: int=4) -> List[Image.Image]: + def load_from_requests( + frames_dir: Path, + frame_requests: List[Dict[str, Any]], + convert_rgb: bool = False, + num_workers: int = 4, + ) -> List[Image.Image]: with ThreadPoolExecutor(max_workers=num_workers) as executor: - images = list(executor.map(lambda req: FrameLoader.__load_single(frames_dir, req, convert_rgb), frame_requests)) + images = list( + executor.map( + lambda req: FrameLoader.__load_single(frames_dir, req, convert_rgb), + frame_requests, + ), + ) return images @staticmethod - def __load_single(frames_dir: Path, request: Dict[str, Any], convert_rgb: bool) -> Image.Image: + def __load_single( + frames_dir: Path, request: Dict[str, Any], convert_rgb: bool, + ) -> Image.Image: if 'frame_path' in request: frame_path = frames_dir / request['frame_path'] else: frame_num = request['frame_number'] frame_path = frames_dir / f'frame_{frame_num:06d}.jpg' + if frame_path.exists(): img = Image.open(frame_path) if convert_rgb and img.mode != 'RGB': img = img.convert('RGB') return img + return Image.new('RGB', (1, 1)) diff --git a/preprocessor/services/video/image_hasher.py b/preprocessor/services/video/image_hasher.py index b4dfcbb4d..1b2a4d511 100644 --- a/preprocessor/services/video/image_hasher.py +++ b/preprocessor/services/video/image_hasher.py @@ -1,20 +1,35 @@ -from typing import Optional +from typing import ( + List, + Optional, +) +from PIL import Image import torch from torch import nn import torch.nn.functional as F -from torchvision import models +from torchvision import ( + models, + transforms, +) from torchvision.models import ResNet18_Weights class PerceptualHasher: - def __init__(self) -> None: + def __init__(self, device: str = 'cuda', hash_size: int = 8) -> None: + self.__device = device + self.__hash_size = hash_size base_model = models.resnet18(weights=ResNet18_Weights.DEFAULT) self.model: Optional[nn.Module] = nn.Sequential(*list(base_model.children())[:-1]) self.model.eval() - if torch.cuda.is_available(): + if device == 'cuda' and torch.cuda.is_available(): self.model = self.model.cuda() + self.__transform = transforms.Compose([ + transforms.Resize(256), + transforms.CenterCrop(224), + transforms.ToTensor(), + transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), + ]) def cleanup(self) -> None: if self.model is not None: @@ -23,6 +38,34 @@ def cleanup(self) -> None: if torch.cuda.is_available(): torch.cuda.empty_cache() + def compute_phash_batch(self, images: List[Image.Image]) -> List[str]: + if self.model is None: + raise RuntimeError('Model not initialized or already cleaned up') + + hashes: List[str] = [] + batch_tensors: List[torch.Tensor] = [] + + for img in images: + tensor = self.__transform(img) + batch_tensors.append(tensor) + + if batch_tensors: + batch = torch.stack(batch_tensors) + if self.__device == 'cuda' and torch.cuda.is_available(): + batch = batch.cuda() + + with torch.no_grad(): + features = self.model(batch) + features = F.adaptive_avg_pool2d(features, (1, 1)) + features = features.view(features.size(0), -1) + + for feature_vec in features: + hash_bits = (feature_vec > feature_vec.median()).int() + hash_str = ''.join([str(bit.item()) for bit in hash_bits[:self.__hash_size * self.__hash_size]]) + hashes.append(hash_str) + + return hashes + def __compute_hash(self, image_tensor: torch.Tensor) -> int: # pylint: disable=unused-private-member if self.model is None: raise RuntimeError('Model not initialized or already cleaned up') @@ -31,6 +74,7 @@ def __compute_hash(self, image_tensor: torch.Tensor) -> int: # pylint: disable= features = F.adaptive_avg_pool2d(features, (1, 1)) features = features.flatten() hash_bits = (features > features.median()).int() - hash_val = int(''.join([str(bit) for bit in hash_bits.tolist()[:64]]), 2) + hash_val = int(''.join([str(bit.item()) for bit in hash_bits.tolist()[:64]]), 2) return hash_val + __all__ = ['PerceptualHasher'] diff --git a/preprocessor/services/video/strategies/base_strategy.py b/preprocessor/services/video/strategies/base_strategy.py index 930186bf1..f8e4c6d91 100644 --- a/preprocessor/services/video/strategies/base_strategy.py +++ b/preprocessor/services/video/strategies/base_strategy.py @@ -11,7 +11,8 @@ class BaseKeyframeStrategy(ABC): - @abstractmethod - def extract_frame_requests(self, video_path: Path, data: Dict[str, Any]) -> List[Dict[str, Any]]: + def extract_frame_requests( + self, video_path: Path, data: Dict[str, Any], + ) -> List[Dict[str, Any]]: pass diff --git a/preprocessor/services/video/strategies/scene_changes_strategy.py b/preprocessor/services/video/strategies/scene_changes_strategy.py index db7abb2c9..ba84bba9e 100644 --- a/preprocessor/services/video/strategies/scene_changes_strategy.py +++ b/preprocessor/services/video/strategies/scene_changes_strategy.py @@ -3,6 +3,7 @@ Any, Dict, List, + Optional, ) from preprocessor.config.enums import FrameType @@ -11,42 +12,93 @@ class SceneChangesStrategy(BaseKeyframeStrategy): - def __init__(self, frames_per_scene: int) -> None: - self.frames_per_scene = frames_per_scene + self.__frames_per_scene = frames_per_scene - def extract_frame_requests(self, video_path: Path, data: Dict[str, Any]) -> List[Dict[str, Any]]: - scene_timestamps = data.get('scene_timestamps', {}) - scenes = scene_timestamps.get('scenes', []) + def extract_frame_requests( + self, video_path: Path, data: Dict[str, Any], + ) -> List[Dict[str, Any]]: + scenes = self.__extract_scenes(data) if not scenes: console.print('[yellow]No scene timestamps found[/yellow]') return [] + + fps = self.__extract_fps(data) + return self.__process_all_scenes(scenes, fps) + + def __process_all_scenes( + self, scenes: List[Dict[str, Any]], fps: float, + ) -> List[Dict[str, Any]]: + frame_requests: List[Dict[str, Any]] = [] + for i, scene in enumerate(scenes): + frame_requests.extend(self.__process_single_scene(scene, i, fps)) + return frame_requests + + def __process_single_scene( + self, scene: Dict[str, Any], scene_index: int, fps: float, + ) -> List[Dict[str, Any]]: + start_frame = scene.get('start', {}).get('frame', 0) + frame_count = scene.get('frame_count', 1) + + if frame_count <= 1: + return [ + self.__create_request(start_frame, fps, FrameType.SCENE_SINGLE, scene_index), + ] + + return self.__generate_multi_frame_requests( + start_frame, frame_count, scene_index, fps, + ) + + def __generate_multi_frame_requests( + self, start_frame: int, frame_count: int, scene_index: int, fps: float, + ) -> List[Dict[str, Any]]: + requests: List[Dict[str, Any]] = [] + for frame_idx in range(self.__frames_per_scene): + frame_number = self.__calculate_frame_number( + start_frame, frame_count, frame_idx, + ) + frame_type = self.__determine_frame_type(frame_idx) + requests.append( + self.__create_request(frame_number, fps, frame_type, scene_index), + ) + return requests + + def __calculate_frame_number( + self, start_frame: int, frame_count: int, frame_idx: int, + ) -> int: + position = frame_idx / (self.__frames_per_scene - 1) if self.__frames_per_scene > 1 else 0.0 + return int(start_frame + position * (frame_count - 1)) + + def __determine_frame_type(self, frame_idx: int) -> str: + if frame_idx == 0: + return FrameType.SCENE_START + if frame_idx == self.__frames_per_scene - 1: + return FrameType.SCENE_END + return FrameType.scene_mid(frame_idx) + + @staticmethod + def __extract_scenes(data: Dict[str, Any]) -> List[Dict[str, Any]]: + scene_timestamps = data.get('scene_timestamps', {}) + return scene_timestamps.get('scenes', []) + + @staticmethod + def __extract_fps(data: Dict[str, Any]) -> float: + scene_timestamps = data.get('scene_timestamps', {}) video_info = scene_timestamps.get('video_info', {}) fps = video_info.get('fps') if fps is None: raise ValueError('FPS not found in scene_timestamps video_info') - frame_requests = [] - for i, scene in enumerate(scenes): - start_frame = scene.get('start', {}).get('frame', 0) - frame_count = scene.get('frame_count', 1) - if frame_count <= 1: - frame_requests.append(self.__create_request(start_frame, fps, FrameType.SCENE_SINGLE, i)) - continue - for frame_idx in range(self.frames_per_scene): - position = frame_idx / (self.frames_per_scene - 1) if self.frames_per_scene > 1 else 0.0 - frame_number = int(start_frame + position * (frame_count - 1)) - if frame_idx == 0: - frame_type = FrameType.SCENE_START - elif frame_idx == self.frames_per_scene - 1: - frame_type = FrameType.SCENE_END - else: - frame_type = FrameType.scene_mid(frame_idx) - frame_requests.append(self.__create_request(frame_number, fps, frame_type, i)) - return frame_requests + return fps @staticmethod - def __create_request(frame: int, fps: float, type_name: str, scene_num: int=None) -> Dict[str, Any]: - req = {'frame_number': int(frame), 'timestamp': float(frame / fps), 'type': type_name} + def __create_request( + frame: int, fps: float, type_name: str, scene_num: Optional[int] = None, + ) -> Dict[str, Any]: + req: Dict[str, Any] = { + 'frame_number': int(frame), + 'timestamp': float(frame / fps), + 'type': type_name, + } if scene_num is not None: req['scene_number'] = scene_num return req diff --git a/preprocessor/services/video/strategies/strategy_factory.py b/preprocessor/services/video/strategies/strategy_factory.py index cd37e4ae5..db56415d0 100644 --- a/preprocessor/services/video/strategies/strategy_factory.py +++ b/preprocessor/services/video/strategies/strategy_factory.py @@ -4,9 +4,10 @@ class KeyframeStrategyFactory: - @staticmethod - def create(strategy_type: KeyframeStrategy, frames_per_scene: int=1) -> BaseKeyframeStrategy: + def create( + strategy_type: KeyframeStrategy, frames_per_scene: int = 1, + ) -> BaseKeyframeStrategy: if strategy_type == KeyframeStrategy.SCENE_CHANGES: return SceneChangesStrategy(frames_per_scene=frames_per_scene) raise ValueError(f'Unknown keyframe strategy: {strategy_type}') diff --git a/preprocessor/steps/analysis/resolution_analysis_step.py b/preprocessor/steps/analysis/resolution_analysis_step.py index f8d59d5a0..80eb2ff6f 100644 --- a/preprocessor/steps/analysis/resolution_analysis_step.py +++ b/preprocessor/steps/analysis/resolution_analysis_step.py @@ -3,6 +3,7 @@ import json from pathlib import Path from typing import ( + Any, Dict, List, Optional, @@ -17,106 +18,54 @@ class ResolutionAnalysisStep(PipelineStep[None, ResolutionAnalysisResult, TranscodeConfig]): + @property + def name(self) -> str: + return 'resolution_analysis' + + @property + def is_global(self) -> bool: + return True def execute( - self, input_data: None, context: ExecutionContext, + self, input_data: None, context: ExecutionContext, ) -> ResolutionAnalysisResult: - context.logger.info('=' * 80) - context.logger.info('RESOLUTION ANALYSIS - Checking source video resolutions') - context.logger.info('=' * 80) + self.__log_analysis_header(context) video_paths = self.__find_video_files(context) if not video_paths: - context.logger.warning('No video files found - skipping resolution analysis') - context.mark_step_completed(self.name, 'all') - return ResolutionAnalysisResult(total_files=0, upscaling_percentage=0.0) + return self.__handle_missing_videos(context) video_info = self.__scan_resolutions(video_paths, context) if not video_info: - context.logger.warning('Failed to analyze videos - skipping') - context.mark_step_completed(self.name, 'all') - return ResolutionAnalysisResult(total_files=len(video_paths), upscaling_percentage=0.0) + return self.__handle_failed_analysis(video_paths, context) upscaling_pct = self.__analyze_and_report(video_info, context) self.__save_results_to_json(video_info, upscaling_pct, context) context.mark_step_completed(self.name, 'all') - return ResolutionAnalysisResult(total_files=len(video_info), upscaling_percentage=upscaling_pct) - - @property - def name(self) -> str: - return 'resolution_analysis' - - @property - def is_global(self) -> bool: - return True - - @staticmethod - def __find_video_files(context: ExecutionContext) -> List[Path]: - input_base = PathService.get_input_base() - series_path = input_base / context.series_name - - if not series_path.exists(): - return [] - - video_extensions = {'.mp4', '.mkv', '.avi', '.mov', '.m4v'} - video_files = [ - p for p in series_path.rglob('*') - if p.is_file() and p.suffix.lower() in video_extensions - ] - - return sorted(video_files) - - @staticmethod - def __scan_resolutions( - video_paths: List[Path], context: ExecutionContext, - ) -> List[Dict[str, any]]: - video_info = [] - - for video_path in video_paths: - try: - probe_data = FFmpegWrapper.probe_video(video_path) - width, height = FFmpegWrapper.get_resolution(probe_data) - sar_num, sar_denom = FFmpegWrapper.get_sample_aspect_ratio(probe_data) - field_order = FFmpegWrapper.get_field_order(probe_data) - - effective_width = int(width * sar_num / sar_denom) - - context.logger.info( - f'Analyzing interlacing for {video_path.name} ' - f'(field_order={field_order}, analyzing full video)...', - ) - has_interlacing, idet_stats = FFmpegWrapper.detect_interlacing( - video_path, analysis_time=None, - ) - - metadata_vs_reality = ResolutionAnalysisStep.__validate_field_order( - field_order, has_interlacing, idet_stats, - ) - - if metadata_vs_reality != 'match': - context.logger.warning( - f'⚠ {video_path.name}: field_order={field_order} but idet says {metadata_vs_reality}!', - ) + return ResolutionAnalysisResult( + total_files=len(video_info), upscaling_percentage=upscaling_pct, + ) - video_info.append({ - 'filename': video_path.name, - 'width': effective_width, - 'height': height, - 'field_order': field_order, - 'needs_deinterlace': has_interlacing, - 'idet_stats': idet_stats, - 'metadata_match': metadata_vs_reality, - }) + def __log_analysis_header(self, context: ExecutionContext) -> None: + context.logger.info('=' * 80) + context.logger.info('RESOLUTION ANALYSIS - Checking source video resolutions') + context.logger.info('=' * 80) - except Exception as e: # pylint: disable=broad-except - context.logger.warning(f'Failed to probe {video_path.name}: {e}') - continue + def __handle_missing_videos(self, context: ExecutionContext) -> ResolutionAnalysisResult: + context.logger.warning('No video files found - skipping resolution analysis') + context.mark_step_completed(self.name, 'all') + return ResolutionAnalysisResult(total_files=0, upscaling_percentage=0.0) - return video_info + def __handle_failed_analysis( + self, video_paths: List[Path], context: ExecutionContext, + ) -> ResolutionAnalysisResult: + context.logger.warning('Failed to analyze videos - skipping') + context.mark_step_completed(self.name, 'all') + return ResolutionAnalysisResult(total_files=len(video_paths), upscaling_percentage=0.0) def __analyze_and_report( - self, video_info: List[Dict[str, any]], context: ExecutionContext, + self, video_info: List[Dict[str, Any]], context: ExecutionContext, ) -> float: resolution_counts = Counter((v['width'], v['height']) for v in video_info) total_episodes = len(video_info) @@ -135,6 +84,26 @@ def __analyze_and_report( progressive_count = sum(1 for v in video_info if not v['needs_deinterlace']) metadata_mismatch_count = sum(1 for v in video_info if v['metadata_match'] != 'match') + self.__log_resolution_distribution( + context, resolution_counts, total_episodes, target_width, target_height, + ) + self.__log_upscaling_warnings(context, upscaling_pct) + self.__log_interlacing_analysis( + context, progressive_count, needs_deinterlace_count, total_episodes, + ) + self.__log_metadata_warnings(context, metadata_mismatch_count) + + context.logger.info('=' * 80) + return upscaling_pct + + def __log_resolution_distribution( + self, + context: ExecutionContext, + resolution_counts: Counter, + total_episodes: int, + target_width: int, + target_height: int, + ) -> None: context.logger.info('') context.logger.info('Source Resolution Distribution:') context.logger.info('-' * 60) @@ -152,6 +121,7 @@ def __analyze_and_report( f'({self.__get_resolution_label(target_width, target_height)})', ) + def __log_upscaling_warnings(self, context: ExecutionContext, upscaling_pct: float) -> None: if upscaling_pct > 50: context.logger.warning('') context.logger.warning('⚠' * 30) @@ -169,88 +139,42 @@ def __analyze_and_report( '(enhanced quality params will be used)', ) + def __log_interlacing_analysis( + self, + context: ExecutionContext, + progressive_count: int, + needs_deinterlace_count: int, + total_episodes: int, + ) -> None: context.logger.info('') context.logger.info('Interlacing Analysis (based on idet, not metadata):') context.logger.info('-' * 60) context.logger.info( f' Progressive: {progressive_count} episodes ' - f'({(progressive_count/total_episodes)*100:.1f}%)', + f'({(progressive_count / total_episodes) * 100:.1f}%)', ) context.logger.info( f' Interlaced (needs deinterlace): {needs_deinterlace_count} episodes ' - f'({(needs_deinterlace_count/total_episodes)*100:.1f}%)', + f'({(needs_deinterlace_count / total_episodes) * 100:.1f}%)', ) - if metadata_mismatch_count > 0: + def __log_metadata_warnings(self, context: ExecutionContext, mismatch_count: int) -> None: + if mismatch_count > 0: context.logger.warning('') context.logger.warning( - f'⚠ WARNING: {metadata_mismatch_count} episodes have INCORRECT field_order metadata!', + f'⚠ WARNING: {mismatch_count} episodes have INCORRECT field_order metadata!', ) context.logger.warning( '⚠ Using idet analysis instead of metadata for deinterlacing decisions.', ) - context.logger.info('=' * 80) - - return upscaling_pct - - @staticmethod - def __validate_field_order( - field_order: str, has_interlacing: bool, idet_stats: Optional[Dict[str, int]], - ) -> str: - if not idet_stats: - return 'unknown' - - metadata_says_progressive = field_order in {'progressive', 'unknown'} - idet_says_progressive = not has_interlacing - - if metadata_says_progressive and idet_says_progressive: - return 'match' - if not metadata_says_progressive and not idet_says_progressive: - return 'match' - if metadata_says_progressive and not idet_says_progressive: - return 'interlaced (metadata wrong)' - return 'progressive (metadata wrong)' - - @staticmethod - def __get_resolution_label(width: int, height: int) -> str: - resolution_labels = { - (7680, 4320): '8K', - (3840, 2160): '4K', - (2560, 1440): '1440p', - (1920, 1080): '1080p', - (1280, 720): '720p', - (854, 480): '480p', - (640, 360): '360p', - (426, 240): '240p', - (256, 144): '144p', - } - - if (width, height) in resolution_labels: - return resolution_labels[(width, height)] - - if height >= 2000: - return '4K+' - if height >= 1400: - return '2K' - if height >= 1000: - return 'Full HD' - if height >= 700: - return 'HD' - if height >= 450: - return 'SD' - return 'Low' - - def __save_results_to_json( # pylint: disable=too-many-locals - self, - video_info: List[Dict[str, any]], - upscaling_pct: float, - context: ExecutionContext, + def __save_results_to_json( + self, + video_info: List[Dict[str, Any]], + upscaling_pct: float, + context: ExecutionContext, ) -> None: - output_base = PathService.get_output_base() - output_dir = output_base / context.series_name - output_dir.mkdir(parents=True, exist_ok=True) - output_file = output_dir / 'resolution_analysis.json' + output_file = self.__resolve_output_file(context) resolution_counts = Counter((v['width'], v['height']) for v in video_info) total_episodes = len(video_info) @@ -263,11 +187,45 @@ def __save_results_to_json( # pylint: disable=too-many-locals 1 for v in video_info if (v['width'] * v['height']) < target_pixels ) - needs_deinterlace_count = sum(1 for v in video_info if v['needs_deinterlace']) progressive_count = sum(1 for v in video_info if not v['needs_deinterlace']) metadata_mismatch_count = sum(1 for v in video_info if v['metadata_match'] != 'match') + result = self.__build_analysis_payload( + context, + video_info, + resolution_counts, + total_episodes, + target_width, + target_height, + target_pixels, + upscaling_count, + upscaling_pct, + progressive_count, + needs_deinterlace_count, + metadata_mismatch_count, + ) + + with open(output_file, 'w', encoding='utf-8') as f: + json.dump(result, f, indent=2, ensure_ascii=False) + + context.logger.info(f'Resolution analysis saved to: {output_file}') + + def __build_analysis_payload( # pylint: disable=too-many-arguments + self, + context: ExecutionContext, + video_info: List[Dict[str, Any]], + resolution_counts: Counter, + total_episodes: int, + target_width: int, + target_height: int, + target_pixels: int, + upscaling_count: int, + upscaling_pct: float, + progressive_count: int, + needs_deinterlace_count: int, + metadata_mismatch_count: int, + ) -> Dict[str, Any]: source_resolutions = [ { 'width': width, @@ -294,7 +252,7 @@ def __save_results_to_json( # pylint: disable=too-many-locals for v in sorted(video_info, key=lambda x: x['filename']) ] - result = { + return { 'analysis_date': datetime.now().isoformat(), 'series_name': context.series_name, 'target_resolution': { @@ -325,7 +283,120 @@ def __save_results_to_json( # pylint: disable=too-many-locals 'files': files_details, } - with open(output_file, 'w', encoding='utf-8') as f: - json.dump(result, f, indent=2, ensure_ascii=False) + @staticmethod + def __find_video_files(context: ExecutionContext) -> List[Path]: + input_base = PathService.get_input_base() + series_path = input_base / context.series_name - context.logger.info(f'Resolution analysis saved to: {output_file}') + if not series_path.exists(): + return [] + + video_extensions = {'.mp4', '.mkv', '.avi', '.mov', '.m4v'} + video_files = [ + p for p in series_path.rglob('*') + if p.is_file() and p.suffix.lower() in video_extensions + ] + + return sorted(video_files) + + @staticmethod + def __scan_resolutions( + video_paths: List[Path], context: ExecutionContext, + ) -> List[Dict[str, Any]]: + video_info = [] + + for video_path in video_paths: + try: + probe_data = FFmpegWrapper.probe_video(video_path) + width, height = FFmpegWrapper.get_resolution(probe_data) + sar_num, sar_denom = FFmpegWrapper.get_sample_aspect_ratio(probe_data) + field_order = FFmpegWrapper.get_field_order(probe_data) + + effective_width = int(width * sar_num / sar_denom) + + context.logger.info( + f'Analyzing interlacing for {video_path.name} ' + f'(field_order={field_order}, analyzing full video)...', + ) + has_interlacing, idet_stats = FFmpegWrapper.detect_interlacing( + video_path, analysis_time=None, + ) + + metadata_vs_reality = ResolutionAnalysisStep.__validate_field_order( + field_order, has_interlacing, idet_stats, + ) + + if metadata_vs_reality != 'match': + context.logger.warning( + f'⚠ {video_path.name}: field_order={field_order} but idet says {metadata_vs_reality}!', + ) + + video_info.append({ + 'filename': video_path.name, + 'width': effective_width, + 'height': height, + 'field_order': field_order, + 'needs_deinterlace': has_interlacing, + 'idet_stats': idet_stats, + 'metadata_match': metadata_vs_reality, + }) + + except Exception as e: + context.logger.warning(f'Failed to probe {video_path.name}: {e}') + continue + + return video_info + + @staticmethod + def __validate_field_order( + field_order: str, has_interlacing: bool, idet_stats: Optional[Dict[str, int]], + ) -> str: + if not idet_stats: + return 'unknown' + + metadata_says_progressive = field_order in {'progressive', 'unknown'} + idet_says_progressive = not has_interlacing + + if metadata_says_progressive and idet_says_progressive: + return 'match' + if not metadata_says_progressive and not idet_says_progressive: + return 'match' + if metadata_says_progressive and not idet_says_progressive: + return 'interlaced (metadata wrong)' + return 'progressive (metadata wrong)' + + @staticmethod + def __get_resolution_label(width: int, height: int) -> str: + resolution_labels = { + (7680, 4320): '8K', + (3840, 2160): '4K', + (2560, 1440): '1440p', + (1920, 1080): '1080p', + (1280, 720): '720p', + (854, 480): '480p', + (640, 360): '360p', + (426, 240): '240p', + (256, 144): '144p', + } + + if (width, height) in resolution_labels: + return resolution_labels[(width, height)] + + if height >= 2000: + return '4K+' + if height >= 1400: + return '2K' + if height >= 1000: + return 'Full HD' + if height >= 700: + return 'HD' + if height >= 450: + return 'SD' + return 'Low' + + @staticmethod + def __resolve_output_file(context: ExecutionContext) -> Path: + output_base = PathService.get_output_base() + output_dir = output_base / context.series_name + output_dir.mkdir(parents=True, exist_ok=True) + return output_dir / 'resolution_analysis.json' diff --git a/preprocessor/steps/audio/separation_step.py b/preprocessor/steps/audio/separation_step.py index ec13e4c6f..f98eed5d3 100644 --- a/preprocessor/steps/audio/separation_step.py +++ b/preprocessor/steps/audio/separation_step.py @@ -28,48 +28,131 @@ class SoundSeparationStep(PipelineStep[TranscriptionData, TranscriptionData, SoundSeparationConfig]): + @property + def name(self) -> str: + return 'sound_separation' def execute( - self, - input_data: TranscriptionData, - context: ExecutionContext, + self, + input_data: TranscriptionData, + context: ExecutionContext, ) -> TranscriptionData: - output_paths = self._prepare_output_paths(input_data) - + output_paths = self.__resolve_output_paths(input_data) clean_json = output_paths['clean_json'] + if self._check_cache_validity(clean_json, context, input_data.episode_id, 'cached'): - return self._create_cached_result(output_paths, input_data) + return self.__construct_cached_result(output_paths, input_data) context.mark_step_started(self.name, input_data.episode_id) - transcription_data = self._load_transcription_data(input_data) - dialogue_segments, sound_segments = self._separate_dialogue_from_sounds( + + transcription_data = self.__load_transcription_payload(input_data) + dialogue_segments, sound_segments = self.__separate_dialogue_from_sounds( transcription_data['segments'], ) - self._save_separated_data( + + self.__save_separated_data( output_paths, transcription_data['episode_info'], dialogue_segments, sound_segments, ) - self._generate_additional_formats( + self.__generate_additional_formats( output_paths, dialogue_segments, sound_segments, ) + context.mark_step_completed(self.name, input_data.episode_id) + return self.__construct_result_artifact(output_paths, input_data) - return self._create_result_artifact(output_paths, input_data) + def __separate_dialogue_from_sounds( + self, + segments: List[Dict[str, Any]], + ) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]: + dialogue_segments = [] + sound_segments = [] - @property - def name(self) -> str: - return 'sound_separation' + for segment in segments: + classification = classify_segment(segment) + if classification == 'dialogue': + cleaned = self.__clean_segment_text(segment) + dialogue_segments.append(cleaned) + elif classification == 'sound_event': + cleaned = self.__clean_segment_text(segment) + cleaned['sound_type'] = 'sound' + sound_segments.append(cleaned) + elif classification == 'mixed': + dialogue_parts, sound_parts = self.__split_mixed_segment(segment) + dialogue_segments.extend(dialogue_parts) + sound_segments.extend(sound_parts) + + dialogue_segments = self.__renumber_segments(dialogue_segments) + sound_segments = self.__renumber_segments(sound_segments) + + return dialogue_segments, sound_segments + + def __split_mixed_segment( + self, + segment: Dict[str, Any], + ) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]: + words = segment.get(WordKeys.WORDS, []) + dialogue_parts = [] + sound_parts = [] + current_type = None + current_words = [] + current_start = None + + for word in words: + word_type = 'sound' if is_sound_event(word) else 'dialogue' + if word.get(WordKeys.TYPE) == WordTypeValues.SPACING: + if current_words: + current_words.append(word) + continue + + if word_type != current_type: + if current_words and current_type: + self.__finalize_sequence( + current_type, + current_words, + current_start, + dialogue_parts, + sound_parts, + ) + current_type = word_type + current_words = [word] + current_start = word.get(WordKeys.START) + else: + current_words.append(word) + + if current_words and current_type: + self.__finalize_sequence( + current_type, + current_words, + current_start, + dialogue_parts, + sound_parts, + ) + + return dialogue_parts, sound_parts + + def __generate_additional_formats( + self, + output_paths: Dict[str, Path], + dialogue_segments: List[Dict[str, Any]], + sound_segments: List[Dict[str, Any]], + ) -> None: + self.__generate_txt_file(output_paths['clean_json'], output_paths['clean_txt']) + self.__generate_txt_file(output_paths['sound_json'], output_paths['sound_txt']) + self.__generate_srt_file(dialogue_segments, output_paths['clean_srt']) + self.__generate_srt_file(sound_segments, output_paths['sound_srt']) @staticmethod - def _prepare_output_paths(input_data: TranscriptionData) -> Dict[str, Path]: + def __resolve_output_paths(input_data: TranscriptionData) -> Dict[str, Path]: base_name = input_data.path.stem.replace(FILE_SUFFIXES['segmented'], '') episode_dir = input_data.path.parent.parent clean_dir = episode_dir / 'clean' sound_dir = episode_dir / 'sound_events' + clean_dir.mkdir(parents=True, exist_ok=True) sound_dir.mkdir(parents=True, exist_ok=True) @@ -84,9 +167,8 @@ def _prepare_output_paths(input_data: TranscriptionData) -> Dict[str, Path]: 'sound_srt': sound_dir / f"{base_name}{FILE_SUFFIXES['sound_events']}{FILE_EXTENSIONS['srt']}", } - @staticmethod - def _load_transcription_data(input_data: TranscriptionData) -> Dict[str, Any]: + def __load_transcription_payload(input_data: TranscriptionData) -> Dict[str, Any]: with open(input_data.path, 'r', encoding='utf-8') as f: data = json.load(f) return { @@ -94,38 +176,12 @@ def _load_transcription_data(input_data: TranscriptionData) -> Dict[str, Any]: 'segments': data.get('segments', []), } - def _separate_dialogue_from_sounds( - self, - segments: List[Dict[str, Any]], - ) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]: - dialogue_segments = [] - sound_segments = [] - - for segment in segments: - classification = classify_segment(segment) - if classification == 'dialogue': - cleaned = self.__clean_segment_text(segment) - dialogue_segments.append(cleaned) - elif classification == 'sound_event': - cleaned = self.__clean_segment_text(segment) - cleaned['sound_type'] = 'sound' - sound_segments.append(cleaned) - elif classification == 'mixed': - dialogue_parts, sound_parts = self.__split_mixed_segment(segment) - dialogue_segments.extend(dialogue_parts) - sound_segments.extend(sound_parts) - - dialogue_segments = self.__renumber_segments(dialogue_segments) - sound_segments = self.__renumber_segments(sound_segments) - - return dialogue_segments, sound_segments - @staticmethod - def _save_separated_data( - output_paths: Dict[str, Path], - episode_info_dict: Dict[str, Any], - dialogue_segments: List[Dict[str, Any]], - sound_segments: List[Dict[str, Any]], + def __save_separated_data( + output_paths: Dict[str, Path], + episode_info_dict: Dict[str, Any], + dialogue_segments: List[Dict[str, Any]], + sound_segments: List[Dict[str, Any]], ) -> None: clean_data = {'episode_info': episode_info_dict, 'segments': dialogue_segments} sound_data = {'episode_info': episode_info_dict, 'segments': sound_segments} @@ -135,21 +191,10 @@ def _save_separated_data( FileOperations.atomic_write_json(output_paths['clean_segmented'], clean_data) FileOperations.atomic_write_json(output_paths['sound_segmented'], sound_data) - def _generate_additional_formats( - self, - output_paths: Dict[str, Path], - dialogue_segments: List[Dict[str, Any]], - sound_segments: List[Dict[str, Any]], - ) -> None: - self.__generate_txt_file(output_paths['clean_json'], output_paths['clean_txt']) - self.__generate_txt_file(output_paths['sound_json'], output_paths['sound_txt']) - self.__generate_srt_file(dialogue_segments, output_paths['clean_srt']) - self.__generate_srt_file(sound_segments, output_paths['sound_srt']) - @staticmethod - def _create_cached_result( - output_paths: Dict[str, Path], - input_data: TranscriptionData, + def __construct_cached_result( + output_paths: Dict[str, Path], + input_data: TranscriptionData, ) -> TranscriptionData: return TranscriptionData( path=output_paths['clean_json'], @@ -161,9 +206,9 @@ def _create_cached_result( ) @staticmethod - def _create_result_artifact( - output_paths: Dict[str, Path], - input_data: TranscriptionData, + def __construct_result_artifact( + output_paths: Dict[str, Path], + input_data: TranscriptionData, ) -> TranscriptionData: return TranscriptionData( path=output_paths['clean_json'], @@ -181,26 +226,30 @@ def __clean_segment_text(segment: Dict[str, Any]) -> Dict[str, Any]: text = re.sub('\\s+', ' ', text) cleaned['text'] = text.strip() words = cleaned.get(WordKeys.WORDS, []) + if words: non_spacing = [w for w in words if w.get(WordKeys.TYPE) != WordTypeValues.SPACING] if non_spacing: cleaned[WordKeys.START] = min((w.get(WordKeys.START, 0) for w in non_spacing)) cleaned[WordKeys.END] = max((w.get(WordKeys.END, 0) for w in non_spacing)) + return cleaned @staticmethod def __finalize_sequence( - seq_type: str, - words: List[Dict[str, Any]], - start: float, - dialogue_parts: List[Dict[str, Any]], - sound_parts: List[Dict[str, Any]], + seq_type: str, + words: List[Dict[str, Any]], + start: float, + dialogue_parts: List[Dict[str, Any]], + sound_parts: List[Dict[str, Any]], ) -> None: non_spacing = [w for w in words if w.get(WordKeys.TYPE) != WordTypeValues.SPACING] if not non_spacing: return + text = ''.join((w.get(WordKeys.TEXT, '') for w in words)) end = words[-1].get(WordKeys.END, start) + new_segment = { 'id': 0, 'text': text, @@ -208,6 +257,7 @@ def __finalize_sequence( WordKeys.END: end, WordKeys.WORDS: words, } + if seq_type == 'sound': new_segment['sound_type'] = 'sound' sound_parts.append(new_segment) @@ -229,8 +279,10 @@ def __generate_srt_file(segments: List[Dict[str, Any]], srt_path: Path) -> None: start = seg.get('start', 0) end = seg.get('end', 0) text = seg.get('text', '').strip() + start_time = SoundSeparationStep.__format_srt_time(start) end_time = SoundSeparationStep.__format_srt_time(end) + f.write(f'{idx}\n') f.write(f'{start_time} --> {end_time}\n') f.write(f'{text}\n\n') @@ -239,14 +291,17 @@ def __generate_srt_file(segments: List[Dict[str, Any]], srt_path: Path) -> None: def __generate_txt_file(json_path: Path, txt_path: Path) -> None: with open(json_path, 'r', encoding='utf-8') as f: data = json.load(f) + segments = data.get('segments', []) text_lines = [] + for seg in segments: text = seg.get('text', '').strip() text = re.sub('\\([^)]*\\)', '', text) text = re.sub('\\s+', ' ', text).strip() if text: text_lines.append(text) + with open(txt_path, 'w', encoding='utf-8') as f: f.write(' '.join(text_lines)) @@ -259,43 +314,3 @@ def __renumber_segments(segments: List[Dict[str, Any]]) -> List[Dict[str, Any]]: for i, seg in enumerate(segments): seg['id'] = i return segments - - def __split_mixed_segment( - self, - segment: Dict[str, Any], - ) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]: - words = segment.get(WordKeys.WORDS, []) - dialogue_parts = [] - sound_parts = [] - current_type = None - current_words = [] - current_start = None - for word in words: - word_type = 'sound' if is_sound_event(word) else 'dialogue' - if word.get(WordKeys.TYPE) == WordTypeValues.SPACING: - if current_words: - current_words.append(word) - continue - if word_type != current_type: - if current_words and current_type: - self.__finalize_sequence( - current_type, - current_words, - current_start, - dialogue_parts, - sound_parts, - ) - current_type = word_type - current_words = [word] - current_start = word.get(WordKeys.START) - else: - current_words.append(word) - if current_words and current_type: - self.__finalize_sequence( - current_type, - current_words, - current_start, - dialogue_parts, - sound_parts, - ) - return dialogue_parts, sound_parts diff --git a/preprocessor/steps/packaging/archives_step.py b/preprocessor/steps/packaging/archives_step.py index 7a50438fb..66116088b 100644 --- a/preprocessor/steps/packaging/archives_step.py +++ b/preprocessor/steps/packaging/archives_step.py @@ -10,31 +10,35 @@ class ArchiveGenerationStep(PipelineStep[ProcessedEpisode, ArchiveArtifact, ArchiveConfig]): + @property + def name(self) -> str: + return 'archive_generation' - def execute(self, input_data: ProcessedEpisode, context: ExecutionContext) -> ArchiveArtifact: - output_path = self._get_output_path(input_data, context) + def execute( + self, input_data: ProcessedEpisode, context: ExecutionContext, + ) -> ArchiveArtifact: + output_path = self.__resolve_output_path(input_data, context) if self._check_cache_validity(output_path, context, input_data.episode_id, 'cached archive'): - return self._create_archive_artifact(input_data, output_path) + return self.__construct_archive_artifact(input_data, output_path) context.logger.info(f'Generating archive for {input_data.episode_id}') context.mark_step_started(self.name, input_data.episode_id) - context.mark_step_completed(self.name, input_data.episode_id) - - return self._create_archive_artifact(input_data, output_path) - @property - def name(self) -> str: - return 'archive_generation' + context.mark_step_completed(self.name, input_data.episode_id) + return self.__construct_archive_artifact(input_data, output_path) @staticmethod - def _get_output_path(input_data: ProcessedEpisode, context: ExecutionContext) -> Path: + def __resolve_output_path( + input_data: ProcessedEpisode, context: ExecutionContext, + ) -> Path: output_filename: str = f'{context.series_name}_{input_data.episode_info.episode_code()}_archive.zip' return context.get_output_path(input_data.episode_info, 'archives', output_filename) - @staticmethod - def _create_archive_artifact(input_data: ProcessedEpisode, output_path: Path) -> ArchiveArtifact: + def __construct_archive_artifact( + input_data: ProcessedEpisode, output_path: Path, + ) -> ArchiveArtifact: return ArchiveArtifact( episode_id=input_data.episode_id, episode_info=input_data.episode_info, diff --git a/preprocessor/steps/scraping/character_scraper_step.py b/preprocessor/steps/scraping/character_scraper_step.py index c0275bda3..2f90caeef 100644 --- a/preprocessor/steps/scraping/character_scraper_step.py +++ b/preprocessor/steps/scraping/character_scraper_step.py @@ -6,13 +6,12 @@ class CharacterScraperStep(BaseScraperStep[CharacterScraperConfig]): + @property + def name(self) -> str: + return "scrape_characters" def _get_scraper_class(self) -> Type[CharacterScraper]: return CharacterScraper def _get_metadata_type_name(self) -> str: return "Characters" - - @property - def name(self) -> str: - return "scrape_characters" diff --git a/preprocessor/steps/scraping/episode_scraper_step.py b/preprocessor/steps/scraping/episode_scraper_step.py index 4de7f883a..d1aa3dae1 100644 --- a/preprocessor/steps/scraping/episode_scraper_step.py +++ b/preprocessor/steps/scraping/episode_scraper_step.py @@ -12,6 +12,9 @@ class EpisodeScraperStep(BaseScraperStep[EpisodeScraperConfig]): + @property + def name(self) -> str: + return "scrape_episodes" def _get_scraper_class(self) -> Type[EpisodeScraper]: return EpisodeScraper @@ -23,7 +26,3 @@ def _build_scraper_args(self, output_path: Path, context: ExecutionContext) -> D args = super()._build_scraper_args(output_path, context) args["merge_sources"] = self.config.merge_sources return args - - @property - def name(self) -> str: - return "scrape_episodes" diff --git a/preprocessor/steps/scraping/reference_processor_step.py b/preprocessor/steps/scraping/reference_processor_step.py index b17b53be2..0ec3069b7 100644 --- a/preprocessor/steps/scraping/reference_processor_step.py +++ b/preprocessor/steps/scraping/reference_processor_step.py @@ -1,5 +1,8 @@ from pathlib import Path -from typing import Optional +from typing import ( + Optional, + Tuple, +) from preprocessor.config.step_configs import CharacterReferenceConfig from preprocessor.core.artifacts import SourceVideo @@ -11,43 +14,34 @@ class CharacterReferenceStep( PipelineStep[SourceVideo, SourceVideo, CharacterReferenceConfig], ): + @property + def name(self) -> str: + return "process_character_references" + + @property + def is_global(self) -> bool: + return True + def execute( self, input_data: SourceVideo, context: ExecutionContext, ) -> Optional[SourceVideo]: - characters_path, output_dir = self._get_paths() - self._validate_characters_file(characters_path) + characters_path, output_dir = self.__resolve_paths() + self.__validate_characters_file(characters_path) - if output_dir.exists() and any(output_dir.iterdir()) and not context.force_rerun: + if self.__should_skip_processing(output_dir, context): context.logger.info(f"Character references already exist in: {output_dir}") return input_data - self._process_character_references(characters_path, output_dir, context) + self.__download_character_references(characters_path, output_dir, context) return input_data - @property - def name(self) -> str: - return "process_character_references" - - @property - def is_global(self) -> bool: - return True - - def _get_paths(self) -> tuple[Path, Path]: + def __resolve_paths(self) -> Tuple[Path, Path]: characters_path = Path(self.config.characters_file) output_dir = Path(self.config.output_dir) return characters_path, output_dir - @staticmethod - def _validate_characters_file(characters_path: Path) -> None: - if not characters_path.exists(): - raise FileNotFoundError( - f"Characters file not found: {characters_path}. " - f"Run scrape_characters first.", - ) - - - def _process_character_references( + def __download_character_references( self, characters_path: Path, output_dir: Path, @@ -73,3 +67,17 @@ def _process_character_references( ) context.logger.info(f"Character references saved to: {output_dir}") + + @staticmethod + def __should_skip_processing(output_dir: Path, context: ExecutionContext) -> bool: + if context.force_rerun: + return False + return output_dir.exists() and any(output_dir.iterdir()) + + @staticmethod + def __validate_characters_file(characters_path: Path) -> None: + if not characters_path.exists(): + raise FileNotFoundError( + f"Characters file not found: {characters_path}. " + f"Run scrape_characters first.", + ) diff --git a/preprocessor/steps/search/document_generation_step.py b/preprocessor/steps/search/document_generation_step.py index 2897b5092..6615465cf 100644 --- a/preprocessor/steps/search/document_generation_step.py +++ b/preprocessor/steps/search/document_generation_step.py @@ -3,6 +3,8 @@ from typing import ( Any, Dict, + List, + Tuple, ) from preprocessor.config.step_configs import DocumentGenerationConfig @@ -16,103 +18,149 @@ class DocumentGeneratorStep(PipelineStep[Artifact, ElasticDocuments, DocumentGenerationConfig]): + @property + def name(self) -> str: + return 'document_generation' - def execute(self, input_data: Artifact, context: ExecutionContext) -> ElasticDocuments: - episode_info, episode_id = self._extract_episode_info(input_data) - output_dir = context.get_output_path(episode_info, 'elastic_documents', '') + def execute( + self, input_data: Artifact, context: ExecutionContext, + ) -> ElasticDocuments: + episode_info, episode_id = self.__extract_episode_info(input_data) + output_dir = self.__resolve_output_dir(episode_info, context) if self._check_cache_validity(output_dir, context, episode_id, 'cached'): - return self._create_empty_result(episode_id, episode_info, output_dir) + return self.__construct_elastic_documents(episode_id, episode_info, output_dir, 0) context.logger.info(f'Generating Elasticsearch documents for {episode_id}') context.mark_step_started(self.name, episode_id) data = self.__gather_input_data(episode_info, context) - total_docs = self._generate_documents(data, episode_info, context) + total_docs = self.__generate_documents(data, episode_info, context) context.mark_step_completed(self.name, episode_id) - return ElasticDocuments( - episode_id=episode_id, - episode_info=episode_info, - path=output_dir, - document_count=total_docs, + return self.__construct_elastic_documents(episode_id, episode_info, output_dir, total_docs) + + def __generate_documents( + self, + data: Dict[str, Any], + episode_info: Any, + context: ExecutionContext, + ) -> int: + total_docs = 0 + if self.config.generate_segments and 'transcription' in data: + _, count = self.__generate_segments_jsonl(data, episode_info, context) + total_docs += count + return total_docs + + def __generate_segments_jsonl( + self, data: Dict[str, Any], episode_info: Any, context: ExecutionContext, + ) -> Tuple[Path, int]: + output_path = self.__resolve_segments_output_path(episode_info, context) + segments = data['transcription'].get('segments', []) + episode_metadata = self.__build_episode_metadata(episode_info, context) + video_bot_path = self.__build_video_bot_path(episode_info, context) + + count = self.__write_segments_to_jsonl( + segments, output_path, episode_info, episode_metadata, video_bot_path, ) + return output_path, count - @property - def name(self) -> str: - return 'document_generation' + @staticmethod + def __write_segments_to_jsonl( + segments: List[Dict[str, Any]], + output_path: Path, + episode_info: Any, + episode_metadata: Dict[str, Any], + video_bot_path: str, + ) -> int: + count = 0 + with open(output_path, 'w', encoding='utf-8') as f: + for i, segment in enumerate(segments): + doc = { + 'episode_id': episode_info.episode_code(), + 'episode_metadata': episode_metadata, + 'segment_id': i, + 'text': segment.get('text', '').strip(), + 'start_time': segment.get('start', 0.0), + 'end_time': segment.get('end', 0.0), + 'speaker': segment.get('speaker', 'unknown'), + 'video_path': video_bot_path, + } + f.write(json.dumps(doc, ensure_ascii=False) + '\n') + count += 1 + return count @staticmethod - def _extract_episode_info(input_data: Artifact) -> tuple[Any, str]: + def __extract_episode_info(input_data: Artifact) -> Tuple[Any, str]: if not hasattr(input_data, 'episode_info'): raise ValueError('Input artifact must have episode_info') + episode_info = getattr(input_data, 'episode_info') episode_id = getattr(input_data, 'episode_id') return episode_info, episode_id + @staticmethod + def __resolve_output_dir(episode_info: Any, context: ExecutionContext) -> Path: + return context.get_output_path(episode_info, 'elastic_documents', '') @staticmethod - def _create_empty_result(episode_id: str, episode_info: Any, output_dir: Path) -> ElasticDocuments: + def __resolve_segments_output_path(episode_info: Any, context: ExecutionContext) -> Path: + output_filename = f'{context.series_name}_{episode_info.episode_code()}_text_segments.jsonl' + return context.get_output_path( + episode_info, 'elastic_documents/text_segments', output_filename, + ) + + @staticmethod + def __build_video_bot_path(episode_info: Any, context: ExecutionContext) -> str: + filename = f'{context.series_name}_{episode_info.episode_code()}.mp4' + return f'bot/{context.series_name.upper()}-WIDEO/{episode_info.season_code()}/{filename}' + + @staticmethod + def __construct_elastic_documents( + episode_id: str, episode_info: Any, output_dir: Path, document_count: int, + ) -> ElasticDocuments: return ElasticDocuments( episode_id=episode_id, episode_info=episode_info, path=output_dir, - document_count=0, + document_count=document_count, ) - def _generate_documents( - self, - data: Dict[str, Any], - episode_info: Any, - context: ExecutionContext, - ) -> int: - total_docs = 0 - if self.config.generate_segments and 'transcription' in data: - _, count = self.__generate_segments_jsonl(data, episode_info, context) - total_docs += count - return total_docs - @staticmethod def __build_episode_metadata(episode_info: Any, context: ExecutionContext) -> Dict[str, Any]: - return {'season': episode_info.season, 'episode_number': episode_info.relative_episode, 'series_name': context.series_name} + return { + 'season': episode_info.season, + 'episode_number': episode_info.relative_episode, + 'series_name': context.series_name, + } @staticmethod def __gather_input_data(episode_info: Any, context: ExecutionContext) -> Dict[str, Any]: - data = {} - clean_filename = f'{context.series_name}_{episode_info.episode_code()}_clean_transcription.json' - clean_path = context.get_output_path(episode_info, 'transcriptions/clean', clean_filename) + data: Dict[str, Any] = {} + + clean_path = DocumentGeneratorStep.__resolve_input_path( + episode_info, context, 'transcriptions/clean', '_clean_transcription.json', + ) if clean_path.exists(): data['transcription'] = FileOperations.load_json(clean_path) - text_emb_filename = f'{context.series_name}_{episode_info.episode_code()}_embeddings_text.json' - text_emb_path = context.get_output_path(episode_info, 'embeddings', text_emb_filename) + + text_emb_path = DocumentGeneratorStep.__resolve_input_path( + episode_info, context, 'embeddings', '_embeddings_text.json', + ) if text_emb_path.exists(): data['text_embeddings'] = FileOperations.load_json(text_emb_path) - scene_filename = f'{context.series_name}_{episode_info.episode_code()}_scenes.json' - scene_path = context.get_output_path(episode_info, 'scene_timestamps', scene_filename) + + scene_path = DocumentGeneratorStep.__resolve_input_path( + episode_info, context, 'scene_timestamps', '_scenes.json', + ) if scene_path.exists(): data['scenes'] = FileOperations.load_json(scene_path) + return data - def __generate_segments_jsonl(self, data: Dict[str, Any], episode_info: Any, context: ExecutionContext) -> tuple[Path, int]: - output_filename = f'{context.series_name}_{episode_info.episode_code()}_text_segments.jsonl' - output_path = context.get_output_path(episode_info, 'elastic_documents/text_segments', output_filename) - segments = data['transcription'].get('segments', []) - episode_metadata = self.__build_episode_metadata(episode_info, context) - filename = f'{context.series_name}_{episode_info.episode_code()}.mp4' - video_bot_path = f'bot/{context.series_name.upper()}-WIDEO/{episode_info.season_code()}/{filename}' - count = 0 - with open(output_path, 'w', encoding='utf-8') as f: - for i, segment in enumerate(segments): - doc = { - 'episode_id': episode_info.episode_code(), - 'episode_metadata': episode_metadata, - 'segment_id': i, - 'text': segment.get('text', '').strip(), - 'start_time': segment.get('start', 0.0), - 'end_time': segment.get('end', 0.0), - 'speaker': segment.get('speaker', 'unknown'), - 'video_path': video_bot_path, - } - f.write(json.dumps(doc, ensure_ascii=False) + '\n') - count += 1 - return output_path, count + @staticmethod + def __resolve_input_path( + episode_info: Any, context: ExecutionContext, folder: str, suffix: str, + ) -> Path: + filename = f'{context.series_name}_{episode_info.episode_code()}{suffix}' + return context.get_output_path(episode_info, folder, filename) diff --git a/preprocessor/steps/search/indexing_step.py b/preprocessor/steps/search/indexing_step.py index 3cd6707fb..d140af358 100644 --- a/preprocessor/steps/search/indexing_step.py +++ b/preprocessor/steps/search/indexing_step.py @@ -19,59 +19,38 @@ class ElasticsearchIndexerStep(PipelineStep[List[ElasticDocuments], IndexingResult, ElasticsearchConfig]): - def __init__(self, config: ElasticsearchConfig) -> None: super().__init__(config) - self._es: Optional[ElasticsearchWrapper] = None - - def cleanup(self) -> None: - if self._es: - asyncio.run(self._es.close()) - self._es = None - - def execute(self, input_data: List[ElasticDocuments], context: ExecutionContext) -> IndexingResult: - return asyncio.run(self._execute_async(input_data, context)) + self.__es: Optional[ElasticsearchWrapper] = None @property def name(self) -> str: return 'elasticsearch_indexing' - async def _execute_async( + def cleanup(self) -> None: + if self.__es: + asyncio.run(self.__es.close()) + self.__es = None + + def execute( + self, input_data: List[ElasticDocuments], context: ExecutionContext, + ) -> IndexingResult: + return asyncio.run(self.__execute_async(input_data, context)) + + async def __execute_async( self, input_data: List[ElasticDocuments], context: ExecutionContext, ) -> IndexingResult: if not input_data: - return await self._create_empty_result(context) + return self.__construct_empty_result(context) - docs_by_type = self._group_documents_by_type(input_data) - total_indexed = await self._index_all_document_types(docs_by_type, context) + docs_by_type = self.__group_documents_by_type(input_data) + total_indexed = await self.__process_all_document_types(docs_by_type, context) - return IndexingResult( - index_name=self.config.index_name, - document_count=total_indexed, - success=True, - ) + return self.__construct_indexing_result(total_indexed) - async def _create_empty_result(self, context: ExecutionContext) -> IndexingResult: - context.logger.warning('No documents to index.') - return IndexingResult( - index_name=self.config.index_name, - document_count=0, - success=True, - ) - - @staticmethod - def _group_documents_by_type(input_data: List[ElasticDocuments]) -> Dict[str, List[Path]]: - docs_by_type: Dict[str, List[Path]] = {} - for doc_artifact in input_data: - doc_type: str = doc_artifact.path.parent.name - if doc_type not in docs_by_type: - docs_by_type[doc_type] = [] - docs_by_type[doc_type].append(doc_artifact.path) - return docs_by_type - - async def _index_all_document_types( + async def __process_all_document_types( self, docs_by_type: Dict[str, List[Path]], context: ExecutionContext, @@ -79,14 +58,14 @@ async def _index_all_document_types( total_indexed: int = 0 for doc_type, paths in docs_by_type.items(): try: - indexed_count = await self._index_document_type(doc_type, paths, context) + indexed_count = await self.__process_document_type(doc_type, paths, context) total_indexed += indexed_count except Exception as e: context.logger.error(f'Elasticsearch indexing failed for {doc_type}: {e}') raise return total_indexed - async def _index_document_type( + async def __process_document_type( self, doc_type: str, paths: List[Path], @@ -95,41 +74,31 @@ async def _index_document_type( index_name: str = f'{self.config.index_name}_{doc_type}' context.logger.info(f'Indexing {len(paths)} files into {index_name}') - await self._ensure_elasticsearch_wrapper(index_name) - await self._prepare_index(doc_type) + await self.__prepare_elasticsearch_client(index_name) + await self.__setup_index(doc_type) - documents = self._load_documents_from_paths(paths) - return await self._bulk_index_documents(documents, index_name, context) + documents = self.__load_documents_from_paths(paths) + return await self.__execute_bulk_indexing(documents, index_name, context) - async def _ensure_elasticsearch_wrapper(self, index_name: str) -> None: - if self._es is None or self._es.index_name != index_name: - if self._es is not None: - await self._es.close() - self._es = ElasticsearchWrapper( + async def __prepare_elasticsearch_client(self, index_name: str) -> None: + if self.__es is None or self.__es.index_name != index_name: + if self.__es is not None: + await self.__es.close() + self.__es = ElasticsearchWrapper( index_name=index_name, host=self.config.host, dry_run=self.config.dry_run, ) - async def _prepare_index(self, doc_type: str) -> None: + async def __setup_index(self, doc_type: str) -> None: if not self.config.append: - await self._es.delete_index() + await self.__es.delete_index() mapping: Optional[Dict[str, Any]] = self.__get_mapping_for_type(doc_type) if mapping: - await self._es.create_index(mapping) + await self.__es.create_index(mapping) - @staticmethod - def _load_documents_from_paths(paths: List[Path]) -> List[Dict[str, Any]]: - documents: List[Dict[str, Any]] = [] - for path in paths: - with open(path, 'r', encoding='utf-8') as f: - for line in f: - if line.strip(): - documents.append(json.loads(line)) - return documents - - async def _bulk_index_documents( + async def __execute_bulk_indexing( self, documents: List[Dict[str, Any]], index_name: str, @@ -139,7 +108,7 @@ async def _bulk_index_documents( return 0 if not self.config.dry_run: - await self._es.bulk_index(documents) + await self.__es.bulk_index(documents) return len(documents) context.logger.info( @@ -147,6 +116,37 @@ async def _bulk_index_documents( ) return 0 + def __construct_indexing_result(self, document_count: int) -> IndexingResult: + return IndexingResult( + index_name=self.config.index_name, + document_count=document_count, + success=True, + ) + + def __construct_empty_result(self, context: ExecutionContext) -> IndexingResult: + context.logger.warning('No documents to index.') + return self.__construct_indexing_result(0) + + @staticmethod + def __group_documents_by_type(input_data: List[ElasticDocuments]) -> Dict[str, List[Path]]: + docs_by_type: Dict[str, List[Path]] = {} + for doc_artifact in input_data: + doc_type: str = doc_artifact.path.parent.name + if doc_type not in docs_by_type: + docs_by_type[doc_type] = [] + docs_by_type[doc_type].append(doc_artifact.path) + return docs_by_type + + @staticmethod + def __load_documents_from_paths(paths: List[Path]) -> List[Dict[str, Any]]: + documents: List[Dict[str, Any]] = [] + for path in paths: + with open(path, 'r', encoding='utf-8') as f: + for line in f: + if line.strip(): + documents.append(json.loads(line)) + return documents + @staticmethod def __get_mapping_for_type( doc_type: str, # pylint: disable=unused-argument diff --git a/preprocessor/steps/text/analysis_step.py b/preprocessor/steps/text/analysis_step.py index 820a8193b..51ce5f10d 100644 --- a/preprocessor/steps/text/analysis_step.py +++ b/preprocessor/steps/text/analysis_step.py @@ -17,74 +17,88 @@ class TextAnalysisStep(PipelineStep[TranscriptionData, TextAnalysisResults, TextAnalysisConfig]): + @property + def name(self) -> str: + return 'text_analysis' - def execute(self, input_data: TranscriptionData, context: ExecutionContext) -> TextAnalysisResults: - output_path = self._get_output_path(input_data) + def execute( + self, input_data: TranscriptionData, context: ExecutionContext, + ) -> TextAnalysisResults: + output_path = self.__resolve_output_path(input_data) if self._check_cache_validity(output_path, context, input_data.episode_id, 'cached'): - return self._load_cached_result(output_path, input_data) + return self.__load_cached_result(output_path, input_data) context.logger.info(f'Analyzing text for {input_data.episode_id}') context.mark_step_started(self.name, input_data.episode_id) - txt_path = self._get_text_file_path(input_data) - stats = self._analyze_text_statistics(txt_path) - result_data = self._build_result_data(stats, txt_path, input_data) + txt_path = self.__resolve_text_file_path(input_data) + stats = self.__analyze_text_statistics(txt_path) + result_data = self.__build_result_payload(stats, txt_path, input_data) - FileOperations.atomic_write_json(output_path, result_data) + self.__save_analysis_results(output_path, result_data) context.mark_step_completed(self.name, input_data.episode_id) - return TextAnalysisResults( - episode_id=input_data.episode_id, - episode_info=input_data.episode_info, - path=output_path, - statistics=result_data, - ) + return self.__construct_analysis_results(input_data, output_path, result_data) - @property - def name(self) -> str: - return 'text_analysis' + def __analyze_text_statistics(self, txt_path: Path) -> TextStatistics: + return TextStatistics.from_file(txt_path, language=self.config.language) + + def __build_result_payload( + self, + stats: TextStatistics, + txt_path: Path, + input_data: TranscriptionData, + ) -> Dict[str, Any]: + return { + 'metadata': { + 'episode_id': input_data.episode_id, + 'language': self.config.language, + 'source_file': txt_path.name, + 'analyzed_at': datetime.now().isoformat(), + }, + **stats.to_dict(), + } @staticmethod - def _get_output_path(input_data: TranscriptionData) -> Path: + def __resolve_output_path(input_data: TranscriptionData) -> Path: output_filename = input_data.path.stem + '_text_stats.json' return input_data.path.parent / output_filename - - @staticmethod - def _load_cached_result(output_path: Path, input_data: TranscriptionData) -> TextAnalysisResults: - stats_data = FileOperations.load_json(output_path) - return TextAnalysisResults( - episode_id=input_data.episode_id, - episode_info=input_data.episode_info, - path=output_path, - statistics=stats_data, - ) - @staticmethod - def _get_text_file_path(input_data: TranscriptionData) -> Path: + def __resolve_text_file_path(input_data: TranscriptionData) -> Path: txt_path = input_data.path if input_data.format != 'txt': txt_path = input_data.path.with_suffix('.txt') + if not txt_path.exists(): raise FileNotFoundError(f'Transcription text file not found: {txt_path}') + return txt_path - def _analyze_text_statistics(self, txt_path: Path) -> TextStatistics: - return TextStatistics.from_file(txt_path, language=self.config.language) + @staticmethod + def __load_cached_result( + output_path: Path, + input_data: TranscriptionData, + ) -> TextAnalysisResults: + stats_data = FileOperations.load_json(output_path) + return TextAnalysisStep.__construct_analysis_results( + input_data, output_path, stats_data, + ) - def _build_result_data( - self, - stats: TextStatistics, - txt_path: Path, - input_data: TranscriptionData, - ) -> Dict[str, Any]: - return { - 'metadata': { - 'episode_id': input_data.episode_id, - 'language': self.config.language, - 'source_file': txt_path.name, - 'analyzed_at': datetime.now().isoformat(), - }, - **stats.to_dict(), - } + @staticmethod + def __save_analysis_results(output_path: Path, result_data: Dict[str, Any]) -> None: + FileOperations.atomic_write_json(output_path, result_data) + + @staticmethod + def __construct_analysis_results( + input_data: TranscriptionData, + output_path: Path, + result_data: Dict[str, Any], + ) -> TextAnalysisResults: + return TextAnalysisResults( + episode_id=input_data.episode_id, + episode_info=input_data.episode_info, + path=output_path, + statistics=result_data, + ) diff --git a/preprocessor/steps/text/embeddings_step.py b/preprocessor/steps/text/embeddings_step.py index 7a11abc8b..e7c782678 100644 --- a/preprocessor/steps/text/embeddings_step.py +++ b/preprocessor/steps/text/embeddings_step.py @@ -5,6 +5,7 @@ Dict, List, Optional, + Tuple, ) from preprocessor.config.step_configs import TextEmbeddingConfig @@ -20,104 +21,61 @@ class TextEmbeddingStep(PipelineStep[TranscriptionData, EmbeddingCollection, TextEmbeddingConfig]): - def __init__(self, config: TextEmbeddingConfig) -> None: super().__init__(config) - self._model: Optional[EmbeddingModelWrapper] = None + self.__model: Optional[EmbeddingModelWrapper] = None + + @property + def name(self) -> str: + return 'text_embedding' def cleanup(self) -> None: - if self._model: - self._model = None + if self.__model: + self.__model = None def execute( self, input_data: TranscriptionData, context: ExecutionContext, ) -> EmbeddingCollection: - output_path = self._get_output_path(input_data, context) + output_path = self.__resolve_output_path(input_data, context) - if self._should_skip_processing(output_path, context, input_data): - return self._load_cached_result(output_path, input_data) + if self._check_cache_validity(output_path, context, input_data.episode_id, 'cached text embeddings'): + return self.__load_cached_result(output_path, input_data) - segments = self._load_and_validate_segments(input_data, context) + segments = self.__extract_valid_segments(input_data, context) if not segments: - return self._create_embedding_collection(input_data, output_path, 0) + return self.__construct_embedding_collection(input_data, output_path, 0) - self._ensure_model_loaded() + self.__prepare_embedding_model() context.logger.info(f'Generating text embeddings for {input_data.episode_id}') context.mark_step_started(self.name, input_data.episode_id) - results = self._generate_embeddings(segments) - self._save_results(results, output_path, input_data) + results = self.__process_text_embeddings(segments) + self.__save_embedding_results(results, output_path, input_data) context.mark_step_completed(self.name, input_data.episode_id) - return self._create_embedding_collection(input_data, output_path, len(results)) - - @property - def name(self) -> str: - return 'text_embedding' - - @staticmethod - def _get_output_path(input_data: TranscriptionData, context: ExecutionContext) -> Path: - episode_code = input_data.episode_info.episode_code() - output_filename: str = f'{context.series_name}_{episode_code}_embeddings_text.json' - return context.get_output_path(input_data.episode_info, 'embeddings', output_filename) - - def _should_skip_processing( - self, - output_path: Path, - context: ExecutionContext, - input_data: TranscriptionData, - ) -> bool: - return self._check_cache_validity( - output_path, - context, - input_data.episode_id, - 'cached text embeddings', - ) + return self.__construct_embedding_collection(input_data, output_path, len(results)) - def _load_cached_result( # pylint: disable=duplicate-code - self, - output_path: Path, - input_data: TranscriptionData, - ) -> EmbeddingCollection: - emb_data: Dict[str, Any] = FileOperations.load_json(output_path) - return self._create_embedding_collection( - input_data, - output_path, - len(emb_data.get('results', [])), - ) - - def _load_and_validate_segments( - self, - input_data: TranscriptionData, - context: ExecutionContext, - ) -> List[Dict[str, Any]]: - transcription: Dict[str, Any] = self.__load_clean_transcription(input_data, context) - segments: List[Dict[str, Any]] = transcription.get('segments', []) - if not segments: - context.logger.warning(f'No text segments for embedding in {input_data.episode_id}') - return segments - - def _ensure_model_loaded(self) -> None: - if self._model is None: - self._model = EmbeddingModelWrapper( + def __prepare_embedding_model(self) -> None: + if self.__model is None: + self.__model = EmbeddingModelWrapper( self.config.model_name, self.config.device, self.config.batch_size, ) - def _generate_embeddings(self, segments: List[Dict[str, Any]]) -> List[Dict[str, Any]]: + def __process_text_embeddings(self, segments: List[Dict[str, Any]]) -> List[Dict[str, Any]]: full_text: str = ' '.join([seg.get('text', '') for seg in segments]) sentences: List[str] = self.__split_into_sentences(full_text) - text_chunks, chunk_metadata = self._create_text_chunks(sentences, segments) - return self._batch_encode_chunks(text_chunks, chunk_metadata) + text_chunks, chunk_metadata = self.__create_text_chunks(sentences, segments) + return self.__batch_encode_chunks(text_chunks, chunk_metadata) - def _create_text_chunks( + def __create_text_chunks( self, sentences: List[str], segments: List[Dict[str, Any]], - ) -> tuple[List[str], List[Dict[str, Any]]]: + ) -> Tuple[List[str], List[Dict[str, Any]]]: text_chunks: List[str] = [] chunk_metadata: List[Dict[str, Any]] = [] step: int = self.config.text_sentences_per_chunk - self.config.text_chunk_overlap @@ -144,7 +102,7 @@ def _create_text_chunks( return text_chunks, chunk_metadata - def _batch_encode_chunks( + def __batch_encode_chunks( self, text_chunks: List[str], chunk_metadata: List[Dict[str, Any]], @@ -154,14 +112,14 @@ def _batch_encode_chunks( for i in range(0, len(text_chunks), self.config.batch_size): batch_texts: List[str] = text_chunks[i:i + self.config.batch_size] batch_meta: List[Dict[str, Any]] = chunk_metadata[i:i + self.config.batch_size] - batch_embeddings: List[List[float]] = self._model.encode_text(batch_texts) + batch_embeddings: List[List[float]] = self.__model.encode_text(batch_texts) for meta, embedding in zip(batch_meta, batch_embeddings): results.append({**meta, 'embedding': embedding}) return results - def _save_results( + def __save_embedding_results( self, results: List[Dict[str, Any]], output_path: Path, @@ -179,7 +137,7 @@ def _save_results( ) FileOperations.atomic_write_json(output_path, output_data) - def _create_embedding_collection( # pylint: disable=duplicate-code + def __construct_embedding_collection( self, input_data: TranscriptionData, output_path: Path, @@ -195,20 +153,39 @@ def _create_embedding_collection( # pylint: disable=duplicate-code ) @staticmethod - def __find_segment_at_position(segments: List[Dict[str, Any]], char_pos: int) -> int: - cumulative_length: int = 0 - for idx, seg in enumerate(segments): - seg_length: int = len(seg.get('text', '')) + 1 - if cumulative_length <= char_pos < cumulative_length + seg_length: - return idx - cumulative_length += seg_length - return len(segments) - 1 if segments else 0 + def __resolve_output_path(input_data: TranscriptionData, context: ExecutionContext) -> Path: + episode_code = input_data.episode_info.episode_code() + output_filename: str = f'{context.series_name}_{episode_code}_embeddings_text.json' + return context.get_output_path(input_data.episode_info, 'embeddings', output_filename) + + @staticmethod + def __load_cached_result( + output_path: Path, + input_data: TranscriptionData, + ) -> EmbeddingCollection: + emb_data: Dict[str, Any] = FileOperations.load_json(output_path) + return MetadataBuilder.create_embedding_collection( + episode_id=input_data.episode_id, + episode_info=input_data.episode_info, + path=output_path, + model_name="cached_model", + embedding_count=len(emb_data.get('text_embeddings', [])), + embedding_type='text', + ) @staticmethod - def __load_clean_transcription( + def __extract_valid_segments( input_data: TranscriptionData, - context: ExecutionContext, # pylint: disable=unused-argument - ) -> Dict[str, Any]: + context: ExecutionContext, + ) -> List[Dict[str, Any]]: + transcription: Dict[str, Any] = TextEmbeddingStep.__load_clean_transcription(input_data) + segments: List[Dict[str, Any]] = transcription.get('segments', []) + if not segments: + context.logger.warning(f'No text segments for embedding in {input_data.episode_id}') + return segments + + @staticmethod + def __load_clean_transcription(input_data: TranscriptionData) -> Dict[str, Any]: raw_path: Path = input_data.path clean_path: Path = ( raw_path.parent.parent / 'clean' / @@ -218,6 +195,16 @@ def __load_clean_transcription( return FileOperations.load_json(clean_path) return FileOperations.load_json(raw_path) + @staticmethod + def __find_segment_at_position(segments: List[Dict[str, Any]], char_pos: int) -> int: + cumulative_length: int = 0 + for idx, seg in enumerate(segments): + seg_length: int = len(seg.get('text', '')) + 1 + if cumulative_length <= char_pos < cumulative_length + seg_length: + return idx + cumulative_length += seg_length + return len(segments) - 1 if segments else 0 + @staticmethod def __split_into_sentences(text: str) -> List[str]: normalized_text: str = re.sub('\\.{2,}', '.', text) diff --git a/preprocessor/steps/text/transcription_step.py b/preprocessor/steps/text/transcription_step.py index 2871b1bb8..e8467da95 100644 --- a/preprocessor/steps/text/transcription_step.py +++ b/preprocessor/steps/text/transcription_step.py @@ -18,76 +18,53 @@ class TranscriptionStep(PipelineStep[AudioArtifact, TranscriptionData, WhisperTranscriptionConfig]): - def __init__(self, config: WhisperTranscriptionConfig) -> None: super().__init__(config) - self._whisper: Optional[Whisper] = None + self.__whisper: Optional[Whisper] = None + + @property + def name(self) -> str: + return 'transcription' def cleanup(self) -> None: - if self._whisper: - self._whisper.cleanup() - self._whisper = None + if self.__whisper: + self.__whisper.cleanup() + self.__whisper = None def execute(self, input_data: AudioArtifact, context: ExecutionContext) -> TranscriptionData: - output_path = self._get_output_path(input_data, context) + output_path = self.__resolve_output_path(input_data, context) if self._check_cache_validity(output_path, context, input_data.episode_id, 'cached transcription'): - return self._create_cached_result(output_path, input_data) + return self.__construct_cached_result(output_path, input_data) - self._ensure_whisper_loaded() + self.__prepare_whisper_model() context.logger.info( f'Transcribing {input_data.episode_id} using Whisper {self.config.model}', ) context.mark_step_started(self.name, input_data.episode_id) - result = self._transcribe_audio(input_data, output_path, context) - context.mark_step_completed(self.name, input_data.episode_id) - - return self._create_result_artifact(output_path, input_data, result) - - @property - def name(self) -> str: - return 'transcription' - - @staticmethod - def _get_output_path(input_data: AudioArtifact, context: ExecutionContext) -> Path: - output_filename: str = ( - f'{context.series_name}_{input_data.episode_info.episode_code()}.json' - ) - return context.get_output_path( - input_data.episode_info, - 'transcriptions', - f'raw/{output_filename}', - ) - + result = self.__process_audio_transcription(input_data, output_path, context) - def _create_cached_result(self, output_path: Path, input_data: AudioArtifact) -> TranscriptionData: - return TranscriptionData( - episode_id=input_data.episode_id, - episode_info=input_data.episode_info, - path=output_path, - language=self.config.language, - model=self.config.model, - format='json', - ) + context.mark_step_completed(self.name, input_data.episode_id) + return self.__construct_result_artifact(output_path, input_data, result) - def _ensure_whisper_loaded(self) -> None: - if self._whisper is None: - self._whisper = Whisper( + def __prepare_whisper_model(self) -> None: + if self.__whisper is None: + self.__whisper = Whisper( model=self.config.model, language=self.config.language, device=self.config.device, beam_size=self.config.beam_size, ) - def _transcribe_audio( - self, - input_data: AudioArtifact, - output_path: Path, - context: ExecutionContext, + def __process_audio_transcription( + self, + input_data: AudioArtifact, + output_path: Path, + context: ExecutionContext, ) -> Dict[str, Any]: try: - result: Dict[str, Any] = self._whisper.transcribe(input_data.path) + result: Dict[str, Any] = self.__whisper.transcribe(input_data.path) result['episode_info'] = EpisodeManager.get_metadata(input_data.episode_info) FileOperations.atomic_write_json(output_path, result) return result @@ -99,11 +76,23 @@ def _transcribe_audio( output_path.unlink() raise - def _create_result_artifact( - self, - output_path: Path, - input_data: AudioArtifact, - result: Dict[str, Any], + def __construct_cached_result( + self, output_path: Path, input_data: AudioArtifact, + ) -> TranscriptionData: + return TranscriptionData( + episode_id=input_data.episode_id, + episode_info=input_data.episode_info, + path=output_path, + language=self.config.language, + model=self.config.model, + format='json', + ) + + def __construct_result_artifact( + self, + output_path: Path, + input_data: AudioArtifact, + result: Dict[str, Any], ) -> TranscriptionData: return TranscriptionData( episode_id=input_data.episode_id, @@ -113,3 +102,14 @@ def _create_result_artifact( model=self.config.model, format='json', ) + + @staticmethod + def __resolve_output_path(input_data: AudioArtifact, context: ExecutionContext) -> Path: + output_filename: str = ( + f'{context.series_name}_{input_data.episode_info.episode_code()}.json' + ) + return context.get_output_path( + input_data.episode_info, + 'transcriptions', + f'raw/{output_filename}', + ) diff --git a/preprocessor/steps/validation/validator_step.py b/preprocessor/steps/validation/validator_step.py index cfdbc6eca..114ec0867 100644 --- a/preprocessor/steps/validation/validator_step.py +++ b/preprocessor/steps/validation/validator_step.py @@ -9,6 +9,9 @@ class ValidationStep(PipelineStep[ElasticDocuments, ValidationResult, ValidationConfig]): + @property + def name(self) -> str: + return "validate" def execute( self, @@ -17,21 +20,14 @@ def execute( ) -> ValidationResult: context.logger.info(f"Starting validation for season {context.season}") - validator = self._create_validator(context) - self._run_validation(validator) + validator = self.__create_validator(context) + self.__run_validation(validator) context.logger.info("Validation completed successfully") - return ValidationResult( - season=context.season, - validation_report_dir=validator.validation_reports_dir, - ) + return self.__construct_validation_result(context, validator) - @property - def name(self) -> str: - return "validate" - - def _create_validator(self, context: ExecutionContext) -> Validator: + def __create_validator(self, context: ExecutionContext) -> Validator: return Validator( season=context.season, series_name=context.series_name, @@ -41,7 +37,17 @@ def _create_validator(self, context: ExecutionContext) -> Validator: ) @staticmethod - def _run_validation(validator: Validator) -> None: + def __run_validation(validator: Validator) -> None: exit_code = validator.validate() if exit_code != 0: raise RuntimeError(f"Validation failed with exit code {exit_code}") + + @staticmethod + def __construct_validation_result( + context: ExecutionContext, + validator: Validator, + ) -> ValidationResult: + return ValidationResult( + season=context.season, + validation_report_dir=validator.validation_reports_dir, + ) diff --git a/preprocessor/steps/video/frame_export_step.py b/preprocessor/steps/video/frame_export_step.py index 984821ef4..031eb868d 100644 --- a/preprocessor/steps/video/frame_export_step.py +++ b/preprocessor/steps/video/frame_export_step.py @@ -26,28 +26,37 @@ class FrameExporterStep(PipelineStep[SceneCollection, FrameCollection, FrameExportConfig]): - def __init__(self, config: FrameExportConfig) -> None: super().__init__(config) decord.bridge.set_bridge('native') - self.strategy = KeyframeStrategyFactory.create(self.config.keyframe_strategy, self.config.frames_per_scene) + self.__strategy = KeyframeStrategyFactory.create( + self.config.keyframe_strategy, self.config.frames_per_scene, + ) - def execute(self, input_data: SceneCollection, context: ExecutionContext) -> FrameCollection: - episode_dir, metadata_file = self._prepare_output_paths(input_data, context) + @property + def name(self) -> str: + return 'frame_export' + + def execute( + self, input_data: SceneCollection, context: ExecutionContext, + ) -> FrameCollection: + episode_dir, metadata_file = self.__resolve_output_paths(input_data, context) if self._check_cache_validity(metadata_file, context, input_data.episode_id, 'cached'): - return self._load_cached_result(metadata_file, episode_dir, input_data) + return self.__load_cached_result(metadata_file, episode_dir, input_data) - self._prepare_episode_directory(episode_dir, context) - frame_requests = self._extract_frame_requests(input_data) + self.__prepare_episode_directory(episode_dir, context) + frame_requests = self.__extract_frame_requests(input_data) if not frame_requests: - return self._create_empty_result(episode_dir, metadata_file, input_data, context) + return self.__construct_empty_result(episode_dir, metadata_file, input_data, context) - context.logger.info(f'Extracting {len(frame_requests)} keyframes from {input_data.video_path.name}') + context.logger.info( + f'Extracting {len(frame_requests)} keyframes from {input_data.video_path.name}', + ) context.mark_step_started(self.name, input_data.episode_id) - self._process_frame_extraction( + self.__process_frame_extraction( input_data.video_path, frame_requests, episode_dir, @@ -65,158 +74,86 @@ def execute(self, input_data: SceneCollection, context: ExecutionContext) -> Fra metadata_path=metadata_file, ) - @property - def name(self) -> str: - return 'frame_export' - - @staticmethod - def _prepare_output_paths( - input_data: SceneCollection, - context: ExecutionContext, - ) -> Tuple[Path, Path]: - episode_dir = context.get_output_path(input_data.episode_info, 'exported_frames', '') - metadata_filename = f'{context.series_name}_{input_data.episode_info.episode_code()}_frame_metadata.json' - metadata_file = episode_dir / metadata_filename - return episode_dir, metadata_file - - @staticmethod - def _load_cached_result( - metadata_file: Path, - episode_dir: Path, - input_data: SceneCollection, - ) -> FrameCollection: - with open(metadata_file, 'r', encoding='utf-8') as f: - metadata = json.load(f) - return FrameCollection( - episode_id=input_data.episode_id, - episode_info=input_data.episode_info, - directory=episode_dir, - frame_count=metadata['statistics']['total_frames'], - metadata_path=metadata_file, - ) - - @staticmethod - def _prepare_episode_directory(episode_dir: Path, context: ExecutionContext) -> None: - if episode_dir.exists(): - context.logger.info(f'Cleaning incomplete frames from previous run: {episode_dir}') - shutil.rmtree(episode_dir, ignore_errors=True) - episode_dir.mkdir(parents=True, exist_ok=True) - - def _extract_frame_requests(self, input_data: SceneCollection) -> List[FrameRequest]: + def __extract_frame_requests(self, input_data: SceneCollection) -> List[FrameRequest]: video_path = input_data.video_path if not video_path.exists(): raise FileNotFoundError(f'Video file not found for frame export: {video_path}') data = {'scene_timestamps': {'scenes': input_data.scenes}} - return self.strategy.extract_frame_requests(video_path, data) - - @staticmethod - def _create_empty_result( - episode_dir: Path, - metadata_file: Path, - input_data: SceneCollection, - context: ExecutionContext, - ) -> FrameCollection: - context.logger.warning(f'No frames to extract for {input_data.episode_id}') - return FrameCollection( - episode_id=input_data.episode_id, - episode_info=input_data.episode_info, - directory=episode_dir, - frame_count=0, - metadata_path=metadata_file, - ) + return self.__strategy.extract_frame_requests(video_path, data) - def _process_frame_extraction( - self, - video_path: Path, - frame_requests: List[FrameRequest], - episode_dir: Path, - input_data: SceneCollection, - metadata_file: Path, - context: ExecutionContext, + def __process_frame_extraction( + self, + video_path: Path, + frame_requests: List[FrameRequest], + episode_dir: Path, + input_data: SceneCollection, + metadata_file: Path, + context: ExecutionContext, ) -> None: try: - self.__extract_frames(video_path, frame_requests, episode_dir, input_data.episode_info, context) - self.__write_metadata(frame_requests, input_data.episode_info, video_path, context, metadata_file) + self.__extract_frames( + video_path, frame_requests, episode_dir, input_data.episode_info, context, + ) + self.__write_metadata( + frame_requests, input_data.episode_info, video_path, context, metadata_file, + ) except Exception as e: context.logger.error(f'Failed to extract frames from {video_path}: {e}') shutil.rmtree(episode_dir, ignore_errors=True) raise - @staticmethod - def __calculate_display_aspect_ratio(metadata: Dict[str, Any]) -> float: - width = metadata.get('width', 0) - height = metadata.get('height', 0) - if width == 0 or height == 0: - raise ValueError('Invalid video dimensions') - sar_str = metadata.get('sample_aspect_ratio', '1:1') - if sar_str == 'N/A' or not sar_str: - sar_str = '1:1' - try: - sar_num, sar_denom = [int(x) for x in sar_str.split(':')] - sar = sar_num / sar_denom if sar_denom != 0 else 1.0 - except (ValueError, ZeroDivisionError): - sar = 1.0 - return width / height * sar + def __extract_frames( + self, + video_file: Path, + frame_requests: List[FrameRequest], + episode_dir: Path, + episode_info, + context: ExecutionContext, + ) -> None: + video_metadata = self.__fetch_video_metadata(video_file) + dar = self.__calculate_display_aspect_ratio(video_metadata) + vr = decord.VideoReader(str(video_file), ctx=decord.cpu(0)) + + for req in frame_requests: + frame_num = req['frame_number'] + self.__extract_and_save_frame( + vr, frame_num, episode_dir, episode_info, dar, context.series_name, + ) + + del vr def __extract_and_save_frame( - self, - vr, - frame_num: int, - episode_dir: Path, - episode_info, - dar: float, - series_name: str, + self, + vr: decord.VideoReader, + frame_num: int, + episode_dir: Path, + episode_info, + dar: float, + series_name: str, ) -> None: frame_np = vr[frame_num].asnumpy() frame_pil = Image.fromarray(frame_np) resized = self.__resize_frame(frame_pil, dar) + base_filename = f'{series_name}_{episode_info.episode_code()}' filename = f'{base_filename}_frame_{frame_num:06d}.jpg' resized.save(episode_dir / filename, quality=90) - def __extract_frames( - self, - video_file: Path, - frame_requests: List[FrameRequest], - episode_dir: Path, - episode_info, - context: ExecutionContext, - ) -> None: - video_metadata = self.__get_video_metadata(video_file) - dar = self.__calculate_display_aspect_ratio(video_metadata) - vr = decord.VideoReader(str(video_file), ctx=decord.cpu(0)) - for req in frame_requests: - frame_num = req['frame_number'] - self.__extract_and_save_frame(vr, frame_num, episode_dir, episode_info, dar, context.series_name) - del vr - - @staticmethod - def __get_video_metadata(video_path: Path) -> Dict[str, Any]: - cmd = [ - 'ffprobe', '-v', 'error', '-select_streams', 'v:0', - '-show_entries', 'stream=width,height,sample_aspect_ratio,display_aspect_ratio', - '-of', 'json', str(video_path), - ] - result = subprocess.run(cmd, capture_output=True, text=True, check=True) - probe_data: Dict[str, Any] = json.loads(result.stdout) - streams: List[Dict[str, Any]] = probe_data.get('streams', []) - if not streams: - raise ValueError(f'No video streams found in {video_path}') - return streams[0] - def __resize_frame(self, frame: Image.Image, display_aspect_ratio: float) -> Image.Image: target_width = self.config.resolution.width target_height = self.config.resolution.height target_aspect = target_width / target_height + if abs(display_aspect_ratio - target_aspect) < 0.01: return frame.resize((target_width, target_height), Image.Resampling.LANCZOS) + if display_aspect_ratio > target_aspect: new_height = target_height new_width = int(target_height * display_aspect_ratio) resized = frame.resize((new_width, new_height), Image.Resampling.LANCZOS) x_crop = (new_width - target_width) // 2 - cropped = resized.crop((x_crop, 0, x_crop + target_width, target_height)) - return cropped + return resized.crop((x_crop, 0, x_crop + target_width, target_height)) + new_width = target_width new_height = int(target_width / display_aspect_ratio) resized = frame.resize((new_width, new_height), Image.Resampling.LANCZOS) @@ -226,28 +163,32 @@ def __resize_frame(self, frame: Image.Image, display_aspect_ratio: float) -> Ima return result def __write_metadata( - self, - frame_requests: List[FrameRequest], - episode_info, - source_video: Path, - context: ExecutionContext, - metadata_file: Path, + self, + frame_requests: List[FrameRequest], + episode_info, + source_video: Path, + context: ExecutionContext, + metadata_file: Path, ) -> None: - frame_types_count = {} - frames_with_paths = [] + frame_types_count: Dict[str, int] = {} + frames_with_paths: List[Dict[str, Any]] = [] base_filename = f'{context.series_name}_{episode_info.episode_code()}' + for frame in frame_requests: frame_type = frame.get('type', 'unknown') frame_types_count[frame_type] = frame_types_count.get(frame_type, 0) + 1 + frame_with_path = frame.copy() frame_num = frame['frame_number'] frame_with_path['frame_path'] = f'{base_filename}_frame_{frame_num:06d}.jpg' frames_with_paths.append(frame_with_path) + scene_numbers = { f.get('scene_number', -1) for f in frame_requests if f.get('scene_number', -1) != -1 } + metadata = { 'generated_at': datetime.now().isoformat(), 'episode_info': { @@ -274,3 +215,87 @@ def __write_metadata( 'frames': frames_with_paths, } FileOperations.atomic_write_json(metadata_file, metadata, indent=2) + + @staticmethod + def __resolve_output_paths( + input_data: SceneCollection, + context: ExecutionContext, + ) -> Tuple[Path, Path]: + episode_dir = context.get_output_path(input_data.episode_info, 'exported_frames', '') + metadata_filename = f'{context.series_name}_{input_data.episode_info.episode_code()}_frame_metadata.json' + metadata_file = episode_dir / metadata_filename + return episode_dir, metadata_file + + @staticmethod + def __load_cached_result( + metadata_file: Path, + episode_dir: Path, + input_data: SceneCollection, + ) -> FrameCollection: + with open(metadata_file, 'r', encoding='utf-8') as f: + metadata = json.load(f) + return FrameCollection( + episode_id=input_data.episode_id, + episode_info=input_data.episode_info, + directory=episode_dir, + frame_count=metadata['statistics']['total_frames'], + metadata_path=metadata_file, + ) + + @staticmethod + def __prepare_episode_directory(episode_dir: Path, context: ExecutionContext) -> None: + if episode_dir.exists(): + context.logger.info(f'Cleaning incomplete frames from previous run: {episode_dir}') + shutil.rmtree(episode_dir, ignore_errors=True) + episode_dir.mkdir(parents=True, exist_ok=True) + + @staticmethod + def __construct_empty_result( + episode_dir: Path, + metadata_file: Path, + input_data: SceneCollection, + context: ExecutionContext, + ) -> FrameCollection: + context.logger.warning(f'No frames to extract for {input_data.episode_id}') + return FrameCollection( + episode_id=input_data.episode_id, + episode_info=input_data.episode_info, + directory=episode_dir, + frame_count=0, + metadata_path=metadata_file, + ) + + @staticmethod + def __fetch_video_metadata(video_path: Path) -> Dict[str, Any]: + cmd = [ + 'ffprobe', '-v', 'error', '-select_streams', 'v:0', + '-show_entries', 'stream=width,height,sample_aspect_ratio,display_aspect_ratio', + '-of', 'json', str(video_path), + ] + result = subprocess.run(cmd, capture_output=True, text=True, check=True) + probe_data: Dict[str, Any] = json.loads(result.stdout) + streams: List[Dict[str, Any]] = probe_data.get('streams', []) + + if not streams: + raise ValueError(f'No video streams found in {video_path}') + return streams[0] + + @staticmethod + def __calculate_display_aspect_ratio(metadata: Dict[str, Any]) -> float: + width = metadata.get('width', 0) + height = metadata.get('height', 0) + + if width == 0 or height == 0: + raise ValueError('Invalid video dimensions') + + sar_str = metadata.get('sample_aspect_ratio', '1:1') + if sar_str == 'N/A' or not sar_str: + sar_str = '1:1' + + try: + sar_num, sar_denom = [int(x) for x in sar_str.split(':')] + sar = sar_num / sar_denom if sar_denom != 0 else 1.0 + except (ValueError, ZeroDivisionError): + sar = 1.0 + + return width / height * sar diff --git a/preprocessor/steps/video/scene_detection_step.py b/preprocessor/steps/video/scene_detection_step.py index e359e12e1..faf2772a4 100644 --- a/preprocessor/steps/video/scene_detection_step.py +++ b/preprocessor/steps/video/scene_detection_step.py @@ -17,70 +17,68 @@ class SceneDetectorStep(PipelineStep[TranscodedVideo, SceneCollection, SceneDetectionConfig]): - def __init__(self, config: SceneDetectionConfig) -> None: super().__init__(config) - self.transnet = TransNetWrapper() - self._model_loaded = False + self.__transnet = TransNetWrapper() + self.__model_loaded = False + + @property + def name(self) -> str: + return 'scene_detection' def cleanup(self) -> None: - if self._model_loaded: - self.transnet.cleanup() - self._model_loaded = False + if self.__model_loaded: + self.__transnet.cleanup() + self.__model_loaded = False - def execute(self, input_data: TranscodedVideo, context: ExecutionContext) -> SceneCollection: - output_path = self._get_output_path(input_data, context) + def execute( + self, input_data: TranscodedVideo, context: ExecutionContext, + ) -> SceneCollection: + output_path = self.__resolve_output_path(input_data, context) if self._check_cache_validity(output_path, context, input_data.episode_id, 'cached'): - return self._load_cached_result(output_path, input_data) + return self.__load_cached_result(output_path, input_data) + + self.__prepare_detection_environment(context) - self._ensure_model_loaded(context) context.logger.info(f'Detecting scenes in {input_data.episode_id}') context.mark_step_started(self.name, input_data.episode_id) - scenes = self._detect_scenes(input_data.path) - self._save_results(scenes, input_data.path, output_path) + scenes = self.__detect_scenes(input_data.path) + self.__save_detection_results(scenes, input_data.path, output_path) context.mark_step_completed(self.name, input_data.episode_id) - return self._create_scene_collection(output_path, input_data, scenes) - - @property - def name(self) -> str: - return 'scene_detection' - - @staticmethod - def _get_output_path(input_data: TranscodedVideo, context: ExecutionContext) -> Path: - output_filename = f'{context.series_name}_{input_data.episode_info.episode_code()}_scenes.json' - return context.get_output_path(input_data.episode_info, 'scene_timestamps', output_filename) - - def _load_cached_result(self, output_path: Path, input_data: TranscodedVideo) -> SceneCollection: - scenes_data = FileOperations.load_json(output_path) - return SceneCollection( - path=output_path, - video_path=input_data.path, - episode_id=input_data.episode_id, - episode_info=input_data.episode_info, - scenes=scenes_data.get('scenes', []), - threshold=self.config.threshold, - min_scene_len=self.config.min_scene_len, - ) + return self.__construct_scene_collection(output_path, input_data, scenes) - def _ensure_model_loaded(self, context: ExecutionContext) -> None: - if not self._model_loaded: + def __prepare_detection_environment(self, context: ExecutionContext) -> None: + if not self.__model_loaded: context.logger.info('Loading TransNetV2 model...') - self.transnet.load_model() - self._model_loaded = True + self.__transnet.load_model() + self.__model_loaded = True - def _detect_scenes(self, video_path: Path) -> List[Dict[str, Any]]: - return self.transnet.detect_scenes( + def __detect_scenes(self, video_path: Path) -> List[Dict[str, Any]]: + return self.__transnet.detect_scenes( video_path, threshold=self.config.threshold, min_scene_len=self.config.min_scene_len, ) - def _save_results(self, scenes: List[Dict[str, Any]], video_path: Path, output_path: Path) -> None: - video_info = self.transnet._TransNetWrapper__get_video_info(video_path) - output_data = { + def __save_detection_results( + self, + scenes: List[Dict[str, Any]], + video_path: Path, + output_path: Path, + ) -> None: + video_info = self.__transnet._TransNetWrapper__get_video_info(video_path) + output_data = self.__build_results_payload(scenes, video_info) + FileOperations.atomic_write_json(output_path, output_data) + + def __build_results_payload( + self, + scenes: List[Dict[str, Any]], + video_info: Dict[str, Any], + ) -> Dict[str, Any]: + return { 'total_scenes': len(scenes), 'video_info': video_info, 'detection_settings': { @@ -90,13 +88,28 @@ def _save_results(self, scenes: List[Dict[str, Any]], video_path: Path, output_p }, 'scenes': scenes, } - FileOperations.atomic_write_json(output_path, output_data) - def _create_scene_collection( - self, - output_path: Path, - input_data: TranscodedVideo, - scenes: List[Dict[str, Any]], + def __load_cached_result( + self, + output_path: Path, + input_data: TranscodedVideo, + ) -> SceneCollection: + scenes_data = FileOperations.load_json(output_path) + return SceneCollection( + path=output_path, + video_path=input_data.path, + episode_id=input_data.episode_id, + episode_info=input_data.episode_info, + scenes=scenes_data.get('scenes', []), + threshold=self.config.threshold, + min_scene_len=self.config.min_scene_len, + ) + + def __construct_scene_collection( + self, + output_path: Path, + input_data: TranscodedVideo, + scenes: List[Dict[str, Any]], ) -> SceneCollection: return SceneCollection( path=output_path, @@ -107,3 +120,13 @@ def _create_scene_collection( threshold=self.config.threshold, min_scene_len=self.config.min_scene_len, ) + + @staticmethod + def __resolve_output_path( + input_data: TranscodedVideo, + context: ExecutionContext, + ) -> Path: + output_filename = f'{context.series_name}_{input_data.episode_info.episode_code()}_scenes.json' + return context.get_output_path( + input_data.episode_info, 'scene_timestamps', output_filename, + ) diff --git a/preprocessor/steps/video/transcoding_step.py b/preprocessor/steps/video/transcoding_step.py index 953b9873e..c81c68bc7 100644 --- a/preprocessor/steps/video/transcoding_step.py +++ b/preprocessor/steps/video/transcoding_step.py @@ -2,6 +2,7 @@ from typing import ( Any, Dict, + Tuple, ) from preprocessor.config.step_configs import TranscodeConfig @@ -16,106 +17,69 @@ class VideoTranscoderStep(PipelineStep[SourceVideo, TranscodedVideo, TranscodeConfig]): - _command_logged = False + __command_logged = False - def execute( # pylint: disable=too-many-locals - self, input_data: SourceVideo, context: ExecutionContext, + @property + def name(self) -> str: + return 'video_transcode' + + def execute( + self, input_data: SourceVideo, context: ExecutionContext, ) -> TranscodedVideo: - output_path = self._get_output_path(input_data, context) + output_path = self.__resolve_output_path(input_data, context) if self._check_cache_validity(output_path, context, input_data.episode_id, 'output exists'): - return self._create_result_artifact(output_path, input_data) + return self.__construct_result_artifact(output_path, input_data) probe_data = FFmpegWrapper.probe_video(input_data.path) - target_fps = self._calculate_target_fps(probe_data, context) - is_upscaling, source_pixels, target_pixels = self._detect_upscaling(probe_data) - - source_width, source_height = FFmpegWrapper.get_resolution(probe_data) - sar_num, sar_denom = FFmpegWrapper.get_sample_aspect_ratio(probe_data) - effective_width = int(source_width * sar_num / sar_denom) + params = self.__create_transcode_params(input_data, output_path, probe_data, context) - if is_upscaling: - context.logger.info( - f'{input_data.episode_id}: Source {effective_width}x{source_height} ' - f'({source_pixels:,} px) → Target {self.config.resolution.width}x{self.config.resolution.height} ' - f'({target_pixels:,} px) - UPSCALING DETECTED', - ) - else: - context.logger.info( - f'{input_data.episode_id}: Source {effective_width}x{source_height} ' - f'({source_pixels:,} px) → Target {self.config.resolution.width}x{self.config.resolution.height} ' - f'({target_pixels:,} px) - No upscaling', - ) + self.__log_transcode_details(context, input_data, params, probe_data) + self.__execute_ffmpeg_process(context, params, input_data.episode_id) - video_bitrate, minrate, maxrate, bufsize = self._adjust_video_bitrate( + context.mark_step_completed(self.name, input_data.episode_id) + return self.__construct_result_artifact(output_path, input_data) + + def __create_transcode_params( + self, + input_data: SourceVideo, + output_path: Path, + probe_data: Dict[str, Any], + context: ExecutionContext, + ) -> TranscodeParams: + target_fps = self.__resolve_target_framerate(probe_data, context) + is_upscaling, source_pixels, target_pixels = self.__analyze_resolution_scaling(probe_data) + + v_bitrate, v_min, v_max, v_buf = self.__compute_video_bitrate_settings( probe_data, context, is_upscaling, source_pixels, target_pixels, ) - audio_bitrate = self._adjust_audio_bitrate(probe_data, context) - deinterlace = self._determine_deinterlace(input_data, context, probe_data) - context.logger.info( - 'Video: SAR 1:1 (square pixels), timebase 1/90000, ' - 'colorspace bt709, color_range tv, closed GOP=12 frames (0.5s) with IDR keyframes ' - '(forced for frame-accurate cutting & concat)', - ) - context.logger.info( - f'Audio: AAC {audio_bitrate} kbps, 2 channels (stereo), 48 kHz sample rate (forced)', - ) - context.logger.info(f'Transcoding {input_data.episode_id}') - self._perform_transcode( - input_data.path, - output_path, - video_bitrate, - minrate, - maxrate, - bufsize, - audio_bitrate, - target_fps, - deinterlace, - is_upscaling, - context, - input_data, - ) + audio_bitrate = self.__compute_audio_bitrate(probe_data, context) + deinterlace = self.__resolve_deinterlacing_strategy(input_data, context, probe_data) + log_cmd = self.__should_log_command() - context.mark_step_completed(self.name, input_data.episode_id) - return self._create_result_artifact(output_path, input_data) - - @property - def name(self) -> str: - return 'video_transcode' - - @staticmethod - def _get_output_path(input_data: SourceVideo, context: ExecutionContext) -> Path: - output_filename = f'{context.series_name}_{input_data.episode_info.episode_code()}.mp4' - return context.get_season_output_path(input_data.episode_info, 'transcoded_videos', output_filename) - - - def _create_result_artifact(self, output_path: Path, input_data: SourceVideo) -> TranscodedVideo: - resolution_str = f'{self.config.resolution.width}x{self.config.resolution.height}' - return TranscodedVideo( - path=output_path, - episode_id=input_data.episode_id, - episode_info=input_data.episode_info, - resolution=resolution_str, + return TranscodeParams( + input_path=input_data.path, + output_path=output_path, codec=self.config.codec, + preset=self.config.preset, + resolution=f'{self.config.resolution.width}:{self.config.resolution.height}', + video_bitrate=f'{v_bitrate}M', + minrate=f'{v_min}M', + maxrate=f'{v_max}M', + bufsize=f'{v_buf}M', + audio_bitrate=f'{audio_bitrate}k', + gop_size=int(target_fps * self.config.keyframe_interval_seconds), + target_fps=target_fps, + deinterlace=deinterlace, + is_upscaling=is_upscaling, + log_command=log_cmd, ) - @staticmethod - def _calculate_target_fps( - probe_data: Dict[str, Any], - context: ExecutionContext, - ) -> float: - input_fps = FFmpegWrapper.get_framerate(probe_data) - target_fps = 24.0 - - if input_fps != target_fps: - context.logger.info( - f'Input FPS ({input_fps:.2f}) → forcing {target_fps} FPS for consistency and cinematic quality.', - ) - - return target_fps - - def _detect_upscaling(self, probe_data: Dict[str, Any]) -> tuple[bool, int, int]: + def __analyze_resolution_scaling( + self, + probe_data: Dict[str, Any], + ) -> Tuple[bool, int, int]: source_width, source_height = FFmpegWrapper.get_resolution(probe_data) sar_num, sar_denom = FFmpegWrapper.get_sample_aspect_ratio(probe_data) effective_width = int(source_width * sar_num / sar_denom) @@ -125,214 +89,237 @@ def _detect_upscaling(self, probe_data: Dict[str, Any]) -> tuple[bool, int, int] return source_pixels < target_pixels, source_pixels, target_pixels - def _adjust_video_bitrate( - self, - probe_data: Dict[str, Any], - context: ExecutionContext, - is_upscaling: bool, - source_pixels: int, - target_pixels: int, - ) -> tuple[float, float, float, float]: - if is_upscaling: - return self._calculate_upscale_bitrate( - probe_data, source_pixels, target_pixels, context, - ) + def __compute_video_bitrate_settings( + self, + probe_data: Dict[str, Any], + context: ExecutionContext, + is_upscaling: bool, + source_pixels: int, + target_pixels: int, + ) -> Tuple[float, float, float, float]: + return self.__compute_scaled_bitrate( + probe_data, source_pixels, target_pixels, context, is_upscaling, + ) - input_video_bitrate = FFmpegWrapper.get_video_bitrate(probe_data) - video_bitrate = self.config.video_bitrate_mbps - minrate = self.config.minrate_mbps - maxrate = self.config.maxrate_mbps - bufsize = self.config.bufsize_mbps - - if input_video_bitrate and input_video_bitrate < video_bitrate: - adjusted_bitrate = min(input_video_bitrate * 1.05, video_bitrate) - ratio = adjusted_bitrate / video_bitrate - video_bitrate = adjusted_bitrate - minrate = round(minrate * ratio, 2) - maxrate = round(maxrate * ratio, 2) - bufsize = round(bufsize * ratio, 2) - context.logger.info( - f'Input video bitrate ({input_video_bitrate} Mbps) < ' - f'target ({self.config.video_bitrate_mbps} Mbps). ' - f'Adjusted to {video_bitrate} Mbps to avoid quality loss.', + def __compute_scaled_bitrate( + self, + probe_data: Dict[str, Any], + source_pixels: int, + target_pixels: int, + context: ExecutionContext, + is_upscaling: bool, + ) -> Tuple[float, float, float, float]: + source_bitrate = FFmpegWrapper.get_video_bitrate(probe_data) + target_bitrate = self.config.calculate_video_bitrate_mbps() + minrate = self.config.calculate_minrate_mbps() + maxrate = self.config.calculate_maxrate_mbps() + bufsize = self.config.calculate_bufsize_mbps() + + if not source_bitrate: + context.logger.warning( + f'Cannot detect source bitrate. Using target bitrate ({target_bitrate} Mbps).', ) + return target_bitrate, minrate, maxrate, bufsize - return video_bitrate, minrate, maxrate, bufsize - - def _calculate_upscale_bitrate( - self, - probe_data: Dict[str, Any], - source_pixels: int, - target_pixels: int, - context: ExecutionContext, - ) -> tuple[float, float, float, float]: - __MIN_BITRATE_FOR_RESOLUTION: Dict[tuple[int, int], float] = { - (7680, 4320): 35.0, - (3840, 2160): 15.0, - (2560, 1440): 8.0, - (1920, 1080): 3.5, - (1280, 720): 2.0, - (854, 480): 1.2, - (640, 360): 0.8, - (426, 240): 0.5, - (256, 144): 0.3, - } - - target_res = (self.config.resolution.width, self.config.resolution.height) - min_required = __MIN_BITRATE_FOR_RESOLUTION.get(target_res, 2.0) pixel_ratio = target_pixels / source_pixels + scaled_bitrate = source_bitrate * (pixel_ratio ** 0.7) - if pixel_ratio > 1.4: - min_required *= 1.25 - elif pixel_ratio > 1.2: - min_required *= 1.15 - - source_bitrate = FFmpegWrapper.get_video_bitrate(probe_data) - quality_boost = 1.2 + max(0.0, (pixel_ratio - 1.1) * 0.4) - - if source_bitrate: - calculated = source_bitrate * pixel_ratio * quality_boost - upscaled_bitrate = max(calculated, min_required) - else: - upscaled_bitrate = min_required * max(1.2, pixel_ratio * 0.9) - - max_allowed = self.config.video_bitrate_mbps * 1.1 - upscaled_bitrate = min(upscaled_bitrate, max_allowed) + final_bitrate = min(scaled_bitrate, target_bitrate) + ratio = final_bitrate / target_bitrate - ratio = upscaled_bitrate / self.config.video_bitrate_mbps - - context.logger.warning( - f'⚠ UPSCALING: {source_pixels:,} px → {target_pixels:,} px ' - f'(+{((target_pixels/source_pixels)-1)*100:.1f}%, quality_boost={quality_boost:.2f}). ' - f'Bitrate: {source_bitrate or "N/A"} → {upscaled_bitrate:.2f} Mbps ' - f'(min for {target_res[0]}x{target_res[1]}: {min_required} Mbps). ' - f'Using Spline36 scaler (flicker-free) + enhanced nvenc params.', + direction = 'upscaling' if is_upscaling else 'downscaling' if pixel_ratio < 1.0 else 'same resolution' + context.logger.info( + f'Bitrate calculation ({direction}): ' + f'source {source_bitrate:.2f} Mbps @ {source_pixels:,}px → ' + f'scaled {scaled_bitrate:.2f} Mbps @ {target_pixels:,}px ' + f'(pixel_ratio {pixel_ratio:.2f}, exponent 0.7) → ' + f'final {final_bitrate:.2f} Mbps (capped to target {target_bitrate} Mbps)', ) return ( - upscaled_bitrate, - round(self.config.minrate_mbps * ratio, 2), - round(self.config.maxrate_mbps * ratio, 2), - round(self.config.bufsize_mbps * ratio, 2), + final_bitrate, + round(minrate * ratio, 2), + round(maxrate * ratio, 2), + round(bufsize * ratio, 2), ) - def _adjust_audio_bitrate( - self, - probe_data: Dict[str, Any], - context: ExecutionContext, + def __compute_audio_bitrate( + self, + probe_data: Dict[str, Any], + context: ExecutionContext, ) -> int: - input_audio_bitrate = FFmpegWrapper.get_audio_bitrate(probe_data) - audio_bitrate = self.config.audio_bitrate_kbps + input_audio = FFmpegWrapper.get_audio_bitrate(probe_data) + target_audio = self.config.audio_bitrate_kbps - if input_audio_bitrate and input_audio_bitrate < audio_bitrate: - adjusted_audio_bitrate = min(int(input_audio_bitrate * 1.05), audio_bitrate) - audio_bitrate = adjusted_audio_bitrate + if input_audio and input_audio < target_audio: + adjusted = min(int(input_audio * 1.05), target_audio) context.logger.info( - f'Input audio bitrate ({input_audio_bitrate} kbps) < ' - f'target ({self.config.audio_bitrate_kbps} kbps). ' - f'Adjusted to {audio_bitrate} kbps to avoid quality loss.', + f'Input audio ({input_audio} kbps) < target. Adjusted to {adjusted} kbps.', ) - - return audio_bitrate - - def _determine_deinterlace( - self, input_data: SourceVideo, context: ExecutionContext, probe_data: Dict[str, Any], + return adjusted + return target_audio + + def __resolve_deinterlacing_strategy( + self, + input_data: SourceVideo, + context: ExecutionContext, + probe_data: Dict[str, Any], ) -> bool: - field_order = FFmpegWrapper.get_field_order(probe_data) - if self.config.force_deinterlace: - context.logger.info( - f"Force deinterlacing enabled for {input_data.episode_id} (field_order={field_order}) - " - f"skipping idet analysis and applying bwdif filter unconditionally", - ) + context.logger.info(f"Force deinterlacing enabled for {input_data.episode_id}") return True + return self.__detect_and_verify_interlacing(input_data, context, probe_data) + + def __log_execution_details( + self, + context: ExecutionContext, + input_data: SourceVideo, + params: TranscodeParams, + probe_data: Dict[str, Any], + ) -> None: + source_w, source_h = FFmpegWrapper.get_resolution(probe_data) + upscale_msg = "UPSCALING DETECTED" if params.is_upscaling else "No upscaling" + context.logger.info( - f"Detecting interlacing for {input_data.episode_id} " - f"(field_order={field_order}, analyzing first 60s)...", + f'{input_data.episode_id}: Source {source_w}x{source_h} → ' + f'Target {self.config.resolution.width}x{self.config.resolution.height} - {upscale_msg}', ) - has_interlacing, idet_stats = FFmpegWrapper.detect_interlacing(input_data.path) + self.__log_static_transcode_info(context, params.audio_bitrate) + context.logger.info(f'Transcoding {input_data.episode_id}') - if idet_stats: - metadata_says_progressive = field_order in {'progressive', 'unknown'} - idet_says_progressive = not has_interlacing + def __log_transcode_details( + self, + context: ExecutionContext, + input_data: SourceVideo, + params: TranscodeParams, + probe_data: Dict[str, Any], + ) -> None: + self.__log_execution_details(context, input_data, params, probe_data) - if metadata_says_progressive != idet_says_progressive: - context.logger.warning( - f"⚠ {input_data.episode_id}: field_order={field_order} but idet detected " - f"{'interlaced' if has_interlacing else 'progressive'} content! Using idet result.", - ) + def __execute_ffmpeg_process( + self, + context: ExecutionContext, + params: TranscodeParams, + episode_id: str, + ) -> None: + temp_path = params.output_path.with_suffix('.mp4.tmp') + final_path = params.output_path - if has_interlacing and idet_stats: - context.logger.info( - f"Interlacing detected for {input_data.episode_id} " - f"({idet_stats['ratio']*100:.1f}% interlaced frames: " - f"TFF={idet_stats['tff']}, BFF={idet_stats['bff']}, Progressive={idet_stats['progressive']}) - " - f"applying bwdif deinterlacing filter", - ) - elif idet_stats: + params.output_path = temp_path + context.mark_step_started(self.name, episode_id, [str(temp_path)]) + + try: + if params.log_command: + self.__log_ffmpeg_command_header(context) + + FFmpegWrapper.transcode(params) + temp_path.replace(final_path) + except BaseException: + if temp_path.exists(): + temp_path.unlink() + raise + finally: + params.output_path = final_path + + def __construct_result_artifact( + self, + output_path: Path, + input_data: SourceVideo, + ) -> TranscodedVideo: + return TranscodedVideo( + path=output_path, + episode_id=input_data.episode_id, + episode_info=input_data.episode_info, + resolution=f'{self.config.resolution.width}x{self.config.resolution.height}', + codec=self.config.codec, + ) + + @staticmethod + def __should_log_command() -> bool: + if not VideoTranscoderStep.__command_logged: + VideoTranscoderStep.__command_logged = True + return True + return False + + @staticmethod + def __resolve_output_path( + input_data: SourceVideo, + context: ExecutionContext, + ) -> Path: + filename = f'{context.series_name}_{input_data.episode_info.episode_code()}.mp4' + return context.get_season_output_path( + input_data.episode_info, 'transcoded_videos', filename, + ) + + @staticmethod + def __resolve_target_framerate( + probe_data: Dict[str, Any], + context: ExecutionContext, + ) -> float: + input_fps = FFmpegWrapper.get_framerate(probe_data) + target_fps = 24.0 + + if input_fps != target_fps: context.logger.info( - f"Progressive content detected for {input_data.episode_id} " - f"({idet_stats['progressive']}/{idet_stats['progressive'] + idet_stats['tff'] + idet_stats['bff']} frames) - " - f"no deinterlacing needed", + f'Input FPS ({input_fps:.2f}) → forcing {target_fps} FPS for consistency.', ) - else: + return target_fps + + @staticmethod + def __detect_and_verify_interlacing( + input_data: SourceVideo, + context: ExecutionContext, + probe_data: Dict[str, Any], + ) -> bool: + context.logger.info(f"Detecting interlacing for {input_data.episode_id}...") + has_interlacing, idet_stats = FFmpegWrapper.detect_interlacing(input_data.path) + field_order = FFmpegWrapper.get_field_order(probe_data) + + if not idet_stats: context.logger.error( - f"Failed to detect interlacing for {input_data.episode_id} - " - f"idet filter did not return valid statistics. " - f"This may indicate an ffmpeg error or incompatible video format. " - f"Proceeding without deinterlacing.", + f"Failed to detect interlacing for {input_data.episode_id}. Proceeding without deinterlace.", ) + return False + VideoTranscoderStep.__log_interlacing_diagnostics(context, has_interlacing, idet_stats, field_order) return has_interlacing - def _perform_transcode( # pylint: disable=too-many-arguments - self, - input_path: Path, - output_path: Path, - video_bitrate: float, - minrate: float, - maxrate: float, - bufsize: float, - audio_bitrate: int, - target_fps: float, - deinterlace: bool, - is_upscaling: bool, - context: ExecutionContext, - input_data: SourceVideo, + @staticmethod + def __log_interlacing_diagnostics( + context: ExecutionContext, + has_interlacing: bool, + idet_stats: Dict[str, Any], + field_order: str, ) -> None: - temp_path = output_path.with_suffix('.mp4.tmp') - context.mark_step_started(self.name, input_data.episode_id, [str(temp_path)]) + meta_progressive = field_order in {'progressive', 'unknown'} + idet_progressive = not has_interlacing - try: - log_command = not VideoTranscoderStep._command_logged - if log_command: - VideoTranscoderStep._command_logged = True - context.logger.info('=' * 80) - context.logger.info('FFmpeg command example (showing once):') - context.logger.info('=' * 80) - - FFmpegWrapper.transcode( - TranscodeParams( - input_path=input_path, - output_path=temp_path, - codec=self.config.codec, - preset=self.config.preset, - resolution=f'{self.config.resolution.width}:{self.config.resolution.height}', - video_bitrate=f'{video_bitrate}M', - minrate=f'{minrate}M', - maxrate=f'{maxrate}M', - bufsize=f'{bufsize}M', - audio_bitrate=f'{audio_bitrate}k', - gop_size=int(target_fps * 0.5), - target_fps=target_fps, - deinterlace=deinterlace, - is_upscaling=is_upscaling, - log_command=log_command, - ), + if meta_progressive != idet_progressive: + context.logger.warning( + f"⚠ Conflict: Metadata says {field_order}, idet says " + f"{'interlaced' if has_interlacing else 'progressive'}. Using idet result.", ) - temp_path.replace(output_path) - except BaseException: - if temp_path.exists(): - temp_path.unlink() - raise + + if has_interlacing: + context.logger.info( + f"Interlacing detected ({idet_stats['ratio'] * 100:.1f}%). Applying bwdif.", + ) + else: + context.logger.info("Progressive content detected. No deinterlacing needed.") + + @staticmethod + def __log_static_transcode_info(context: ExecutionContext, audio_bitrate: str) -> None: + context.logger.info( + 'Video: SAR 1:1, timebase 1/90000, colorspace bt709, ' + 'closed GOP=12 frames with IDR keyframes.', + ) + context.logger.info( + f'Audio: AAC {audio_bitrate}, 2 channels, 48 kHz (forced).', + ) + + @staticmethod + def __log_ffmpeg_command_header(context: ExecutionContext) -> None: + context.logger.info('=' * 80) + context.logger.info('FFmpeg command example (showing once):') + context.logger.info('=' * 80) diff --git a/preprocessor/steps/vision/character_detection_step.py b/preprocessor/steps/vision/character_detection_step.py index 11455c11a..dad92807f 100644 --- a/preprocessor/steps/vision/character_detection_step.py +++ b/preprocessor/steps/vision/character_detection_step.py @@ -19,34 +19,37 @@ class CharacterDetectorStep(PipelineStep[FrameCollection, DetectionResults, CharacterDetectionConfig]): - def __init__(self, config: CharacterDetectionConfig) -> None: super().__init__(config) - self._face_app = None - self._character_vectors: Dict[str, np.ndarray] = {} + self.__face_app = None + self.__character_vectors: Dict[str, np.ndarray] = {} + + @property + def name(self) -> str: + return 'character_detection' def cleanup(self) -> None: - self._face_app = None - self._character_vectors = {} + self.__face_app = None + self.__character_vectors = {} def execute( self, input_data: FrameCollection, context: ExecutionContext, ) -> DetectionResults: - output_path = self._get_output_path(input_data, context) + output_path = self.__resolve_output_path(input_data, context) if self._check_cache_validity(output_path, context, input_data.episode_id, 'cached character detections'): - return self._load_cached_result(output_path, input_data) + return self.__load_cached_result(output_path, input_data) - self._ensure_model_loaded(context) + self.__prepare_detection_environment(context) context.logger.info(f'Detecting characters in {input_data.episode_id}') context.mark_step_started(self.name, input_data.episode_id) - frame_files = self._get_frame_files(input_data) + frame_files = self.__extract_frame_files(input_data) if not frame_files: - return self._create_empty_result(output_path, input_data, context) + return self.__construct_empty_result(output_path, input_data, context) - results = self._detect_characters(frame_files) - self._save_results(results, output_path, input_data, context, frame_files) + results = self.__process_character_detection(frame_files) + self.__save_detection_results(results, output_path, input_data, context, frame_files) context.mark_step_completed(self.name, input_data.episode_id) return DetectionResults( @@ -57,85 +60,39 @@ def execute( detection_count=len(results), ) - @property - def name(self) -> str: - return 'character_detection' - - @staticmethod - def _get_output_path(input_data: FrameCollection, context: ExecutionContext) -> Path: - filename = f'{context.series_name}_{input_data.episode_info.episode_code()}' - output_filename: str = f'{filename}_character_detections.json' - return context.get_output_path( - input_data.episode_info, 'character_detections', output_filename, - ) - - - @staticmethod - def _load_cached_result(output_path: Path, input_data: FrameCollection) -> DetectionResults: - detection_data: Dict[str, Any] = FileOperations.load_json(output_path) - return DetectionResults( - episode_id=input_data.episode_id, - episode_info=input_data.episode_info, - path=output_path, - detection_type='character', - detection_count=len(detection_data.get('detections', [])), - ) - - def _ensure_model_loaded(self, context: ExecutionContext) -> None: - if self._face_app is None: + def __prepare_detection_environment(self, context: ExecutionContext) -> None: + if self.__face_app is None: context.logger.info('Initializing face detection model...') - self._face_app = FaceDetector.init() - self._load_character_references(context) + self.__face_app = FaceDetector.init() + self.__load_character_references(context) - def _load_character_references(self, context: ExecutionContext) -> None: + def __load_character_references(self, context: ExecutionContext) -> None: characters_dir: Path = Path('preprocessor/output_data') / context.series_name / 'characters' if not characters_dir.exists(): characters_dir = Path('preprocessor/input_data') / context.series_name / 'characters' if characters_dir.exists(): context.logger.info(f'Loading character references from {characters_dir}') - self._character_vectors = FaceDetector.load_character_references( - characters_dir, self._face_app, + self.__character_vectors = FaceDetector.load_character_references( + characters_dir, self.__face_app, ) else: context.logger.warning(f'Characters directory not found: {characters_dir}') - @staticmethod - def _get_frame_files(input_data: FrameCollection) -> List[Path]: - return sorted([ - f for f in input_data.directory.glob('*.jpg') - if f.is_file() and 'frame_' in f.name - ]) - - @staticmethod - def _create_empty_result( - output_path: Path, - input_data: FrameCollection, - context: ExecutionContext, - ) -> DetectionResults: - context.logger.warning(f'No frame files found in {input_data.directory}') - return DetectionResults( - episode_id=input_data.episode_id, - episode_info=input_data.episode_info, - path=output_path, - detection_type='character', - detection_count=0, - ) - - def _detect_characters(self, frame_files: List[Path]) -> List[Dict[str, Any]]: + def __process_character_detection(self, frame_files: List[Path]) -> List[Dict[str, Any]]: results: List[Dict[str, Any]] = [] for frame_path in frame_files: detections: List[Dict[str, Any]] = FaceDetector.detect_characters_in_frame( frame_path, - self._face_app, - self._character_vectors, + self.__face_app, + self.__character_vectors, self.config.threshold, ) if detections: results.append({'frame': frame_path.name, 'faces': detections}) return results - def _save_results( + def __save_detection_results( self, results: List[Dict[str, Any]], output_path: Path, @@ -156,6 +113,47 @@ def _save_results( } FileOperations.atomic_write_json(output_path, output_data) + @staticmethod + def __resolve_output_path(input_data: FrameCollection, context: ExecutionContext) -> Path: + filename = f'{context.series_name}_{input_data.episode_info.episode_code()}' + output_filename: str = f'{filename}_character_detections.json' + return context.get_output_path( + input_data.episode_info, 'character_detections', output_filename, + ) + + @staticmethod + def __load_cached_result(output_path: Path, input_data: FrameCollection) -> DetectionResults: + detection_data: Dict[str, Any] = FileOperations.load_json(output_path) + return DetectionResults( + episode_id=input_data.episode_id, + episode_info=input_data.episode_info, + path=output_path, + detection_type='character', + detection_count=len(detection_data.get('detections', [])), + ) + + @staticmethod + def __extract_frame_files(input_data: FrameCollection) -> List[Path]: + return sorted([ + f for f in input_data.directory.glob('*.jpg') + if f.is_file() and 'frame_' in f.name + ]) + + @staticmethod + def __construct_empty_result( + output_path: Path, + input_data: FrameCollection, + context: ExecutionContext, + ) -> DetectionResults: + context.logger.warning(f'No frame files found in {input_data.directory}') + return DetectionResults( + episode_id=input_data.episode_id, + episode_info=input_data.episode_info, + path=output_path, + detection_type='character', + detection_count=0, + ) + @staticmethod def __count_characters(results: List[Dict[str, Any]]) -> Dict[str, int]: counts: Dict[str, int] = {} diff --git a/preprocessor/steps/vision/embeddings_step.py b/preprocessor/steps/vision/embeddings_step.py index 55c9b7bff..fe72e061c 100644 --- a/preprocessor/steps/vision/embeddings_step.py +++ b/preprocessor/steps/vision/embeddings_step.py @@ -21,97 +21,55 @@ class VideoEmbeddingStep(PipelineStep[FrameCollection, EmbeddingCollection, VideoEmbeddingConfig]): - def __init__(self, config: VideoEmbeddingConfig) -> None: super().__init__(config) - self._model: Optional[EmbeddingModelWrapper] = None + self.__model: Optional[EmbeddingModelWrapper] = None + + @property + def name(self) -> str: + return 'video_embedding' def cleanup(self) -> None: - if self._model: - self._model.cleanup() # pylint: disable=no-member - self._model = None + if self.__model: + self.__model.cleanup() + self.__model = None def execute( - self, input_data: FrameCollection, context: ExecutionContext, + self, input_data: FrameCollection, context: ExecutionContext, ) -> EmbeddingCollection: - output_path = self._get_output_path(input_data, context) + output_path = self.__resolve_output_path(input_data, context) - if self._should_skip_processing(output_path, context, input_data): - return self._load_cached_result(output_path, input_data) + if self._check_cache_validity(output_path, context, input_data.episode_id, 'cached video embeddings'): + return self.__load_cached_result(output_path, input_data) - frame_requests = self._load_frame_requests(input_data, context) + frame_requests = self.__extract_frame_requests(input_data, context) if not frame_requests: - return self._create_embedding_collection(input_data, output_path, 0) + return self.__construct_embedding_collection(input_data, output_path, 0) - self._ensure_model_loaded() + self.__prepare_embedding_model(context) context.logger.info( f'Generating video embeddings for {len(frame_requests)} frames in {input_data.episode_id}', ) context.mark_step_started(self.name, input_data.episode_id) - image_hashes = self.__load_image_hashes(input_data, context) - results = self._generate_embeddings(frame_requests, input_data, image_hashes) - self._save_results(results, output_path, input_data, image_hashes) + image_hashes = self.__fetch_image_hashes(input_data, context) + results = self.__generate_embeddings(frame_requests, input_data, image_hashes) + self.__save_embedding_results(results, output_path, input_data, image_hashes) context.mark_step_completed(self.name, input_data.episode_id) - return self._create_embedding_collection(input_data, output_path, len(results)) - - @property - def name(self) -> str: - return 'video_embedding' - - @staticmethod - def _get_output_path(input_data: FrameCollection, context: ExecutionContext) -> Path: - filename_base = f'{context.series_name}_{input_data.episode_info.episode_code()}' - output_filename: str = f'{filename_base}_embeddings_video.json' - return context.get_output_path(input_data.episode_info, 'embeddings', output_filename) - - def _should_skip_processing( - self, - output_path: Path, - context: ExecutionContext, - input_data: FrameCollection, - ) -> bool: - return self._check_cache_validity( - output_path, - context, - input_data.episode_id, - 'cached video embeddings', - ) - - def _load_cached_result( # pylint: disable=duplicate-code - self, - output_path: Path, - input_data: FrameCollection, - ) -> EmbeddingCollection: - emb_data: Dict[str, Any] = FileOperations.load_json(output_path) - return self._create_embedding_collection( - input_data, - output_path, - len(emb_data.get('video_embeddings', [])), - ) - - @staticmethod - def _load_frame_requests( - input_data: FrameCollection, - context: ExecutionContext, - ) -> List[Dict[str, Any]]: - frame_metadata: Dict[str, Any] = FileOperations.load_json(input_data.metadata_path) - frame_requests: List[Dict[str, Any]] = frame_metadata.get('frames', []) - if not frame_requests: - context.logger.warning(f'No frames for embedding in {input_data.episode_id}') - return frame_requests - - def _ensure_model_loaded(self) -> None: - if self._model is None: - self._model = EmbeddingModelWrapper(self.config.model_name, self.config.device) - self._model.load_model() # pylint: disable=no-member - - def _generate_embeddings( - self, - frame_requests: List[Dict[str, Any]], - input_data: FrameCollection, - image_hashes: Dict[int, str], + return self.__construct_embedding_collection(input_data, output_path, len(results)) + + def __prepare_embedding_model(self, context: ExecutionContext) -> None: + if self.__model is None: + context.logger.info('Initializing embedding model...') + self.__model = EmbeddingModelWrapper(self.config.model_name, self.config.device) + self.__model.load_model() + + def __generate_embeddings( + self, + frame_requests: List[Dict[str, Any]], + input_data: FrameCollection, + image_hashes: Dict[int, str], ) -> List[Dict[str, Any]]: results: List[Dict[str, Any]] = [] batch_size: int = self.config.batch_size @@ -119,7 +77,7 @@ def _generate_embeddings( for i in range(0, len(frame_requests), batch_size): batch: List[Dict[str, Any]] = frame_requests[i:i + batch_size] image_paths: List[str] = [str(input_data.directory / f['frame_path']) for f in batch] - batch_embeddings: List[np.ndarray] = self._model.encode_images(image_paths) # pylint: disable=no-member + batch_embeddings: List[np.ndarray] = self.__model.encode_images(image_paths) for request, embedding in zip(batch, batch_embeddings): res: Dict[str, Any] = {**request, 'embedding': embedding.tolist()} @@ -130,12 +88,39 @@ def _generate_embeddings( return results + def __load_cached_result( + self, + output_path: Path, + input_data: FrameCollection, + ) -> EmbeddingCollection: + emb_data: Dict[str, Any] = FileOperations.load_json(output_path) + return self.__construct_embedding_collection( + input_data, + output_path, + len(emb_data.get('video_embeddings', [])), + ) + + def __construct_embedding_collection( # pylint: disable=duplicate-code + self, + input_data: FrameCollection, + output_path: Path, + embedding_count: int, + ) -> EmbeddingCollection: + return MetadataBuilder.create_embedding_collection( + episode_id=input_data.episode_id, + episode_info=input_data.episode_info, + path=output_path, + model_name=self.config.model_name, + embedding_count=embedding_count, + embedding_type='video', + ) + @staticmethod - def _save_results( - results: List[Dict[str, Any]], - output_path: Path, - input_data: FrameCollection, - image_hashes: Dict[int, str], + def __save_embedding_results( + results: List[Dict[str, Any]], + output_path: Path, + input_data: FrameCollection, + image_hashes: Dict[int, str], ) -> None: statistics = { 'total_embeddings': len(results), @@ -151,30 +136,34 @@ def _save_results( ) FileOperations.atomic_write_json(output_path, output_data) - def _create_embedding_collection( # pylint: disable=duplicate-code - self, - input_data: FrameCollection, - output_path: Path, - embedding_count: int, - ) -> EmbeddingCollection: - return MetadataBuilder.create_embedding_collection( - episode_id=input_data.episode_id, - episode_info=input_data.episode_info, - path=output_path, - model_name=self.config.model_name, - embedding_count=embedding_count, - embedding_type='video', - ) + @staticmethod + def __resolve_output_path(input_data: FrameCollection, context: ExecutionContext) -> Path: + filename_base = f'{context.series_name}_{input_data.episode_info.episode_code()}' + output_filename: str = f'{filename_base}_embeddings_video.json' + return context.get_output_path(input_data.episode_info, 'embeddings', output_filename) + + @staticmethod + def __extract_frame_requests( + input_data: FrameCollection, + context: ExecutionContext, + ) -> List[Dict[str, Any]]: + frame_metadata: Dict[str, Any] = FileOperations.load_json(input_data.metadata_path) + frame_requests: List[Dict[str, Any]] = frame_metadata.get('frames', []) + if not frame_requests: + context.logger.warning(f'No frames for embedding in {input_data.episode_id}') + return frame_requests @staticmethod - def __load_image_hashes( - input_data: FrameCollection, context: ExecutionContext, + def __fetch_image_hashes( + input_data: FrameCollection, context: ExecutionContext, ) -> Dict[int, str]: filename_base = f'{context.series_name}_{input_data.episode_info.episode_code()}' hash_filename: str = f'{filename_base}_image_hashes.json' hash_path: Path = context.get_output_path(input_data.episode_info, 'image_hashes', hash_filename) + if not hash_path.exists(): return {} + try: data: Dict[str, Any] = FileOperations.load_json(hash_path) return {h['frame_number']: h['perceptual_hash'] for h in data.get('hashes', [])} diff --git a/preprocessor/steps/vision/emotion_detection_step.py b/preprocessor/steps/vision/emotion_detection_step.py index d1db207d4..05370d7a4 100644 --- a/preprocessor/steps/vision/emotion_detection_step.py +++ b/preprocessor/steps/vision/emotion_detection_step.py @@ -23,69 +23,46 @@ class EmotionDetectionStep(PipelineStep[FrameCollection, EmotionData, EmotionDetectionConfig]): - def __init__(self, config: EmotionDetectionConfig) -> None: super().__init__(config) - self._model: Optional[HSEmotionRecognizer] = None + self.__model: Optional[HSEmotionRecognizer] = None + + @property + def name(self) -> str: + return 'emotion_detection' def cleanup(self) -> None: - self._model = None + self.__model = None def execute(self, input_data: FrameCollection, context: ExecutionContext) -> EmotionData: - detections_path = self._get_character_detections_path(input_data, context) + detections_path = self.__resolve_detections_path(input_data, context) if self._check_cache_validity(detections_path, context, input_data.episode_id, 'cached emotion detection'): - return EmotionData( - episode_id=input_data.episode_id, - episode_info=input_data.episode_info, - path=detections_path, - ) + return self.__construct_emotion_data(input_data, detections_path) if not detections_path.exists(): context.logger.warning( f'No character detections found for emotion analysis: {detections_path}', ) - return EmotionData( - episode_id=input_data.episode_id, - episode_info=input_data.episode_info, - path=detections_path, - ) + return self.__construct_emotion_data(input_data, detections_path) context.logger.info(f'Detecting emotions for {input_data.episode_id}') context.mark_step_started(self.name, input_data.episode_id) - self._ensure_model_loaded(context) + self.__prepare_emotion_model(context) detections_data = FileOperations.load_json(detections_path) - self._process_emotions(detections_data, input_data, context) + self.__process_and_update_emotions(detections_data, input_data, context) FileOperations.atomic_write_json(detections_path, detections_data) context.mark_step_completed(self.name, input_data.episode_id) - return EmotionData( - episode_id=input_data.episode_id, - episode_info=input_data.episode_info, - path=detections_path, - ) + return self.__construct_emotion_data(input_data, detections_path) - @property - def name(self) -> str: - return 'emotion_detection' + def __prepare_emotion_model(self, context: ExecutionContext) -> None: + if self.__model is None: + self.__model = EmotionDetector._init_model(context.logger) - @staticmethod - def _get_character_detections_path( - input_data: FrameCollection, context: ExecutionContext, - ) -> Path: - filename = f'{context.series_name}_{input_data.episode_info.episode_code()}' - output_filename: str = f'{filename}_character_detections.json' - return context.get_output_path( - input_data.episode_info, 'character_detections', output_filename, - ) - - def _ensure_model_loaded(self, context: ExecutionContext) -> None: - if self._model is None: - self._model = EmotionDetector._init_model(context.logger) - - def _process_emotions( + def __process_and_update_emotions( self, detections_data: Dict[str, Any], input_data: FrameCollection, @@ -93,7 +70,7 @@ def _process_emotions( ) -> None: detections: List[Dict[str, Any]] = detections_data.get('detections', []) - face_crops, face_metadata = self._collect_face_crops( + face_crops, face_metadata = self.__collect_face_crops( detections, input_data.directory, context, ) @@ -103,13 +80,33 @@ def _process_emotions( context.logger.info(f'Processing {len(face_crops)} faces with HSEmotion model') emotion_results = EmotionDetector._detect_batch( - face_crops, self._model, batch_size=32, logger=context.logger, + face_crops, self.__model, batch_size=32, logger=context.logger, + ) + + self.__apply_emotion_results(detections, emotion_results, face_metadata, context) + + @staticmethod + def __resolve_detections_path( + input_data: FrameCollection, context: ExecutionContext, + ) -> Path: + filename = f'{context.series_name}_{input_data.episode_info.episode_code()}' + output_filename: str = f'{filename}_character_detections.json' + return context.get_output_path( + input_data.episode_info, 'character_detections', output_filename, ) - self._apply_emotion_results(detections, emotion_results, face_metadata, context) + @staticmethod + def __construct_emotion_data( + input_data: FrameCollection, detections_path: Path, + ) -> EmotionData: + return EmotionData( + episode_id=input_data.episode_id, + episode_info=input_data.episode_info, + path=detections_path, + ) @staticmethod - def _collect_face_crops( + def __collect_face_crops( detections: List[Dict[str, Any]], frames_dir: Path, context: ExecutionContext, @@ -152,7 +149,7 @@ def _collect_face_crops( return face_crops, face_metadata @staticmethod - def _apply_emotion_results( + def __apply_emotion_results( detections: List[Dict[str, Any]], emotion_results: List[Optional[Tuple[str, float, Dict[str, float]]]], face_metadata: List[Dict[str, int]], diff --git a/preprocessor/steps/vision/face_clustering_step.py b/preprocessor/steps/vision/face_clustering_step.py index 1a55a11e5..304abafb4 100644 --- a/preprocessor/steps/vision/face_clustering_step.py +++ b/preprocessor/steps/vision/face_clustering_step.py @@ -10,19 +10,49 @@ class FaceClusteringStep(PipelineStep[FrameCollection, ClusterData, FaceClusteringConfig]): + @property + def name(self) -> str: + return 'face_clustering' + + def execute( + self, input_data: FrameCollection, context: ExecutionContext, + ) -> ClusterData: + output_path = self.__resolve_output_path(input_data, context) + + if self.__is_execution_cached(output_path, input_data.episode_id, context): + context.logger.info(f'Skipping {input_data.episode_id} (cached face clustering)') + return self.__construct_cluster_data(input_data, output_path) - def execute(self, input_data: FrameCollection, context: ExecutionContext) -> ClusterData: - output_filename: str = f'{context.series_name}_{input_data.episode_info.episode_code()}_clusters.json' - output_path: Path = context.get_output_path(input_data.episode_info, 'face_clusters', output_filename) - if output_path.exists() and (not context.force_rerun): - if context.is_step_completed(self.name, input_data.episode_id): - context.logger.info(f'Skipping {input_data.episode_id} (cached face clustering)') - return ClusterData(episode_id=input_data.episode_id, episode_info=input_data.episode_info, path=output_path) context.logger.info(f'Clustering faces for {input_data.episode_id}') context.mark_step_started(self.name, input_data.episode_id) + context.mark_step_completed(self.name, input_data.episode_id) - return ClusterData(episode_id=input_data.episode_id, episode_info=input_data.episode_info, path=output_path) + return self.__construct_cluster_data(input_data, output_path) - @property - def name(self) -> str: - return 'face_clustering' + def __is_execution_cached( + self, output_path: Path, episode_id: str, context: ExecutionContext, + ) -> bool: + if not output_path.exists(): + return False + if context.force_rerun: + return False + return context.is_step_completed(self.name, episode_id) + + @staticmethod + def __resolve_output_path( + input_data: FrameCollection, context: ExecutionContext, + ) -> Path: + output_filename = f'{context.series_name}_{input_data.episode_info.episode_code()}_clusters.json' + return context.get_output_path( + input_data.episode_info, 'face_clusters', output_filename, + ) + + @staticmethod + def __construct_cluster_data( + input_data: FrameCollection, output_path: Path, + ) -> ClusterData: + return ClusterData( + episode_id=input_data.episode_id, + episode_info=input_data.episode_info, + path=output_path, + ) diff --git a/preprocessor/steps/vision/image_hashing_step.py b/preprocessor/steps/vision/image_hashing_step.py index 0b83d63e0..326b52459 100644 --- a/preprocessor/steps/vision/image_hashing_step.py +++ b/preprocessor/steps/vision/image_hashing_step.py @@ -1,4 +1,4 @@ -# pylint: disable=cyclic-import # False positive - config uses import-outside-toplevel +# pylint: disable=cyclic-import import gc from pathlib import Path from typing import ( @@ -6,6 +6,7 @@ Dict, List, Optional, + Tuple, ) import torch @@ -23,35 +24,39 @@ class ImageHashStep(PipelineStep[FrameCollection, ImageHashCollection, ImageHashConfig]): - def __init__(self, config: ImageHashConfig) -> None: super().__init__(config) - self._hasher: Optional[PerceptualHasher] = None + self.__hasher: Optional[PerceptualHasher] = None + + @property + def name(self) -> str: + return 'image_hashing' def cleanup(self) -> None: - self._hasher = None + self.__hasher = None self.__cleanup_memory() def execute( - self, input_data: FrameCollection, context: ExecutionContext, + self, input_data: FrameCollection, context: ExecutionContext, ) -> ImageHashCollection: - output_path = self._get_output_path(input_data, context) + output_path = self.__resolve_output_path(input_data, context) if self._check_cache_validity(output_path, context, input_data.episode_id, 'cached'): - return self._load_cached_result(output_path, input_data) + return self.__load_cached_result(output_path, input_data) - frame_metadata, frame_requests = self._load_frame_metadata(input_data, context) + frame_metadata, frame_requests = self.__load_frame_metadata(input_data, context) if not frame_requests: - return self._create_empty_result(output_path, input_data) + return self.__construct_empty_result(output_path, input_data) + + self.__prepare_hasher(context) - self._ensure_hasher_loaded(context) context.logger.info( f'Computing hashes for {len(frame_requests)} frames in {input_data.episode_id}', ) context.mark_step_started(self.name, input_data.episode_id) - hash_results = self._compute_hashes(frame_requests, input_data) - self._save_results(hash_results, output_path, input_data, context, frame_metadata) + hash_results = self.__compute_hashes(frame_requests, input_data) + self.__save_hash_results(hash_results, output_path, input_data, context, frame_metadata) context.mark_step_completed(self.name, input_data.episode_id) self.__cleanup_memory() @@ -63,19 +68,43 @@ def execute( hash_count=len(hash_results), ) - @property - def name(self) -> str: - return 'image_hashing' + def __prepare_hasher(self, context: ExecutionContext) -> None: + if self.__hasher is None: + context.logger.info(f'Loading image hasher on {self.config.device}...') + self.__hasher = PerceptualHasher() + + def __compute_hashes( + self, + frame_requests: List[Dict[str, Any]], + input_data: FrameCollection, + ) -> List[Dict[str, Any]]: + hash_results: List[Dict[str, Any]] = [] + batch_size: int = self.config.batch_size + + for i in range(0, len(frame_requests), batch_size): + batch: List[Dict[str, Any]] = frame_requests[i:i + batch_size] + pil_images = FrameLoader.load_from_requests(input_data.directory, batch) + phashes: List[str] = self.__hasher.compute_phash_batch(pil_images) + + for request, phash in zip(batch, phashes): + result: Dict[str, Any] = request.copy() + result['perceptual_hash'] = phash + hash_results.append(result) + + del pil_images + if i % (batch_size * 5) == 0: + self.__cleanup_memory() + + return hash_results @staticmethod - def _get_output_path(input_data: FrameCollection, context: ExecutionContext) -> Path: + def __resolve_output_path(input_data: FrameCollection, context: ExecutionContext) -> Path: filename_base = f'{context.series_name}_{input_data.episode_info.episode_code()}' output_filename: str = f'{filename_base}_image_hashes.json' return context.get_output_path(input_data.episode_info, 'image_hashes', output_filename) - @staticmethod - def _load_cached_result(output_path: Path, input_data: FrameCollection) -> ImageHashCollection: + def __load_cached_result(output_path: Path, input_data: FrameCollection) -> ImageHashCollection: hash_data: Dict[str, Any] = FileOperations.load_json(output_path) return ImageHashCollection( episode_id=input_data.episode_id, @@ -85,20 +114,22 @@ def _load_cached_result(output_path: Path, input_data: FrameCollection) -> Image ) @staticmethod - def _load_frame_metadata( - input_data: FrameCollection, - context: ExecutionContext, - ) -> tuple[Dict[str, Any], List[Dict[str, Any]]]: + def __load_frame_metadata( + input_data: FrameCollection, + context: ExecutionContext, + ) -> Tuple[Dict[str, Any], List[Dict[str, Any]]]: frame_metadata: Dict[str, Any] = FileOperations.load_json(input_data.metadata_path) frame_requests: List[Dict[str, Any]] = frame_metadata.get('frames', []) + if not frame_requests: context.logger.warning(f'No frames to hash for {input_data.episode_id}') + return frame_metadata, frame_requests @staticmethod - def _create_empty_result( - output_path: Path, - input_data: FrameCollection, + def __construct_empty_result( + output_path: Path, + input_data: FrameCollection, ) -> ImageHashCollection: return ImageHashCollection( episode_id=input_data.episode_id, @@ -107,42 +138,13 @@ def _create_empty_result( hash_count=0, ) - def _ensure_hasher_loaded(self, context: ExecutionContext) -> None: - if self._hasher is None: - context.logger.info(f'Loading image hasher on {self.config.device}...') - self._hasher = PerceptualHasher() - - def _compute_hashes( - self, - frame_requests: List[Dict[str, Any]], - input_data: FrameCollection, - ) -> List[Dict[str, Any]]: - hash_results: List[Dict[str, Any]] = [] - batch_size: int = self.config.batch_size - - for i in range(0, len(frame_requests), batch_size): - batch: List[Dict[str, Any]] = frame_requests[i:i + batch_size] - pil_images = FrameLoader.load_from_requests(input_data.directory, batch) - phashes: List[str] = self._hasher.compute_phash_batch(pil_images) # pylint: disable=no-member - - for request, phash in zip(batch, phashes): - result: Dict[str, Any] = request.copy() - result['perceptual_hash'] = phash - hash_results.append(result) - - del pil_images - if i % (batch_size * 5) == 0: - self.__cleanup_memory() - - return hash_results - @staticmethod - def _save_results( - hash_results: List[Dict[str, Any]], - output_path: Path, - input_data: FrameCollection, - context: ExecutionContext, - frame_metadata: Dict[str, Any], + def __save_hash_results( + hash_results: List[Dict[str, Any]], + output_path: Path, + input_data: FrameCollection, + context: ExecutionContext, + frame_metadata: Dict[str, Any], ) -> None: output_data: Dict[str, Any] = { 'episode_id': input_data.episode_id, diff --git a/preprocessor/steps/vision/object_detection_step.py b/preprocessor/steps/vision/object_detection_step.py index 66cce8bb5..2f974cb97 100644 --- a/preprocessor/steps/vision/object_detection_step.py +++ b/preprocessor/steps/vision/object_detection_step.py @@ -10,19 +10,49 @@ class ObjectDetectionStep(PipelineStep[FrameCollection, ObjectDetectionData, ObjectDetectionConfig]): + @property + def name(self) -> str: + return 'object_detection' + + def execute( + self, input_data: FrameCollection, context: ExecutionContext, + ) -> ObjectDetectionData: + output_path = self.__resolve_output_path(input_data, context) + + if self.__is_execution_cached(output_path, input_data.episode_id, context): + context.logger.info(f'Skipping {input_data.episode_id} (cached object detection)') + return self.__construct_object_data(input_data, output_path) - def execute(self, input_data: FrameCollection, context: ExecutionContext) -> ObjectDetectionData: - output_filename: str = f'{context.series_name}_{input_data.episode_info.episode_code()}_objects.json' - output_path: Path = context.get_output_path(input_data.episode_info, 'object_detections', output_filename) - if output_path.exists() and (not context.force_rerun): - if context.is_step_completed(self.name, input_data.episode_id): - context.logger.info(f'Skipping {input_data.episode_id} (cached object detection)') - return ObjectDetectionData(episode_id=input_data.episode_id, episode_info=input_data.episode_info, path=output_path) context.logger.info(f'Detecting objects for {input_data.episode_id}') context.mark_step_started(self.name, input_data.episode_id) + context.mark_step_completed(self.name, input_data.episode_id) - return ObjectDetectionData(episode_id=input_data.episode_id, episode_info=input_data.episode_info, path=output_path) + return self.__construct_object_data(input_data, output_path) - @property - def name(self) -> str: - return 'object_detection' + def __is_execution_cached( + self, output_path: Path, episode_id: str, context: ExecutionContext, + ) -> bool: + if not output_path.exists(): + return False + if context.force_rerun: + return False + return context.is_step_completed(self.name, episode_id) + + @staticmethod + def __resolve_output_path( + input_data: FrameCollection, context: ExecutionContext, + ) -> Path: + output_filename = f'{context.series_name}_{input_data.episode_info.episode_code()}_objects.json' + return context.get_output_path( + input_data.episode_info, 'object_detections', output_filename, + ) + + @staticmethod + def __construct_object_data( + input_data: FrameCollection, output_path: Path, + ) -> ObjectDetectionData: + return ObjectDetectionData( + episode_id=input_data.episode_id, + episode_info=input_data.episode_info, + path=output_path, + ) From 8663d2b164754f21e1e1152cf82f140c4f988ec2 Mon Sep 17 00:00:00 2001 From: dam2452 <81230036+dam2452@users.noreply.github.com> Date: Fri, 13 Feb 2026 11:20:14 +0100 Subject: [PATCH 29/89] Refactor: static methods, typing and renames Large refactor across the preprocessor package: convert many instance helpers to @staticmethods, tighten typing, and rename parameters/fields for clarity. Key changes: replace Settings._from_env with Settings.from_env and update SettingsFactory; introduce/adjust FrameRequest fields (frame_number, timestamp) and propagate that type through keyframe strategies; make numerous helper methods static (console, validators, generators, FFmpeg parsers, embedding computation, etc.); normalize method names in emotion and scene detection (init_model/detect_batch/crop_face, get_video_info); make PathService.suffix optional; avoid mutating TranscodeParams by using dataclasses.replace in transcoding step; simplify calls (safe_resize rename and call sites); format search CLI output into a single joinable list. Also includes small typing fixes, minor logic cleanups, and other consistency improvements. --- preprocessor/app/pipeline_builder.py | 4 +-- preprocessor/cli/cli_main.py | 8 +++--- preprocessor/cli/search_handler.py | 11 ++++---- preprocessor/config/config.py | 26 +++++++++---------- preprocessor/config/settings_factory.py | 2 +- preprocessor/config/types/frame.py | 6 ++--- preprocessor/services/ai/clients.py | 2 +- preprocessor/services/io/path_service.py | 7 +++-- preprocessor/services/media/ffmpeg.py | 13 +++++----- .../services/media/scene_detection.py | 4 +-- .../services/scraping/base_scraper.py | 26 ++++++++++++++++++- .../services/scraping/grid_visualizer.py | 10 ++++--- .../services/scraping/reference_processor.py | 9 +++---- .../search/clients/elasticsearch_queries.py | 6 +++-- .../search/clients/embedding_service.py | 3 ++- .../generators/base_generator.py | 3 ++- .../generators/multi_format_generator.py | 8 ++++-- .../processors/audio_normalizer.py | 3 ++- .../processors/normalized_audio_processor.py | 6 ++--- preprocessor/services/ui/console.py | 9 ++++--- .../services/validation/report_generator.py | 3 ++- preprocessor/services/validation/validator.py | 9 ++++--- .../validators/elastic_validator.py | 3 ++- .../validators/face_cluster_validator.py | 4 +-- .../validation/validators/object_validator.py | 6 +++-- .../validation/validators/scene_validator.py | 6 +++-- .../validators/transcription_validator.py | 9 ++++--- .../validation/validators/video_validator.py | 6 +++-- preprocessor/services/video/emotion_utils.py | 6 ++--- .../video/strategies/base_strategy.py | 4 ++- .../strategies/scene_changes_strategy.py | 17 ++++++------ .../video/strategies/strategy_factory.py | 3 ++- .../analysis/resolution_analysis_step.py | 11 +++++--- preprocessor/steps/search/indexing_step.py | 2 +- preprocessor/steps/text/embeddings_step.py | 2 +- .../steps/video/scene_detection_step.py | 2 +- preprocessor/steps/video/transcoding_step.py | 9 +++---- .../steps/vision/character_detection_step.py | 2 +- .../steps/vision/emotion_detection_step.py | 6 ++--- 39 files changed, 168 insertions(+), 108 deletions(-) diff --git a/preprocessor/app/pipeline_builder.py b/preprocessor/app/pipeline_builder.py index d4eb8b773..d51a9330e 100644 --- a/preprocessor/app/pipeline_builder.py +++ b/preprocessor/app/pipeline_builder.py @@ -43,8 +43,8 @@ def execute_step( self.__context.logger.info(f"Step: {step_id}") self.__context.logger.info(f"{step_def.description}") - StepClass = step_def.load_class() - instance = StepClass(step_def.config) + step_class = step_def.load_class() + instance = step_class(step_def.config) runner = PipelineExecutor(self.__context) runner.add_step(instance) diff --git a/preprocessor/cli/cli_main.py b/preprocessor/cli/cli_main.py index 190825e23..f5e83171c 100644 --- a/preprocessor/cli/cli_main.py +++ b/preprocessor/cli/cli_main.py @@ -1,12 +1,10 @@ import asyncio from pathlib import Path import sys -from typing import ( - Callable, - Tuple, -) +from typing import Tuple import click +from click import Command from elasticsearch import AsyncElasticsearch from preprocessor.app.pipeline_builder import PipelineExecutor @@ -82,7 +80,7 @@ def __run_all(series: str, force_rerun: bool, skip: Tuple[str, ...]) -> None: setup.logger.finalize() -def __create_step_command(step_id: str, step_description: str) -> Callable: +def __create_step_command(step_id: str, step_description: str) -> Command: @click.command(name=step_id.replace("_", "-"), help=f"{step_description}") @click.option("--series", required=True, help="Series name (e.g., ranczo)") @click.option("--force-rerun", is_flag=True, help="Force rerun even if cached") diff --git a/preprocessor/cli/search_handler.py b/preprocessor/cli/search_handler.py index 14e82c8a4..797552275 100644 --- a/preprocessor/cli/search_handler.py +++ b/preprocessor/cli/search_handler.py @@ -66,11 +66,12 @@ async def handle_stats(self) -> str: if self.__json_output: return json.dumps(result, indent=2) - output = ["\nStatystyki:"] - output.append(f" Segments: {result['segments']:,}") - output.append(f" Text Embeddings: {result['text_embeddings']:,}") - output.append(f" Video Embeddings: {result['video_embeddings']:,}") - output.append(f" Episode Names: {result['episode_names']:,}") + output = [ + "\nStatystyki:", f" Segments: {result['segments']:,}", + f" Text Embeddings: {result['text_embeddings']:,}", + f" Video Embeddings: {result['video_embeddings']:,}", + f" Episode Names: {result['episode_names']:,}", + ] return "\n".join(output) async def handle_list_characters(self) -> str: diff --git a/preprocessor/config/config.py b/preprocessor/config/config.py index e0ded6ed1..176a33d1c 100644 --- a/preprocessor/config/config.py +++ b/preprocessor/config/config.py @@ -114,7 +114,7 @@ class WhisperSettings: model: str = 'large-v3-turbo' @classmethod - def _from_env(cls) -> 'WhisperSettings': + def from_env(cls) -> 'WhisperSettings': return cls(model=os.getenv('WHISPER_MODEL', 'large-v3-turbo')) @@ -134,7 +134,7 @@ class ElevenLabsSettings(BaseAPISettings): polling_interval: int = 20 @classmethod - def _from_env(cls) -> 'ElevenLabsSettings': + def from_env(cls) -> 'ElevenLabsSettings': api_key = None if os.getenv('ELEVEN_API_KEY'): api_key = SecretStr(os.getenv('ELEVEN_API_KEY', '')) @@ -186,7 +186,7 @@ class EmotionDetectionSettings: model_name: str = 'enet_b2_8' @classmethod - def _from_env(cls) -> 'EmotionDetectionSettings': + def from_env(cls) -> 'EmotionDetectionSettings': model_name = os.getenv('EMOTION_MODEL_NAME', 'enet_b2_8') return cls(model_name=model_name) @@ -231,7 +231,7 @@ def serpapi_key(self) -> Optional[str]: return self.api_key @classmethod - def _from_env(cls) -> 'ImageScraperSettings': + def from_env(cls) -> 'ImageScraperSettings': api_key = None if os.getenv('SERPAPI_API_KEY'): api_key = SecretStr(os.getenv('SERPAPI_API_KEY', '')) @@ -250,7 +250,7 @@ class ElasticsearchSettings: user: str = '' @classmethod - def _from_env(cls) -> 'ElasticsearchSettings': + def from_env(cls) -> 'ElasticsearchSettings': return cls( host=os.getenv('ES_HOST', ''), user=os.getenv('ES_USER', ''), @@ -261,7 +261,7 @@ def _from_env(cls) -> 'ElasticsearchSettings': @dataclass(frozen=True) class GeminiSettings(BaseAPISettings): @classmethod - def _from_env(cls) -> 'GeminiSettings': + def from_env(cls) -> 'GeminiSettings': api_key = None if os.getenv('GEMINI_API_KEY'): api_key = SecretStr(os.getenv('GEMINI_API_KEY', '')) @@ -293,10 +293,10 @@ class Settings: # pylint: disable=too-many-instance-attributes whisper: WhisperSettings @classmethod - def _from_env(cls) -> 'Settings': + def from_env(cls) -> 'Settings': return cls( output_subdirs=OutputSubdirs(), - whisper=WhisperSettings._from_env(), + whisper=WhisperSettings.from_env(), text_chunking=TextChunkingSettings(), embedding_model=EmbeddingModelSettings(), embedding=EmbeddingSettings(), @@ -309,11 +309,11 @@ def _from_env(cls) -> 'Settings': object_detection=ObjectDetectionSettings(), face_recognition=FaceRecognitionSettings(), face_clustering=FaceClusteringSettings(), - emotion_detection=EmotionDetectionSettings._from_env(), - image_scraper=ImageScraperSettings._from_env(), - elevenlabs=ElevenLabsSettings._from_env(), - elasticsearch=ElasticsearchSettings._from_env(), - gemini=GeminiSettings._from_env(), + emotion_detection=EmotionDetectionSettings.from_env(), + image_scraper=ImageScraperSettings.from_env(), + elevenlabs=ElevenLabsSettings.from_env(), + elasticsearch=ElasticsearchSettings.from_env(), + gemini=GeminiSettings.from_env(), transcode=TranscodeSettings(), transcription=TranscriptionSettings(), ) diff --git a/preprocessor/config/settings_factory.py b/preprocessor/config/settings_factory.py index beb52eb5a..915b5a9fc 100644 --- a/preprocessor/config/settings_factory.py +++ b/preprocessor/config/settings_factory.py @@ -9,7 +9,7 @@ class SettingsFactory: @classmethod def get_settings(cls) -> Settings: if cls.__instance is None: - cls.__instance = Settings._from_env() + cls.__instance = Settings.from_env() return cls.__instance @classmethod diff --git a/preprocessor/config/types/frame.py b/preprocessor/config/types/frame.py index 94832464e..70ffedef9 100644 --- a/preprocessor/config/types/frame.py +++ b/preprocessor/config/types/frame.py @@ -5,7 +5,7 @@ class FrameRequest(TypedDict): - frame: int - scene_number: NotRequired[int] - time: float + frame_number: int + timestamp: float type: str + scene_number: NotRequired[int] diff --git a/preprocessor/services/ai/clients.py b/preprocessor/services/ai/clients.py index ac0dd02ec..9e785c0d7 100644 --- a/preprocessor/services/ai/clients.py +++ b/preprocessor/services/ai/clients.py @@ -80,7 +80,7 @@ def generate(self, messages: List[Dict[str, str]], max_tokens: int = 32768) -> s response = self.__client.chat.completions.create( model=self.__GEMINI_MODEL_NAME, - messages=messages, + messages=messages, # type: ignore[arg-type] ) return response.choices[0].message.content.strip() diff --git a/preprocessor/services/io/path_service.py b/preprocessor/services/io/path_service.py index 1f0e03ef8..06d2f76ed 100644 --- a/preprocessor/services/io/path_service.py +++ b/preprocessor/services/io/path_service.py @@ -1,5 +1,8 @@ from pathlib import Path -from typing import TYPE_CHECKING +from typing import ( + TYPE_CHECKING, + Optional, +) from preprocessor.config.output_paths import get_base_output_dir from preprocessor.services.core.environment import Environment @@ -16,7 +19,7 @@ def build_filename( self, episode_info: 'EpisodeInfo', extension: str = 'json', - suffix: str = '', + suffix: Optional[str] = None, ) -> str: base = f'{self.__series_name}_{episode_info.episode_code()}' suffix_str = f'_{suffix}' if suffix else '' diff --git a/preprocessor/services/media/ffmpeg.py b/preprocessor/services/media/ffmpeg.py index ed3b61e25..6b1830ab7 100644 --- a/preprocessor/services/media/ffmpeg.py +++ b/preprocessor/services/media/ffmpeg.py @@ -8,6 +8,7 @@ List, Optional, Tuple, + Union, ) from preprocessor.services.media.transcode_params import TranscodeParams @@ -30,7 +31,7 @@ def detect_interlacing( video_path: Path, analysis_time: Optional[int] = 60, threshold: float = 0.15, - ) -> Tuple[bool, Optional[Dict[str, Any]]]: + ) -> Tuple[bool, Optional[Dict[str, Union[int, float]]]]: cmd = ['ffmpeg'] if analysis_time is not None: @@ -126,17 +127,17 @@ def get_resolution(probe_data: Dict[str, Any]) -> Tuple[int, int]: def get_sample_aspect_ratio(probe_data: Dict[str, Any]) -> Tuple[int, int]: stream = FFmpegWrapper.__get_stream_by_type(probe_data, 'video') if not stream: - return (1, 1) + return 1, 1 sar = stream.get('sample_aspect_ratio', '1:1') if sar == '0:1' or not sar: - return (1, 1) + return 1, 1 try: num, denom = [int(x) for x in sar.split(':')] - return (num, denom) + return num, denom except (ValueError, AttributeError): - return (1, 1) + return 1, 1 @staticmethod def get_field_order(probe_data: Dict[str, Any]) -> str: @@ -304,7 +305,7 @@ def __get_stream_by_type(probe_data: Dict[str, Any], codec_type: str) -> Optiona return streams[0] if streams else None @staticmethod - def __parse_idet_output(stderr: str) -> Optional[Dict[str, int]]: + def __parse_idet_output(stderr: str) -> Optional[Dict[str, Union[int, float]]]: matches = re.findall( r'Multi frame detection:\s+TFF:\s*(\d+)\s+BFF:\s*(\d+)\s+Progressive:\s*(\d+)', stderr, diff --git a/preprocessor/services/media/scene_detection.py b/preprocessor/services/media/scene_detection.py index dc34753e2..91f17b669 100644 --- a/preprocessor/services/media/scene_detection.py +++ b/preprocessor/services/media/scene_detection.py @@ -40,7 +40,7 @@ def detect_scenes( if self.__model is None: raise RuntimeError('Model not loaded. Call load_model() first.') - video_info = self.__get_video_info(video_path) + video_info = self.get_video_info(video_path) if not video_info: raise RuntimeError(f'Failed to get video info for {video_path}') @@ -112,7 +112,7 @@ def __frame_to_timecode(frame: int, fps: float) -> str: return f'{hours:02d}:{minutes:02d}:{secs:02d}:{frames:02d}' @staticmethod - def __get_video_info(video_file: Path) -> Optional[Dict[str, Any]]: + def get_video_info(video_file: Path) -> Optional[Dict[str, Any]]: try: vr = decord.VideoReader(str(video_file), ctx=decord.cpu(0)) fps = vr.get_avg_fps() diff --git a/preprocessor/services/scraping/base_scraper.py b/preprocessor/services/scraping/base_scraper.py index bd74aa88f..2d25e0524 100644 --- a/preprocessor/services/scraping/base_scraper.py +++ b/preprocessor/services/scraping/base_scraper.py @@ -15,7 +15,11 @@ ) from preprocessor.config.settings_instance import settings from preprocessor.services.ai import LLMProvider -from preprocessor.services.core.base_processor import BaseProcessor +from preprocessor.services.core.base_processor import ( + BaseProcessor, + OutputSpec, + ProcessingItem, +) from preprocessor.services.scraping.clipboard import ScraperClipboard from preprocessor.services.scraping.crawl4ai import ScraperCrawl4AI from preprocessor.services.ui.console import console @@ -98,3 +102,23 @@ def __run_scraper(self, url: str) -> Optional[str]: ) return None + + def get_output_subdir(self) -> str: + return 'scraper' + + def _get_expected_outputs(self, item: ProcessingItem) -> List[OutputSpec]: + return [] + + def _get_processing_items(self) -> List[ProcessingItem]: + return [] + + def _process_item( + self, item: ProcessingItem, missing_outputs: List[OutputSpec], + ) -> None: + pass + + def _validate_args(self, args: Dict[str, Any]) -> None: + if 'urls' not in args: + raise ValueError("Missing required argument: 'urls'") + if 'output_file' not in args: + raise ValueError("Missing required argument: 'output_file'") diff --git a/preprocessor/services/scraping/grid_visualizer.py b/preprocessor/services/scraping/grid_visualizer.py index 8f73ccec5..8b53b44e5 100644 --- a/preprocessor/services/scraping/grid_visualizer.py +++ b/preprocessor/services/scraping/grid_visualizer.py @@ -84,7 +84,8 @@ def generate_grid( 'avg_similarity': avg_similarity, } - def __empty_result(self) -> Dict[str, Any]: + @staticmethod + def __empty_result() -> Dict[str, Any]: return { 'width': 0, 'height': 0, @@ -92,7 +93,8 @@ def __empty_result(self) -> Dict[str, Any]: 'avg_similarity': 0.0, } - def __get_processed_characters(self, dir_path: Path) -> List[Path]: + @staticmethod + def __get_processed_characters(dir_path: Path) -> List[Path]: return sorted([d for d in dir_path.iterdir() if d.is_dir()]) def __create_canvas(self, num_chars: int) -> np.ndarray: @@ -238,7 +240,7 @@ def __render_character_faces(self, canvas: np.ndarray, char_dir: Path, y_offset: if face_img is None: continue - face_resized = CharacterGridVisualizer._safe_resize( + face_resized = CharacterGridVisualizer.safe_resize( face_img, (self.__dims.face_size, self.__dims.face_size), ) @@ -317,7 +319,7 @@ def __calculate_avg_similarity(metadata_all: List[Dict[str, Any]]) -> float: return float(np.mean([m.get('average_similarity', 0) for m in metadata_all])) @staticmethod - def _safe_resize(img: np.ndarray, target_size: Tuple[int, int]) -> Optional[np.ndarray]: + def safe_resize(img: np.ndarray, target_size: Tuple[int, int]) -> Optional[np.ndarray]: if img is None or img.size == 0: return None if img.shape[0] == 0 or img.shape[1] == 0: diff --git a/preprocessor/services/scraping/reference_processor.py b/preprocessor/services/scraping/reference_processor.py index 2757264f6..ceb4b6895 100644 --- a/preprocessor/services/scraping/reference_processor.py +++ b/preprocessor/services/scraping/reference_processor.py @@ -104,7 +104,7 @@ def _process_item(self, item: ProcessingItem, _missing_outputs: List[OutputSpec] console.print(f'[yellow]Skipping {char_name}: no faces detected[/yellow]') return - selected_faces = self.__find_common_face(all_faces, char_name, ref_images) + selected_faces = self.__find_common_face(all_faces) if not selected_faces: console.print(f'[yellow]Skipping {char_name}: could not identify common face[/yellow]') return @@ -154,8 +154,6 @@ def __detect_faces_in_references(self, image_paths: List[Path]) -> List[List[Fac def __find_common_face( self, all_faces: List[List[FaceData]], - char_name: str, # pylint: disable=unused-argument - ref_images: List[Path], # pylint: disable=unused-argument ) -> Optional[List[FaceData]]: first_faces = all_faces[0] candidates = self.__find_face_candidates(first_faces, all_faces[1:], all_faces) @@ -190,7 +188,8 @@ def __find_face_candidates( return candidates - def __get_best_match(self, ref_face: FaceData, candidates: List[FaceData]) -> Tuple[Optional[FaceData], float]: + @staticmethod + def __get_best_match(ref_face: FaceData, candidates: List[FaceData]) -> Tuple[Optional[FaceData], float]: best_match, best_sim = None, -1.0 for cand in candidates: sim = float(np.dot(ref_face.face_vector, cand.face_vector)) @@ -207,7 +206,7 @@ def __save_processed_references( face_vectors = [] for idx, face_data in enumerate(selected_faces): - norm_face = CharacterGridVisualizer._safe_resize( + norm_face = CharacterGridVisualizer.safe_resize( face_data.face_img, settings.character.normalized_face_size, ) diff --git a/preprocessor/services/search/clients/elasticsearch_queries.py b/preprocessor/services/search/clients/elasticsearch_queries.py index 0ed25794f..7901709bb 100644 --- a/preprocessor/services/search/clients/elasticsearch_queries.py +++ b/preprocessor/services/search/clients/elasticsearch_queries.py @@ -102,7 +102,8 @@ async def __execute_knn_query( } return await es_client.search(index=index, knn=knn, size=limit) - def __build_episode_filters(self, season: Optional[int], episode: Optional[int]) -> List[Dict[str, Any]]: + @staticmethod + def __build_episode_filters(season: Optional[int], episode: Optional[int]) -> List[Dict[str, Any]]: filters = [] if season is not None: filters.append({'term': {'episode_metadata.season': season}}) @@ -123,7 +124,8 @@ def __build_nested_filter(emotion: str, character: Optional[str]) -> Dict[str, A }, } - async def __list_nested_terms(self, es_client: AsyncElasticsearch, index: str, path: str, field: str) -> List[ + @staticmethod + async def __list_nested_terms(es_client: AsyncElasticsearch, index: str, path: str, field: str) -> List[ Tuple[str, int] ]: result = await es_client.search( diff --git a/preprocessor/services/search/clients/embedding_service.py b/preprocessor/services/search/clients/embedding_service.py index 0eedc3b1c..22313bf75 100644 --- a/preprocessor/services/search/clients/embedding_service.py +++ b/preprocessor/services/search/clients/embedding_service.py @@ -61,7 +61,8 @@ def get_text_embedding(self, text: str) -> List[float]: ).to(device) return self.__compute_normalized_embedding(model, {'input_ids': inputs}) - def __compute_normalized_embedding(self, model: Any, inputs: Dict[str, Any]) -> List[float]: + @staticmethod + def __compute_normalized_embedding(model: Any, inputs: Dict[str, Any]) -> List[float]: with torch.no_grad(): output = model(**inputs, output_hidden_states=True) embedding = output.hidden_states[-1][:, -1, :].squeeze(0) diff --git a/preprocessor/services/transcription/generators/base_generator.py b/preprocessor/services/transcription/generators/base_generator.py index f5cec09c7..46e376e34 100644 --- a/preprocessor/services/transcription/generators/base_generator.py +++ b/preprocessor/services/transcription/generators/base_generator.py @@ -28,7 +28,8 @@ def generate(self) -> None: except Exception as e: self._logger.error(f'Failed to generate output for {json_file}: {e}') - def __load_json(self, file_path: Path) -> Dict[str, Any]: + @staticmethod + def __load_json(file_path: Path) -> Dict[str, Any]: with open(file_path, 'r', encoding='utf-8') as f: return json.load(f) diff --git a/preprocessor/services/transcription/generators/multi_format_generator.py b/preprocessor/services/transcription/generators/multi_format_generator.py index b8e8605cc..657824e64 100644 --- a/preprocessor/services/transcription/generators/multi_format_generator.py +++ b/preprocessor/services/transcription/generators/multi_format_generator.py @@ -3,6 +3,7 @@ from typing import ( Any, Dict, + Literal, Optional, ) @@ -20,7 +21,7 @@ def __init__( self, jsons_dir: Path, episodes_info_json: Path, - output_base_path: Path, # pylint: disable=unused-argument + _output_base_path: Path, logger: ErrorHandlingLogger, series_name: str = '', ) -> None: @@ -65,7 +66,10 @@ def __generate_all_formats(self, transcription: Dict[str, Any], episode_info: An self.__save_srt(transcription, episode_info, base_dir) self.__save_txt(transcription, episode_info, base_dir) - def __save_json(self, data: Dict[str, Any], ep_info: Any, out_dir: Path, fmt: str) -> None: + def __save_json( + self, data: Dict[str, Any], ep_info: Any, out_dir: Path, + fmt: Literal['full', 'simple', 'segmented'], + ) -> None: gen = JsonGenerator(fmt, Path(''), out_dir, self.__logger) filename = self.__episode_manager.path_manager.build_filename( ep_info, extension='json', suffix=fmt if fmt != 'full' else None, diff --git a/preprocessor/services/transcription/processors/audio_normalizer.py b/preprocessor/services/transcription/processors/audio_normalizer.py index 879ed0438..bc84b2593 100644 --- a/preprocessor/services/transcription/processors/audio_normalizer.py +++ b/preprocessor/services/transcription/processors/audio_normalizer.py @@ -76,7 +76,8 @@ def __execute_normalization_pipeline(self, video: Path, audio_idx: int, output: tmp_output.replace(output) self.__logger.info(f'Normalization complete: {output.name}') - def __extract_audio(self, video: Path, audio_idx: int, output: Path) -> None: + @staticmethod + def __extract_audio(video: Path, audio_idx: int, output: Path) -> None: cmd = [ 'ffmpeg', '-y', '-i', str(video), '-map', f'0:{audio_idx}', '-acodec', 'pcm_s16le', '-ar', '48000', '-ac', '1', str(output), diff --git a/preprocessor/services/transcription/processors/normalized_audio_processor.py b/preprocessor/services/transcription/processors/normalized_audio_processor.py index 6b031d0c1..813cc8c50 100644 --- a/preprocessor/services/transcription/processors/normalized_audio_processor.py +++ b/preprocessor/services/transcription/processors/normalized_audio_processor.py @@ -47,8 +47,7 @@ def __init__( def cleanup(self) -> None: self.__logger.info('Purging GPU memory and unloading Whisper model...') - if hasattr(self, '_NormalizedAudioProcessor__whisper_model'): - del self.__whisper_model + del self.__whisper_model gc.collect() if torch.cuda.is_available(): @@ -87,7 +86,8 @@ def __transcribe_file(self, audio_path: Path) -> None: except Exception as e: self.__logger.error(f'Whisper error on {audio_path.name}: {e}') - def __save_results(self, result: dict, path: Path) -> None: + @staticmethod + def __save_results(result: dict, path: Path) -> None: for segment in result.get('segments', []): segment['temperature'] = 0.0 diff --git a/preprocessor/services/ui/console.py b/preprocessor/services/ui/console.py index 41c1aeeae..4269068e3 100644 --- a/preprocessor/services/ui/console.py +++ b/preprocessor/services/ui/console.py @@ -72,7 +72,8 @@ def advance(self, task_id: int, step: int = 1) -> None: self.__render_progress(task_id) task['last_print_time'] = current_time - def __should_render(self, task: Dict[str, Any], current_time: float) -> bool: + @staticmethod + def __should_render(task: Dict[str, Any], current_time: float) -> bool: is_finished = task['completed'] >= task['total'] is_second_passed = (current_time - task['last_print_time']) >= 1.0 return is_finished or is_second_passed @@ -95,7 +96,8 @@ def __render_progress(self, task_id: int) -> None: highlight=False, ) - def __compute_task_eta(self, task: Dict[str, Any]) -> str: + @staticmethod + def __compute_task_eta(task: Dict[str, Any]) -> str: completed = task['completed'] total = task['total'] @@ -108,7 +110,8 @@ def __compute_task_eta(self, task: Dict[str, Any]) -> str: eta_seconds = (elapsed / completed) * (total - completed) return TimeFormatter.format_hms(eta_seconds) - def __build_visual_bar(self, completed: int, total: int, width: int = 30) -> str: + @staticmethod + def __build_visual_bar(completed: int, total: int, width: int = 30) -> str: if total <= 0: return '-' * width diff --git a/preprocessor/services/validation/report_generator.py b/preprocessor/services/validation/report_generator.py index b413f9049..c4a21321b 100644 --- a/preprocessor/services/validation/report_generator.py +++ b/preprocessor/services/validation/report_generator.py @@ -33,7 +33,8 @@ def generate_report( self.__write_to_disk(report, output_path) return report - def __write_to_disk(self, data: Dict[str, Any], path: Path) -> None: + @staticmethod + def __write_to_disk(data: Dict[str, Any], path: Path) -> None: path.parent.mkdir(parents=True, exist_ok=True) with open(path, 'w', encoding='utf-8') as f: json.dump(data, f, indent=2, ensure_ascii=False) diff --git a/preprocessor/services/validation/validator.py b/preprocessor/services/validation/validator.py index 3495e4efa..c08419c47 100644 --- a/preprocessor/services/validation/validator.py +++ b/preprocessor/services/validation/validator.py @@ -112,7 +112,8 @@ def __print_execution_summary(self, stats: Dict[str, EpisodeStats], comparison: self.__print_anomalies(comparison) self.__print_issues(stats) - def __build_episode_report_payload(self, stats: EpisodeStats) -> Dict[str, Any]: + @staticmethod + def __build_episode_report_payload(stats: EpisodeStats) -> Dict[str, Any]: return { 'validation_timestamp': datetime.now().isoformat(), 'episode_id': stats.episode_info.episode_code(), @@ -123,7 +124,8 @@ def __build_episode_report_payload(self, stats: EpisodeStats) -> Dict[str, Any]: 'stats': stats.to_dict()['stats'], } - def __print_status_counts(self, stats: Dict[str, EpisodeStats]) -> None: + @staticmethod + def __print_status_counts(stats: Dict[str, EpisodeStats]) -> None: counts = {'PASS': 0, 'WARNING': 0, 'FAIL': 0} for s in stats.values(): counts[s.status] += 1 @@ -131,7 +133,8 @@ def __print_status_counts(self, stats: Dict[str, EpisodeStats]) -> None: console.print(f' [yellow]WARNING:[/yellow] {counts["WARNING"]}') console.print(f' [red]FAIL:[/red] {counts["FAIL"]}') - def __print_anomalies(self, comparison: SeasonComparison) -> None: + @staticmethod + def __print_anomalies(comparison: SeasonComparison) -> None: if not comparison.anomalies: return console.print(f'\n[bold yellow]Anomalies detected: {len(comparison.anomalies)}[/bold yellow]') diff --git a/preprocessor/services/validation/validators/elastic_validator.py b/preprocessor/services/validation/validators/elastic_validator.py index ea601edd3..0d44c794b 100644 --- a/preprocessor/services/validation/validators/elastic_validator.py +++ b/preprocessor/services/validation/validators/elastic_validator.py @@ -120,5 +120,6 @@ def __check_doc_dimension( if actual != expected: self._add_error(stats, f'{fname} line {lnum}: {field} has {actual} dim, expected {expected}') - def __get_dir(self, stats: 'EpisodeStats', subdir: str) -> Path: + @staticmethod + def __get_dir(stats: 'EpisodeStats', subdir: str) -> Path: return PathService(stats.series_name).get_episode_dir(stats.episode_info, subdir) diff --git a/preprocessor/services/validation/validators/face_cluster_validator.py b/preprocessor/services/validation/validators/face_cluster_validator.py index bc63844a9..8e011655c 100644 --- a/preprocessor/services/validation/validators/face_cluster_validator.py +++ b/preprocessor/services/validation/validators/face_cluster_validator.py @@ -35,13 +35,13 @@ def validate(self, stats: 'EpisodeStats') -> None: if data: self.__parse_cluster_stats(stats, data) - def __get_metadata_file(self, clusters_dir: Path) -> Optional[Path]: + @staticmethod + def __get_metadata_file(clusters_dir: Path) -> Optional[Path]: files = list(clusters_dir.glob('*_face_clusters.json')) return files[0] if files else None def __parse_cluster_stats(self, stats: 'EpisodeStats', data: Dict[str, Any]) -> None: clusters = data.get('clusters', {}) - total_faces = 0 if isinstance(clusters, (dict, list)): stats.face_clusters_count = len(clusters) diff --git a/preprocessor/services/validation/validators/object_validator.py b/preprocessor/services/validation/validators/object_validator.py index 0d7c37cce..a4dd9240a 100644 --- a/preprocessor/services/validation/validators/object_validator.py +++ b/preprocessor/services/validation/validators/object_validator.py @@ -16,7 +16,8 @@ def validate(self, stats: 'EpisodeStats') -> None: self.__validate_object_detections(stats) self.__validate_object_visualizations(stats) - def __validate_object_detections(self, stats: 'EpisodeStats') -> None: + @staticmethod + def __validate_object_detections(stats: 'EpisodeStats') -> None: JsonDirectoryValidationHelper.validate_json_directory( stats, settings.output_subdirs.object_detections, @@ -25,7 +26,8 @@ def __validate_object_detections(self, stats: 'EpisodeStats') -> None: exclude_pattern='visualizations', ) - def __validate_object_visualizations(self, stats: 'EpisodeStats') -> None: + @staticmethod + def __validate_object_visualizations(stats: 'EpisodeStats') -> None: VisualizationValidationHelper.validate_visualizations( stats, settings.output_subdirs.object_visualizations, diff --git a/preprocessor/services/validation/validators/scene_validator.py b/preprocessor/services/validation/validators/scene_validator.py index a81aae5e6..e97340294 100644 --- a/preprocessor/services/validation/validators/scene_validator.py +++ b/preprocessor/services/validation/validators/scene_validator.py @@ -30,7 +30,8 @@ def validate(self, stats: 'EpisodeStats') -> None: if data: self.__extract_scene_stats(stats, data) - def __resolve_scenes_file(self, stats: 'EpisodeStats') -> Path: + @staticmethod + def __resolve_scenes_file(stats: 'EpisodeStats') -> Path: scenes_dir = PathService(stats.series_name).get_episode_dir( stats.episode_info, settings.output_subdirs.scenes, ) @@ -44,7 +45,8 @@ def __validate_json_integrity(self, stats: 'EpisodeStats', file_path: Path) -> b return False return True - def __extract_scene_stats(self, stats: 'EpisodeStats', data: Dict[str, Any]) -> None: + @staticmethod + def __extract_scene_stats(stats: 'EpisodeStats', data: Dict[str, Any]) -> None: stats.scenes_count = data.get('total_scenes', 0) scenes: List[Dict[str, Any]] = data.get('scenes', []) diff --git a/preprocessor/services/validation/validators/transcription_validator.py b/preprocessor/services/validation/validators/transcription_validator.py index bd6f7ae0e..21cc3dbe2 100644 --- a/preprocessor/services/validation/validators/transcription_validator.py +++ b/preprocessor/services/validation/validators/transcription_validator.py @@ -51,14 +51,16 @@ def __extract_transcription_metrics(self, stats: 'EpisodeStats', raw_path: Path) stats.transcription_words = len(text.split()) stats.transcription_duration = self.__determine_duration(data) - def __get_full_text(self, data: Dict[str, Any]) -> str: + @staticmethod + def __get_full_text(data: Dict[str, Any]) -> str: text = data.get('text', '') if not text: segments: List[Dict[str, Any]] = data.get('segments', []) text = ' '.join(s.get('text', '') for s in segments) return text - def __determine_duration(self, data: Dict[str, Any]) -> float: + @staticmethod + def __determine_duration(data: Dict[str, Any]) -> float: words: List[Dict[str, Any]] = data.get('words', []) if words: return words[-1].get('end', 0.0) @@ -86,7 +88,8 @@ def __validate_sound_events(self, stats: 'EpisodeStats', file_path: Path) -> Non invalid_msg_prefix='Invalid sound events JSON', ) - def __resolve_file_map(self, stats: 'EpisodeStats') -> Dict[str, Path]: + @staticmethod + def __resolve_file_map(stats: 'EpisodeStats') -> Dict[str, Path]: path_svc = PathService(stats.series_name) trans_dir = path_svc.get_episode_dir(stats.episode_info, settings.output_subdirs.transcriptions) base = f'{stats.series_name}_{stats.episode_info.episode_code()}' diff --git a/preprocessor/services/validation/validators/video_validator.py b/preprocessor/services/validation/validators/video_validator.py index f20b336f3..d7e75c049 100644 --- a/preprocessor/services/validation/validators/video_validator.py +++ b/preprocessor/services/validation/validators/video_validator.py @@ -26,7 +26,8 @@ def validate(self, stats: 'EpisodeStats') -> None: self.__populate_video_metrics(stats, result.metadata) - def __resolve_video_file_path(self, stats: 'EpisodeStats') -> Path: + @staticmethod + def __resolve_video_file_path(stats: 'EpisodeStats') -> Path: filename = f'{stats.series_name.lower()}_{stats.episode_info.episode_code()}{DEFAULT_VIDEO_EXTENSION}' season_dir = ( get_base_output_dir(stats.series_name) / @@ -35,7 +36,8 @@ def __resolve_video_file_path(self, stats: 'EpisodeStats') -> Path: ) return season_dir / filename - def __populate_video_metrics(self, stats: 'EpisodeStats', metadata: dict) -> None: + @staticmethod + def __populate_video_metrics(stats: 'EpisodeStats', metadata: dict) -> None: stats.video_size_mb = metadata['size_mb'] stats.video_duration = metadata['duration'] stats.video_codec = metadata['codec'] diff --git a/preprocessor/services/video/emotion_utils.py b/preprocessor/services/video/emotion_utils.py index f31d30114..a733aa813 100644 --- a/preprocessor/services/video/emotion_utils.py +++ b/preprocessor/services/video/emotion_utils.py @@ -27,7 +27,7 @@ def detect( raise RuntimeError(f'Emotion detection failed: {e}') from e @staticmethod - def _init_model(logger: Optional[ErrorHandlingLogger] = None) -> HSEmotionRecognizer: + def init_model(logger: Optional[ErrorHandlingLogger] = None) -> HSEmotionRecognizer: model_name = settings.emotion_detection.model_name if logger: logger.info(f'Loading HSEmotion model: {model_name}...') @@ -41,7 +41,7 @@ def _init_model(logger: Optional[ErrorHandlingLogger] = None) -> HSEmotionRecogn raise RuntimeError(f'Failed to load HSEmotion model {model_name}: {e}') from e @staticmethod - def _detect_batch( + def detect_batch( face_images: List[np.ndarray], model: HSEmotionRecognizer, batch_size: int = 32, @@ -76,7 +76,7 @@ def _detect_batch( return results @staticmethod - def _crop_face(frame: np.ndarray, bbox: Dict[str, int]) -> Optional[np.ndarray]: + def crop_face(frame: np.ndarray, bbox: Dict[str, int]) -> Optional[np.ndarray]: try: x1, y1, x2, y2 = (bbox['x1'], bbox['y1'], bbox['x2'], bbox['y2']) height, width = frame.shape[:2] diff --git a/preprocessor/services/video/strategies/base_strategy.py b/preprocessor/services/video/strategies/base_strategy.py index f8e4c6d91..f04e03a2d 100644 --- a/preprocessor/services/video/strategies/base_strategy.py +++ b/preprocessor/services/video/strategies/base_strategy.py @@ -9,10 +9,12 @@ List, ) +from preprocessor.config.types import FrameRequest + class BaseKeyframeStrategy(ABC): @abstractmethod def extract_frame_requests( self, video_path: Path, data: Dict[str, Any], - ) -> List[Dict[str, Any]]: + ) -> List[FrameRequest]: pass diff --git a/preprocessor/services/video/strategies/scene_changes_strategy.py b/preprocessor/services/video/strategies/scene_changes_strategy.py index ba84bba9e..9f299de0d 100644 --- a/preprocessor/services/video/strategies/scene_changes_strategy.py +++ b/preprocessor/services/video/strategies/scene_changes_strategy.py @@ -7,6 +7,7 @@ ) from preprocessor.config.enums import FrameType +from preprocessor.config.types import FrameRequest from preprocessor.services.ui.console import console from preprocessor.services.video.strategies.base_strategy import BaseKeyframeStrategy @@ -17,7 +18,7 @@ def __init__(self, frames_per_scene: int) -> None: def extract_frame_requests( self, video_path: Path, data: Dict[str, Any], - ) -> List[Dict[str, Any]]: + ) -> List[FrameRequest]: scenes = self.__extract_scenes(data) if not scenes: console.print('[yellow]No scene timestamps found[/yellow]') @@ -28,15 +29,15 @@ def extract_frame_requests( def __process_all_scenes( self, scenes: List[Dict[str, Any]], fps: float, - ) -> List[Dict[str, Any]]: - frame_requests: List[Dict[str, Any]] = [] + ) -> List[FrameRequest]: + frame_requests: List[FrameRequest] = [] for i, scene in enumerate(scenes): frame_requests.extend(self.__process_single_scene(scene, i, fps)) return frame_requests def __process_single_scene( self, scene: Dict[str, Any], scene_index: int, fps: float, - ) -> List[Dict[str, Any]]: + ) -> List[FrameRequest]: start_frame = scene.get('start', {}).get('frame', 0) frame_count = scene.get('frame_count', 1) @@ -51,8 +52,8 @@ def __process_single_scene( def __generate_multi_frame_requests( self, start_frame: int, frame_count: int, scene_index: int, fps: float, - ) -> List[Dict[str, Any]]: - requests: List[Dict[str, Any]] = [] + ) -> List[FrameRequest]: + requests: List[FrameRequest] = [] for frame_idx in range(self.__frames_per_scene): frame_number = self.__calculate_frame_number( start_frame, frame_count, frame_idx, @@ -93,8 +94,8 @@ def __extract_fps(data: Dict[str, Any]) -> float: @staticmethod def __create_request( frame: int, fps: float, type_name: str, scene_num: Optional[int] = None, - ) -> Dict[str, Any]: - req: Dict[str, Any] = { + ) -> FrameRequest: + req: FrameRequest = { 'frame_number': int(frame), 'timestamp': float(frame / fps), 'type': type_name, diff --git a/preprocessor/services/video/strategies/strategy_factory.py b/preprocessor/services/video/strategies/strategy_factory.py index db56415d0..d902c03e0 100644 --- a/preprocessor/services/video/strategies/strategy_factory.py +++ b/preprocessor/services/video/strategies/strategy_factory.py @@ -10,4 +10,5 @@ def create( ) -> BaseKeyframeStrategy: if strategy_type == KeyframeStrategy.SCENE_CHANGES: return SceneChangesStrategy(frames_per_scene=frames_per_scene) - raise ValueError(f'Unknown keyframe strategy: {strategy_type}') + + raise ValueError(f"Unknown strategy type: {strategy_type}") diff --git a/preprocessor/steps/analysis/resolution_analysis_step.py b/preprocessor/steps/analysis/resolution_analysis_step.py index 80eb2ff6f..1e4f1bfbf 100644 --- a/preprocessor/steps/analysis/resolution_analysis_step.py +++ b/preprocessor/steps/analysis/resolution_analysis_step.py @@ -47,7 +47,8 @@ def execute( total_files=len(video_info), upscaling_percentage=upscaling_pct, ) - def __log_analysis_header(self, context: ExecutionContext) -> None: + @staticmethod + def __log_analysis_header(context: ExecutionContext) -> None: context.logger.info('=' * 80) context.logger.info('RESOLUTION ANALYSIS - Checking source video resolutions') context.logger.info('=' * 80) @@ -121,7 +122,8 @@ def __log_resolution_distribution( f'({self.__get_resolution_label(target_width, target_height)})', ) - def __log_upscaling_warnings(self, context: ExecutionContext, upscaling_pct: float) -> None: + @staticmethod + def __log_upscaling_warnings(context: ExecutionContext, upscaling_pct: float) -> None: if upscaling_pct > 50: context.logger.warning('') context.logger.warning('⚠' * 30) @@ -139,8 +141,8 @@ def __log_upscaling_warnings(self, context: ExecutionContext, upscaling_pct: flo '(enhanced quality params will be used)', ) + @staticmethod def __log_interlacing_analysis( - self, context: ExecutionContext, progressive_count: int, needs_deinterlace_count: int, @@ -158,7 +160,8 @@ def __log_interlacing_analysis( f'({(needs_deinterlace_count / total_episodes) * 100:.1f}%)', ) - def __log_metadata_warnings(self, context: ExecutionContext, mismatch_count: int) -> None: + @staticmethod + def __log_metadata_warnings(context: ExecutionContext, mismatch_count: int) -> None: if mismatch_count > 0: context.logger.warning('') context.logger.warning( diff --git a/preprocessor/steps/search/indexing_step.py b/preprocessor/steps/search/indexing_step.py index d140af358..07ca4dd3c 100644 --- a/preprocessor/steps/search/indexing_step.py +++ b/preprocessor/steps/search/indexing_step.py @@ -149,6 +149,6 @@ def __load_documents_from_paths(paths: List[Path]) -> List[Dict[str, Any]]: @staticmethod def __get_mapping_for_type( - doc_type: str, # pylint: disable=unused-argument + _doc_type: str, ) -> Optional[Dict[str, Any]]: return None diff --git a/preprocessor/steps/text/embeddings_step.py b/preprocessor/steps/text/embeddings_step.py index e7c782678..94e881996 100644 --- a/preprocessor/steps/text/embeddings_step.py +++ b/preprocessor/steps/text/embeddings_step.py @@ -127,7 +127,7 @@ def __save_embedding_results( ) -> None: output_data: Dict[str, Any] = MetadataBuilder.create_processing_metadata( episode_info=input_data.episode_info, - processing_params=self.config.dict(), + processing_params=self.config.model_dump(), statistics={ 'total_embeddings': len(results), 'embedding_dimension': len(results[0]['embedding']) if results else 0, diff --git a/preprocessor/steps/video/scene_detection_step.py b/preprocessor/steps/video/scene_detection_step.py index faf2772a4..f660040d7 100644 --- a/preprocessor/steps/video/scene_detection_step.py +++ b/preprocessor/steps/video/scene_detection_step.py @@ -69,7 +69,7 @@ def __save_detection_results( video_path: Path, output_path: Path, ) -> None: - video_info = self.__transnet._TransNetWrapper__get_video_info(video_path) + video_info = self.__transnet.get_video_info(video_path) output_data = self.__build_results_payload(scenes, video_info) FileOperations.atomic_write_json(output_path, output_data) diff --git a/preprocessor/steps/video/transcoding_step.py b/preprocessor/steps/video/transcoding_step.py index c81c68bc7..8b72ee05d 100644 --- a/preprocessor/steps/video/transcoding_step.py +++ b/preprocessor/steps/video/transcoding_step.py @@ -1,3 +1,4 @@ +from dataclasses import replace from pathlib import Path from typing import ( Any, @@ -206,21 +207,19 @@ def __execute_ffmpeg_process( temp_path = params.output_path.with_suffix('.mp4.tmp') final_path = params.output_path - params.output_path = temp_path + temp_params = replace(params, output_path=temp_path) context.mark_step_started(self.name, episode_id, [str(temp_path)]) try: - if params.log_command: + if temp_params.log_command: self.__log_ffmpeg_command_header(context) - FFmpegWrapper.transcode(params) + FFmpegWrapper.transcode(temp_params) temp_path.replace(final_path) except BaseException: if temp_path.exists(): temp_path.unlink() raise - finally: - params.output_path = final_path def __construct_result_artifact( self, diff --git a/preprocessor/steps/vision/character_detection_step.py b/preprocessor/steps/vision/character_detection_step.py index dad92807f..8338da024 100644 --- a/preprocessor/steps/vision/character_detection_step.py +++ b/preprocessor/steps/vision/character_detection_step.py @@ -103,7 +103,7 @@ def __save_detection_results( output_data: Dict[str, Any] = { 'episode_id': input_data.episode_id, 'series_name': context.series_name, - 'detection_settings': self.config.dict(), + 'detection_settings': self.config.model_dump(), 'statistics': { 'total_frames_processed': len(frame_files), 'frames_with_detections': len(results), diff --git a/preprocessor/steps/vision/emotion_detection_step.py b/preprocessor/steps/vision/emotion_detection_step.py index 05370d7a4..80999c2d3 100644 --- a/preprocessor/steps/vision/emotion_detection_step.py +++ b/preprocessor/steps/vision/emotion_detection_step.py @@ -60,7 +60,7 @@ def execute(self, input_data: FrameCollection, context: ExecutionContext) -> Emo def __prepare_emotion_model(self, context: ExecutionContext) -> None: if self.__model is None: - self.__model = EmotionDetector._init_model(context.logger) + self.__model = EmotionDetector.init_model(context.logger) def __process_and_update_emotions( self, @@ -79,7 +79,7 @@ def __process_and_update_emotions( return context.logger.info(f'Processing {len(face_crops)} faces with HSEmotion model') - emotion_results = EmotionDetector._detect_batch( + emotion_results = EmotionDetector.detect_batch( face_crops, self.__model, batch_size=32, logger=context.logger, ) @@ -136,7 +136,7 @@ def __collect_face_crops( if not bbox: continue - face_crop = EmotionDetector._crop_face(frame, bbox) + face_crop = EmotionDetector.crop_face(frame, bbox) if face_crop is None: continue From a163ee8fb1f4cfcd535c7c9bb375e080317484dc Mon Sep 17 00:00:00 2001 From: dam2452 <81230036+dam2452@users.noreply.github.com> Date: Fri, 13 Feb 2026 15:18:01 +0100 Subject: [PATCH 30/89] Add batch processing and model pool support Introduce episode-level batch processing across the pipeline and a simple ModelPool. PipelineExecutor now decides between sequential or batch execution, marking episodes in-progress/completed and honoring a global disable_parallel flag. PipelineStep gained batch-related hooks (supports_batch_processing, setup_resources, execute_batch, teardown_resources) and helper executors using ThreadPoolExecutor. Many steps were updated to implement batch execution, resource lifecycle, and per-step max_parallel_episodes config. Added max_parallel_episodes fields to multiple step configs and added ModelPool to ExecutionContext for shared model management; Elasticsearch indexing was adapted as a global batched step. --- preprocessor/app/pipeline_builder.py | 70 +++++++++++++++++++ preprocessor/config/step_configs.py | 26 +++++-- preprocessor/core/base_step.py | 62 ++++++++++++++++ preprocessor/core/context.py | 12 ++++ preprocessor/core/model_pool.py | 49 +++++++++++++ preprocessor/steps/audio/separation_step.py | 11 +++ preprocessor/steps/packaging/archives_step.py | 12 ++++ .../steps/search/document_generation_step.py | 11 +++ preprocessor/steps/search/indexing_step.py | 35 ++++++++++ preprocessor/steps/text/analysis_step.py | 12 ++++ preprocessor/steps/text/embeddings_step.py | 44 ++++++++++++ preprocessor/steps/text/transcription_step.py | 46 ++++++++++++ .../steps/validation/validator_step.py | 13 ++++ preprocessor/steps/video/frame_export_step.py | 11 +++ .../steps/video/scene_detection_step.py | 40 +++++++++++ preprocessor/steps/video/transcoding_step.py | 12 ++++ .../steps/vision/character_detection_step.py | 50 +++++++++++++ preprocessor/steps/vision/embeddings_step.py | 45 ++++++++++++ .../steps/vision/emotion_detection_step.py | 45 ++++++++++++ .../steps/vision/face_clustering_step.py | 44 ++++++++++++ .../steps/vision/image_hashing_step.py | 11 +++ .../steps/vision/object_detection_step.py | 43 ++++++++++++ 22 files changed, 698 insertions(+), 6 deletions(-) create mode 100644 preprocessor/core/model_pool.py diff --git a/preprocessor/app/pipeline_builder.py b/preprocessor/app/pipeline_builder.py index d51a9330e..a7471adad 100644 --- a/preprocessor/app/pipeline_builder.py +++ b/preprocessor/app/pipeline_builder.py @@ -110,6 +110,34 @@ def __run_episode_step( self, step: PipelineStep, current_artifacts: List[Any], ) -> List[Any]: self.__context.logger.info(f"=== Running Step: {step.name} ===") + + if self.__should_use_batch_processing(step): + return self.__run_episode_step_batch(step, current_artifacts) + return self.__run_episode_step_sequential(step, current_artifacts) + + def __should_use_batch_processing(self, step: PipelineStep) -> bool: + + if self.__context.disable_parallel: + self.__context.logger.info( + f"Batch processing disabled globally for {step.name}", + ) + return False + + if hasattr(step.config, 'enable_parallel'): + if not step.config.enable_parallel: + self.__context.logger.info( + f"Batch processing disabled by config for {step.name}", + ) + return False + + if not step.supports_batch_processing(): + return False + + return True + + def __run_episode_step_sequential( + self, step: PipelineStep, current_artifacts: List[Any], + ) -> List[Any]: next_artifacts = [] for artifact in current_artifacts: @@ -139,6 +167,48 @@ def __run_episode_step( return next_artifacts + def __run_episode_step_batch( + self, step: PipelineStep, current_artifacts: List[Any], + ) -> List[Any]: + artifacts_to_process = [] + next_artifacts = [] + + for artifact in current_artifacts: + episode_id = artifact.episode_id + if self.__should_skip_step(step.name, episode_id): + self.__context.logger.info( + f"Skipping {step.name} for {episode_id} (already completed)", + ) + next_artifacts.append(artifact) + else: + artifacts_to_process.append(artifact) + + if not artifacts_to_process: + return next_artifacts + + self.__context.logger.info( + f"Processing {len(artifacts_to_process)} episodes with batch processing", + ) + + try: + if hasattr(step, 'setup_resources'): + step.setup_resources(self.__context) + + for artifact in artifacts_to_process: + self.__mark_step_in_progress(step.name, artifact.episode_id) + + results = step.execute_batch(artifacts_to_process, self.__context) + + for artifact, result in zip(artifacts_to_process, results): + self.__mark_step_completed(step.name, artifact.episode_id) + next_artifacts.append(result or artifact) + + return next_artifacts + + finally: + if hasattr(step, 'teardown_resources'): + step.teardown_resources(self.__context) + def __mark_step_completed(self, step_name: str, episode_id: str) -> None: if self.__context.state_manager is None: return diff --git a/preprocessor/config/step_configs.py b/preprocessor/config/step_configs.py index 850d6e8e2..c77369bf0 100644 --- a/preprocessor/config/step_configs.py +++ b/preprocessor/config/step_configs.py @@ -21,6 +21,7 @@ class TranscodeConfig(BaseModel): bitrate_reference_seconds: float = Field(gt=0) force_deinterlace: bool = False keyframe_interval_seconds: float = Field(gt=0) + max_parallel_episodes: int = Field(default=3, ge=1, le=10) resolution: Resolution = Field(default=Resolution.R720P) @property @@ -52,6 +53,7 @@ def calculate_bufsize_mbps(self, multiplier: float = 2.0) -> float: class SceneDetectionConfig(BaseModel): + max_parallel_episodes: int = Field(default=4, ge=1, le=8) min_scene_len: int = Field(default=10, ge=1) threshold: float = Field(default=0.5, ge=0, le=1) @@ -61,11 +63,13 @@ class FrameExportConfig(BaseModel): frames_per_scene: int = Field(default=3, ge=1) keyframe_strategy: KeyframeStrategy = KeyframeStrategy.SCENE_CHANGES + max_parallel_episodes: int = Field(default=4, ge=1, le=8) resolution: Resolution = Field(default=Resolution.R720P) class TranscriptionConfig(BaseModel): language: str = 'pl' + max_parallel_episodes: int = Field(default=2, ge=1, le=4) model: str = 'large-v3' output_formats: List[str] = ['json', 'srt', 'txt'] @@ -74,17 +78,20 @@ class WhisperTranscriptionConfig(BaseModel): beam_size: int = Field(default=10, ge=1) device: str = 'cuda' language: str = 'pl' + max_parallel_episodes: int = Field(default=2, ge=1, le=4) model: str = 'large-v3-turbo' temperature: float = Field(default=0.0, ge=0.0, le=1.0) class TextAnalysisConfig(BaseModel): language: str = 'pl' + max_parallel_episodes: int = Field(default=8, ge=1, le=16) class TextEmbeddingConfig(BaseModel): batch_size: int = Field(default=8, ge=1) device: str = 'cuda' + max_parallel_episodes: int = Field(default=1, ge=1, le=2) model_name: str = 'Qwen/Qwen3-VL-Embedding-8B' text_chunk_overlap: int = Field(default=1, ge=0) text_sentences_per_chunk: int = Field(default=5, ge=1) @@ -93,19 +100,22 @@ class TextEmbeddingConfig(BaseModel): class VideoEmbeddingConfig(BaseModel): batch_size: int = Field(default=8, ge=1) device: str = 'cuda' + max_parallel_episodes: int = Field(default=1, ge=1, le=2) model_name: str = 'Qwen/Qwen3-VL-Embedding-8B' class SoundSeparationConfig(BaseModel): - pass + max_parallel_episodes: int = Field(default=4, ge=1, le=8) class DocumentGenerationConfig(BaseModel): generate_segments: bool = True + max_parallel_episodes: int = Field(default=8, ge=1, le=16) class ImageHashConfig(BaseModel): batch_size: int = Field(default=32, ge=1) + max_parallel_episodes: int = Field(default=2, ge=1, le=4) class TranscriptionImportConfig(BaseModel): @@ -118,35 +128,38 @@ class ElasticsearchConfig(BaseModel): dry_run: bool = False host: str = 'localhost:9200' index_name: str + max_parallel_episodes: int = Field(default=4, ge=1, le=8) class AudioExtractionConfig(BaseModel): - pass + max_parallel_episodes: int = Field(default=4, ge=1, le=8) class CharacterDetectionConfig(BaseModel): + max_parallel_episodes: int = Field(default=2, ge=1, le=4) threshold: float = Field(default=0.7, ge=0.0, le=1.0) class EmotionDetectionConfig(BaseModel): - pass + max_parallel_episodes: int = Field(default=2, ge=1, le=4) class FaceClusteringConfig(BaseModel): - pass + max_parallel_episodes: int = Field(default=4, ge=1, le=8) class ObjectDetectionConfig(BaseModel): - pass + max_parallel_episodes: int = Field(default=2, ge=1, le=4) class ArchiveConfig(BaseModel): - pass + max_parallel_episodes: int = Field(default=4, ge=1, le=8) class ValidationConfig(BaseModel): anomaly_threshold: float = 20.0 episodes_info_json: Optional[Path] = None + max_parallel_episodes: int = Field(default=8, ge=1, le=16) class EpisodeScraperConfig(BaseModel): @@ -169,5 +182,6 @@ class CharacterScraperConfig(BaseModel): class CharacterReferenceConfig(BaseModel): characters_file: str images_per_character: int = Field(default=5, ge=1, le=20) + max_parallel_episodes: int = Field(default=4, ge=1, le=8) output_dir: str search_engine: str = "duckduckgo" diff --git a/preprocessor/core/base_step.py b/preprocessor/core/base_step.py index 2828cf3c1..6e9f1744c 100644 --- a/preprocessor/core/base_step.py +++ b/preprocessor/core/base_step.py @@ -2,10 +2,16 @@ ABC, abstractmethod, ) +from concurrent.futures import ( + ThreadPoolExecutor, + as_completed, +) from pathlib import Path from typing import ( TYPE_CHECKING, + Callable, Generic, + List, TypeVar, ) @@ -40,6 +46,21 @@ def is_global(self) -> bool: def execute(self, input_data: InputT, context: "ExecutionContext") -> OutputT: pass + @property + def supports_batch_processing(self) -> bool: + return False + + def setup_resources(self, context: "ExecutionContext") -> None: + pass + + def execute_batch( + self, input_data: List[InputT], context: "ExecutionContext", + ) -> List[OutputT]: + return [self.execute(item, context) for item in input_data] + + def teardown_resources(self, context: "ExecutionContext") -> None: + pass + def cleanup(self) -> None: pass @@ -55,3 +76,44 @@ def _check_cache_validity( context.logger.info(f'Skipping {episode_id} ({cache_description})') return True return False + + @staticmethod + def _execute_with_threadpool( + input_data: List[InputT], + context: "ExecutionContext", + max_workers: int, + executor_fn: Callable[[InputT, "ExecutionContext"], OutputT], + ) -> List[OutputT]: + context.logger.info( + f"Batch processing {len(input_data)} episodes with {max_workers} workers", + ) + + with ThreadPoolExecutor(max_workers=max_workers) as executor: + futures = { + executor.submit(executor_fn, artifact, context): artifact + for artifact in input_data + } + + results = [] + for future in as_completed(futures): + result = future.result() + results.append(result) + + return results + + @staticmethod + def _execute_sequential( + input_data: List[InputT], + context: "ExecutionContext", + executor_fn: Callable[[InputT, "ExecutionContext"], OutputT], + ) -> List[OutputT]: + context.logger.info( + f"Batch processing {len(input_data)} episodes sequentially", + ) + + results = [] + for artifact in input_data: + result = executor_fn(artifact, context) + results.append(result) + + return results diff --git a/preprocessor/core/context.py b/preprocessor/core/context.py index de44c53db..c6eb7a487 100644 --- a/preprocessor/core/context.py +++ b/preprocessor/core/context.py @@ -7,6 +7,7 @@ from preprocessor.config.config import Settings from preprocessor.config.settings_factory import SettingsFactory +from preprocessor.core.model_pool import ModelPool from preprocessor.services.core.logging import ErrorHandlingLogger if TYPE_CHECKING: @@ -22,14 +23,21 @@ def __init__( logger: ErrorHandlingLogger, state_manager: Optional['StateManager'] = None, force_rerun: bool = False, + disable_parallel: bool = False, settings_instance: Optional[Settings] = None, ) -> None: self.__series_name: str = series_name self.__base_output_dir: Path = base_output_dir / series_name self.__state_manager: Optional['StateManager'] = state_manager self.__force_rerun: bool = force_rerun + self.__disable_parallel: bool = disable_parallel self.__logger: ErrorHandlingLogger = logger self.__settings: Settings = settings_instance or SettingsFactory.get_settings() + self.__model_pool: ModelPool = ModelPool() + + @property + def disable_parallel(self) -> bool: + return self.__disable_parallel @property def force_rerun(self) -> bool: @@ -39,6 +47,10 @@ def force_rerun(self) -> bool: def logger(self) -> ErrorHandlingLogger: return self.__logger + @property + def model_pool(self) -> ModelPool: + return self.__model_pool + @property def series_name(self) -> str: return self.__series_name diff --git a/preprocessor/core/model_pool.py b/preprocessor/core/model_pool.py new file mode 100644 index 000000000..c7e6932cf --- /dev/null +++ b/preprocessor/core/model_pool.py @@ -0,0 +1,49 @@ +import threading +from typing import ( + Any, + Callable, + Dict, + Optional, +) + +from preprocessor.services.core.logging import ErrorHandlingLogger + + +class ModelPool: + def __init__(self) -> None: + self._models: Dict[str, Any] = {} + self._lock = threading.Lock() + self._ref_counts: Dict[str, int] = {} + + def get_or_load( + self, + model_id: str, + loader: Callable[[], Any], + logger: Optional[ErrorHandlingLogger] = None, + ) -> Any: + with self._lock: + if model_id not in self._models: + if logger: + logger.info(f"Loading model to pool: {model_id}") + self._models[model_id] = loader() + self._ref_counts[model_id] = 0 + + self._ref_counts[model_id] += 1 + return self._models[model_id] + + def release(self, model_id: str, logger: Optional[ErrorHandlingLogger] = None) -> None: + with self._lock: + if model_id in self._ref_counts: + self._ref_counts[model_id] -= 1 + if self._ref_counts[model_id] <= 0: + if logger: + logger.info(f"Removing model from pool: {model_id}") + del self._models[model_id] + del self._ref_counts[model_id] + + def cleanup_all(self, logger: Optional[ErrorHandlingLogger] = None) -> None: + with self._lock: + if logger and self._models: + logger.info(f"Cleaning up {len(self._models)} models from pool") + self._models.clear() + self._ref_counts.clear() diff --git a/preprocessor/steps/audio/separation_step.py b/preprocessor/steps/audio/separation_step.py index f98eed5d3..267e4215a 100644 --- a/preprocessor/steps/audio/separation_step.py +++ b/preprocessor/steps/audio/separation_step.py @@ -32,6 +32,17 @@ class SoundSeparationStep(PipelineStep[TranscriptionData, TranscriptionData, Sou def name(self) -> str: return 'sound_separation' + @property + def supports_batch_processing(self) -> bool: + return True + + def execute_batch( + self, input_data: List[TranscriptionData], context: ExecutionContext, + ) -> List[TranscriptionData]: + return self._execute_with_threadpool( + input_data, context, self.config.max_parallel_episodes, self.execute, + ) + def execute( self, input_data: TranscriptionData, diff --git a/preprocessor/steps/packaging/archives_step.py b/preprocessor/steps/packaging/archives_step.py index 66116088b..cb9a7a85c 100644 --- a/preprocessor/steps/packaging/archives_step.py +++ b/preprocessor/steps/packaging/archives_step.py @@ -1,4 +1,5 @@ from pathlib import Path +from typing import List from preprocessor.config.step_configs import ArchiveConfig from preprocessor.core.artifacts import ( @@ -14,6 +15,17 @@ class ArchiveGenerationStep(PipelineStep[ProcessedEpisode, ArchiveArtifact, Arch def name(self) -> str: return 'archive_generation' + @property + def supports_batch_processing(self) -> bool: + return True + + def execute_batch( + self, input_data: List[ProcessedEpisode], context: ExecutionContext, + ) -> List[ArchiveArtifact]: + return self._execute_with_threadpool( + input_data, context, self.config.max_parallel_episodes, self.execute, + ) + def execute( self, input_data: ProcessedEpisode, context: ExecutionContext, ) -> ArchiveArtifact: diff --git a/preprocessor/steps/search/document_generation_step.py b/preprocessor/steps/search/document_generation_step.py index 6615465cf..5defd8475 100644 --- a/preprocessor/steps/search/document_generation_step.py +++ b/preprocessor/steps/search/document_generation_step.py @@ -22,6 +22,17 @@ class DocumentGeneratorStep(PipelineStep[Artifact, ElasticDocuments, DocumentGen def name(self) -> str: return 'document_generation' + @property + def supports_batch_processing(self) -> bool: + return True + + def execute_batch( + self, input_data: List[Artifact], context: ExecutionContext, + ) -> List[ElasticDocuments]: + return self._execute_with_threadpool( + input_data, context, self.config.max_parallel_episodes, self.execute, + ) + def execute( self, input_data: Artifact, context: ExecutionContext, ) -> ElasticDocuments: diff --git a/preprocessor/steps/search/indexing_step.py b/preprocessor/steps/search/indexing_step.py index 07ca4dd3c..564b222c3 100644 --- a/preprocessor/steps/search/indexing_step.py +++ b/preprocessor/steps/search/indexing_step.py @@ -27,6 +27,41 @@ def __init__(self, config: ElasticsearchConfig) -> None: def name(self) -> str: return 'elasticsearch_indexing' + @property + def is_global(self) -> bool: + """Indexing is a global step - processes all episodes at once.""" + return True + + @property + def supports_batch_processing(self) -> bool: + return True + + def setup_resources(self, context: ExecutionContext) -> None: + if self.__es is None: + context.logger.info(f'Initializing Elasticsearch client: {self.config.host}') + self.__es = ElasticsearchWrapper( + host=self.config.host, + index_name=self.config.index_name, + ) + + def execute_batch( + self, input_data: List[List[ElasticDocuments]], context: ExecutionContext, + ) -> List[IndexingResult]: + context.logger.info(f"Batch indexing {len(input_data)} document collections") + + results = [] + for docs in input_data: + result = asyncio.run(self.__execute_async(docs, context)) + results.append(result) + + return results + + def teardown_resources(self, context: ExecutionContext) -> None: + if self.__es: + asyncio.run(self.__es.close()) + self.__es = None + context.logger.info('Elasticsearch client closed') + def cleanup(self) -> None: if self.__es: asyncio.run(self.__es.close()) diff --git a/preprocessor/steps/text/analysis_step.py b/preprocessor/steps/text/analysis_step.py index 51ce5f10d..6d31c6655 100644 --- a/preprocessor/steps/text/analysis_step.py +++ b/preprocessor/steps/text/analysis_step.py @@ -3,6 +3,7 @@ from typing import ( Any, Dict, + List, ) from preprocessor.config.step_configs import TextAnalysisConfig @@ -21,6 +22,17 @@ class TextAnalysisStep(PipelineStep[TranscriptionData, TextAnalysisResults, Text def name(self) -> str: return 'text_analysis' + @property + def supports_batch_processing(self) -> bool: + return True + + def execute_batch( + self, input_data: List[TranscriptionData], context: ExecutionContext, + ) -> List[TextAnalysisResults]: + return self._execute_with_threadpool( + input_data, context, self.config.max_parallel_episodes, self.execute, + ) + def execute( self, input_data: TranscriptionData, context: ExecutionContext, ) -> TextAnalysisResults: diff --git a/preprocessor/steps/text/embeddings_step.py b/preprocessor/steps/text/embeddings_step.py index 94e881996..87d3af87f 100644 --- a/preprocessor/steps/text/embeddings_step.py +++ b/preprocessor/steps/text/embeddings_step.py @@ -29,6 +29,29 @@ def __init__(self, config: TextEmbeddingConfig) -> None: def name(self) -> str: return 'text_embedding' + @property + def supports_batch_processing(self) -> bool: + return True + + def setup_resources(self, context: ExecutionContext) -> None: + if self.__model is None: + context.logger.info(f'Loading VLLM embedding model: {self.config.model_name}') + self.__model = EmbeddingModelWrapper( + self.config.model_name, + self.config.device, + self.config.batch_size, + ) + + def execute_batch( + self, input_data: List[TranscriptionData], context: ExecutionContext, + ) -> List[EmbeddingCollection]: + return self._execute_sequential(input_data, context, self.__execute_single) + + def teardown_resources(self, context: ExecutionContext) -> None: + if self.__model: + self.__model = None + context.logger.info('VLLM embedding model unloaded') + def cleanup(self) -> None: if self.__model: self.__model = None @@ -57,6 +80,27 @@ def execute( context.mark_step_completed(self.name, input_data.episode_id) return self.__construct_embedding_collection(input_data, output_path, len(results)) + def __execute_single( + self, input_data: TranscriptionData, context: ExecutionContext, + ) -> EmbeddingCollection: + output_path = self.__resolve_output_path(input_data, context) + + if self._check_cache_validity(output_path, context, input_data.episode_id, 'cached text embeddings'): + return self.__load_cached_result(output_path, input_data) + + segments = self.__extract_valid_segments(input_data, context) + if not segments: + return self.__construct_embedding_collection(input_data, output_path, 0) + + context.logger.info(f'Generating text embeddings for {input_data.episode_id}') + context.mark_step_started(self.name, input_data.episode_id) + + results = self.__process_text_embeddings(segments) + self.__save_embedding_results(results, output_path, input_data) + + context.mark_step_completed(self.name, input_data.episode_id) + return self.__construct_embedding_collection(input_data, output_path, len(results)) + def __prepare_embedding_model(self) -> None: if self.__model is None: self.__model = EmbeddingModelWrapper( diff --git a/preprocessor/steps/text/transcription_step.py b/preprocessor/steps/text/transcription_step.py index e8467da95..335014493 100644 --- a/preprocessor/steps/text/transcription_step.py +++ b/preprocessor/steps/text/transcription_step.py @@ -2,6 +2,7 @@ from typing import ( Any, Dict, + List, Optional, ) @@ -26,6 +27,33 @@ def __init__(self, config: WhisperTranscriptionConfig) -> None: def name(self) -> str: return 'transcription' + @property + def supports_batch_processing(self) -> bool: + return True + + def setup_resources(self, context: ExecutionContext) -> None: + if self.__whisper is None: + context.logger.info(f'Loading Whisper model: {self.config.model}') + self.__whisper = Whisper( + model=self.config.model, + language=self.config.language, + device=self.config.device, + beam_size=self.config.beam_size, + ) + + def execute_batch( + self, input_data: List[AudioArtifact], context: ExecutionContext, + ) -> List[TranscriptionData]: + return self._execute_with_threadpool( + input_data, context, self.config.max_parallel_episodes, self.__execute_single, + ) + + def teardown_resources(self, context: ExecutionContext) -> None: + if self.__whisper: + self.__whisper.cleanup() + self.__whisper = None + context.logger.info('Whisper model unloaded') + def cleanup(self) -> None: if self.__whisper: self.__whisper.cleanup() @@ -48,6 +76,24 @@ def execute(self, input_data: AudioArtifact, context: ExecutionContext) -> Trans context.mark_step_completed(self.name, input_data.episode_id) return self.__construct_result_artifact(output_path, input_data, result) + def __execute_single( + self, input_data: AudioArtifact, context: ExecutionContext, + ) -> TranscriptionData: + output_path = self.__resolve_output_path(input_data, context) + + if self._check_cache_validity(output_path, context, input_data.episode_id, 'cached transcription'): + return self.__construct_cached_result(output_path, input_data) + + context.logger.info( + f'Transcribing {input_data.episode_id} using Whisper {self.config.model}', + ) + context.mark_step_started(self.name, input_data.episode_id) + + result = self.__process_audio_transcription(input_data, output_path, context) + + context.mark_step_completed(self.name, input_data.episode_id) + return self.__construct_result_artifact(output_path, input_data, result) + def __prepare_whisper_model(self) -> None: if self.__whisper is None: self.__whisper = Whisper( diff --git a/preprocessor/steps/validation/validator_step.py b/preprocessor/steps/validation/validator_step.py index 114ec0867..d909ae115 100644 --- a/preprocessor/steps/validation/validator_step.py +++ b/preprocessor/steps/validation/validator_step.py @@ -1,3 +1,5 @@ +from typing import List + from preprocessor.config.step_configs import ValidationConfig from preprocessor.core.artifacts import ( ElasticDocuments, @@ -13,6 +15,17 @@ class ValidationStep(PipelineStep[ElasticDocuments, ValidationResult, Validation def name(self) -> str: return "validate" + @property + def supports_batch_processing(self) -> bool: + return True + + def execute_batch( + self, input_data: List[ElasticDocuments], context: ExecutionContext, + ) -> List[ValidationResult]: + return self._execute_with_threadpool( + input_data, context, self.config.max_parallel_episodes, self.execute, + ) + def execute( self, input_data: ElasticDocuments, diff --git a/preprocessor/steps/video/frame_export_step.py b/preprocessor/steps/video/frame_export_step.py index 031eb868d..12bae66e4 100644 --- a/preprocessor/steps/video/frame_export_step.py +++ b/preprocessor/steps/video/frame_export_step.py @@ -37,6 +37,17 @@ def __init__(self, config: FrameExportConfig) -> None: def name(self) -> str: return 'frame_export' + @property + def supports_batch_processing(self) -> bool: + return True + + def execute_batch( + self, input_data: List[SceneCollection], context: ExecutionContext, + ) -> List[FrameCollection]: + return self._execute_with_threadpool( + input_data, context, self.config.max_parallel_episodes, self.execute, + ) + def execute( self, input_data: SceneCollection, context: ExecutionContext, ) -> FrameCollection: diff --git a/preprocessor/steps/video/scene_detection_step.py b/preprocessor/steps/video/scene_detection_step.py index f660040d7..e05864c5c 100644 --- a/preprocessor/steps/video/scene_detection_step.py +++ b/preprocessor/steps/video/scene_detection_step.py @@ -26,6 +26,29 @@ def __init__(self, config: SceneDetectionConfig) -> None: def name(self) -> str: return 'scene_detection' + @property + def supports_batch_processing(self) -> bool: + return True + + def setup_resources(self, context: ExecutionContext) -> None: + if not self.__model_loaded: + context.logger.info('Loading TransNetV2 model...') + self.__transnet.load_model() + self.__model_loaded = True + + def execute_batch( + self, input_data: List[TranscodedVideo], context: ExecutionContext, + ) -> List[SceneCollection]: + return self._execute_with_threadpool( + input_data, context, self.config.max_parallel_episodes, self.__execute_single, + ) + + def teardown_resources(self, context: ExecutionContext) -> None: + if self.__model_loaded: + self.__transnet.cleanup() + self.__model_loaded = False + context.logger.info('TransNetV2 model unloaded') + def cleanup(self) -> None: if self.__model_loaded: self.__transnet.cleanup() @@ -50,6 +73,23 @@ def execute( context.mark_step_completed(self.name, input_data.episode_id) return self.__construct_scene_collection(output_path, input_data, scenes) + def __execute_single( + self, input_data: TranscodedVideo, context: ExecutionContext, + ) -> SceneCollection: + output_path = self.__resolve_output_path(input_data, context) + + if self._check_cache_validity(output_path, context, input_data.episode_id, 'cached'): + return self.__load_cached_result(output_path, input_data) + + context.logger.info(f'Detecting scenes in {input_data.episode_id}') + context.mark_step_started(self.name, input_data.episode_id) + + scenes = self.__detect_scenes(input_data.path) + self.__save_detection_results(scenes, input_data.path, output_path) + + context.mark_step_completed(self.name, input_data.episode_id) + return self.__construct_scene_collection(output_path, input_data, scenes) + def __prepare_detection_environment(self, context: ExecutionContext) -> None: if not self.__model_loaded: context.logger.info('Loading TransNetV2 model...') diff --git a/preprocessor/steps/video/transcoding_step.py b/preprocessor/steps/video/transcoding_step.py index 8b72ee05d..42b5c1820 100644 --- a/preprocessor/steps/video/transcoding_step.py +++ b/preprocessor/steps/video/transcoding_step.py @@ -3,6 +3,7 @@ from typing import ( Any, Dict, + List, Tuple, ) @@ -24,6 +25,17 @@ class VideoTranscoderStep(PipelineStep[SourceVideo, TranscodedVideo, TranscodeCo def name(self) -> str: return 'video_transcode' + @property + def supports_batch_processing(self) -> bool: + return True + + def execute_batch( + self, input_data: List[SourceVideo], context: ExecutionContext, + ) -> List[TranscodedVideo]: + return self._execute_with_threadpool( + input_data, context, self.config.max_parallel_episodes, self.execute, + ) + def execute( self, input_data: SourceVideo, context: ExecutionContext, ) -> TranscodedVideo: diff --git a/preprocessor/steps/vision/character_detection_step.py b/preprocessor/steps/vision/character_detection_step.py index 8338da024..cb33252a5 100644 --- a/preprocessor/steps/vision/character_detection_step.py +++ b/preprocessor/steps/vision/character_detection_step.py @@ -32,6 +32,56 @@ def cleanup(self) -> None: self.__face_app = None self.__character_vectors = {} + @property + def supports_batch_processing(self) -> bool: + return True + + def setup_resources(self, context: ExecutionContext) -> None: + if self.__face_app is None: + context.logger.info('Loading Face Detection model...') + self.__face_app = FaceDetector.init() + self.__load_character_references(context) + + def execute_batch( + self, input_data: List[FrameCollection], context: ExecutionContext, + ) -> List[DetectionResults]: + return self._execute_with_threadpool( + input_data, context, self.config.max_parallel_episodes, self.__execute_single, + ) + + def teardown_resources(self, context: ExecutionContext) -> None: + if self.__face_app: + context.logger.info('Face Detection model unloaded') + self.__face_app = None + self.__character_vectors = {} + + def __execute_single( + self, input_data: FrameCollection, context: ExecutionContext, + ) -> DetectionResults: + output_path = self.__resolve_output_path(input_data, context) + + if self._check_cache_validity(output_path, context, input_data.episode_id, 'cached character detections'): + return self.__load_cached_result(output_path, input_data) + + context.logger.info(f'Detecting characters in {input_data.episode_id}') + context.mark_step_started(self.name, input_data.episode_id) + + frame_files = self.__extract_frame_files(input_data) + if not frame_files: + return self.__construct_empty_result(output_path, input_data, context) + + results = self.__process_character_detection(frame_files) + self.__save_detection_results(results, output_path, input_data, context, frame_files) + + context.mark_step_completed(self.name, input_data.episode_id) + return DetectionResults( + episode_id=input_data.episode_id, + episode_info=input_data.episode_info, + path=output_path, + detection_type='character', + detection_count=len(results), + ) + def execute( self, input_data: FrameCollection, context: ExecutionContext, ) -> DetectionResults: diff --git a/preprocessor/steps/vision/embeddings_step.py b/preprocessor/steps/vision/embeddings_step.py index fe72e061c..3b8809f03 100644 --- a/preprocessor/steps/vision/embeddings_step.py +++ b/preprocessor/steps/vision/embeddings_step.py @@ -29,6 +29,27 @@ def __init__(self, config: VideoEmbeddingConfig) -> None: def name(self) -> str: return 'video_embedding' + @property + def supports_batch_processing(self) -> bool: + return True + + def setup_resources(self, context: ExecutionContext) -> None: + if self.__model is None: + context.logger.info(f'Loading VLLM embedding model: {self.config.model_name}') + self.__model = EmbeddingModelWrapper(self.config.model_name, self.config.device) + self.__model.load_model() + + def execute_batch( + self, input_data: List[FrameCollection], context: ExecutionContext, + ) -> List[EmbeddingCollection]: + return self._execute_sequential(input_data, context, self.__execute_single) + + def teardown_resources(self, context: ExecutionContext) -> None: + if self.__model: + self.__model.cleanup() + self.__model = None + context.logger.info('VLLM embedding model unloaded') + def cleanup(self) -> None: if self.__model: self.__model.cleanup() @@ -59,6 +80,30 @@ def execute( context.mark_step_completed(self.name, input_data.episode_id) return self.__construct_embedding_collection(input_data, output_path, len(results)) + def __execute_single( + self, input_data: FrameCollection, context: ExecutionContext, + ) -> EmbeddingCollection: + output_path = self.__resolve_output_path(input_data, context) + + if self._check_cache_validity(output_path, context, input_data.episode_id, 'cached video embeddings'): + return self.__load_cached_result(output_path, input_data) + + frame_requests = self.__extract_frame_requests(input_data, context) + if not frame_requests: + return self.__construct_embedding_collection(input_data, output_path, 0) + + context.logger.info( + f'Generating video embeddings for {len(frame_requests)} frames in {input_data.episode_id}', + ) + context.mark_step_started(self.name, input_data.episode_id) + + image_hashes = self.__fetch_image_hashes(input_data, context) + results = self.__generate_embeddings(frame_requests, input_data, image_hashes) + self.__save_embedding_results(results, output_path, input_data, image_hashes) + + context.mark_step_completed(self.name, input_data.episode_id) + return self.__construct_embedding_collection(input_data, output_path, len(results)) + def __prepare_embedding_model(self, context: ExecutionContext) -> None: if self.__model is None: context.logger.info('Initializing embedding model...') diff --git a/preprocessor/steps/vision/emotion_detection_step.py b/preprocessor/steps/vision/emotion_detection_step.py index 80999c2d3..618829de4 100644 --- a/preprocessor/steps/vision/emotion_detection_step.py +++ b/preprocessor/steps/vision/emotion_detection_step.py @@ -34,6 +34,51 @@ def name(self) -> str: def cleanup(self) -> None: self.__model = None + @property + def supports_batch_processing(self) -> bool: + return True + + def setup_resources(self, context: ExecutionContext) -> None: + if self.__model is None: + context.logger.info('Loading HSEmotion model...') + self.__model = EmotionDetector.init_model(context.logger) + + def execute_batch( + self, input_data: List[FrameCollection], context: ExecutionContext, + ) -> List[EmotionData]: + return self._execute_with_threadpool( + input_data, context, self.config.max_parallel_episodes, self.__execute_single, + ) + + def teardown_resources(self, context: ExecutionContext) -> None: + if self.__model: + context.logger.info('HSEmotion model unloaded') + self.__model = None + + def __execute_single( + self, input_data: FrameCollection, context: ExecutionContext, + ) -> EmotionData: + detections_path = self.__resolve_detections_path(input_data, context) + + if self._check_cache_validity(detections_path, context, input_data.episode_id, 'cached emotion detection'): + return self.__construct_emotion_data(input_data, detections_path) + + if not detections_path.exists(): + context.logger.warning( + f'No character detections found for emotion analysis: {detections_path}', + ) + return self.__construct_emotion_data(input_data, detections_path) + + context.logger.info(f'Detecting emotions for {input_data.episode_id}') + context.mark_step_started(self.name, input_data.episode_id) + + detections_data = FileOperations.load_json(detections_path) + self.__process_and_update_emotions(detections_data, input_data, context) + FileOperations.atomic_write_json(detections_path, detections_data) + + context.mark_step_completed(self.name, input_data.episode_id) + return self.__construct_emotion_data(input_data, detections_path) + def execute(self, input_data: FrameCollection, context: ExecutionContext) -> EmotionData: detections_path = self.__resolve_detections_path(input_data, context) diff --git a/preprocessor/steps/vision/face_clustering_step.py b/preprocessor/steps/vision/face_clustering_step.py index 304abafb4..1559957fd 100644 --- a/preprocessor/steps/vision/face_clustering_step.py +++ b/preprocessor/steps/vision/face_clustering_step.py @@ -1,4 +1,5 @@ from pathlib import Path +from typing import List from preprocessor.config.step_configs import FaceClusteringConfig from preprocessor.core.artifacts import ( @@ -10,10 +11,53 @@ class FaceClusteringStep(PipelineStep[FrameCollection, ClusterData, FaceClusteringConfig]): + def __init__(self, config: FaceClusteringConfig) -> None: + super().__init__(config) + self.__model = None + @property def name(self) -> str: return 'face_clustering' + def cleanup(self) -> None: + self.__model = None + + @property + def supports_batch_processing(self) -> bool: + return True + + def setup_resources(self, context: ExecutionContext) -> None: + if self.__model is None: + context.logger.info('Loading Face Clustering model...') + + def execute_batch( + self, input_data: List[FrameCollection], context: ExecutionContext, + ) -> List[ClusterData]: + return self._execute_with_threadpool( + input_data, context, self.config.max_parallel_episodes, self.__execute_single, + ) + + def teardown_resources(self, context: ExecutionContext) -> None: + if self.__model: + context.logger.info('Face Clustering model unloaded') + self.__model = None + + def __execute_single( + self, input_data: FrameCollection, context: ExecutionContext, + ) -> ClusterData: + """Execute single episode (batch processing variant without lazy loading).""" + output_path = self.__resolve_output_path(input_data, context) + + if self.__is_execution_cached(output_path, input_data.episode_id, context): + context.logger.info(f'Skipping {input_data.episode_id} (cached face clustering)') + return self.__construct_cluster_data(input_data, output_path) + + context.logger.info(f'Clustering faces for {input_data.episode_id}') + context.mark_step_started(self.name, input_data.episode_id) + + context.mark_step_completed(self.name, input_data.episode_id) + return self.__construct_cluster_data(input_data, output_path) + def execute( self, input_data: FrameCollection, context: ExecutionContext, ) -> ClusterData: diff --git a/preprocessor/steps/vision/image_hashing_step.py b/preprocessor/steps/vision/image_hashing_step.py index 326b52459..895733ebd 100644 --- a/preprocessor/steps/vision/image_hashing_step.py +++ b/preprocessor/steps/vision/image_hashing_step.py @@ -32,6 +32,17 @@ def __init__(self, config: ImageHashConfig) -> None: def name(self) -> str: return 'image_hashing' + @property + def supports_batch_processing(self) -> bool: + return True + + def execute_batch( + self, input_data: List[FrameCollection], context: ExecutionContext, + ) -> List[ImageHashCollection]: + return self._execute_with_threadpool( + input_data, context, self.config.max_parallel_episodes, self.execute, + ) + def cleanup(self) -> None: self.__hasher = None self.__cleanup_memory() diff --git a/preprocessor/steps/vision/object_detection_step.py b/preprocessor/steps/vision/object_detection_step.py index 2f974cb97..b88d6c9a5 100644 --- a/preprocessor/steps/vision/object_detection_step.py +++ b/preprocessor/steps/vision/object_detection_step.py @@ -1,4 +1,5 @@ from pathlib import Path +from typing import List from preprocessor.config.step_configs import ObjectDetectionConfig from preprocessor.core.artifacts import ( @@ -10,10 +11,52 @@ class ObjectDetectionStep(PipelineStep[FrameCollection, ObjectDetectionData, ObjectDetectionConfig]): + def __init__(self, config: ObjectDetectionConfig) -> None: + super().__init__(config) + self.__model = None + @property def name(self) -> str: return 'object_detection' + def cleanup(self) -> None: + self.__model = None + + @property + def supports_batch_processing(self) -> bool: + return True + + def setup_resources(self, context: ExecutionContext) -> None: + if self.__model is None: + context.logger.info('Loading Object Detection model...') + + def execute_batch( + self, input_data: List[FrameCollection], context: ExecutionContext, + ) -> List[ObjectDetectionData]: + return self._execute_with_threadpool( + input_data, context, self.config.max_parallel_episodes, self.__execute_single, + ) + + def teardown_resources(self, context: ExecutionContext) -> None: + if self.__model: + context.logger.info('Object Detection model unloaded') + self.__model = None + + def __execute_single( + self, input_data: FrameCollection, context: ExecutionContext, + ) -> ObjectDetectionData: + output_path = self.__resolve_output_path(input_data, context) + + if self.__is_execution_cached(output_path, input_data.episode_id, context): + context.logger.info(f'Skipping {input_data.episode_id} (cached object detection)') + return self.__construct_object_data(input_data, output_path) + + context.logger.info(f'Detecting objects for {input_data.episode_id}') + context.mark_step_started(self.name, input_data.episode_id) + + context.mark_step_completed(self.name, input_data.episode_id) + return self.__construct_object_data(input_data, output_path) + def execute( self, input_data: FrameCollection, context: ExecutionContext, ) -> ObjectDetectionData: From 1860252fe1d07a2f6fb3c2569544b7d10c9d4830 Mon Sep 17 00:00:00 2001 From: dam2452 <81230036+dam2452@users.noreply.github.com> Date: Fri, 13 Feb 2026 15:22:36 +0100 Subject: [PATCH 31/89] Use attribute and fix DDGS import Access supports_batch_processing as an attribute (supports_batch_processing) in PipelineExecutor instead of calling it as a method, avoiding an incorrect function call. Also update the DuckDuckGo image search import to use the ddgs package (from ddgs import DDGS) to match the installed module name and prevent import errors. --- preprocessor/app/pipeline_builder.py | 2 +- .../services/characters/image_search/duckduckgo_image_search.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/preprocessor/app/pipeline_builder.py b/preprocessor/app/pipeline_builder.py index a7471adad..65bd0472b 100644 --- a/preprocessor/app/pipeline_builder.py +++ b/preprocessor/app/pipeline_builder.py @@ -130,7 +130,7 @@ def __should_use_batch_processing(self, step: PipelineStep) -> bool: ) return False - if not step.supports_batch_processing(): + if not step.supports_batch_processing: return False return True diff --git a/preprocessor/services/characters/image_search/duckduckgo_image_search.py b/preprocessor/services/characters/image_search/duckduckgo_image_search.py index c4e74f15e..e9363c915 100644 --- a/preprocessor/services/characters/image_search/duckduckgo_image_search.py +++ b/preprocessor/services/characters/image_search/duckduckgo_image_search.py @@ -3,7 +3,7 @@ List, ) -from duckduckgo_search import DDGS +from ddgs import DDGS from preprocessor.services.characters.image_search.image_search import BaseImageSearch From 5924d49cadf6126f2a7f5841be811c87af774238 Mon Sep 17 00:00:00 2001 From: dam2452 <81230036+dam2452@users.noreply.github.com> Date: Fri, 13 Feb 2026 15:24:10 +0100 Subject: [PATCH 32/89] Use config.video_bitrate_mbps property Replace call to calculate_video_bitrate_mbps() with direct access to config.video_bitrate_mbps in VideoTranscoderStep. This aligns with the updated config API (bitrate exposed as an attribute) while leaving minrate/maxrate/bufsize calculations unchanged. --- preprocessor/steps/video/transcoding_step.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/preprocessor/steps/video/transcoding_step.py b/preprocessor/steps/video/transcoding_step.py index 42b5c1820..c8d408ade 100644 --- a/preprocessor/steps/video/transcoding_step.py +++ b/preprocessor/steps/video/transcoding_step.py @@ -123,7 +123,7 @@ def __compute_scaled_bitrate( is_upscaling: bool, ) -> Tuple[float, float, float, float]: source_bitrate = FFmpegWrapper.get_video_bitrate(probe_data) - target_bitrate = self.config.calculate_video_bitrate_mbps() + target_bitrate = self.config.video_bitrate_mbps minrate = self.config.calculate_minrate_mbps() maxrate = self.config.calculate_maxrate_mbps() bufsize = self.config.calculate_bufsize_mbps() From c677d6ecf4130bce5e84cb72cd8c3617d51fe071 Mon Sep 17 00:00:00 2001 From: dam2452 <81230036+dam2452@users.noreply.github.com> Date: Fri, 13 Feb 2026 16:19:21 +0100 Subject: [PATCH 33/89] Add thread-safety and multi in-progress state Make StateManager and ErrorHandlingLogger thread-safe by adding threading.Lock and guarding state mutations and logging calls. Change ProcessingState.in_progress from a single optional InProgressStep to a list of InProgressStep and update serialization/deserialization accordingly. Update mark_step_started to append new in-progress entries and mark_step_completed to move matching in-progress entries to completed_steps and remove them from the in-progress list. Adjust cleanup to acquire the lock before removing the state file. Modify logger methods to acquire a lock and update finalize to print error panel and return the configured error exit code when errors occurred. Add necessary threading imports. --- preprocessor/core/state_manager.py | 67 ++++++++++++++++----------- preprocessor/services/core/logging.py | 51 +++++++++++--------- 2 files changed, 68 insertions(+), 50 deletions(-) diff --git a/preprocessor/core/state_manager.py b/preprocessor/core/state_manager.py index 9c2023f05..52373f50c 100644 --- a/preprocessor/core/state_manager.py +++ b/preprocessor/core/state_manager.py @@ -6,6 +6,7 @@ from datetime import datetime import json from pathlib import Path +import threading from typing import ( Any, Dict, @@ -37,7 +38,7 @@ class ProcessingState: series_name: str started_at: str completed_steps: List[StepCheckpoint] = field(default_factory=list) - in_progress: Optional[InProgressStep] = None + in_progress: List[InProgressStep] = field(default_factory=list) def to_dict(self) -> Dict[str, Any]: return { @@ -45,7 +46,7 @@ def to_dict(self) -> Dict[str, Any]: 'started_at': self.started_at, 'last_checkpoint': self.last_checkpoint, 'completed_steps': [asdict(step) for step in self.completed_steps], - 'in_progress': asdict(self.in_progress) if self.in_progress else None, + 'in_progress': [asdict(step) for step in self.in_progress], } @classmethod @@ -53,9 +54,11 @@ def from_dict(cls, data: Dict[str, Any]) -> 'ProcessingState': completed_steps = [ StepCheckpoint(**step) for step in data.get('completed_steps', []) ] - in_progress_data = data.get('in_progress') + in_progress_data = data.get('in_progress', []) in_progress = ( - InProgressStep(**in_progress_data) if in_progress_data else None + [InProgressStep(**step) for step in in_progress_data] + if isinstance(in_progress_data, list) + else [] ) return cls( @@ -69,6 +72,7 @@ def from_dict(cls, data: Dict[str, Any]) -> 'ProcessingState': class StateManager: __STATE_FILE_TEMPLATE: str = '.preprocessing_state_{series}.json' + __lock = threading.Lock() def __init__(self, series_name: str, working_dir: Path = Path('.')) -> None: self.__series_name = series_name @@ -78,9 +82,10 @@ def __init__(self, series_name: str, working_dir: Path = Path('.')) -> None: self.__state: Optional[ProcessingState] = None def cleanup(self) -> None: - if self.__state_file.exists(): - console.print(f'[blue]Cleaning up state file: {self.__state_file}[/blue]') - self.__state_file.unlink() + with self.__lock: + if self.__state_file.exists(): + console.print(f'[blue]Cleaning up state file: {self.__state_file}[/blue]') + self.__state_file.unlink() def is_step_completed(self, step: str, episode: str) -> bool: if self.__state is None: @@ -97,34 +102,40 @@ def load_or_create_state(self) -> ProcessingState: return self.__create_new_state() def mark_step_completed(self, step: str, episode: str) -> None: - self.__ensure_state_initialized() + with self.__lock: + self.__ensure_state_initialized() - checkpoint = StepCheckpoint( - step=step, - episode=episode, - completed_at=datetime.now().isoformat(), - ) + checkpoint = StepCheckpoint( + step=step, + episode=episode, + completed_at=datetime.now().isoformat(), + ) - self.__state.completed_steps.append(checkpoint) - self.__state.in_progress = None - self.__save_state() + self.__state.completed_steps.append(checkpoint) + self.__state.in_progress = [ + s for s in self.__state.in_progress + if not (s.step == step and s.episode == episode) + ] + self.__save_state() - console.print(f'[green]Completed: {step} for {episode}[/green]') + console.print(f'[green]Completed: {step} for {episode}[/green]') def mark_step_started( self, step: str, episode: str, temp_files: Optional[List[str]] = None, ) -> None: - self.__ensure_state_initialized() - - self.__state.in_progress = InProgressStep( - step=step, - episode=episode, - started_at=datetime.now().isoformat(), - temp_files=temp_files or [], - ) - self.__save_state() - - console.print(f'[cyan]Started: {step} for {episode}[/cyan]') + with self.__lock: + self.__ensure_state_initialized() + + in_progress_step = InProgressStep( + step=step, + episode=episode, + started_at=datetime.now().isoformat(), + temp_files=temp_files or [], + ) + self.__state.in_progress.append(in_progress_step) + self.__save_state() + + console.print(f'[cyan]Started: {step} for {episode}[/cyan]') def __load_existing_state(self) -> ProcessingState: console.print(f'[yellow]Found existing state file: {self.__state_file}[/yellow]') diff --git a/preprocessor/services/core/logging.py b/preprocessor/services/core/logging.py index 9eedd0bb1..98f53d112 100644 --- a/preprocessor/services/core/logging.py +++ b/preprocessor/services/core/logging.py @@ -1,4 +1,5 @@ import logging +import threading from typing import List from rich.logging import RichHandler @@ -18,6 +19,7 @@ class ErrorHandlingLogger: WARNING = 30 ERROR = 40 CRITICAL = 50 + __lock = threading.Lock() def __init__(self, class_name: str, loglevel: int, error_exit_code: int) -> None: self.__class_name = class_name @@ -38,41 +40,46 @@ def __del__(self) -> None: raise LoggerNotFinalizedException() def debug(self, message: str) -> None: - self.__logger.debug(message) + with self.__lock: + self.__logger.debug(message) def info(self, message: str) -> None: - self.__logger.info(message) + with self.__lock: + self.__logger.info(message) def warning(self, message: str) -> None: - self.__logger.warning(message) + with self.__lock: + self.__logger.warning(message) def error(self, message: str) -> None: - self.__logger.error(message) - self.__errors.append(message) + with self.__lock: + self.__logger.error(message) + self.__errors.append(message) def finalize(self) -> int: - self.__is_finalized = True + with self.__lock: + self.__is_finalized = True + + if self.__errors: + console.print( + Panel( + f"[bold red]Processing for '{self.__class_name}' " + f"completed with {len(self.__errors)} error(s)[/bold red]", + title='Errors Occurred', + border_style='red', + ), + ) + return self.__error_exit_code - if self.__errors: console.print( Panel( - f"[bold red]Processing for '{self.__class_name}' " - f"completed with {len(self.__errors)} error(s)[/bold red]", - title='Errors Occurred', - border_style='red', + f"[bold green]Processing for '{self.__class_name}' " + "completed successfully[/bold green]", + title='Success', + border_style='green', ), ) - return self.__error_exit_code - - console.print( - Panel( - f"[bold green]Processing for '{self.__class_name}' " - "completed successfully[/bold green]", - title='Success', - border_style='green', - ), - ) - return 0 + return 0 def __setup_logger(self, level: int) -> logging.Logger: logging.basicConfig( From 98c71147800796aa6d3fabac9e5ff00eb82239fa Mon Sep 17 00:00:00 2001 From: dam2452 <81230036+dam2452@users.noreply.github.com> Date: Fri, 13 Feb 2026 20:50:03 +0100 Subject: [PATCH 34/89] Refine bitrate scaling; lower parallel episodes Reduce TranscodeConfig.max_parallel_episodes default from 3 to 2. Replace fixed exponent bitrate scaling with a dynamic algorithm: import math and add helper methods (__calculate_scaling_exponent, __apply_bitrate_limits, __get_scaling_direction) to compute an exponent based on log10(pixel_ratio), handle upscaling/downscaling differently, clamp scaled bitrate to the target, and update ratio/log output accordingly. This makes bitrate decisions more adaptive to resolution changes while ensuring the final bitrate does not exceed the target. --- preprocessor/config/step_configs.py | 2 +- preprocessor/steps/video/transcoding_step.py | 40 ++++++++++++++++---- 2 files changed, 34 insertions(+), 8 deletions(-) diff --git a/preprocessor/config/step_configs.py b/preprocessor/config/step_configs.py index c77369bf0..efa61bc4a 100644 --- a/preprocessor/config/step_configs.py +++ b/preprocessor/config/step_configs.py @@ -21,7 +21,7 @@ class TranscodeConfig(BaseModel): bitrate_reference_seconds: float = Field(gt=0) force_deinterlace: bool = False keyframe_interval_seconds: float = Field(gt=0) - max_parallel_episodes: int = Field(default=3, ge=1, le=10) + max_parallel_episodes: int = Field(default=2, ge=1, le=10) resolution: Resolution = Field(default=Resolution.R720P) @property diff --git a/preprocessor/steps/video/transcoding_step.py b/preprocessor/steps/video/transcoding_step.py index c8d408ade..0ce407194 100644 --- a/preprocessor/steps/video/transcoding_step.py +++ b/preprocessor/steps/video/transcoding_step.py @@ -1,4 +1,5 @@ from dataclasses import replace +import math from pathlib import Path from typing import ( Any, @@ -135,27 +136,52 @@ def __compute_scaled_bitrate( return target_bitrate, minrate, maxrate, bufsize pixel_ratio = target_pixels / source_pixels - scaled_bitrate = source_bitrate * (pixel_ratio ** 0.7) + exponent = self.__calculate_scaling_exponent(pixel_ratio, is_upscaling) + scaled_bitrate = source_bitrate * (pixel_ratio ** exponent) + clamped_bitrate = self.__apply_bitrate_limits(scaled_bitrate, target_bitrate) - final_bitrate = min(scaled_bitrate, target_bitrate) - ratio = final_bitrate / target_bitrate + ratio = clamped_bitrate / target_bitrate + direction = self.__get_scaling_direction(pixel_ratio, is_upscaling) - direction = 'upscaling' if is_upscaling else 'downscaling' if pixel_ratio < 1.0 else 'same resolution' context.logger.info( f'Bitrate calculation ({direction}): ' f'source {source_bitrate:.2f} Mbps @ {source_pixels:,}px → ' f'scaled {scaled_bitrate:.2f} Mbps @ {target_pixels:,}px ' - f'(pixel_ratio {pixel_ratio:.2f}, exponent 0.7) → ' - f'final {final_bitrate:.2f} Mbps (capped to target {target_bitrate} Mbps)', + f'(pixel_ratio {pixel_ratio:.2f}, exponent {exponent:.1f}) → ' + f'final {clamped_bitrate:.2f} Mbps (capped to target {target_bitrate} Mbps)', ) return ( - final_bitrate, + clamped_bitrate, round(minrate * ratio, 2), round(maxrate * ratio, 2), round(bufsize * ratio, 2), ) + @staticmethod + def __calculate_scaling_exponent(pixel_ratio: float, is_upscaling: bool) -> float: + safe_ratio = max(pixel_ratio, 0.01) + log_ratio = math.log10(safe_ratio) + + if is_upscaling: + capped_log = min(log_ratio, 1.0) + return 0.8 + capped_log * 0.35 + + capped_log = max(log_ratio, -2.0) + return 0.8 + capped_log * 0.175 + + @staticmethod + def __apply_bitrate_limits(scaled_bitrate: float, target_bitrate: float) -> float: + return min(scaled_bitrate, target_bitrate) + + @staticmethod + def __get_scaling_direction(pixel_ratio: float, is_upscaling: bool) -> str: + if is_upscaling: + return 'upscaling' + if pixel_ratio < 1.0: + return 'downscaling' + return 'same resolution' + def __compute_audio_bitrate( self, probe_data: Dict[str, Any], From e02b0fd1e53a233409978d8088f4f9475cd63b9d Mon Sep 17 00:00:00 2001 From: dam2452 <81230036+dam2452@users.noreply.github.com> Date: Fri, 13 Feb 2026 20:52:06 +0100 Subject: [PATCH 35/89] Increase default max_parallel_episodes to 3 Update TranscodeConfig in preprocessor/config/step_configs.py to raise the default max_parallel_episodes from 2 to 3, allowing one additional concurrent episode transcode while keeping the existing bounds (1-10). No other behavior was changed. --- preprocessor/config/step_configs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/preprocessor/config/step_configs.py b/preprocessor/config/step_configs.py index efa61bc4a..c77369bf0 100644 --- a/preprocessor/config/step_configs.py +++ b/preprocessor/config/step_configs.py @@ -21,7 +21,7 @@ class TranscodeConfig(BaseModel): bitrate_reference_seconds: float = Field(gt=0) force_deinterlace: bool = False keyframe_interval_seconds: float = Field(gt=0) - max_parallel_episodes: int = Field(default=2, ge=1, le=10) + max_parallel_episodes: int = Field(default=3, ge=1, le=10) resolution: Resolution = Field(default=Resolution.R720P) @property From 2f114ba88438fcee87234ba94594648bbc96b071 Mon Sep 17 00:00:00 2001 From: dam2452 <81230036+dam2452@users.noreply.github.com> Date: Fri, 13 Feb 2026 22:35:08 +0100 Subject: [PATCH 36/89] Parallelize video resolution scanning Convert __scan_resolutions into an instance method that dispatches per-file probing to a thread pool using self._execute_with_threadpool and self.config.max_parallel_episodes. Extract per-video logic into a new helper __scan_single_video that returns a dict or None on error, and filter out failures. Preserves logging, interlace detection, and metadata-vs-reality checks while isolating errors per-file and improving performance via parallel execution. --- .../analysis/resolution_analysis_step.py | 80 ++++++++++--------- 1 file changed, 41 insertions(+), 39 deletions(-) diff --git a/preprocessor/steps/analysis/resolution_analysis_step.py b/preprocessor/steps/analysis/resolution_analysis_step.py index 1e4f1bfbf..3e8dd59b7 100644 --- a/preprocessor/steps/analysis/resolution_analysis_step.py +++ b/preprocessor/steps/analysis/resolution_analysis_step.py @@ -302,53 +302,55 @@ def __find_video_files(context: ExecutionContext) -> List[Path]: return sorted(video_files) - @staticmethod def __scan_resolutions( - video_paths: List[Path], context: ExecutionContext, + self, video_paths: List[Path], context: ExecutionContext, ) -> List[Dict[str, Any]]: - video_info = [] + max_workers = self.config.max_parallel_episodes + results = self._execute_with_threadpool( + video_paths, context, max_workers, self.__scan_single_video, + ) + return [r for r in results if r is not None] - for video_path in video_paths: - try: - probe_data = FFmpegWrapper.probe_video(video_path) - width, height = FFmpegWrapper.get_resolution(probe_data) - sar_num, sar_denom = FFmpegWrapper.get_sample_aspect_ratio(probe_data) - field_order = FFmpegWrapper.get_field_order(probe_data) + @staticmethod + def __scan_single_video(video_path: Path, context: ExecutionContext) -> Optional[Dict[str, Any]]: + try: + probe_data = FFmpegWrapper.probe_video(video_path) + width, height = FFmpegWrapper.get_resolution(probe_data) + sar_num, sar_denom = FFmpegWrapper.get_sample_aspect_ratio(probe_data) + field_order = FFmpegWrapper.get_field_order(probe_data) - effective_width = int(width * sar_num / sar_denom) + effective_width = int(width * sar_num / sar_denom) - context.logger.info( - f'Analyzing interlacing for {video_path.name} ' - f'(field_order={field_order}, analyzing full video)...', - ) - has_interlacing, idet_stats = FFmpegWrapper.detect_interlacing( - video_path, analysis_time=None, - ) + context.logger.info( + f'Analyzing interlacing for {video_path.name} ' + f'(field_order={field_order}, analyzing full video)...', + ) + has_interlacing, idet_stats = FFmpegWrapper.detect_interlacing( + video_path, analysis_time=None, + ) + + metadata_vs_reality = ResolutionAnalysisStep.__validate_field_order( + field_order, has_interlacing, idet_stats, + ) - metadata_vs_reality = ResolutionAnalysisStep.__validate_field_order( - field_order, has_interlacing, idet_stats, + if metadata_vs_reality != 'match': + context.logger.warning( + f'⚠ {video_path.name}: field_order={field_order} but idet says {metadata_vs_reality}!', ) - if metadata_vs_reality != 'match': - context.logger.warning( - f'⚠ {video_path.name}: field_order={field_order} but idet says {metadata_vs_reality}!', - ) - - video_info.append({ - 'filename': video_path.name, - 'width': effective_width, - 'height': height, - 'field_order': field_order, - 'needs_deinterlace': has_interlacing, - 'idet_stats': idet_stats, - 'metadata_match': metadata_vs_reality, - }) - - except Exception as e: - context.logger.warning(f'Failed to probe {video_path.name}: {e}') - continue - - return video_info + return { + 'filename': video_path.name, + 'width': effective_width, + 'height': height, + 'field_order': field_order, + 'needs_deinterlace': has_interlacing, + 'idet_stats': idet_stats, + 'metadata_match': metadata_vs_reality, + } + + except Exception as e: + context.logger.warning(f'Failed to probe {video_path.name}: {e}') + return None @staticmethod def __validate_field_order( From 320a9d8a3c8da6dd620aed5c0789f3a61ffdbfa1 Mon Sep 17 00:00:00 2001 From: dam2452 <81230036+dam2452@users.noreply.github.com> Date: Sat, 14 Feb 2026 12:09:50 +0100 Subject: [PATCH 37/89] Refactor transcode bitrates & add resolution check Replace legacy bitrate_reference fields with max_bitrate_file_size_mb, max_bitrate_duration_seconds and min_upscale_bitrate_ratio in SeriesConfig, defaults, and step configs. Add a ResolutionAnalysisConfig and wire resolution analysis into the pipeline. Add FFmpegWrapper.get_video_codec. Major refactor of VideoTranscoderStep: introduce codec-efficiency multipliers, centralize bitrate calculations into __compute_all_bitrate_settings, enforce minimum upscale bitrate ratio, simplify interlacing detection/logging, and clean up threadpool, path/result construction, and logging behavior. --- preprocessor/app/pipeline_factory.py | 12 +- preprocessor/config/series_config.py | 10 +- preprocessor/config/step_configs.py | 14 +- preprocessor/config/step_defaults.py | 5 +- preprocessor/series_configs/defaults.json | 5 +- preprocessor/services/media/ffmpeg.py | 7 + .../analysis/resolution_analysis_step.py | 7 +- preprocessor/steps/video/transcoding_step.py | 415 +++++++----------- 8 files changed, 191 insertions(+), 284 deletions(-) diff --git a/preprocessor/app/pipeline_factory.py b/preprocessor/app/pipeline_factory.py index 992c48116..90dcd8444 100644 --- a/preprocessor/app/pipeline_factory.py +++ b/preprocessor/app/pipeline_factory.py @@ -20,6 +20,7 @@ FrameExportConfig, ImageHashConfig, ObjectDetectionConfig, + ResolutionAnalysisConfig, SceneDetectionConfig, SoundSeparationConfig, TextAnalysisConfig, @@ -103,11 +104,7 @@ def build_pipeline(series_name: str) -> PipelineDefinition: # pylint: disable=t description="Analyze source video resolutions and warn if upscaling required", produces=[], needs=[], - config=TranscodeConfig( - bitrate_reference_mb=series_config.processing.transcode.bitrate_reference_mb, - bitrate_reference_seconds=series_config.processing.transcode.bitrate_reference_seconds, - keyframe_interval_seconds=series_config.processing.transcode.keyframe_interval_seconds, - force_deinterlace=series_config.processing.transcode.force_deinterlace, + config=ResolutionAnalysisConfig( resolution=Resolution.from_string(series_config.processing.transcode.resolution), ), ) @@ -120,9 +117,10 @@ def build_pipeline(series_name: str) -> PipelineDefinition: # pylint: disable=t produces=["transcoded_videos/{season}/{episode}.mp4"], needs=[resolution_analysis], config=TranscodeConfig( - bitrate_reference_mb=series_config.processing.transcode.bitrate_reference_mb, - bitrate_reference_seconds=series_config.processing.transcode.bitrate_reference_seconds, + max_bitrate_file_size_mb=series_config.processing.transcode.max_bitrate_file_size_mb, + max_bitrate_duration_seconds=series_config.processing.transcode.max_bitrate_duration_seconds, keyframe_interval_seconds=series_config.processing.transcode.keyframe_interval_seconds, + min_upscale_bitrate_ratio=series_config.processing.transcode.min_upscale_bitrate_ratio, force_deinterlace=series_config.processing.transcode.force_deinterlace, ), ) diff --git a/preprocessor/config/series_config.py b/preprocessor/config/series_config.py index 822293fc3..1d0c98919 100644 --- a/preprocessor/config/series_config.py +++ b/preprocessor/config/series_config.py @@ -55,10 +55,11 @@ class TranscriptionProcessingConfig: @dataclass class TranscodeProcessingConfig: - bitrate_reference_mb: float - bitrate_reference_seconds: float force_deinterlace: bool keyframe_interval_seconds: float + max_bitrate_duration_seconds: float + max_bitrate_file_size_mb: float + min_upscale_bitrate_ratio: float resolution: str @@ -149,8 +150,9 @@ def __load_from_dict(data: Dict[str, Any]) -> 'SeriesConfig': device=data['processing']['transcription']['device'], ), transcode=TranscodeProcessingConfig( - bitrate_reference_mb=data['processing']['transcode']['bitrate_reference_mb'], - bitrate_reference_seconds=data['processing']['transcode']['bitrate_reference_seconds'], + max_bitrate_file_size_mb=data['processing']['transcode']['max_bitrate_file_size_mb'], + max_bitrate_duration_seconds=data['processing']['transcode']['max_bitrate_duration_seconds'], + min_upscale_bitrate_ratio=data['processing']['transcode']['min_upscale_bitrate_ratio'], force_deinterlace=data['processing']['transcode']['force_deinterlace'], keyframe_interval_seconds=data['processing']['transcode']['keyframe_interval_seconds'], resolution=data['processing']['transcode']['resolution'], diff --git a/preprocessor/config/step_configs.py b/preprocessor/config/step_configs.py index c77369bf0..49b9abb42 100644 --- a/preprocessor/config/step_configs.py +++ b/preprocessor/config/step_configs.py @@ -17,11 +17,12 @@ class TranscodeConfig(BaseModel): model_config = ConfigDict(arbitrary_types_allowed=True) - bitrate_reference_mb: float = Field(gt=0) - bitrate_reference_seconds: float = Field(gt=0) force_deinterlace: bool = False keyframe_interval_seconds: float = Field(gt=0) + max_bitrate_duration_seconds: float = Field(gt=0) + max_bitrate_file_size_mb: float = Field(gt=0) max_parallel_episodes: int = Field(default=3, ge=1, le=10) + min_upscale_bitrate_ratio: float = Field(default=0.52, ge=0, le=1) resolution: Resolution = Field(default=Resolution.R720P) @property @@ -38,7 +39,7 @@ def preset(self) -> str: @property def video_bitrate_mbps(self) -> float: - total = (self.bitrate_reference_mb * 8) / self.bitrate_reference_seconds + total = (self.max_bitrate_file_size_mb * 8) / self.max_bitrate_duration_seconds audio = self.audio_bitrate_kbps / 1000.0 return round(total - audio, 2) @@ -52,6 +53,13 @@ def calculate_bufsize_mbps(self, multiplier: float = 2.0) -> float: return round(self.video_bitrate_mbps * multiplier, 2) +class ResolutionAnalysisConfig(BaseModel): + model_config = ConfigDict(arbitrary_types_allowed=True) + + max_parallel_episodes: int = Field(default=10, ge=1, le=20) + resolution: Resolution = Field(default=Resolution.R720P) + + class SceneDetectionConfig(BaseModel): max_parallel_episodes: int = Field(default=4, ge=1, le=8) min_scene_len: int = Field(default=10, ge=1) diff --git a/preprocessor/config/step_defaults.py b/preprocessor/config/step_defaults.py index d5490e817..0563f5da3 100644 --- a/preprocessor/config/step_defaults.py +++ b/preprocessor/config/step_defaults.py @@ -25,9 +25,10 @@ class DefaultConfigFactory: def get_configs(series_name: str) -> Dict[str, object]: return { 'transcode': TranscodeConfig( - bitrate_reference_mb=50.0, - bitrate_reference_seconds=100.0, + max_bitrate_file_size_mb=50.0, + max_bitrate_duration_seconds=100.0, keyframe_interval_seconds=0.5, + min_upscale_bitrate_ratio=0.52, ), 'transcribe': WhisperTranscriptionConfig( model='large-v3-turbo', diff --git a/preprocessor/series_configs/defaults.json b/preprocessor/series_configs/defaults.json index f22e0f78e..12eca464c 100644 --- a/preprocessor/series_configs/defaults.json +++ b/preprocessor/series_configs/defaults.json @@ -18,10 +18,11 @@ "threshold": 0.5 }, "transcode": { - "bitrate_reference_mb": 50.0, - "bitrate_reference_seconds": 100.0, "force_deinterlace": false, "keyframe_interval_seconds": 0.5, + "max_bitrate_duration_seconds": 100.0, + "max_bitrate_file_size_mb": 50.0, + "min_upscale_bitrate_ratio": 0.52, "resolution": "720p" }, "transcription": { diff --git a/preprocessor/services/media/ffmpeg.py b/preprocessor/services/media/ffmpeg.py index 6b1830ab7..6520393fe 100644 --- a/preprocessor/services/media/ffmpeg.py +++ b/preprocessor/services/media/ffmpeg.py @@ -146,6 +146,13 @@ def get_field_order(probe_data: Dict[str, Any]) -> str: return 'unknown' return stream.get('field_order', 'unknown') + @staticmethod + def get_video_codec(probe_data: Dict[str, Any]) -> str: + stream = FFmpegWrapper.__get_stream_by_type(probe_data, 'video') + if not stream: + return 'h264' + return stream.get('codec_name', 'h264').lower() + @staticmethod def probe_video(video_path: Path) -> Dict[str, Any]: cmd = [ diff --git a/preprocessor/steps/analysis/resolution_analysis_step.py b/preprocessor/steps/analysis/resolution_analysis_step.py index 3e8dd59b7..d6f0dd54b 100644 --- a/preprocessor/steps/analysis/resolution_analysis_step.py +++ b/preprocessor/steps/analysis/resolution_analysis_step.py @@ -9,7 +9,7 @@ Optional, ) -from preprocessor.config.step_configs import TranscodeConfig +from preprocessor.config.step_configs import ResolutionAnalysisConfig from preprocessor.core.artifacts import ResolutionAnalysisResult from preprocessor.core.base_step import PipelineStep from preprocessor.core.context import ExecutionContext @@ -17,7 +17,7 @@ from preprocessor.services.media.ffmpeg import FFmpegWrapper -class ResolutionAnalysisStep(PipelineStep[None, ResolutionAnalysisResult, TranscodeConfig]): +class ResolutionAnalysisStep(PipelineStep[None, ResolutionAnalysisResult, ResolutionAnalysisConfig]): @property def name(self) -> str: return 'resolution_analysis' @@ -305,9 +305,8 @@ def __find_video_files(context: ExecutionContext) -> List[Path]: def __scan_resolutions( self, video_paths: List[Path], context: ExecutionContext, ) -> List[Dict[str, Any]]: - max_workers = self.config.max_parallel_episodes results = self._execute_with_threadpool( - video_paths, context, max_workers, self.__scan_single_video, + video_paths, context, self.config.max_parallel_episodes, self.__scan_single_video, ) return [r for r in results if r is not None] diff --git a/preprocessor/steps/video/transcoding_step.py b/preprocessor/steps/video/transcoding_step.py index 0ce407194..fe95e9d2c 100644 --- a/preprocessor/steps/video/transcoding_step.py +++ b/preprocessor/steps/video/transcoding_step.py @@ -20,6 +20,11 @@ class VideoTranscoderStep(PipelineStep[SourceVideo, TranscodedVideo, TranscodeConfig]): + __CODEC_EFFICIENCY = { + 'h264': 1.0, 'avc': 1.0, + 'hevc': 2.0, 'h265': 2.0, + 'vp9': 2.85, 'av1': 4.0, + } __command_logged = False @property @@ -30,16 +35,12 @@ def name(self) -> str: def supports_batch_processing(self) -> bool: return True - def execute_batch( - self, input_data: List[SourceVideo], context: ExecutionContext, - ) -> List[TranscodedVideo]: + def execute_batch(self, input_data: List[SourceVideo], context: ExecutionContext) -> List[TranscodedVideo]: return self._execute_with_threadpool( input_data, context, self.config.max_parallel_episodes, self.execute, ) - def execute( - self, input_data: SourceVideo, context: ExecutionContext, - ) -> TranscodedVideo: + def execute(self, input_data: SourceVideo, context: ExecutionContext) -> TranscodedVideo: output_path = self.__resolve_output_path(input_data, context) if self._check_cache_validity(output_path, context, input_data.episode_id, 'output exists'): @@ -55,22 +56,12 @@ def execute( return self.__construct_result_artifact(output_path, input_data) def __create_transcode_params( - self, - input_data: SourceVideo, - output_path: Path, - probe_data: Dict[str, Any], - context: ExecutionContext, + self, input_data: SourceVideo, output_path: Path, probe_data: Dict[str, Any], context: ExecutionContext, ) -> TranscodeParams: - target_fps = self.__resolve_target_framerate(probe_data, context) - is_upscaling, source_pixels, target_pixels = self.__analyze_resolution_scaling(probe_data) + target_fps = self.__resolve_target_framerate() + is_upscaling, src_px, target_px = self.__analyze_resolution_scaling(probe_data) - v_bitrate, v_min, v_max, v_buf = self.__compute_video_bitrate_settings( - probe_data, context, is_upscaling, source_pixels, target_pixels, - ) - - audio_bitrate = self.__compute_audio_bitrate(probe_data, context) - deinterlace = self.__resolve_deinterlacing_strategy(input_data, context, probe_data) - log_cmd = self.__should_log_command() + bitrates = self.__compute_all_bitrate_settings(probe_data, context, is_upscaling, src_px, target_px) return TranscodeParams( input_path=input_data.path, @@ -78,285 +69,185 @@ def __create_transcode_params( codec=self.config.codec, preset=self.config.preset, resolution=f'{self.config.resolution.width}:{self.config.resolution.height}', - video_bitrate=f'{v_bitrate}M', - minrate=f'{v_min}M', - maxrate=f'{v_max}M', - bufsize=f'{v_buf}M', - audio_bitrate=f'{audio_bitrate}k', + video_bitrate=f'{bitrates["video"]}M', + minrate=f'{bitrates["min"]}M', + maxrate=f'{bitrates["max"]}M', + bufsize=f'{bitrates["buf"]}M', + audio_bitrate=f'{self.__compute_audio_bitrate(probe_data, context)}k', gop_size=int(target_fps * self.config.keyframe_interval_seconds), target_fps=target_fps, - deinterlace=deinterlace, + deinterlace=self.__resolve_deinterlacing_strategy(input_data, context, probe_data), is_upscaling=is_upscaling, - log_command=log_cmd, + log_command=self.__should_log_command(), ) - def __analyze_resolution_scaling( - self, - probe_data: Dict[str, Any], - ) -> Tuple[bool, int, int]: - source_width, source_height = FFmpegWrapper.get_resolution(probe_data) + def __analyze_resolution_scaling(self, probe_data: Dict[str, Any]) -> Tuple[bool, int, int]: + w, h = FFmpegWrapper.get_resolution(probe_data) sar_num, sar_denom = FFmpegWrapper.get_sample_aspect_ratio(probe_data) - effective_width = int(source_width * sar_num / sar_denom) - - source_pixels = effective_width * source_height - target_pixels = self.config.resolution.width * self.config.resolution.height - - return source_pixels < target_pixels, source_pixels, target_pixels - - def __compute_video_bitrate_settings( - self, - probe_data: Dict[str, Any], - context: ExecutionContext, - is_upscaling: bool, - source_pixels: int, - target_pixels: int, - ) -> Tuple[float, float, float, float]: - return self.__compute_scaled_bitrate( - probe_data, source_pixels, target_pixels, context, is_upscaling, - ) - def __compute_scaled_bitrate( - self, - probe_data: Dict[str, Any], - source_pixels: int, - target_pixels: int, - context: ExecutionContext, - is_upscaling: bool, - ) -> Tuple[float, float, float, float]: - source_bitrate = FFmpegWrapper.get_video_bitrate(probe_data) - target_bitrate = self.config.video_bitrate_mbps - minrate = self.config.calculate_minrate_mbps() - maxrate = self.config.calculate_maxrate_mbps() - bufsize = self.config.calculate_bufsize_mbps() - - if not source_bitrate: - context.logger.warning( - f'Cannot detect source bitrate. Using target bitrate ({target_bitrate} Mbps).', - ) - return target_bitrate, minrate, maxrate, bufsize - - pixel_ratio = target_pixels / source_pixels - exponent = self.__calculate_scaling_exponent(pixel_ratio, is_upscaling) - scaled_bitrate = source_bitrate * (pixel_ratio ** exponent) - clamped_bitrate = self.__apply_bitrate_limits(scaled_bitrate, target_bitrate) - - ratio = clamped_bitrate / target_bitrate - direction = self.__get_scaling_direction(pixel_ratio, is_upscaling) - - context.logger.info( - f'Bitrate calculation ({direction}): ' - f'source {source_bitrate:.2f} Mbps @ {source_pixels:,}px → ' - f'scaled {scaled_bitrate:.2f} Mbps @ {target_pixels:,}px ' - f'(pixel_ratio {pixel_ratio:.2f}, exponent {exponent:.1f}) → ' - f'final {clamped_bitrate:.2f} Mbps (capped to target {target_bitrate} Mbps)', - ) + eff_w = int(w * sar_num / sar_denom) + src_px = eff_w * h + target_px = self.config.resolution.width * self.config.resolution.height - return ( - clamped_bitrate, - round(minrate * ratio, 2), - round(maxrate * ratio, 2), - round(bufsize * ratio, 2), - ) + return src_px < target_px, src_px, target_px - @staticmethod - def __calculate_scaling_exponent(pixel_ratio: float, is_upscaling: bool) -> float: - safe_ratio = max(pixel_ratio, 0.01) - log_ratio = math.log10(safe_ratio) + def __compute_all_bitrate_settings( + self, probe_data: Dict[str, Any], context: ExecutionContext, + is_up: bool, src_px: int, target_px: int, + ) -> Dict[str, float]: + src_v = FFmpegWrapper.get_video_bitrate(probe_data) + target_max = self.config.video_bitrate_mbps - if is_upscaling: - capped_log = min(log_ratio, 1.0) - return 0.8 + capped_log * 0.35 + if not src_v: + return self.__build_fallback_bitrates(target_max) - capped_log = max(log_ratio, -2.0) - return 0.8 + capped_log * 0.175 + norm_v = self.__get_normalized_bitrate(src_v, probe_data, is_up, context) + ratio = target_px / src_px + exp = self.__calculate_scaling_exponent(ratio, is_up) - @staticmethod - def __apply_bitrate_limits(scaled_bitrate: float, target_bitrate: float) -> float: - return min(scaled_bitrate, target_bitrate) + scaled_raw = norm_v * (ratio ** exp) + scaled_min = self.__apply_min_upscale_constraint(scaled_raw, target_max, is_up) + final_v = min(scaled_min, target_max) - @staticmethod - def __get_scaling_direction(pixel_ratio: float, is_upscaling: bool) -> str: - if is_upscaling: - return 'upscaling' - if pixel_ratio < 1.0: - return 'downscaling' - return 'same resolution' - - def __compute_audio_bitrate( - self, - probe_data: Dict[str, Any], - context: ExecutionContext, - ) -> int: - input_audio = FFmpegWrapper.get_audio_bitrate(probe_data) - target_audio = self.config.audio_bitrate_kbps - - if input_audio and input_audio < target_audio: - adjusted = min(int(input_audio * 1.05), target_audio) + self.__log_bitrate_workflow( + context, src_v, norm_v, scaled_raw, scaled_min, final_v, target_max, ratio, is_up, + ) + + return self.__scale_bitrate_limits(final_v / target_max) + + def __get_normalized_bitrate( + self, src_v: float, probe: Dict[str, Any], is_up: bool, + context: ExecutionContext, + ) -> float: + if not is_up: + return src_v + + src_codec = self.__normalize_codec_name(FFmpegWrapper.get_video_codec(probe)) + tgt_codec = self.__normalize_codec_name(self.config.codec) + mult = self.__get_codec_efficiency_multiplier(src_codec, tgt_codec) + + if mult != 1.0: + norm = src_v * mult context.logger.info( - f'Input audio ({input_audio} kbps) < target. Adjusted to {adjusted} kbps.', + f'Codec: {src_codec.upper()}->{tgt_codec.upper()} ({mult}x) | {src_v:.2f}->{norm:.2f} Mbps', ) - return adjusted - return target_audio + return norm + return src_v + + def __apply_min_upscale_constraint(self, scaled: float, target_max: float, is_up: bool) -> float: + if not is_up: + return scaled + return max(scaled, target_max * self.config.min_upscale_bitrate_ratio) + + def __scale_bitrate_limits(self, scale: float) -> Dict[str, float]: + return { + "video": round(self.config.video_bitrate_mbps * scale, 2), + "min": round(self.config.calculate_minrate_mbps() * scale, 2), + "max": round(self.config.calculate_maxrate_mbps() * scale, 2), + "buf": round(self.config.calculate_bufsize_mbps() * scale, 2), + } + + def __build_fallback_bitrates(self, target_max: float) -> Dict[str, float]: + return { + "video": target_max, + "min": self.config.calculate_minrate_mbps(), + "max": self.config.calculate_maxrate_mbps(), + "buf": self.config.calculate_bufsize_mbps(), + } + + @staticmethod + def __calculate_scaling_exponent(ratio: float, is_up: bool) -> float: + log_r = math.log10(max(ratio, 0.01)) + if is_up: + return 0.8 + min(log_r, 1.0) * 0.35 + return 0.8 + max(log_r, -2.0) * 0.175 def __resolve_deinterlacing_strategy( - self, - input_data: SourceVideo, - context: ExecutionContext, - probe_data: Dict[str, Any], + self, input_data: SourceVideo, context: ExecutionContext, + probe: Dict[str, Any], ) -> bool: if self.config.force_deinterlace: - context.logger.info(f"Force deinterlacing enabled for {input_data.episode_id}") return True - - return self.__detect_and_verify_interlacing(input_data, context, probe_data) - - def __log_execution_details( - self, - context: ExecutionContext, - input_data: SourceVideo, - params: TranscodeParams, - probe_data: Dict[str, Any], - ) -> None: - source_w, source_h = FFmpegWrapper.get_resolution(probe_data) - upscale_msg = "UPSCALING DETECTED" if params.is_upscaling else "No upscaling" - - context.logger.info( - f'{input_data.episode_id}: Source {source_w}x{source_h} → ' - f'Target {self.config.resolution.width}x{self.config.resolution.height} - {upscale_msg}', - ) - self.__log_static_transcode_info(context, params.audio_bitrate) - context.logger.info(f'Transcoding {input_data.episode_id}') - - def __log_transcode_details( - self, - context: ExecutionContext, - input_data: SourceVideo, - params: TranscodeParams, - probe_data: Dict[str, Any], - ) -> None: - self.__log_execution_details(context, input_data, params, probe_data) - - def __execute_ffmpeg_process( - self, - context: ExecutionContext, - params: TranscodeParams, - episode_id: str, - ) -> None: - temp_path = params.output_path.with_suffix('.mp4.tmp') - final_path = params.output_path - - temp_params = replace(params, output_path=temp_path) - context.mark_step_started(self.name, episode_id, [str(temp_path)]) - + has_int, stats = FFmpegWrapper.detect_interlacing(input_data.path) + if not stats: + return False + self.__log_int_diagnostics(context, has_int, stats, FFmpegWrapper.get_field_order(probe)) + return has_int + + def __compute_audio_bitrate(self, probe: Dict[str, Any], context: ExecutionContext) -> int: + src_a = FFmpegWrapper.get_audio_bitrate(probe) + tgt_a = self.config.audio_bitrate_kbps + if src_a and src_a < tgt_a: + adj = min(int(src_a * 1.05), tgt_a) + context.logger.info(f'Audio boost: {src_a} -> {adj} kbps') + return adj + return tgt_a + + def __execute_ffmpeg_process(self, context: ExecutionContext, params: TranscodeParams, ep_id: str) -> None: + temp = params.output_path.with_suffix('.mp4.tmp') + t_params = replace(params, output_path=temp) + context.mark_step_started(self.name, ep_id, [str(temp)]) try: - if temp_params.log_command: - self.__log_ffmpeg_command_header(context) - - FFmpegWrapper.transcode(temp_params) - temp_path.replace(final_path) + if t_params.log_command: + context.logger.info('=' * 20 + ' FFmpeg ' + '=' * 20) + FFmpegWrapper.transcode(t_params) + temp.replace(params.output_path) except BaseException: - if temp_path.exists(): - temp_path.unlink() + if temp.exists(): + temp.unlink() raise - def __construct_result_artifact( - self, - output_path: Path, - input_data: SourceVideo, - ) -> TranscodedVideo: - return TranscodedVideo( - path=output_path, - episode_id=input_data.episode_id, - episode_info=input_data.episode_info, - resolution=f'{self.config.resolution.width}x{self.config.resolution.height}', - codec=self.config.codec, - ) - @staticmethod - def __should_log_command() -> bool: - if not VideoTranscoderStep.__command_logged: - VideoTranscoderStep.__command_logged = True - return True - return False + def __normalize_codec_name(codec: str) -> str: + name = codec.lower() + mapping = {'h264': ('h264', 'avc'), 'hevc': ('h265', 'hevc'), 'vp9': ('vp9',), 'av1': ('av1',)} + for norm, patterns in mapping.items(): + if any(p in name for p in patterns): + return norm + return 'h264' @staticmethod - def __resolve_output_path( - input_data: SourceVideo, - context: ExecutionContext, - ) -> Path: - filename = f'{context.series_name}_{input_data.episode_info.episode_code()}.mp4' - return context.get_season_output_path( - input_data.episode_info, 'transcoded_videos', filename, + def __get_codec_efficiency_multiplier(src: str, tgt: str) -> float: + return VideoTranscoderStep.__CODEC_EFFICIENCY.get(src, 1.0) / VideoTranscoderStep.__CODEC_EFFICIENCY.get( + tgt, + 1.0, ) @staticmethod - def __resolve_target_framerate( - probe_data: Dict[str, Any], - context: ExecutionContext, - ) -> float: - input_fps = FFmpegWrapper.get_framerate(probe_data) - target_fps = 24.0 + def __resolve_output_path(input_data: SourceVideo, context: ExecutionContext) -> Path: + filename = f'{context.series_name}_{input_data.episode_info.episode_code()}.mp4' + return context.get_season_output_path(input_data.episode_info, 'transcoded_videos', filename) - if input_fps != target_fps: - context.logger.info( - f'Input FPS ({input_fps:.2f}) → forcing {target_fps} FPS for consistency.', - ) - return target_fps + def __construct_result_artifact(self, path: Path, input_data: SourceVideo) -> TranscodedVideo: + return TranscodedVideo( + path=path, episode_id=input_data.episode_id, episode_info=input_data.episode_info, + resolution=f'{self.config.resolution.width}x{self.config.resolution.height}', codec=self.config.codec, + ) @staticmethod - def __detect_and_verify_interlacing( - input_data: SourceVideo, - context: ExecutionContext, - probe_data: Dict[str, Any], - ) -> bool: - context.logger.info(f"Detecting interlacing for {input_data.episode_id}...") - has_interlacing, idet_stats = FFmpegWrapper.detect_interlacing(input_data.path) - field_order = FFmpegWrapper.get_field_order(probe_data) - - if not idet_stats: - context.logger.error( - f"Failed to detect interlacing for {input_data.episode_id}. Proceeding without deinterlace.", - ) - return False - - VideoTranscoderStep.__log_interlacing_diagnostics(context, has_interlacing, idet_stats, field_order) - return has_interlacing + def __log_bitrate_workflow(ctx, src, norm, raw, s_min, final, limit, ratio, is_up): + dir_label = "upscaling" if is_up else ("downscaling" if ratio < 1.0 else "same") + min_msg = f' (MinBoost: {s_min:.2f})' if is_up and (s_min > raw) else '' + ctx.logger.info(f'[{dir_label}] {src:.2f}->{norm:.2f}->{raw:.2f}{min_msg} -> {final:.2f} Mbps (Max: {limit})') @staticmethod - def __log_interlacing_diagnostics( - context: ExecutionContext, - has_interlacing: bool, - idet_stats: Dict[str, Any], - field_order: str, - ) -> None: - meta_progressive = field_order in {'progressive', 'unknown'} - idet_progressive = not has_interlacing - - if meta_progressive != idet_progressive: - context.logger.warning( - f"⚠ Conflict: Metadata says {field_order}, idet says " - f"{'interlaced' if has_interlacing else 'progressive'}. Using idet result.", - ) + def __log_transcode_details(ctx, input_data, params, probe): + w, h = FFmpegWrapper.get_resolution(probe) + ctx.logger.info( + f'{input_data.episode_id}: {w}x{h} -> {params.resolution} [{"UP" if params.is_upscaling else "DOWN"}]', + ) - if has_interlacing: - context.logger.info( - f"Interlacing detected ({idet_stats['ratio'] * 100:.1f}%). Applying bwdif.", - ) - else: - context.logger.info("Progressive content detected. No deinterlacing needed.") + @staticmethod + def __log_int_diagnostics(ctx, has_int, stats, order): + ctx.logger.info(f"Interlacing: {has_int} ({stats['ratio'] * 100:.1f}%) | {order}") @staticmethod - def __log_static_transcode_info(context: ExecutionContext, audio_bitrate: str) -> None: - context.logger.info( - 'Video: SAR 1:1, timebase 1/90000, colorspace bt709, ' - 'closed GOP=12 frames with IDR keyframes.', - ) - context.logger.info( - f'Audio: AAC {audio_bitrate}, 2 channels, 48 kHz (forced).', - ) + def __resolve_target_framerate() -> float: + return 24.0 @staticmethod - def __log_ffmpeg_command_header(context: ExecutionContext) -> None: - context.logger.info('=' * 80) - context.logger.info('FFmpeg command example (showing once):') - context.logger.info('=' * 80) + def __should_log_command() -> bool: + if not VideoTranscoderStep.__command_logged: + VideoTranscoderStep.__command_logged = True + return True + return False From 4588e3762c1a03b0831961cb124385879ab9ceef Mon Sep 17 00:00:00 2001 From: dam2452 <81230036+dam2452@users.noreply.github.com> Date: Sun, 15 Feb 2026 19:38:00 +0100 Subject: [PATCH 38/89] Add OutputDescriptor system and refactor steps Introduce a unified OutputDescriptor API (FileOutput, DirectoryOutput, JsonFileOutput, GlobalOutput) and a StepTempFile context manager to centralize output path resolution, validation and atomic writes. Refactor StepBuilder to accept concrete step_class (auto-generate step id when missing), return output descriptors, and validate dependency outputs instead of using module strings. Update PipelineFactory to import concrete step classes and declare outputs via descriptors (removing many hardcoded output paths from configs), and adjust PipelineExecutor/PipelineStep/ExecutionContext to use the new descriptor-based resolution and caching helpers. Also extract EpisodeInfo to a dedicated types module, make GeminiClient configurable (model, base_url, api_key), rename progress tracker usage, tweak CLI/pre-commit settings and various small compatibility/import fixes. These changes centralize output handling, improve validation/caching, and remove fragile string-based module/path wiring. --- .pre-commit-config.yaml | 2 +- preprocessor/app/pipeline.py | 7 +- preprocessor/app/pipeline_builder.py | 10 +- preprocessor/app/pipeline_factory.py | 247 +++++++++++---- preprocessor/app/step_builder.py | 77 +++-- preprocessor/cli/cli_main.py | 4 +- preprocessor/cli/search_handler.py | 3 +- preprocessor/config/config.py | 4 +- preprocessor/config/step_configs.py | 4 - preprocessor/core/artifacts.py | 8 +- preprocessor/core/base_step.py | 121 +++++++- preprocessor/core/context.py | 23 +- preprocessor/core/output_descriptors.py | 285 ++++++++++++++++++ preprocessor/core/temp_files.py | 37 +++ preprocessor/services/ai/clients.py | 25 +- preprocessor/services/core/base_processor.py | 4 +- preprocessor/services/episodes/__init__.py | 6 +- .../services/episodes/episode_manager.py | 25 +- preprocessor/services/episodes/types.py | 25 ++ preprocessor/services/io/files.py | 18 +- preprocessor/services/io/path_service.py | 15 +- .../services/scraping/base_scraper_step.py | 8 +- .../services/scraping/episode_scraper.py | 1 - .../services/search/embedding_model.py | 14 +- preprocessor/services/text/import_step.py | 19 +- preprocessor/services/text/text_statistics.py | 4 +- preprocessor/services/ui/__init__.py | 7 +- preprocessor/services/ui/console.py | 2 +- preprocessor/services/ui/progress.py | 7 - .../services/validation/episode_stats.py | 33 +- .../validation/validators/base_validator.py | 21 +- .../validators/character_validator.py | 8 +- .../validators/elastic_validator.py | 25 +- .../validators/face_cluster_validator.py | 11 +- .../validation/validators/frame_validator.py | 13 +- .../validators/image_hash_validator.py | 8 +- .../validation/validators/object_validator.py | 12 +- .../validation/validators/scene_validator.py | 15 +- .../validators/transcription_validator.py | 21 +- .../validators/validation_helpers.py | 13 +- .../validation/validators/video_validator.py | 13 +- preprocessor/services/video/image_hasher.py | 10 - .../analysis/resolution_analysis_step.py | 94 +++--- preprocessor/steps/audio/separation_step.py | 31 +- preprocessor/steps/packaging/archives_step.py | 23 +- .../scraping/reference_processor_step.py | 10 +- .../steps/search/document_generation_step.py | 68 +++-- preprocessor/steps/text/analysis_step.py | 30 +- preprocessor/steps/text/embeddings_step.py | 39 ++- preprocessor/steps/text/transcription_step.py | 41 ++- preprocessor/steps/video/frame_export_step.py | 24 +- .../steps/video/scene_detection_step.py | 25 +- preprocessor/steps/video/transcoding_step.py | 41 ++- .../steps/vision/character_detection_step.py | 29 +- preprocessor/steps/vision/embeddings_step.py | 43 ++- .../steps/vision/emotion_detection_step.py | 81 +++-- .../steps/vision/face_clustering_step.py | 43 +-- .../steps/vision/image_hashing_step.py | 30 +- .../steps/vision/object_detection_step.py | 43 +-- 59 files changed, 1355 insertions(+), 555 deletions(-) create mode 100644 preprocessor/core/output_descriptors.py create mode 100644 preprocessor/core/temp_files.py create mode 100644 preprocessor/services/episodes/types.py diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index f5b66a90e..5fd7cc6cf 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,5 +1,5 @@ fail_fast: false -exclude: '^(bot/RANCZO-WIDEO/|bot/RANCZO-TRANSKRYPCJE/)' +exclude: '^(bot/RANCZO-WIDEO/|bot/RANCZO-TRANSKRYPCJE/|scripts/)' repos: - repo: https://github.com/pre-commit/pre-commit-hooks rev: v6.0.0 diff --git a/preprocessor/app/pipeline.py b/preprocessor/app/pipeline.py index f7211c111..84e01a72e 100644 --- a/preprocessor/app/pipeline.py +++ b/preprocessor/app/pipeline.py @@ -1,5 +1,6 @@ +from __future__ import annotations + from typing import ( - TYPE_CHECKING, Dict, List, Optional, @@ -9,9 +10,7 @@ import networkx as nx from preprocessor.app.step_builder import StepBuilder - -if TYPE_CHECKING: - from preprocessor.services.core.logging import ErrorHandlingLogger +from preprocessor.services.core.logging import ErrorHandlingLogger class PipelineDefinition: diff --git a/preprocessor/app/pipeline_builder.py b/preprocessor/app/pipeline_builder.py index 65bd0472b..f9d976774 100644 --- a/preprocessor/app/pipeline_builder.py +++ b/preprocessor/app/pipeline_builder.py @@ -1,19 +1,18 @@ +from __future__ import annotations + from pathlib import Path from typing import ( - TYPE_CHECKING, Any, List, ) +from preprocessor.app.pipeline import PipelineDefinition from preprocessor.core.artifacts import SourceVideo from preprocessor.core.base_step import PipelineStep from preprocessor.core.context import ExecutionContext from preprocessor.services.episodes.episode_manager import EpisodeManager from preprocessor.services.video.discovery import VideoDiscovery -if TYPE_CHECKING: - from preprocessor.app.pipeline import PipelineDefinition - class PipelineExecutor: def __init__(self, context: ExecutionContext) -> None: @@ -43,8 +42,7 @@ def execute_step( self.__context.logger.info(f"Step: {step_id}") self.__context.logger.info(f"{step_def.description}") - step_class = step_def.load_class() - instance = step_class(step_def.config) + instance = step_def.step_class(step_def.config) runner = PipelineExecutor(self.__context) runner.add_step(instance) diff --git a/preprocessor/app/pipeline_factory.py b/preprocessor/app/pipeline_factory.py index 90dcd8444..b41b86229 100644 --- a/preprocessor/app/pipeline_factory.py +++ b/preprocessor/app/pipeline_factory.py @@ -30,7 +30,34 @@ VideoEmbeddingConfig, WhisperTranscriptionConfig, ) +from preprocessor.core.output_descriptors import ( + DirectoryOutput, + FileOutput, + JsonFileOutput, + create_frames_output, +) from preprocessor.services.media.resolution import Resolution +from preprocessor.steps.analysis.resolution_analysis_step import ResolutionAnalysisStep +from preprocessor.steps.audio.separation_step import SoundSeparationStep +from preprocessor.steps.packaging.archives_step import ArchiveGenerationStep +from preprocessor.steps.scraping.character_scraper_step import CharacterScraperStep +from preprocessor.steps.scraping.episode_scraper_step import EpisodeScraperStep +from preprocessor.steps.scraping.reference_processor_step import CharacterReferenceStep +from preprocessor.steps.search.document_generation_step import DocumentGeneratorStep +from preprocessor.steps.search.indexing_step import ElasticsearchIndexerStep +from preprocessor.steps.text.analysis_step import TextAnalysisStep +from preprocessor.steps.text.embeddings_step import TextEmbeddingStep +from preprocessor.steps.text.transcription_step import TranscriptionStep +from preprocessor.steps.validation.validator_step import ValidationStep +from preprocessor.steps.video.frame_export_step import FrameExporterStep +from preprocessor.steps.video.scene_detection_step import SceneDetectorStep +from preprocessor.steps.video.transcoding_step import VideoTranscoderStep +from preprocessor.steps.vision.character_detection_step import CharacterDetectorStep +from preprocessor.steps.vision.embeddings_step import VideoEmbeddingStep +from preprocessor.steps.vision.emotion_detection_step import EmotionDetectionStep +from preprocessor.steps.vision.face_clustering_step import FaceClusteringStep +from preprocessor.steps.vision.image_hashing_step import ImageHashStep +from preprocessor.steps.vision.object_detection_step import ObjectDetectionStep # Phase Definitions SCRAPING = Phase("SCRAPING", color="blue") @@ -39,9 +66,19 @@ VALIDATION = Phase("VALIDATION", color="magenta") -def build_pipeline(series_name: str) -> PipelineDefinition: # pylint: disable=too-many-locals +def _get_output_path_from_descriptor(step: StepBuilder, series_name: str, descriptor_idx: int = 0) -> str: + """Get resolved output path from step's OutputDescriptor.""" + descriptors = step.get_output_descriptors() + if not descriptors or descriptor_idx >= len(descriptors): + raise ValueError(f'Step {step.id} has no descriptor at index {descriptor_idx}') + + descriptor = descriptors[descriptor_idx] + base_dir = get_base_output_dir(series_name) + return str(descriptor.resolve_path(base_dir, {'series': series_name})) + + +def build_pipeline(series_name: str) -> PipelineDefinition: # pylint: disable=too-many-locals # Pipeline factory creates 21 step objects - each step needs clear naming for readability series_config = SeriesConfig.load(series_name) - output_dir = get_base_output_dir(series_name) # ========================================================= # SCRAPING PHASE @@ -49,13 +86,18 @@ def build_pipeline(series_name: str) -> PipelineDefinition: # pylint: disable=t episodes_metadata = StepBuilder( id="scrape_episodes", phase=SCRAPING, - module="preprocessor.steps.scraping.episode_scraper_step:EpisodeScraperStep", + step_class=EpisodeScraperStep, description="Scrapes episode metadata from wiki", - produces=["episodes.json"], + produces=[ + JsonFileOutput( + pattern=f"{series_name}_episodes.json", + subdir="", + min_size_bytes=100, + ), + ], needs=[], config=EpisodeScraperConfig( urls=series_config.scraping.episodes.urls, - output_file=str(output_dir / f"{series_name}_episodes.json"), headless=True, merge_sources=True, scraper_method="crawl4ai", @@ -66,13 +108,18 @@ def build_pipeline(series_name: str) -> PipelineDefinition: # pylint: disable=t characters_metadata = StepBuilder( id="scrape_characters", phase=SCRAPING, - module="preprocessor.steps.scraping.character_scraper_step:CharacterScraperStep", + step_class=CharacterScraperStep, description="Scrapes character data from wiki", - produces=["characters.json"], + produces=[ + JsonFileOutput( + pattern=f"{series_name}_characters.json", + subdir="", + min_size_bytes=50, + ), + ], needs=[], config=CharacterScraperConfig( urls=series_config.scraping.characters.urls, - output_file=str(output_dir / f"{series_name}_characters.json"), headless=True, scraper_method="crawl4ai", parser_mode=series_config.scraping.characters.parser_mode, @@ -82,13 +129,19 @@ def build_pipeline(series_name: str) -> PipelineDefinition: # pylint: disable=t character_references = StepBuilder( id="process_references", phase=SCRAPING, - module="preprocessor.steps.scraping.reference_processor_step:CharacterReferenceStep", + step_class=CharacterReferenceStep, description="Downloads and processes character reference images", - produces=["character_faces/{character}/*.jpg"], + produces=[ + DirectoryOutput( + pattern="character_faces", + subdir="", + expected_file_pattern="**/*.jpg", + min_files=1, + min_size_per_file_bytes=1024, + ), + ], needs=[characters_metadata], config=CharacterReferenceConfig( - characters_file=str(output_dir / f"{series_name}_characters.json"), - output_dir=str(output_dir / "character_faces"), search_engine=series_config.scraping.character_references.search_engine, images_per_character=series_config.scraping.character_references.images_per_character, ), @@ -98,9 +151,8 @@ def build_pipeline(series_name: str) -> PipelineDefinition: # pylint: disable=t # PROCESSING PHASE: VIDEO # ========================================================= resolution_analysis = StepBuilder( - id="resolution_analysis", phase=PROCESSING, - module="preprocessor.steps.analysis.resolution_analysis_step:ResolutionAnalysisStep", + step_class=ResolutionAnalysisStep, description="Analyze source video resolutions and warn if upscaling required", produces=[], needs=[], @@ -112,9 +164,15 @@ def build_pipeline(series_name: str) -> PipelineDefinition: # pylint: disable=t transcoded_videos = StepBuilder( id="transcode", phase=PROCESSING, - module="preprocessor.steps.video.transcoding_step:VideoTranscoderStep", + step_class=VideoTranscoderStep, description=f"Conversion to h264_nvenc {series_config.processing.transcode.resolution} with adaptive bitrate", - produces=["transcoded_videos/{season}/{episode}.mp4"], + produces=[ + FileOutput( + pattern="{season}/{episode}.mp4", + subdir="transcoded_videos", + min_size_bytes=1024 * 1024, + ), + ], needs=[resolution_analysis], config=TranscodeConfig( max_bitrate_file_size_mb=series_config.processing.transcode.max_bitrate_file_size_mb, @@ -128,9 +186,15 @@ def build_pipeline(series_name: str) -> PipelineDefinition: # pylint: disable=t scene_data = StepBuilder( id="detect_scenes", phase=PROCESSING, - module="preprocessor.steps.video.scene_detection_step:SceneDetectorStep", + step_class=SceneDetectorStep, description="Detects scene changes using TransNetV2", - produces=["scene_detections/{season}/{episode}.json"], + produces=[ + JsonFileOutput( + pattern="{season}/{episode}.json", + subdir="scene_detections", + min_size_bytes=10, + ), + ], needs=[transcoded_videos], config=SceneDetectionConfig( threshold=series_config.processing.scene_detection.threshold, @@ -138,12 +202,14 @@ def build_pipeline(series_name: str) -> PipelineDefinition: # pylint: disable=t ), ) + # Frame export output descriptor matches FrameExporterStep.get_output_descriptors() + # Defined here for pipeline validation before step instantiation exported_frames = StepBuilder( id="export_frames", phase=PROCESSING, - module="preprocessor.steps.video.frame_export_step:FrameExporterStep", + step_class=FrameExporterStep, description="Exports frames (PNG) at scene boundaries", - produces=["frames/{season}/{episode}/*.png"], + produces=[create_frames_output()], needs=[scene_data], config=FrameExportConfig( frames_per_scene=series_config.processing.frame_export.frames_per_scene, @@ -156,9 +222,15 @@ def build_pipeline(series_name: str) -> PipelineDefinition: # pylint: disable=t transcription_data = StepBuilder( id="transcribe", phase=PROCESSING, - module="preprocessor.steps.text.transcription_step:TranscriptionStep", + step_class=TranscriptionStep, description=f"Audio transcription using {series_config.processing.transcription.mode}", - produces=["transcriptions/{season}/{episode}.json"], + produces=[ + JsonFileOutput( + pattern="{season}/{episode}.json", + subdir="transcriptions", + min_size_bytes=50, + ), + ], needs=[transcoded_videos], config=WhisperTranscriptionConfig( model=series_config.processing.transcription.model, @@ -172,9 +244,17 @@ def build_pipeline(series_name: str) -> PipelineDefinition: # pylint: disable=t separated_audio = StepBuilder( id="separate_sounds", phase=PROCESSING, - module="preprocessor.steps.audio.separation_step:SoundSeparationStep", + step_class=SoundSeparationStep, description="Separates dialogue from sound effects", - produces=["separated_audio/{season}/{episode}/"], + produces=[ + DirectoryOutput( + pattern="{season}/{episode}", + subdir="separated_audio", + expected_file_pattern="*.wav", + min_files=1, + min_size_per_file_bytes=1024, + ), + ], needs=[transcription_data], config=SoundSeparationConfig(), ) @@ -182,19 +262,30 @@ def build_pipeline(series_name: str) -> PipelineDefinition: # pylint: disable=t text_stats = StepBuilder( id="analyze_text", phase=PROCESSING, - module="preprocessor.steps.text.analysis_step:TextAnalysisStep", + step_class=TextAnalysisStep, description="Analyzes text statistics (word frequency, sentiment)", - produces=["text_analysis/{season}/{episode}.json"], + produces=[ + JsonFileOutput( + pattern="{season}/{episode}.json", + subdir="text_analysis", + min_size_bytes=50, + ), + ], needs=[transcription_data], config=TextAnalysisConfig(language=series_config.processing.transcription.language), ) text_embeddings = StepBuilder( - id="text_embeddings", phase=PROCESSING, - module="preprocessor.steps.text.embeddings_step:TextEmbeddingStep", + step_class=TextEmbeddingStep, description="Generates text embeddings using Qwen3-VL-Embedding", - produces=["embeddings/text/{season}/{episode}.npy"], + produces=[ + FileOutput( + pattern="{season}/{episode}.npy", + subdir="embeddings/text", + min_size_bytes=1024, + ), + ], needs=[text_stats], config=TextEmbeddingConfig( model_name="Qwen/Qwen3-VL-Embedding-8B", @@ -211,19 +302,30 @@ def build_pipeline(series_name: str) -> PipelineDefinition: # pylint: disable=t image_hashes = StepBuilder( id="image_hashing", phase=PROCESSING, - module="preprocessor.steps.vision.image_hashing_step:ImageHashStep", + step_class=ImageHashStep, description="Perceptual frame hashing (phash, dhash, wavelet)", - produces=["hashes/{season}/{episode}.json"], + produces=[ + JsonFileOutput( + pattern="{season}/{episode}.json", + subdir="hashes", + min_size_bytes=50, + ), + ], needs=[exported_frames], config=ImageHashConfig(batch_size=32), ) video_embeddings = StepBuilder( - id="video_embeddings", phase=PROCESSING, - module="preprocessor.steps.vision.embeddings_step:VideoEmbeddingStep", + step_class=VideoEmbeddingStep, description="Visual embeddings using Qwen3-VL-Embedding", - produces=["embeddings/vision/{season}/{episode}.npy"], + produces=[ + FileOutput( + pattern="{season}/{episode}.npy", + subdir="embeddings/vision", + min_size_bytes=1024, + ), + ], needs=[exported_frames, image_hashes], config=VideoEmbeddingConfig( model_name="Qwen/Qwen3-VL-Embedding-8B", @@ -235,9 +337,15 @@ def build_pipeline(series_name: str) -> PipelineDefinition: # pylint: disable=t character_detections = StepBuilder( id="detect_characters", phase=PROCESSING, - module="preprocessor.steps.vision.character_detection_step:CharacterDetectorStep", + step_class=CharacterDetectorStep, description="Recognizes characters in frames using InsightFace", - produces=["detections/characters/{season}/{episode}.json"], + produces=[ + JsonFileOutput( + pattern="{season}/{episode}.json", + subdir="detections/characters", + min_size_bytes=10, + ), + ], needs=[exported_frames], config=CharacterDetectionConfig(threshold=0.7), ) @@ -245,9 +353,15 @@ def build_pipeline(series_name: str) -> PipelineDefinition: # pylint: disable=t emotion_data = StepBuilder( id="detect_emotions", phase=PROCESSING, - module="preprocessor.steps.vision.emotion_detection_step:EmotionDetectionStep", + step_class=EmotionDetectionStep, description="Detects emotions on faces using EmoNet", - produces=["detections/emotions/{season}/{episode}.json"], + produces=[ + JsonFileOutput( + pattern="{season}/{episode}.json", + subdir="detections/emotions", + min_size_bytes=10, + ), + ], needs=[exported_frames], config=EmotionDetectionConfig(), ) @@ -255,9 +369,15 @@ def build_pipeline(series_name: str) -> PipelineDefinition: # pylint: disable=t face_clusters = StepBuilder( id="cluster_faces", phase=PROCESSING, - module="preprocessor.steps.vision.face_clustering_step:FaceClusteringStep", + step_class=FaceClusteringStep, description="Face clustering using HDBSCAN", - produces=["clusters/faces/{season}/{episode}.json"], + produces=[ + JsonFileOutput( + pattern="{season}/{episode}.json", + subdir="clusters/faces", + min_size_bytes=10, + ), + ], needs=[exported_frames], config=FaceClusteringConfig(), ) @@ -265,9 +385,15 @@ def build_pipeline(series_name: str) -> PipelineDefinition: # pylint: disable=t object_detections = StepBuilder( id="detect_objects", phase=PROCESSING, - module="preprocessor.steps.vision.object_detection_step:ObjectDetectionStep", + step_class=ObjectDetectionStep, description="General object detection using D-FINE", - produces=["detections/objects/{season}/{episode}.json"], + produces=[ + JsonFileOutput( + pattern="{season}/{episode}.json", + subdir="detections/objects", + min_size_bytes=10, + ), + ], needs=[exported_frames], config=ObjectDetectionConfig(), ) @@ -278,9 +404,15 @@ def build_pipeline(series_name: str) -> PipelineDefinition: # pylint: disable=t elastic_documents = StepBuilder( id="generate_elastic_docs", phase=INDEXING, - module="preprocessor.steps.search.document_generation_step:DocumentGeneratorStep", + step_class=DocumentGeneratorStep, description="Combines all data into Elasticsearch documents", - produces=["elastic_documents/{season}/{episode}.ndjson"], + produces=[ + FileOutput( + pattern="{season}/{episode}.ndjson", + subdir="elastic_documents", + min_size_bytes=100, + ), + ], needs=[ text_embeddings, video_embeddings, @@ -295,9 +427,15 @@ def build_pipeline(series_name: str) -> PipelineDefinition: # pylint: disable=t episode_archives = StepBuilder( id="generate_archives", phase=INDEXING, - module="preprocessor.steps.packaging.archives_step:ArchiveGenerationStep", + step_class=ArchiveGenerationStep, description="Creates ZIP archives per episode (all artifacts)", - produces=["archives/{season}/{episode}.zip"], + produces=[ + FileOutput( + pattern="{season}/{episode}.zip", + subdir="archives", + min_size_bytes=1024 * 100, + ), + ], needs=[elastic_documents], config=ArchiveConfig(), ) @@ -305,9 +443,9 @@ def build_pipeline(series_name: str) -> PipelineDefinition: # pylint: disable=t indexed_data = StepBuilder( id="index_to_elasticsearch", phase=INDEXING, - module="preprocessor.steps.search.indexing_step:ElasticsearchIndexerStep", + step_class=ElasticsearchIndexerStep, description="Indexes documents into Elasticsearch", - produces=[""], + produces=[], needs=[elastic_documents], config=ElasticsearchConfig( index_name=series_config.indexing.elasticsearch.index_name, @@ -321,11 +459,18 @@ def build_pipeline(series_name: str) -> PipelineDefinition: # pylint: disable=t # VALIDATION PHASE # ========================================================= validation = StepBuilder( - id="validate", phase=VALIDATION, - module="preprocessor.steps.validation.validator_step:ValidationStep", + step_class=ValidationStep, description="Validates all processed data and generates reports", - produces=["validation_reports/{season}/"], + produces=[ + DirectoryOutput( + pattern="{season}", + subdir="validation_reports", + expected_file_pattern="*.json", + min_files=1, + min_size_per_file_bytes=50, + ), + ], needs=[indexed_data, episode_archives], config=ValidationConfig(), ) diff --git a/preprocessor/app/step_builder.py b/preprocessor/app/step_builder.py index 426a1c7f0..e4b917098 100644 --- a/preprocessor/app/step_builder.py +++ b/preprocessor/app/step_builder.py @@ -1,11 +1,24 @@ +from __future__ import annotations + from dataclasses import ( dataclass, field, ) -import importlib +from pathlib import Path +import re from typing import ( Any, + Dict, List, + Optional, + Type, + Union, +) + +from preprocessor.core.base_step import PipelineStep +from preprocessor.core.output_descriptors import ( + OutputDescriptor, + ValidationResult, ) @@ -18,50 +31,60 @@ class Phase: @dataclass class StepBuilder: description: str - id: str - module: str + step_class: Type[PipelineStep] phase: Phase - produces: List[str] + produces: Union[List[str], List[OutputDescriptor]] + id: Optional[str] = None config: Any = None - needs: List["StepBuilder"] = field(default_factory=list) + needs: List[StepBuilder] = field(default_factory=list) @property def dependency_ids(self) -> List[str]: return [step.id for step in self.needs] - def load_class(self) -> type: - module_path, class_name = self.module.split(":") + def get_output_descriptors(self) -> List[OutputDescriptor]: + if not self.produces: + return [] + + if isinstance(self.produces[0], OutputDescriptor): + return self.produces + + return [] - try: - mod = importlib.import_module(module_path) - except ImportError as e: - raise ImportError( - f"Cannot load module '{module_path}' for step '{self.id}': {e}", - ) from e + def validate_outputs( + self, + base_dir: Path, + context_vars: Optional[Dict[str, str]] = None, + ) -> Dict[str, ValidationResult]: + results = {} + for idx, descriptor in enumerate(self.get_output_descriptors()): + result = descriptor.validate(base_dir, context_vars) + results[f'{self.id}_output_{idx}'] = result + return results - try: - return getattr(mod, class_name) - except AttributeError as e: - raise AttributeError( - f"Class '{class_name}' not found in module '{module_path}' for step '{self.id}': {e}", - ) from e + def get_dependency_outputs(self) -> Dict[str, List[OutputDescriptor]]: + return { + dep.id: dep.get_output_descriptors() + for dep in self.needs + } def __post_init__(self) -> None: + if self.id is None: + object.__setattr__(self, 'id', self.__generate_id_from_class()) self.__validate_id() - self.__validate_module_path() + + def __generate_id_from_class(self) -> str: + class_name = self.step_class.__name__ + class_name_without_step = re.sub(r'Step$', '', class_name) + snake_case = re.sub(r'(? None: - if not self.id.replace("_", "").replace("-", "").isalnum(): + if not self.id or not self.id.replace("_", "").replace("-", "").isalnum(): raise ValueError( f"Invalid step_id: '{self.id}'. Use only alphanumeric and underscores.", ) - def __validate_module_path(self) -> None: - if not self.module or ":" not in self.module: - raise ValueError( - f"Invalid module format for '{self.id}'. Expected 'package.module:ClassName'", - ) - def __eq__(self, other: object) -> bool: if not isinstance(other, StepBuilder): return False diff --git a/preprocessor/cli/cli_main.py b/preprocessor/cli/cli_main.py index f5e83171c..01489dc46 100644 --- a/preprocessor/cli/cli_main.py +++ b/preprocessor/cli/cli_main.py @@ -139,7 +139,7 @@ def __analyze_resolution(series: str) -> None: setup.logger.finalize() -def __execute_search_command(config: SearchConfig) -> None: # pylint: disable=too-many-statements +def __execute_search_command(config: SearchConfig) -> None: # pylint: disable=too-many-statements # Complex async search setup - splitting would reduce readability series_config = SeriesConfig.load(config.series) index_base = series_config.indexing.elasticsearch.index_name @@ -232,7 +232,7 @@ async def __run_async_search() -> None: @click.option("--stats", is_flag=True, help="Show index statistics") @click.option("--json-output", is_flag=True, help="Output in JSON format") @click.option("--host", type=str, default="http://localhost:9200", help="Elasticsearch host") -def search( # pylint: disable=too-many-arguments,too-many-locals +def search( # pylint: disable=too-many-arguments,too-many-locals # CLI command with many options - cannot refactor without breaking Click interface series: str, text: str, text_semantic: str, diff --git a/preprocessor/cli/search_handler.py b/preprocessor/cli/search_handler.py index 797552275..044ebda3d 100644 --- a/preprocessor/cli/search_handler.py +++ b/preprocessor/cli/search_handler.py @@ -52,12 +52,11 @@ class SearchCommandHandler: def __init__( self, es_client: AsyncElasticsearch, - embedding_service: EmbeddingService, + _embedding_service: EmbeddingService, queries: ElasticsearchQueries, json_output: bool, ) -> None: self.__es = es_client - self.__embedding = embedding_service # pylint: disable=unused-private-member self.__queries = queries self.__json_output = json_output diff --git a/preprocessor/config/config.py b/preprocessor/config/config.py index 176a33d1c..ecaad51c7 100644 --- a/preprocessor/config/config.py +++ b/preprocessor/config/config.py @@ -35,7 +35,7 @@ class TranscriptionSubdirs: @dataclass(frozen=True) -class OutputSubdirs: # pylint: disable=too-many-instance-attributes +class OutputSubdirs: # pylint: disable=too-many-instance-attributes # Configuration dataclass - all subdirs needed archives: str = 'archives' character_detections: str = 'character_detections' character_visualizations: str = 'character_detections/visualizations' @@ -269,7 +269,7 @@ def from_env(cls) -> 'GeminiSettings': @dataclass(frozen=True) -class Settings: # pylint: disable=too-many-instance-attributes +class Settings: # pylint: disable=too-many-instance-attributes # Main settings dataclass aggregating all subsettings character: CharacterSettings elasticsearch: ElasticsearchSettings elevenlabs: ElevenLabsSettings diff --git a/preprocessor/config/step_configs.py b/preprocessor/config/step_configs.py index 49b9abb42..d522c0c02 100644 --- a/preprocessor/config/step_configs.py +++ b/preprocessor/config/step_configs.py @@ -173,7 +173,6 @@ class ValidationConfig(BaseModel): class EpisodeScraperConfig(BaseModel): headless: bool = True merge_sources: bool = True - output_file: str parser_mode: str = "normal" scraper_method: str = "crawl4ai" urls: List[str] @@ -181,15 +180,12 @@ class EpisodeScraperConfig(BaseModel): class CharacterScraperConfig(BaseModel): headless: bool = True - output_file: str parser_mode: str = "normal" scraper_method: str = "crawl4ai" urls: List[str] class CharacterReferenceConfig(BaseModel): - characters_file: str images_per_character: int = Field(default=5, ge=1, le=20) max_parallel_episodes: int = Field(default=4, ge=1, le=8) - output_dir: str search_engine: str = "duckduckgo" diff --git a/preprocessor/core/artifacts.py b/preprocessor/core/artifacts.py index c159d9735..2fb759826 100644 --- a/preprocessor/core/artifacts.py +++ b/preprocessor/core/artifacts.py @@ -1,18 +1,18 @@ +from __future__ import annotations + from dataclasses import ( dataclass, field, ) from pathlib import Path from typing import ( - TYPE_CHECKING, Any, Dict, List, Optional, ) -if TYPE_CHECKING: - from preprocessor.services.episodes.episode_manager import EpisodeInfo +from preprocessor.services.episodes.types import EpisodeInfo @dataclass(frozen=True) @@ -23,7 +23,7 @@ class Artifact: @dataclass(frozen=True) class EpisodeArtifact(Artifact): episode_id: str - episode_info: 'EpisodeInfo' + episode_info: EpisodeInfo @dataclass(frozen=True) diff --git a/preprocessor/core/base_step.py b/preprocessor/core/base_step.py index 6e9f1744c..78e017575 100644 --- a/preprocessor/core/base_step.py +++ b/preprocessor/core/base_step.py @@ -1,3 +1,5 @@ +from __future__ import annotations + from abc import ( ABC, abstractmethod, @@ -8,17 +10,19 @@ ) from pathlib import Path from typing import ( - TYPE_CHECKING, Callable, + Dict, Generic, List, + Optional, TypeVar, ) from pydantic import BaseModel -if TYPE_CHECKING: - from preprocessor.core.context import ExecutionContext +from preprocessor.core.context import ExecutionContext +from preprocessor.core.output_descriptors import OutputDescriptor +from preprocessor.core.temp_files import StepTempFile InputT = TypeVar("InputT") OutputT = TypeVar("OutputT") @@ -42,23 +46,76 @@ def name(self) -> str: def is_global(self) -> bool: return False + def get_output_descriptors(self) -> List[OutputDescriptor]: + """ + Override in subclass to define step outputs. + Used for automatic output validation and path resolution. + """ + return [] + + def _resolve_output_path( + self, + descriptor_index: int, + context: ExecutionContext, + context_vars: Optional[Dict[str, str]] = None, + ) -> Path: + """ + Resolve output path from OutputDescriptor at given index. + Eliminates hardcoded subdirectories - uses descriptor definition. + """ + descriptors = self.get_output_descriptors() + if not descriptors or descriptor_index >= len(descriptors): + raise ValueError( + f'Step {self.name} has no output descriptor at index {descriptor_index}', + ) + + descriptor = descriptors[descriptor_index] + return descriptor.resolve_path(context.base_output_dir, context_vars) + + def should_skip_execution( + self, episode_id: str, context: ExecutionContext, context_vars: Optional[Dict[str, str]] = None, + ) -> bool: + """ + Default caching logic - checks state manager and output validity. + Subclasses can call this at the start of execute() to skip if already done. + """ + if context.force_rerun: + return False + + if not context.is_step_completed(self.name, episode_id): + return False + + descriptors = self.get_output_descriptors() + if not descriptors: + return True + + for descriptor in descriptors: + result = descriptor.validate(context.base_output_dir, context_vars) + if not result.is_valid: + context.logger.warning( + f'{episode_id} - output invalid: {result.message}', + ) + return False + + return True + @abstractmethod - def execute(self, input_data: InputT, context: "ExecutionContext") -> OutputT: + def execute(self, input_data: InputT, context: ExecutionContext) -> OutputT: pass @property def supports_batch_processing(self) -> bool: return False - def setup_resources(self, context: "ExecutionContext") -> None: + def setup_resources(self, context: ExecutionContext) -> None: pass def execute_batch( - self, input_data: List[InputT], context: "ExecutionContext", + self, input_data: List[InputT], context: ExecutionContext, ) -> List[OutputT]: return [self.execute(item, context) for item in input_data] - def teardown_resources(self, context: "ExecutionContext") -> None: + def teardown_resources(self, context: ExecutionContext) -> None: pass def cleanup(self) -> None: @@ -67,7 +124,7 @@ def cleanup(self) -> None: def _check_cache_validity( self, output_path: Path, - context: "ExecutionContext", + context: ExecutionContext, episode_id: str, cache_description: str, ) -> bool: @@ -77,12 +134,43 @@ def _check_cache_validity( return True return False + def _check_output_validity( + self, + output_descriptor: OutputDescriptor, + context: ExecutionContext, + episode_id: str, + context_vars: Optional[Dict[str, str]] = None, + ) -> bool: + if context.force_rerun: + return False + + if not context.is_step_completed(self.name, episode_id): + return False + + validation_result = output_descriptor.validate( + context.base_output_dir, context_vars, + ) + + if validation_result.is_valid: + context.logger.info( + f'Skipping {episode_id} - output valid ' + f'({validation_result.file_count} files, ' + f'{validation_result.total_size_bytes} bytes)', + ) + return True + + context.logger.warning( + f'Output invalid for {episode_id}: {validation_result.message}', + ) + return False + + @staticmethod def _execute_with_threadpool( input_data: List[InputT], - context: "ExecutionContext", + context: ExecutionContext, max_workers: int, - executor_fn: Callable[[InputT, "ExecutionContext"], OutputT], + executor_fn: Callable[[InputT, ExecutionContext], OutputT], ) -> List[OutputT]: context.logger.info( f"Batch processing {len(input_data)} episodes with {max_workers} workers", @@ -104,8 +192,8 @@ def _execute_with_threadpool( @staticmethod def _execute_sequential( input_data: List[InputT], - context: "ExecutionContext", - executor_fn: Callable[[InputT, "ExecutionContext"], OutputT], + context: ExecutionContext, + executor_fn: Callable[[InputT, ExecutionContext], OutputT], ) -> List[OutputT]: context.logger.info( f"Batch processing {len(input_data)} episodes sequentially", @@ -117,3 +205,12 @@ def _execute_sequential( results.append(result) return results + + @staticmethod + def _atomic_write( + final_path: Path, + write_func: Callable[[Path], None], + temp_suffix: str = '.tmp', + ) -> None: + with StepTempFile(final_path, temp_suffix) as temp_path: + write_func(temp_path) diff --git a/preprocessor/core/context.py b/preprocessor/core/context.py index c6eb7a487..b50a395a9 100644 --- a/preprocessor/core/context.py +++ b/preprocessor/core/context.py @@ -1,6 +1,7 @@ +from __future__ import annotations + from pathlib import Path from typing import ( - TYPE_CHECKING, List, Optional, ) @@ -8,11 +9,9 @@ from preprocessor.config.config import Settings from preprocessor.config.settings_factory import SettingsFactory from preprocessor.core.model_pool import ModelPool +from preprocessor.core.state_manager import StateManager from preprocessor.services.core.logging import ErrorHandlingLogger - -if TYPE_CHECKING: - from preprocessor.core.state_manager import StateManager - from preprocessor.services.episodes.episode_manager import EpisodeInfo +from preprocessor.services.episodes.types import EpisodeInfo class ExecutionContext: @@ -21,20 +20,24 @@ def __init__( series_name: str, base_output_dir: Path, logger: ErrorHandlingLogger, - state_manager: Optional['StateManager'] = None, + state_manager: Optional[StateManager] = None, force_rerun: bool = False, disable_parallel: bool = False, settings_instance: Optional[Settings] = None, ) -> None: self.__series_name: str = series_name self.__base_output_dir: Path = base_output_dir / series_name - self.__state_manager: Optional['StateManager'] = state_manager + self.__state_manager: Optional[StateManager] = state_manager self.__force_rerun: bool = force_rerun self.__disable_parallel: bool = disable_parallel self.__logger: ErrorHandlingLogger = logger self.__settings: Settings = settings_instance or SettingsFactory.get_settings() self.__model_pool: ModelPool = ModelPool() + @property + def base_output_dir(self) -> Path: + return self.__base_output_dir + @property def disable_parallel(self) -> bool: return self.__disable_parallel @@ -61,11 +64,11 @@ def settings(self) -> Settings: return self.__settings @property - def state_manager(self) -> Optional['StateManager']: + def state_manager(self) -> Optional[StateManager]: return self.__state_manager def get_output_path( - self, episode_info: 'EpisodeInfo', subdir: str, filename: str, + self, episode_info: EpisodeInfo, subdir: str, filename: str, ) -> Path: season_code: str = episode_info.season_code() episode_code: str = episode_info.episode_num() @@ -75,7 +78,7 @@ def get_output_path( return path def get_season_output_path( - self, episode_info: 'EpisodeInfo', subdir: str, filename: str, + self, episode_info: EpisodeInfo, subdir: str, filename: str, ) -> Path: season_code: str = episode_info.season_code() diff --git a/preprocessor/core/output_descriptors.py b/preprocessor/core/output_descriptors.py new file mode 100644 index 000000000..03ea248f3 --- /dev/null +++ b/preprocessor/core/output_descriptors.py @@ -0,0 +1,285 @@ +from abc import ( + ABC, + abstractmethod, +) +from dataclasses import dataclass +import json +from pathlib import Path +from typing import ( + Callable, + Dict, + Optional, +) + + +@dataclass +class ValidationResult: + is_valid: bool + message: str = '' + file_count: int = 0 + total_size_bytes: int = 0 + + +class OutputDescriptor(ABC): + def __init__(self, pattern: str, subdir: str) -> None: + self._pattern = pattern + self._subdir = subdir + + @property + def pattern(self) -> str: + return self._pattern + + @property + def subdir(self) -> str: + return self._subdir + + @abstractmethod + def resolve_path(self, base_dir: Path, context_vars: Optional[Dict[str, str]] = None) -> Path: + pass + + @abstractmethod + def validate(self, base_dir: Path, context_vars: Optional[Dict[str, str]] = None) -> ValidationResult: + pass + + def format_pattern(self, context_vars: Optional[Dict[str, str]] = None) -> str: + if not context_vars: + return self._pattern + return self._pattern.format(**context_vars) + + +class FileOutput(OutputDescriptor): + def __init__( + self, + pattern: str, + subdir: str, + min_size_bytes: int = 1, + expected_count: int = 1, + ) -> None: + super().__init__(pattern, subdir) + self._min_size_bytes = min_size_bytes + self._expected_count = expected_count + + def resolve_path(self, base_dir: Path, context_vars: Optional[Dict[str, str]] = None) -> Path: + formatted_pattern = self.format_pattern(context_vars) + return base_dir / self._subdir / formatted_pattern + + def validate(self, base_dir: Path, context_vars: Optional[Dict[str, str]] = None) -> ValidationResult: + file_path = self.resolve_path(base_dir, context_vars) + + if not file_path.exists(): + return ValidationResult( + is_valid=False, + message=f'File does not exist: {file_path}', + ) + + if not file_path.is_file(): + return ValidationResult( + is_valid=False, + message=f'Path exists but is not a file: {file_path}', + ) + + file_size = file_path.stat().st_size + + if file_size < self._min_size_bytes: + return ValidationResult( + is_valid=False, + message=f'File too small ({file_size} bytes < {self._min_size_bytes}): {file_path}', + file_count=1, + total_size_bytes=file_size, + ) + + return ValidationResult( + is_valid=True, + message=f'File valid: {file_path}', + file_count=1, + total_size_bytes=file_size, + ) + + +class DirectoryOutput(OutputDescriptor): + def __init__( + self, + pattern: str, + subdir: str, + expected_file_pattern: Optional[str] = None, + min_files: int = 1, + min_size_per_file_bytes: int = 1, + ) -> None: + super().__init__(pattern, subdir) + self._expected_file_pattern = expected_file_pattern + self._min_files = min_files + self._min_size_per_file_bytes = min_size_per_file_bytes + + def resolve_path(self, base_dir: Path, context_vars: Optional[Dict[str, str]] = None) -> Path: + formatted_pattern = self.format_pattern(context_vars) + return base_dir / self._subdir / formatted_pattern + + def validate(self, base_dir: Path, context_vars: Optional[Dict[str, str]] = None) -> ValidationResult: + dir_path = self.resolve_path(base_dir, context_vars) + + if not dir_path.exists(): + return ValidationResult( + is_valid=False, + message=f'Directory does not exist: {dir_path}', + ) + + if not dir_path.is_dir(): + return ValidationResult( + is_valid=False, + message=f'Path exists but is not a directory: {dir_path}', + ) + + if self._expected_file_pattern: + files = list(dir_path.glob(self._expected_file_pattern)) + else: + files = [f for f in dir_path.iterdir() if f.is_file()] + + if len(files) < self._min_files: + return ValidationResult( + is_valid=False, + message=( + f'Not enough files in directory ({len(files)} < {self._min_files}): ' + f'{dir_path}' + ), + file_count=len(files), + ) + + total_size = 0 + for file_path in files: + file_size = file_path.stat().st_size + total_size += file_size + + if file_size < self._min_size_per_file_bytes: + return ValidationResult( + is_valid=False, + message=( + f'File too small ({file_size} bytes < {self._min_size_per_file_bytes}): ' + f'{file_path}' + ), + file_count=len(files), + total_size_bytes=total_size, + ) + + return ValidationResult( + is_valid=True, + message=f'Directory valid: {dir_path} ({len(files)} files, {total_size} bytes)', + file_count=len(files), + total_size_bytes=total_size, + ) + + +class JsonFileOutput(FileOutput): + def __init__( + self, + pattern: str, + subdir: str, + min_size_bytes: int = 2, + schema_validator: Optional[Callable[[Dict], bool]] = None, + ) -> None: + super().__init__(pattern, subdir, min_size_bytes) + self._schema_validator = schema_validator + + def validate(self, base_dir: Path, context_vars: Optional[Dict[str, str]] = None) -> ValidationResult: + basic_validation = super().validate(base_dir, context_vars) + + if not basic_validation.is_valid: + return basic_validation + + file_path = self.resolve_path(base_dir, context_vars) + + try: + with open(file_path, 'r', encoding='utf-8') as f: + data = json.load(f) + except json.JSONDecodeError as e: + return ValidationResult( + is_valid=False, + message=f'Invalid JSON in {file_path}: {e}', + file_count=1, + total_size_bytes=basic_validation.total_size_bytes, + ) + except Exception as e: + return ValidationResult( + is_valid=False, + message=f'Failed to read JSON from {file_path}: {e}', + file_count=1, + total_size_bytes=basic_validation.total_size_bytes, + ) + + if self._schema_validator: + try: + if not self._schema_validator(data): + return ValidationResult( + is_valid=False, + message=f'JSON schema validation failed: {file_path}', + file_count=1, + total_size_bytes=basic_validation.total_size_bytes, + ) + except Exception as e: + return ValidationResult( + is_valid=False, + message=f'Schema validation error for {file_path}: {e}', + file_count=1, + total_size_bytes=basic_validation.total_size_bytes, + ) + + return ValidationResult( + is_valid=True, + message=f'JSON file valid: {file_path}', + file_count=1, + total_size_bytes=basic_validation.total_size_bytes, + ) + + +class GlobalOutput(OutputDescriptor): + def __init__(self, pattern: str, subdir: str = '') -> None: + super().__init__(pattern, subdir) + + def resolve_path(self, base_dir: Path, context_vars: Optional[Dict[str, str]] = None) -> Path: + formatted_pattern = self.format_pattern(context_vars) + if self._subdir: + return base_dir / self._subdir / formatted_pattern + return base_dir / formatted_pattern + + def validate(self, base_dir: Path, context_vars: Optional[Dict[str, str]] = None) -> ValidationResult: + file_path = self.resolve_path(base_dir, context_vars) + + if not file_path.exists(): + return ValidationResult( + is_valid=False, + message=f'Global output does not exist: {file_path}', + ) + + if file_path.is_file(): + file_size = file_path.stat().st_size + return ValidationResult( + is_valid=True, + message=f'Global file valid: {file_path}', + file_count=1, + total_size_bytes=file_size, + ) + + if file_path.is_dir(): + files = [f for f in file_path.rglob('*') if f.is_file()] + total_size = sum(f.stat().st_size for f in files) + return ValidationResult( + is_valid=True, + message=f'Global directory valid: {file_path} ({len(files)} files)', + file_count=len(files), + total_size_bytes=total_size, + ) + + return ValidationResult( + is_valid=False, + message=f'Global output path is neither file nor directory: {file_path}', + ) + + +def create_frames_output() -> DirectoryOutput: + """Create standard DirectoryOutput descriptor for exported frames.""" + return DirectoryOutput( + pattern="{season}/{episode}", + subdir="frames", + expected_file_pattern="*.png", + min_files=1, + min_size_per_file_bytes=1024, + ) diff --git a/preprocessor/core/temp_files.py b/preprocessor/core/temp_files.py new file mode 100644 index 000000000..3efd86c17 --- /dev/null +++ b/preprocessor/core/temp_files.py @@ -0,0 +1,37 @@ +from pathlib import Path +from typing import Optional + + +class StepTempFile: + def __init__(self, final_path: Path, temp_suffix: str = '.tmp') -> None: + self.__final_path: Path = final_path + self.__temp_suffix: str = temp_suffix + self.__temp_path: Optional[Path] = None + + @property + def final_path(self) -> Path: + return self.__final_path + + @property + def temp_path(self) -> Path: + if self.__temp_path is None: + raise RuntimeError('Context manager not entered yet') + return self.__temp_path + + def __enter__(self) -> Path: + self.__temp_path = self.__final_path.with_suffix( + f'{self.__final_path.suffix}{self.__temp_suffix}', + ) + self.__temp_path.parent.mkdir(parents=True, exist_ok=True) + return self.__temp_path + + def __exit__(self, exc_type, exc_val, exc_tb) -> bool: + if self.__temp_path is None: + return False + + if exc_type is None: + self.__temp_path.replace(self.__final_path) + elif self.__temp_path.exists(): + self.__temp_path.unlink() + + return False diff --git a/preprocessor/services/ai/clients.py b/preprocessor/services/ai/clients.py index 9e785c0d7..ba5d172d1 100644 --- a/preprocessor/services/ai/clients.py +++ b/preprocessor/services/ai/clients.py @@ -68,9 +68,15 @@ def __load_model(self) -> None: class GeminiClient(BaseLLMClient): - __GEMINI_MODEL_NAME = 'gemini-2.5-flash' - - def __init__(self) -> None: + def __init__( + self, + model_name: str = 'gemini-2.5-flash', + base_url: str = 'https://generativelanguage.googleapis.com/v1beta/openai/', + api_key: Optional[str] = None, + ) -> None: + self.__model_name = model_name + self.__base_url = base_url + self.__api_key = api_key or settings.gemini.api_key self.__client: Optional[OpenAI] = None self.__init_client() @@ -79,23 +85,22 @@ def generate(self, messages: List[Dict[str, str]], max_tokens: int = 32768) -> s raise RuntimeError('Gemini client not initialized') response = self.__client.chat.completions.create( - model=self.__GEMINI_MODEL_NAME, + model=self.__model_name, messages=messages, # type: ignore[arg-type] ) return response.choices[0].message.content.strip() def __init_client(self) -> None: - console.print(f'[cyan]Initializing {self.__GEMINI_MODEL_NAME} via OpenAI SDK...[/cyan]') + console.print(f'[cyan]Initializing {self.__model_name} via OpenAI SDK...[/cyan]') try: - api_key = settings.gemini.api_key - if not api_key: + if not self.__api_key: raise ValueError('GEMINI_API_KEY not set in environment') self.__client = OpenAI( - base_url='https://generativelanguage.googleapis.com/v1beta/openai/', - api_key=api_key, + base_url=self.__base_url, + api_key=self.__api_key, ) - console.print(f'[green]{self.__GEMINI_MODEL_NAME} initialized[/green]') + console.print(f'[green]{self.__model_name} initialized[/green]') except Exception as e: console.print(f'[red]Failed to initialize Gemini client: {e}[/red]') raise diff --git a/preprocessor/services/core/base_processor.py b/preprocessor/services/core/base_processor.py index 67eabdf49..cf246818d 100644 --- a/preprocessor/services/core/base_processor.py +++ b/preprocessor/services/core/base_processor.py @@ -21,7 +21,7 @@ SimpleProgress, console, ) -from preprocessor.services.ui.progress import ProgressTracker +from preprocessor.services.ui.progress import OperationTracker @dataclass @@ -67,7 +67,7 @@ def __init__( self.state_manager: Optional[StateManager] = args.get('state_manager') self.series_name: str = args.get('series_name', 'unknown') self.path_manager: PathService = args.get('path_manager', PathService(self.series_name)) - self.progress = args.get('progress_tracker', ProgressTracker()) + self.progress = args.get('progress_tracker', OperationTracker('default', 0, 0.0)) def cleanup(self) -> None: pass diff --git a/preprocessor/services/episodes/__init__.py b/preprocessor/services/episodes/__init__.py index f4491ac3a..1026cd993 100644 --- a/preprocessor/services/episodes/__init__.py +++ b/preprocessor/services/episodes/__init__.py @@ -1,6 +1,4 @@ -from preprocessor.services.episodes.episode_manager import ( - EpisodeInfo, - EpisodeManager, -) +from preprocessor.services.episodes.episode_manager import EpisodeManager +from preprocessor.services.episodes.types import EpisodeInfo __all__ = ['EpisodeInfo', 'EpisodeManager'] diff --git a/preprocessor/services/episodes/episode_manager.py b/preprocessor/services/episodes/episode_manager.py index 6e3f58a32..1a91334c1 100644 --- a/preprocessor/services/episodes/episode_manager.py +++ b/preprocessor/services/episodes/episode_manager.py @@ -1,4 +1,3 @@ -from dataclasses import dataclass import json from pathlib import Path import re @@ -13,32 +12,10 @@ EpisodesDataKeys, ) from preprocessor.services.core.logging import ErrorHandlingLogger +from preprocessor.services.episodes.types import EpisodeInfo from preprocessor.services.io.path_service import PathService -@dataclass -class EpisodeInfo: - absolute_episode: int - relative_episode: int - season: int - title: str - premiere_date: Optional[str] = None - series_name: Optional[str] = None - viewership: Optional[str] = None - - def episode_code(self) -> str: - return f'S{self.season:02d}E{self.relative_episode:02d}' - - def episode_num(self) -> str: - return f'E{self.relative_episode:02d}' - - def season_code(self) -> str: - return f'S{self.season:02d}' - - def is_special(self) -> bool: - return self.season == 0 - - class EpisodeManager: def __init__( self, diff --git a/preprocessor/services/episodes/types.py b/preprocessor/services/episodes/types.py new file mode 100644 index 000000000..38de88b2f --- /dev/null +++ b/preprocessor/services/episodes/types.py @@ -0,0 +1,25 @@ +from dataclasses import dataclass +from typing import Optional + + +@dataclass +class EpisodeInfo: + absolute_episode: int + relative_episode: int + season: int + title: str + premiere_date: Optional[str] = None + series_name: Optional[str] = None + viewership: Optional[str] = None + + def episode_code(self) -> str: + return f'S{self.season:02d}E{self.relative_episode:02d}' + + def episode_num(self) -> str: + return f'E{self.relative_episode:02d}' + + def season_code(self) -> str: + return f'S{self.season:02d}' + + def is_special(self) -> bool: + return self.season == 0 diff --git a/preprocessor/services/io/files.py b/preprocessor/services/io/files.py index 906667311..e0b12e769 100644 --- a/preprocessor/services/io/files.py +++ b/preprocessor/services/io/files.py @@ -2,32 +2,20 @@ from pathlib import Path from typing import ( Any, - Callable, Dict, ) +from preprocessor.core.temp_files import StepTempFile + class FileOperations: @staticmethod def atomic_write_json(path: Path, data: Dict[str, Any], indent: int = 2) -> None: - def __write_temp(temp_path: Path) -> None: + with StepTempFile(path) as temp_path: with open(temp_path, 'w', encoding='utf-8') as f: json.dump(data, f, ensure_ascii=False, indent=indent) - FileOperations.__execute_atomic_write(path, __write_temp) - @staticmethod def load_json(path: Path) -> Dict[str, Any]: with open(path, 'r', encoding='utf-8') as f: return json.load(f) - - @staticmethod - def __execute_atomic_write(path: Path, write_func: Callable[[Path], None]) -> None: - temp_path = path.with_suffix(f'{path.suffix}.tmp') - try: - write_func(temp_path) - temp_path.replace(path) - except Exception: - if temp_path.exists(): - temp_path.unlink() - raise diff --git a/preprocessor/services/io/path_service.py b/preprocessor/services/io/path_service.py index 06d2f76ed..51edf5b72 100644 --- a/preprocessor/services/io/path_service.py +++ b/preprocessor/services/io/path_service.py @@ -1,14 +1,11 @@ +from __future__ import annotations + from pathlib import Path -from typing import ( - TYPE_CHECKING, - Optional, -) +from typing import Optional from preprocessor.config.output_paths import get_base_output_dir from preprocessor.services.core.environment import Environment - -if TYPE_CHECKING: - from preprocessor.services.episodes.episode_manager import EpisodeInfo +from preprocessor.services.episodes.types import EpisodeInfo class PathService: @@ -17,7 +14,7 @@ def __init__(self, series_name: str) -> None: def build_filename( self, - episode_info: 'EpisodeInfo', + episode_info: EpisodeInfo, extension: str = 'json', suffix: Optional[str] = None, ) -> str: @@ -25,7 +22,7 @@ def build_filename( suffix_str = f'_{suffix}' if suffix else '' return f'{base}{suffix_str}.{extension}' - def get_episode_dir(self, episode_info: 'EpisodeInfo', subdir: str) -> Path: + def get_episode_dir(self, episode_info: EpisodeInfo, subdir: str) -> Path: base_output_dir = get_base_output_dir(self.__series_name) return base_output_dir / subdir / episode_info.season_code() / episode_info.episode_num() diff --git a/preprocessor/services/scraping/base_scraper_step.py b/preprocessor/services/scraping/base_scraper_step.py index 679e1042d..ef7b5fab5 100644 --- a/preprocessor/services/scraping/base_scraper_step.py +++ b/preprocessor/services/scraping/base_scraper_step.py @@ -13,6 +13,7 @@ from pydantic import BaseModel +from preprocessor.config.output_paths import get_base_output_dir from preprocessor.core.artifacts import SourceVideo from preprocessor.core.base_step import PipelineStep from preprocessor.core.context import ExecutionContext @@ -26,7 +27,7 @@ def is_global(self) -> bool: return True def execute(self, input_data: SourceVideo, context: ExecutionContext) -> Optional[SourceVideo]: - output_path = Path(self.config.output_file) + output_path = self.__resolve_output_path(context) if output_path.exists() and not context.force_rerun: context.logger.info(f"{self._get_metadata_type_name()} metadata already exists.") @@ -43,6 +44,11 @@ def execute(self, input_data: SourceVideo, context: ExecutionContext) -> Optiona context.logger.info(f"{self._get_metadata_type_name()} metadata saved to: {output_path}") return input_data + def __resolve_output_path(self, context: ExecutionContext) -> Path: + metadata_type = self._get_metadata_type_name().lower() + output_dir = get_base_output_dir(context.series_name) + return output_dir / f"{context.series_name}_{metadata_type}.json" + @abstractmethod def _get_scraper_class(self) -> Type: pass diff --git a/preprocessor/services/scraping/episode_scraper.py b/preprocessor/services/scraping/episode_scraper.py index d56de3902..c5ea22c1b 100644 --- a/preprocessor/services/scraping/episode_scraper.py +++ b/preprocessor/services/scraping/episode_scraper.py @@ -14,7 +14,6 @@ class EpisodeScraper(BaseScraper): def __init__(self, args: Dict[str, Any]) -> None: super().__init__(args) - self.__merge_sources: bool = self._args.get('merge_sources', True) # pylint: disable=unused-private-member self.__expected_episodes_count: Optional[int] = self._args.get('expected_episodes_count') self.__videos_dir: Optional[Path] = self._args.get('videos_dir') diff --git a/preprocessor/services/search/embedding_model.py b/preprocessor/services/search/embedding_model.py index 3368f75d2..80fa7bb56 100644 --- a/preprocessor/services/search/embedding_model.py +++ b/preprocessor/services/search/embedding_model.py @@ -11,21 +11,17 @@ class EmbeddingModelWrapper: def __init__( self, - model_name: str, - device: str = 'cuda', - batch_size: int = 8, + _model_name: str, + _device: str = 'cuda', + _batch_size: int = 8, ) -> None: - self.__model_name = model_name # pylint: disable=unused-private-member - self.__device = device # pylint: disable=unused-private-member - self.__batch_size = batch_size # pylint: disable=unused-private-member self.__service = EmbeddingService() - self.__loaded = False # pylint: disable=unused-private-member def load_model(self) -> None: - self.__loaded = True # pylint: disable=unused-private-member + pass def cleanup(self) -> None: - self.__loaded = False # pylint: disable=unused-private-member + pass def encode_text(self, text: Union[str, List[str]]) -> Union[List[float], List[List[float]]]: if isinstance(text, str): diff --git a/preprocessor/services/text/import_step.py b/preprocessor/services/text/import_step.py index b9bb30127..f926a3af2 100644 --- a/preprocessor/services/text/import_step.py +++ b/preprocessor/services/text/import_step.py @@ -1,8 +1,9 @@ +from __future__ import annotations + import json from pathlib import Path import re from typing import ( - TYPE_CHECKING, Any, Dict, List, @@ -14,10 +15,10 @@ from preprocessor.core.artifacts import TranscriptionData from preprocessor.core.base_step import PipelineStep from preprocessor.core.context import ExecutionContext -from preprocessor.services.episodes.episode_manager import EpisodeManager - -if TYPE_CHECKING: - from preprocessor.services.episodes.episode_manager import EpisodeInfo +from preprocessor.services.episodes.episode_manager import ( + EpisodeInfo, + EpisodeManager, +) class TranscriptionImportStep(PipelineStep[None, List[TranscriptionData], TranscriptionImportConfig]): @@ -83,7 +84,7 @@ def __import_single_file(self, json_file: Path, context: ExecutionContext) -> Op return self.__construct_new_artifact(episode_id, episode_info, output_path, converted_data) - def __resolve_episode_info(self, json_file: Path) -> Optional['EpisodeInfo']: + def __resolve_episode_info(self, json_file: Path) -> Optional[EpisodeInfo]: info = self.__episode_manager.parse_filename(json_file) if not info: season, episode = self.__extract_season_episode_fallback(json_file) @@ -105,7 +106,7 @@ def __should_skip_import(self, output_path: Path, episode_id: str, context: Exec return True return False - def __get_output_path(self, episode_info: 'EpisodeInfo', context: ExecutionContext) -> Path: + def __get_output_path(self, episode_info: EpisodeInfo, context: ExecutionContext) -> Path: filename = self.__episode_manager.path_manager.build_filename(episode_info, extension='json') return context.get_output_path(episode_info, 'transcriptions', filename) @@ -190,7 +191,7 @@ def __save_converted_data(output_path: Path, data: Dict[str, Any]) -> None: json.dump(data, f, indent=2, ensure_ascii=False) @staticmethod - def __construct_cached_artifact(episode_id: str, info: 'EpisodeInfo', path: Path) -> TranscriptionData: + def __construct_cached_artifact(episode_id: str, info: EpisodeInfo, path: Path) -> TranscriptionData: return TranscriptionData( episode_id=episode_id, episode_info=info, path=path, language='pl', model='11labs', format='json', @@ -198,7 +199,7 @@ def __construct_cached_artifact(episode_id: str, info: 'EpisodeInfo', path: Path @staticmethod def __construct_new_artifact( - episode_id: str, info: 'EpisodeInfo', path: Path, + episode_id: str, info: EpisodeInfo, path: Path, data: Dict[str, Any], ) -> TranscriptionData: trans_meta = data.get('transcription', {}) diff --git a/preprocessor/services/text/text_statistics.py b/preprocessor/services/text/text_statistics.py index 25a48e005..f8605e768 100644 --- a/preprocessor/services/text/text_statistics.py +++ b/preprocessor/services/text/text_statistics.py @@ -18,7 +18,7 @@ @dataclass -class TextStatistics: # pylint: disable=too-many-instance-attributes +class TextStatistics: # pylint: disable=too-many-instance-attributes # Data structure for comprehensive text statistics - all attributes necessary text: str language: str = 'pl' @@ -85,7 +85,7 @@ def to_dict(self) -> Dict[str, Any]: 'trigrams': self.trigrams, } - def __process_calculations(self) -> None: # pylint: disable=unused-private-member + def __process_calculations(self) -> None: # pylint: disable=unused-private-member # Called in from_file (line 54) - false positive self.__calculate_structural_stats() self.__calculate_character_distribution() self.__calculate_lexical_stats() diff --git a/preprocessor/services/ui/__init__.py b/preprocessor/services/ui/__init__.py index 3a67e434f..229e0e3c6 100644 --- a/preprocessor/services/ui/__init__.py +++ b/preprocessor/services/ui/__init__.py @@ -2,9 +2,6 @@ SimpleProgress, console, ) -from preprocessor.services.ui.progress import ( - OperationTracker, - ProgressTracker, -) +from preprocessor.services.ui.progress import OperationTracker -__all__ = ['console', 'SimpleProgress', 'ProgressTracker', 'OperationTracker'] +__all__ = ['console', 'SimpleProgress', 'OperationTracker'] diff --git a/preprocessor/services/ui/console.py b/preprocessor/services/ui/console.py index 4269068e3..26e27699d 100644 --- a/preprocessor/services/ui/console.py +++ b/preprocessor/services/ui/console.py @@ -15,7 +15,7 @@ def __get_console() -> Console: - global _console_instance # pylint: disable=global-statement + global _console_instance # pylint: disable=global-statement # Singleton pattern - global required for module-level instance if _console_instance is None: _console_instance = __initialize_rich_console() return _console_instance diff --git a/preprocessor/services/ui/progress.py b/preprocessor/services/ui/progress.py index e27e40e41..b971739c8 100644 --- a/preprocessor/services/ui/progress.py +++ b/preprocessor/services/ui/progress.py @@ -1,16 +1,9 @@ import time -from typing import Optional from preprocessor.services.core.time import TimeFormatter from preprocessor.services.ui.console import console -class ProgressTracker: - def __init__(self) -> None: - self.__current_operation: Optional[str] = None # pylint: disable=unused-private-member - self.__start_time: Optional[float] = None # pylint: disable=unused-private-member - - class OperationTracker: def __init__(self, operation_name: str, total: int, start_time: float) -> None: self.__operation_name = operation_name diff --git a/preprocessor/services/validation/episode_stats.py b/preprocessor/services/validation/episode_stats.py index 7728b1f1a..ba7fb3cc0 100644 --- a/preprocessor/services/validation/episode_stats.py +++ b/preprocessor/services/validation/episode_stats.py @@ -12,17 +12,6 @@ from preprocessor.services.episodes import EpisodeInfo from preprocessor.services.validation.base_result import ValidationStatusMixin -from preprocessor.services.validation.validators import ( - CharacterValidator, - ElasticValidator, - FaceClusterValidator, - FrameValidator, - ImageHashValidator, - ObjectValidator, - SceneValidator, - TranscriptionValidator, - VideoValidator, -) class EpisodeStatsData(TypedDict, total=False): @@ -53,16 +42,28 @@ class EpisodeStats(ValidationStatusMixin): video_size_mb: Optional[float] = None scenes_count: Optional[int] = None - def __post_init__(self) -> None: - self.__validators = [ + def collect_stats(self) -> None: + # pylint: disable=import-outside-toplevel # Necessary to avoid circular import (validators import EpisodeStats) + from preprocessor.services.validation.validators import ( + CharacterValidator, + ElasticValidator, + FaceClusterValidator, + FrameValidator, + ImageHashValidator, + ObjectValidator, + SceneValidator, + TranscriptionValidator, + VideoValidator, + ) + + validators = [ TranscriptionValidator(), FrameValidator(), VideoValidator(), SceneValidator(), ImageHashValidator(), CharacterValidator(), FaceClusterValidator(), ObjectValidator(), ElasticValidator(), ] - def collect_stats(self) -> None: - for v in self.__validators: - v.validate(self) + for validator in validators: + validator.validate(self) def to_dict(self) -> Dict[str, Any]: return { diff --git a/preprocessor/services/validation/validators/base_validator.py b/preprocessor/services/validation/validators/base_validator.py index 289790102..cecd6650a 100644 --- a/preprocessor/services/validation/validators/base_validator.py +++ b/preprocessor/services/validation/validators/base_validator.py @@ -1,3 +1,5 @@ +from __future__ import annotations + from abc import ( ABC, abstractmethod, @@ -5,26 +7,23 @@ import json from pathlib import Path from typing import ( - TYPE_CHECKING, Any, Dict, Optional, ) +from preprocessor.services.validation.episode_stats import EpisodeStats from preprocessor.services.validation.file_validators import FileValidator -if TYPE_CHECKING: - from preprocessor.services.validation.episode_stats import EpisodeStats - class BaseValidator(ABC): @abstractmethod - def validate(self, stats: 'EpisodeStats') -> None: + def validate(self, stats: EpisodeStats) -> None: pass @staticmethod def _check_path_exists( - path: Path, stats: 'EpisodeStats', error_msg: str, + path: Path, stats: EpisodeStats, error_msg: str, ) -> bool: if not path.exists(): stats.errors.append(error_msg) @@ -32,16 +31,16 @@ def _check_path_exists( return True @staticmethod - def _add_warning(stats: 'EpisodeStats', message: str) -> None: + def _add_warning(stats: EpisodeStats, message: str) -> None: stats.warnings.append(message) @staticmethod - def _add_error(stats: 'EpisodeStats', message: str) -> None: + def _add_error(stats: EpisodeStats, message: str) -> None: stats.errors.append(message) @staticmethod def _validate_json_if_exists( - stats: 'EpisodeStats', + stats: EpisodeStats, file_path: Path, error_msg_prefix: str, ) -> bool: @@ -56,7 +55,7 @@ def _validate_json_if_exists( @staticmethod def _validate_json_with_warning( - stats: 'EpisodeStats', + stats: EpisodeStats, file_path: Path, missing_msg: str, invalid_msg_prefix: str, @@ -73,7 +72,7 @@ def _validate_json_with_warning( @staticmethod def _validate_json_with_error( - stats: 'EpisodeStats', + stats: EpisodeStats, file_path: Path, missing_msg: str, invalid_msg_prefix: str, diff --git a/preprocessor/services/validation/validators/character_validator.py b/preprocessor/services/validation/validators/character_validator.py index a1ac2f0b9..f82598878 100644 --- a/preprocessor/services/validation/validators/character_validator.py +++ b/preprocessor/services/validation/validators/character_validator.py @@ -1,15 +1,13 @@ -from typing import TYPE_CHECKING +from __future__ import annotations from preprocessor.config.settings_instance import settings +from preprocessor.services.validation.episode_stats import EpisodeStats from preprocessor.services.validation.validators.base_validator import BaseValidator from preprocessor.services.validation.validators.validation_helpers import VisualizationValidationHelper -if TYPE_CHECKING: - from preprocessor.services.validation.episode_stats import EpisodeStats - class CharacterValidator(BaseValidator): - def validate(self, stats: 'EpisodeStats') -> None: + def validate(self, stats: EpisodeStats) -> None: VisualizationValidationHelper.validate_visualizations( stats, settings.output_subdirs.character_visualizations, diff --git a/preprocessor/services/validation/validators/elastic_validator.py b/preprocessor/services/validation/validators/elastic_validator.py index 0d44c794b..de2196e4c 100644 --- a/preprocessor/services/validation/validators/elastic_validator.py +++ b/preprocessor/services/validation/validators/elastic_validator.py @@ -1,7 +1,8 @@ +from __future__ import annotations + import json from pathlib import Path from typing import ( - TYPE_CHECKING, Any, Dict, ) @@ -9,23 +10,21 @@ from preprocessor.config.constants import OUTPUT_FILE_NAMES from preprocessor.config.settings_instance import settings from preprocessor.services.io.path_service import PathService +from preprocessor.services.validation.episode_stats import EpisodeStats from preprocessor.services.validation.file_validators import FileValidator from preprocessor.services.validation.validators.base_validator import BaseValidator -if TYPE_CHECKING: - from preprocessor.services.validation.episode_stats import EpisodeStats - ELASTIC_SUBDIRS = settings.output_subdirs.elastic_document_subdirs class ElasticValidator(BaseValidator): - def validate(self, stats: 'EpisodeStats') -> None: + def validate(self, stats: EpisodeStats) -> None: self.__validate_character_detections(stats) self.__validate_embeddings(stats) self.__validate_elastic_documents(stats) self.__validate_text_statistics(stats) - def __validate_character_detections(self, stats: 'EpisodeStats') -> None: + def __validate_character_detections(self, stats: EpisodeStats) -> None: char_detections_dir = self.__get_dir(stats, settings.output_subdirs.character_detections) detections_file = char_detections_dir / OUTPUT_FILE_NAMES['detections'] @@ -35,7 +34,7 @@ def __validate_character_detections(self, stats: 'EpisodeStats') -> None: error_msg_prefix=f"Invalid {OUTPUT_FILE_NAMES['detections']}", ) - def __validate_embeddings(self, stats: 'EpisodeStats') -> None: + def __validate_embeddings(self, stats: EpisodeStats) -> None: embeddings_dir = self.__get_dir(stats, settings.output_subdirs.embeddings) if embeddings_dir.exists(): embeddings_file = embeddings_dir / OUTPUT_FILE_NAMES['embeddings_text'] @@ -45,7 +44,7 @@ def __validate_embeddings(self, stats: 'EpisodeStats') -> None: error_msg_prefix=f"Invalid {OUTPUT_FILE_NAMES['embeddings_text']}", ) - def __validate_elastic_documents(self, stats: 'EpisodeStats') -> None: + def __validate_elastic_documents(self, stats: EpisodeStats) -> None: subdirs_to_check = [ ELASTIC_SUBDIRS.text_segments, ELASTIC_SUBDIRS.text_embeddings, ELASTIC_SUBDIRS.video_frames, ELASTIC_SUBDIRS.episode_names, @@ -65,7 +64,7 @@ def __validate_elastic_documents(self, stats: 'EpisodeStats') -> None: if not found_any: self._add_warning(stats, f'Missing {settings.output_subdirs.elastic_documents} directory') - def __process_jsonl_files(self, stats: 'EpisodeStats', docs_dir: Path, subdir: str) -> None: + def __process_jsonl_files(self, stats: EpisodeStats, docs_dir: Path, subdir: str) -> None: for jsonl_file in docs_dir.glob('*.jsonl'): result = FileValidator.validate_jsonl_file(jsonl_file) if not result.is_valid: @@ -73,7 +72,7 @@ def __process_jsonl_files(self, stats: 'EpisodeStats', docs_dir: Path, subdir: s else: self.__validate_embedding_dimensions(stats, jsonl_file, subdir) - def __validate_text_statistics(self, stats: 'EpisodeStats') -> None: + def __validate_text_statistics(self, stats: EpisodeStats) -> None: trans_dir = self.__get_dir(stats, settings.output_subdirs.transcriptions) if trans_dir.exists(): clean_subdir = settings.output_subdirs.transcription_subdirs.clean @@ -86,7 +85,7 @@ def __validate_text_statistics(self, stats: 'EpisodeStats') -> None: else: self._add_warning(stats, f'Missing text statistics file: {text_stats_file.name}') - def __validate_embedding_dimensions(self, stats: 'EpisodeStats', jsonl_file: Path, subdir: str) -> None: + def __validate_embedding_dimensions(self, stats: EpisodeStats, jsonl_file: Path, subdir: str) -> None: embedding_fields = { ELASTIC_SUBDIRS.text_embeddings: 'text_embedding', ELASTIC_SUBDIRS.video_frames: 'video_embedding', @@ -112,7 +111,7 @@ def __validate_embedding_dimensions(self, stats: 'EpisodeStats', jsonl_file: Pat self._add_error(stats, f'Error validating embeddings in {jsonl_file.name}: {e}') def __check_doc_dimension( - self, stats: 'EpisodeStats', doc: Dict[str, Any], field: str, expected: int, fname: str, + self, stats: EpisodeStats, doc: Dict[str, Any], field: str, expected: int, fname: str, lnum: int, ) -> None: if field in doc and isinstance(doc[field], list): @@ -121,5 +120,5 @@ def __check_doc_dimension( self._add_error(stats, f'{fname} line {lnum}: {field} has {actual} dim, expected {expected}') @staticmethod - def __get_dir(stats: 'EpisodeStats', subdir: str) -> Path: + def __get_dir(stats: EpisodeStats, subdir: str) -> Path: return PathService(stats.series_name).get_episode_dir(stats.episode_info, subdir) diff --git a/preprocessor/services/validation/validators/face_cluster_validator.py b/preprocessor/services/validation/validators/face_cluster_validator.py index 8e011655c..ceaa5b949 100644 --- a/preprocessor/services/validation/validators/face_cluster_validator.py +++ b/preprocessor/services/validation/validators/face_cluster_validator.py @@ -1,6 +1,7 @@ +from __future__ import annotations + from pathlib import Path from typing import ( - TYPE_CHECKING, Any, Dict, Optional, @@ -8,14 +9,12 @@ from preprocessor.config.settings_instance import settings from preprocessor.services.io.path_service import PathService +from preprocessor.services.validation.episode_stats import EpisodeStats from preprocessor.services.validation.validators.base_validator import BaseValidator -if TYPE_CHECKING: - from preprocessor.services.validation.episode_stats import EpisodeStats - class FaceClusterValidator(BaseValidator): - def validate(self, stats: 'EpisodeStats') -> None: + def validate(self, stats: EpisodeStats) -> None: clusters_dir = PathService(stats.series_name).get_episode_dir( stats.episode_info, settings.output_subdirs.face_clusters, ) @@ -40,7 +39,7 @@ def __get_metadata_file(clusters_dir: Path) -> Optional[Path]: files = list(clusters_dir.glob('*_face_clusters.json')) return files[0] if files else None - def __parse_cluster_stats(self, stats: 'EpisodeStats', data: Dict[str, Any]) -> None: + def __parse_cluster_stats(self, stats: EpisodeStats, data: Dict[str, Any]) -> None: clusters = data.get('clusters', {}) if isinstance(clusters, (dict, list)): diff --git a/preprocessor/services/validation/validators/frame_validator.py b/preprocessor/services/validation/validators/frame_validator.py index 82c976e0c..14ed09791 100644 --- a/preprocessor/services/validation/validators/frame_validator.py +++ b/preprocessor/services/validation/validators/frame_validator.py @@ -1,6 +1,7 @@ +from __future__ import annotations + from pathlib import Path from typing import ( - TYPE_CHECKING, List, Tuple, ) @@ -8,15 +9,13 @@ from preprocessor.config.constants import OUTPUT_FILE_PATTERNS from preprocessor.config.settings_instance import settings from preprocessor.services.io.path_service import PathService +from preprocessor.services.validation.episode_stats import EpisodeStats from preprocessor.services.validation.file_validators import FileValidator from preprocessor.services.validation.validators.base_validator import BaseValidator -if TYPE_CHECKING: - from preprocessor.services.validation.episode_stats import EpisodeStats - class FrameValidator(BaseValidator): - def validate(self, stats: 'EpisodeStats') -> None: + def validate(self, stats: EpisodeStats) -> None: frames_dir = PathService(stats.series_name).get_episode_dir( stats.episode_info, settings.output_subdirs.frames, ) @@ -32,13 +31,13 @@ def validate(self, stats: 'EpisodeStats') -> None: stats.exported_frames_count = len(frame_files) self.__process_frames(stats, frame_files) - def __check_dir(self, stats: 'EpisodeStats', frames_dir: Path) -> bool: + def __check_dir(self, stats: EpisodeStats, frames_dir: Path) -> bool: if not frames_dir.exists(): self._add_warning(stats, f'Missing {settings.output_subdirs.frames} directory') return False return True - def __process_frames(self, stats: 'EpisodeStats', frame_files: List[Path]) -> None: + def __process_frames(self, stats: EpisodeStats, frame_files: List[Path]) -> None: total_size = 0.0 resolutions: List[Tuple[int, int]] = [] invalid_count = 0 diff --git a/preprocessor/services/validation/validators/image_hash_validator.py b/preprocessor/services/validation/validators/image_hash_validator.py index 91f9b101f..1d8534ad8 100644 --- a/preprocessor/services/validation/validators/image_hash_validator.py +++ b/preprocessor/services/validation/validators/image_hash_validator.py @@ -1,15 +1,13 @@ -from typing import TYPE_CHECKING +from __future__ import annotations from preprocessor.config.settings_instance import settings +from preprocessor.services.validation.episode_stats import EpisodeStats from preprocessor.services.validation.validators.base_validator import BaseValidator from preprocessor.services.validation.validators.validation_helpers import JsonDirectoryValidationHelper -if TYPE_CHECKING: - from preprocessor.services.validation.episode_stats import EpisodeStats - class ImageHashValidator(BaseValidator): - def validate(self, stats: 'EpisodeStats') -> None: + def validate(self, stats: EpisodeStats) -> None: JsonDirectoryValidationHelper.validate_json_directory( stats, settings.output_subdirs.image_hashes, diff --git a/preprocessor/services/validation/validators/object_validator.py b/preprocessor/services/validation/validators/object_validator.py index a4dd9240a..7e7c88e3b 100644 --- a/preprocessor/services/validation/validators/object_validator.py +++ b/preprocessor/services/validation/validators/object_validator.py @@ -1,23 +1,21 @@ -from typing import TYPE_CHECKING +from __future__ import annotations from preprocessor.config.settings_instance import settings +from preprocessor.services.validation.episode_stats import EpisodeStats from preprocessor.services.validation.validators.base_validator import BaseValidator from preprocessor.services.validation.validators.validation_helpers import ( JsonDirectoryValidationHelper, VisualizationValidationHelper, ) -if TYPE_CHECKING: - from preprocessor.services.validation.episode_stats import EpisodeStats - class ObjectValidator(BaseValidator): - def validate(self, stats: 'EpisodeStats') -> None: + def validate(self, stats: EpisodeStats) -> None: self.__validate_object_detections(stats) self.__validate_object_visualizations(stats) @staticmethod - def __validate_object_detections(stats: 'EpisodeStats') -> None: + def __validate_object_detections(stats: EpisodeStats) -> None: JsonDirectoryValidationHelper.validate_json_directory( stats, settings.output_subdirs.object_detections, @@ -27,7 +25,7 @@ def __validate_object_detections(stats: 'EpisodeStats') -> None: ) @staticmethod - def __validate_object_visualizations(stats: 'EpisodeStats') -> None: + def __validate_object_visualizations(stats: EpisodeStats) -> None: VisualizationValidationHelper.validate_visualizations( stats, settings.output_subdirs.object_visualizations, diff --git a/preprocessor/services/validation/validators/scene_validator.py b/preprocessor/services/validation/validators/scene_validator.py index e97340294..f8abe13c9 100644 --- a/preprocessor/services/validation/validators/scene_validator.py +++ b/preprocessor/services/validation/validators/scene_validator.py @@ -1,6 +1,7 @@ +from __future__ import annotations + from pathlib import Path from typing import ( - TYPE_CHECKING, Any, Dict, List, @@ -9,15 +10,13 @@ from preprocessor.config.constants import OUTPUT_FILE_PATTERNS from preprocessor.config.settings_instance import settings from preprocessor.services.io.path_service import PathService +from preprocessor.services.validation.episode_stats import EpisodeStats from preprocessor.services.validation.file_validators import FileValidator from preprocessor.services.validation.validators.base_validator import BaseValidator -if TYPE_CHECKING: - from preprocessor.services.validation.episode_stats import EpisodeStats - class SceneValidator(BaseValidator): - def validate(self, stats: 'EpisodeStats') -> None: + def validate(self, stats: EpisodeStats) -> None: scenes_file = self.__resolve_scenes_file(stats) if not self._check_path_exists(scenes_file, stats, f'Missing scenes file: {scenes_file}'): @@ -31,14 +30,14 @@ def validate(self, stats: 'EpisodeStats') -> None: self.__extract_scene_stats(stats, data) @staticmethod - def __resolve_scenes_file(stats: 'EpisodeStats') -> Path: + def __resolve_scenes_file(stats: EpisodeStats) -> Path: scenes_dir = PathService(stats.series_name).get_episode_dir( stats.episode_info, settings.output_subdirs.scenes, ) suffix = OUTPUT_FILE_PATTERNS['scenes_suffix'] return scenes_dir / f"{stats.series_name}_{stats.episode_info.episode_code()}{suffix}" - def __validate_json_integrity(self, stats: 'EpisodeStats', file_path: Path) -> bool: + def __validate_json_integrity(self, stats: EpisodeStats, file_path: Path) -> bool: result = FileValidator.validate_json_file(file_path) if not result.is_valid: self._add_error(stats, f'Invalid scenes JSON: {result.error_message}') @@ -46,7 +45,7 @@ def __validate_json_integrity(self, stats: 'EpisodeStats', file_path: Path) -> b return True @staticmethod - def __extract_scene_stats(stats: 'EpisodeStats', data: Dict[str, Any]) -> None: + def __extract_scene_stats(stats: EpisodeStats, data: Dict[str, Any]) -> None: stats.scenes_count = data.get('total_scenes', 0) scenes: List[Dict[str, Any]] = data.get('scenes', []) diff --git a/preprocessor/services/validation/validators/transcription_validator.py b/preprocessor/services/validation/validators/transcription_validator.py index 21cc3dbe2..8c2825e34 100644 --- a/preprocessor/services/validation/validators/transcription_validator.py +++ b/preprocessor/services/validation/validators/transcription_validator.py @@ -1,6 +1,7 @@ +from __future__ import annotations + from pathlib import Path from typing import ( - TYPE_CHECKING, Any, Dict, List, @@ -8,14 +9,12 @@ from preprocessor.config.settings_instance import settings from preprocessor.services.io.path_service import PathService +from preprocessor.services.validation.episode_stats import EpisodeStats from preprocessor.services.validation.validators.base_validator import BaseValidator -if TYPE_CHECKING: - from preprocessor.services.validation.episode_stats import EpisodeStats - class TranscriptionValidator(BaseValidator): - def validate(self, stats: 'EpisodeStats') -> None: + def validate(self, stats: EpisodeStats) -> None: trans_files = self.__resolve_file_map(stats) if not any(f.exists() for f in trans_files.values()): @@ -28,7 +27,7 @@ def validate(self, stats: 'EpisodeStats') -> None: self.__validate_sound_events(stats, trans_files['sound_events']) def __validate_raw_transcription( - self, stats: 'EpisodeStats', trans_files: Dict[str, Path], + self, stats: EpisodeStats, trans_files: Dict[str, Path], ) -> None: # Try to find any available raw format raw_path = next((trans_files[k] for k in ('main', 'segmented', 'simple') if trans_files[k].exists()), None) @@ -40,7 +39,7 @@ def __validate_raw_transcription( if self._validate_json_if_exists(stats, raw_path, "Invalid transcription JSON"): self.__extract_transcription_metrics(stats, raw_path) - def __extract_transcription_metrics(self, stats: 'EpisodeStats', raw_path: Path) -> None: + def __extract_transcription_metrics(self, stats: EpisodeStats, raw_path: Path) -> None: data = self._load_json_safely(raw_path) if not data: self._add_error(stats, f'Error reading transcription: {raw_path}') @@ -70,18 +69,18 @@ def __determine_duration(data: Dict[str, Any]) -> float: return segments[-1].get('end', 0.0) return 0.0 - def __validate_clean_transcription(self, stats: 'EpisodeStats', file_path: Path) -> None: + def __validate_clean_transcription(self, stats: EpisodeStats, file_path: Path) -> None: self._validate_json_with_warning( stats, file_path, missing_msg=f'Missing clean transcription: {file_path.name}', invalid_msg_prefix='Invalid clean transcription JSON', ) - def __validate_clean_txt(self, stats: 'EpisodeStats', file_path: Path) -> None: + def __validate_clean_txt(self, stats: EpisodeStats, file_path: Path) -> None: if not file_path.exists(): self._add_warning(stats, f'Missing clean transcription txt: {file_path.name}') - def __validate_sound_events(self, stats: 'EpisodeStats', file_path: Path) -> None: + def __validate_sound_events(self, stats: EpisodeStats, file_path: Path) -> None: self._validate_json_with_warning( stats, file_path, missing_msg=f'Missing sound events: {file_path.name}', @@ -89,7 +88,7 @@ def __validate_sound_events(self, stats: 'EpisodeStats', file_path: Path) -> Non ) @staticmethod - def __resolve_file_map(stats: 'EpisodeStats') -> Dict[str, Path]: + def __resolve_file_map(stats: EpisodeStats) -> Dict[str, Path]: path_svc = PathService(stats.series_name) trans_dir = path_svc.get_episode_dir(stats.episode_info, settings.output_subdirs.transcriptions) base = f'{stats.series_name}_{stats.episode_info.episode_code()}' diff --git a/preprocessor/services/validation/validators/validation_helpers.py b/preprocessor/services/validation/validators/validation_helpers.py index 6503c2f91..2ede1d4b4 100644 --- a/preprocessor/services/validation/validators/validation_helpers.py +++ b/preprocessor/services/validation/validators/validation_helpers.py @@ -1,22 +1,21 @@ +from __future__ import annotations + from pathlib import Path from typing import ( - TYPE_CHECKING, List, Optional, Tuple, ) from preprocessor.services.io.path_service import PathService +from preprocessor.services.validation.episode_stats import EpisodeStats from preprocessor.services.validation.file_validators import FileValidator -if TYPE_CHECKING: - from preprocessor.services.validation.episode_stats import EpisodeStats - class JsonDirectoryValidationHelper: @staticmethod def validate_json_directory( - stats: 'EpisodeStats', + stats: EpisodeStats, subdir: str, count_attr: Optional[str], context_name: str, @@ -70,7 +69,7 @@ def __analyze_json_files( @staticmethod def __perform_size_anomaly_check( - stats: 'EpisodeStats', + stats: EpisodeStats, sizes: List[int], folder_name: str, threshold: float = 0.2, @@ -93,7 +92,7 @@ def __perform_size_anomaly_check( class VisualizationValidationHelper: @staticmethod def validate_visualizations( - stats: 'EpisodeStats', + stats: EpisodeStats, subdir: str, count_attr: str, context_name: str, diff --git a/preprocessor/services/validation/validators/video_validator.py b/preprocessor/services/validation/validators/video_validator.py index d7e75c049..7b691b723 100644 --- a/preprocessor/services/validation/validators/video_validator.py +++ b/preprocessor/services/validation/validators/video_validator.py @@ -1,18 +1,17 @@ +from __future__ import annotations + from pathlib import Path -from typing import TYPE_CHECKING from preprocessor.config.constants import DEFAULT_VIDEO_EXTENSION from preprocessor.config.output_paths import get_base_output_dir from preprocessor.config.settings_instance import settings +from preprocessor.services.validation.episode_stats import EpisodeStats from preprocessor.services.validation.file_validators import FileValidator from preprocessor.services.validation.validators.base_validator import BaseValidator -if TYPE_CHECKING: - from preprocessor.services.validation.episode_stats import EpisodeStats - class VideoValidator(BaseValidator): - def validate(self, stats: 'EpisodeStats') -> None: + def validate(self, stats: EpisodeStats) -> None: video_path = self.__resolve_video_file_path(stats) if not video_path.exists(): @@ -27,7 +26,7 @@ def validate(self, stats: 'EpisodeStats') -> None: self.__populate_video_metrics(stats, result.metadata) @staticmethod - def __resolve_video_file_path(stats: 'EpisodeStats') -> Path: + def __resolve_video_file_path(stats: EpisodeStats) -> Path: filename = f'{stats.series_name.lower()}_{stats.episode_info.episode_code()}{DEFAULT_VIDEO_EXTENSION}' season_dir = ( get_base_output_dir(stats.series_name) / @@ -37,7 +36,7 @@ def __resolve_video_file_path(stats: 'EpisodeStats') -> Path: return season_dir / filename @staticmethod - def __populate_video_metrics(stats: 'EpisodeStats', metadata: dict) -> None: + def __populate_video_metrics(stats: EpisodeStats, metadata: dict) -> None: stats.video_size_mb = metadata['size_mb'] stats.video_duration = metadata['duration'] stats.video_codec = metadata['codec'] diff --git a/preprocessor/services/video/image_hasher.py b/preprocessor/services/video/image_hasher.py index 1b2a4d511..b7dab14f3 100644 --- a/preprocessor/services/video/image_hasher.py +++ b/preprocessor/services/video/image_hasher.py @@ -66,15 +66,5 @@ def compute_phash_batch(self, images: List[Image.Image]) -> List[str]: return hashes - def __compute_hash(self, image_tensor: torch.Tensor) -> int: # pylint: disable=unused-private-member - if self.model is None: - raise RuntimeError('Model not initialized or already cleaned up') - with torch.no_grad(): - features = self.model(image_tensor) - features = F.adaptive_avg_pool2d(features, (1, 1)) - features = features.flatten() - hash_bits = (features > features.median()).int() - hash_val = int(''.join([str(bit.item()) for bit in hash_bits.tolist()[:64]]), 2) - return hash_val __all__ = ['PerceptualHasher'] diff --git a/preprocessor/steps/analysis/resolution_analysis_step.py b/preprocessor/steps/analysis/resolution_analysis_step.py index d6f0dd54b..61c003e7a 100644 --- a/preprocessor/steps/analysis/resolution_analysis_step.py +++ b/preprocessor/steps/analysis/resolution_analysis_step.py @@ -1,6 +1,6 @@ from collections import Counter +from dataclasses import dataclass from datetime import datetime -import json from pathlib import Path from typing import ( Any, @@ -13,10 +13,26 @@ from preprocessor.core.artifacts import ResolutionAnalysisResult from preprocessor.core.base_step import PipelineStep from preprocessor.core.context import ExecutionContext +from preprocessor.services.io.files import FileOperations from preprocessor.services.io.path_service import PathService from preprocessor.services.media.ffmpeg import FFmpegWrapper +@dataclass(frozen=True) +class _AnalysisData: + video_info: List[Dict[str, Any]] + resolution_counts: Counter + total_episodes: int + target_width: int + target_height: int + target_pixels: int + upscaling_count: int + upscaling_pct: float + progressive_count: int + needs_deinterlace_count: int + metadata_mismatch_count: int + + class ResolutionAnalysisStep(PipelineStep[None, ResolutionAnalysisResult, ResolutionAnalysisConfig]): @property def name(self) -> str: @@ -194,50 +210,38 @@ def __save_results_to_json( progressive_count = sum(1 for v in video_info if not v['needs_deinterlace']) metadata_mismatch_count = sum(1 for v in video_info if v['metadata_match'] != 'match') - result = self.__build_analysis_payload( - context, - video_info, - resolution_counts, - total_episodes, - target_width, - target_height, - target_pixels, - upscaling_count, - upscaling_pct, - progressive_count, - needs_deinterlace_count, - metadata_mismatch_count, + analysis_data = _AnalysisData( + video_info=video_info, + resolution_counts=resolution_counts, + total_episodes=total_episodes, + target_width=target_width, + target_height=target_height, + target_pixels=target_pixels, + upscaling_count=upscaling_count, + upscaling_pct=upscaling_pct, + progressive_count=progressive_count, + needs_deinterlace_count=needs_deinterlace_count, + metadata_mismatch_count=metadata_mismatch_count, ) - with open(output_file, 'w', encoding='utf-8') as f: - json.dump(result, f, indent=2, ensure_ascii=False) - + result = self.__build_analysis_payload(context, analysis_data) + FileOperations.atomic_write_json(output_file, result, indent=2) context.logger.info(f'Resolution analysis saved to: {output_file}') - def __build_analysis_payload( # pylint: disable=too-many-arguments + def __build_analysis_payload( self, context: ExecutionContext, - video_info: List[Dict[str, Any]], - resolution_counts: Counter, - total_episodes: int, - target_width: int, - target_height: int, - target_pixels: int, - upscaling_count: int, - upscaling_pct: float, - progressive_count: int, - needs_deinterlace_count: int, - metadata_mismatch_count: int, + data: _AnalysisData, ) -> Dict[str, Any]: source_resolutions = [ { 'width': width, 'height': height, 'count': count, - 'percentage': round((count / total_episodes) * 100, 1), + 'percentage': round((count / data.total_episodes) * 100, 1), 'label': self.__get_resolution_label(width, height), } - for (width, height), count in resolution_counts.most_common() + for (width, height), count in data.resolution_counts.most_common() ] files_details = [ @@ -246,41 +250,41 @@ def __build_analysis_payload( # pylint: disable=too-many-arguments 'width': v['width'], 'height': v['height'], 'label': self.__get_resolution_label(v['width'], v['height']), - 'needs_upscaling': (v['width'] * v['height']) < target_pixels, + 'needs_upscaling': (v['width'] * v['height']) < data.target_pixels, 'field_order': v['field_order'], 'needs_deinterlace': v['needs_deinterlace'], 'metadata_match': v['metadata_match'], 'idet_stats': v['idet_stats'], } - for v in sorted(video_info, key=lambda x: x['filename']) + for v in sorted(data.video_info, key=lambda x: x['filename']) ] return { 'analysis_date': datetime.now().isoformat(), 'series_name': context.series_name, 'target_resolution': { - 'width': target_width, - 'height': target_height, - 'label': self.__get_resolution_label(target_width, target_height), + 'width': data.target_width, + 'height': data.target_height, + 'label': self.__get_resolution_label(data.target_width, data.target_height), }, 'source_resolutions': source_resolutions, - 'total_files': total_episodes, + 'total_files': data.total_episodes, 'upscaling_required': { - 'count': upscaling_count, - 'percentage': round(upscaling_pct, 1), + 'count': data.upscaling_count, + 'percentage': round(data.upscaling_pct, 1), }, 'interlacing_analysis': { 'progressive': { - 'count': progressive_count, - 'percentage': round((progressive_count / total_episodes) * 100, 1), + 'count': data.progressive_count, + 'percentage': round((data.progressive_count / data.total_episodes) * 100, 1), }, 'interlaced': { - 'count': needs_deinterlace_count, - 'percentage': round((needs_deinterlace_count / total_episodes) * 100, 1), + 'count': data.needs_deinterlace_count, + 'percentage': round((data.needs_deinterlace_count / data.total_episodes) * 100, 1), }, 'metadata_mismatches': { - 'count': metadata_mismatch_count, - 'percentage': round((metadata_mismatch_count / total_episodes) * 100, 1), + 'count': data.metadata_mismatch_count, + 'percentage': round((data.metadata_mismatch_count / data.total_episodes) * 100, 1), }, }, 'files': files_details, diff --git a/preprocessor/steps/audio/separation_step.py b/preprocessor/steps/audio/separation_step.py index 267e4215a..d1cb8061b 100644 --- a/preprocessor/steps/audio/separation_step.py +++ b/preprocessor/steps/audio/separation_step.py @@ -20,6 +20,7 @@ from preprocessor.core.artifacts import TranscriptionData from preprocessor.core.base_step import PipelineStep from preprocessor.core.context import ExecutionContext +from preprocessor.core.temp_files import StepTempFile from preprocessor.services.io.files import FileOperations from preprocessor.services.transcription.sound_classification import ( classify_segment, @@ -285,18 +286,19 @@ def __format_srt_time(seconds: float) -> str: @staticmethod def __generate_srt_file(segments: List[Dict[str, Any]], srt_path: Path) -> None: - with open(srt_path, 'w', encoding='utf-8') as f: - for idx, seg in enumerate(segments, 1): - start = seg.get('start', 0) - end = seg.get('end', 0) - text = seg.get('text', '').strip() + with StepTempFile(srt_path) as temp_path: + with open(temp_path, 'w', encoding='utf-8') as f: + for idx, seg in enumerate(segments, 1): + start = seg.get('start', 0) + end = seg.get('end', 0) + text = seg.get('text', '').strip() - start_time = SoundSeparationStep.__format_srt_time(start) - end_time = SoundSeparationStep.__format_srt_time(end) + start_time = SoundSeparationStep.__format_srt_time(start) + end_time = SoundSeparationStep.__format_srt_time(end) - f.write(f'{idx}\n') - f.write(f'{start_time} --> {end_time}\n') - f.write(f'{text}\n\n') + f.write(f'{idx}\n') + f.write(f'{start_time} --> {end_time}\n') + f.write(f'{text}\n\n') @staticmethod def __generate_txt_file(json_path: Path, txt_path: Path) -> None: @@ -313,12 +315,9 @@ def __generate_txt_file(json_path: Path, txt_path: Path) -> None: if text: text_lines.append(text) - with open(txt_path, 'w', encoding='utf-8') as f: - f.write(' '.join(text_lines)) - - @staticmethod - def __is_sound_event_text(text: str) -> bool: # pylint: disable=unused-private-member - return bool(re.match(r'^\(.*\)$', text.strip())) + with StepTempFile(txt_path) as temp_path: + with open(temp_path, 'w', encoding='utf-8') as f: + f.write(' '.join(text_lines)) @staticmethod def __renumber_segments(segments: List[Dict[str, Any]]) -> List[Dict[str, Any]]: diff --git a/preprocessor/steps/packaging/archives_step.py b/preprocessor/steps/packaging/archives_step.py index cb9a7a85c..219fbea92 100644 --- a/preprocessor/steps/packaging/archives_step.py +++ b/preprocessor/steps/packaging/archives_step.py @@ -8,9 +8,19 @@ ) from preprocessor.core.base_step import PipelineStep from preprocessor.core.context import ExecutionContext +from preprocessor.core.output_descriptors import FileOutput class ArchiveGenerationStep(PipelineStep[ProcessedEpisode, ArchiveArtifact, ArchiveConfig]): + def get_output_descriptors(self) -> List[FileOutput]: + return [ + FileOutput( + pattern="{season}/{episode}.zip", + subdir="archives", + min_size_bytes=1024*100, + ), + ] + @property def name(self) -> str: return 'archive_generation' @@ -40,12 +50,17 @@ def execute( context.mark_step_completed(self.name, input_data.episode_id) return self.__construct_archive_artifact(input_data, output_path) - @staticmethod def __resolve_output_path( - input_data: ProcessedEpisode, context: ExecutionContext, + self, input_data: ProcessedEpisode, context: ExecutionContext, ) -> Path: - output_filename: str = f'{context.series_name}_{input_data.episode_info.episode_code()}_archive.zip' - return context.get_output_path(input_data.episode_info, 'archives', output_filename) + return self._resolve_output_path( + 0, + context, + { + 'season': input_data.episode_info.season_code(), + 'episode': input_data.episode_info.episode_code(), + }, + ) @staticmethod def __construct_archive_artifact( diff --git a/preprocessor/steps/scraping/reference_processor_step.py b/preprocessor/steps/scraping/reference_processor_step.py index 0ec3069b7..f2104afa3 100644 --- a/preprocessor/steps/scraping/reference_processor_step.py +++ b/preprocessor/steps/scraping/reference_processor_step.py @@ -4,6 +4,7 @@ Tuple, ) +from preprocessor.config.output_paths import get_base_output_dir from preprocessor.config.step_configs import CharacterReferenceConfig from preprocessor.core.artifacts import SourceVideo from preprocessor.core.base_step import PipelineStep @@ -25,7 +26,7 @@ def is_global(self) -> bool: def execute( self, input_data: SourceVideo, context: ExecutionContext, ) -> Optional[SourceVideo]: - characters_path, output_dir = self.__resolve_paths() + characters_path, output_dir = self.__resolve_paths(context) self.__validate_characters_file(characters_path) if self.__should_skip_processing(output_dir, context): @@ -36,9 +37,10 @@ def execute( return input_data - def __resolve_paths(self) -> Tuple[Path, Path]: - characters_path = Path(self.config.characters_file) - output_dir = Path(self.config.output_dir) + def __resolve_paths(self, context: ExecutionContext) -> Tuple[Path, Path]: + base_dir = get_base_output_dir(context.series_name) + characters_path = base_dir / f"{context.series_name}_characters.json" + output_dir = base_dir / "character_faces" return characters_path, output_dir def __download_character_references( diff --git a/preprocessor/steps/search/document_generation_step.py b/preprocessor/steps/search/document_generation_step.py index 5defd8475..9fd4cd5b1 100644 --- a/preprocessor/steps/search/document_generation_step.py +++ b/preprocessor/steps/search/document_generation_step.py @@ -14,10 +14,21 @@ ) from preprocessor.core.base_step import PipelineStep from preprocessor.core.context import ExecutionContext +from preprocessor.core.output_descriptors import FileOutput +from preprocessor.core.temp_files import StepTempFile from preprocessor.services.io.files import FileOperations class DocumentGeneratorStep(PipelineStep[Artifact, ElasticDocuments, DocumentGenerationConfig]): + def get_output_descriptors(self) -> List[FileOutput]: + return [ + FileOutput( + pattern="{season}/{episode}.ndjson", + subdir="elastic_documents", + min_size_bytes=100, + ), + ] + @property def name(self) -> str: return 'document_generation' @@ -85,20 +96,21 @@ def __write_segments_to_jsonl( video_bot_path: str, ) -> int: count = 0 - with open(output_path, 'w', encoding='utf-8') as f: - for i, segment in enumerate(segments): - doc = { - 'episode_id': episode_info.episode_code(), - 'episode_metadata': episode_metadata, - 'segment_id': i, - 'text': segment.get('text', '').strip(), - 'start_time': segment.get('start', 0.0), - 'end_time': segment.get('end', 0.0), - 'speaker': segment.get('speaker', 'unknown'), - 'video_path': video_bot_path, - } - f.write(json.dumps(doc, ensure_ascii=False) + '\n') - count += 1 + with StepTempFile(output_path) as temp_path: + with open(temp_path, 'w', encoding='utf-8') as f: + for i, segment in enumerate(segments): + doc = { + 'episode_id': episode_info.episode_code(), + 'episode_metadata': episode_metadata, + 'segment_id': i, + 'text': segment.get('text', '').strip(), + 'start_time': segment.get('start', 0.0), + 'end_time': segment.get('end', 0.0), + 'speaker': segment.get('speaker', 'unknown'), + 'video_path': video_bot_path, + } + f.write(json.dumps(doc, ensure_ascii=False) + '\n') + count += 1 return count @staticmethod @@ -110,15 +122,25 @@ def __extract_episode_info(input_data: Artifact) -> Tuple[Any, str]: episode_id = getattr(input_data, 'episode_id') return episode_info, episode_id - @staticmethod - def __resolve_output_dir(episode_info: Any, context: ExecutionContext) -> Path: - return context.get_output_path(episode_info, 'elastic_documents', '') - - @staticmethod - def __resolve_segments_output_path(episode_info: Any, context: ExecutionContext) -> Path: - output_filename = f'{context.series_name}_{episode_info.episode_code()}_text_segments.jsonl' - return context.get_output_path( - episode_info, 'elastic_documents/text_segments', output_filename, + def __resolve_output_dir(self, episode_info: Any, context: ExecutionContext) -> Path: + output_path = self._resolve_output_path( + 0, + context, + { + 'season': episode_info.season_code(), + 'episode': episode_info.episode_code(), + }, + ) + return output_path.parent + + def __resolve_segments_output_path(self, episode_info: Any, context: ExecutionContext) -> Path: + return self._resolve_output_path( + 0, + context, + { + 'season': episode_info.season_code(), + 'episode': episode_info.episode_code(), + }, ) @staticmethod diff --git a/preprocessor/steps/text/analysis_step.py b/preprocessor/steps/text/analysis_step.py index 6d31c6655..b44dbc21d 100644 --- a/preprocessor/steps/text/analysis_step.py +++ b/preprocessor/steps/text/analysis_step.py @@ -13,6 +13,10 @@ ) from preprocessor.core.base_step import PipelineStep from preprocessor.core.context import ExecutionContext +from preprocessor.core.output_descriptors import ( + JsonFileOutput, + OutputDescriptor, +) from preprocessor.services.io.files import FileOperations from preprocessor.services.text.text_statistics import TextStatistics @@ -26,6 +30,15 @@ def name(self) -> str: def supports_batch_processing(self) -> bool: return True + def get_output_descriptors(self) -> List[OutputDescriptor]: + return [ + JsonFileOutput( + pattern="{season}/{episode}.json", + subdir="text_analysis", + min_size_bytes=50, + ), + ] + def execute_batch( self, input_data: List[TranscriptionData], context: ExecutionContext, ) -> List[TextAnalysisResults]: @@ -36,7 +49,7 @@ def execute_batch( def execute( self, input_data: TranscriptionData, context: ExecutionContext, ) -> TextAnalysisResults: - output_path = self.__resolve_output_path(input_data) + output_path = self.__resolve_output_path(input_data, context) if self._check_cache_validity(output_path, context, input_data.episode_id, 'cached'): return self.__load_cached_result(output_path, input_data) @@ -72,10 +85,17 @@ def __build_result_payload( **stats.to_dict(), } - @staticmethod - def __resolve_output_path(input_data: TranscriptionData) -> Path: - output_filename = input_data.path.stem + '_text_stats.json' - return input_data.path.parent / output_filename + def __resolve_output_path( + self, input_data: TranscriptionData, context: ExecutionContext, + ) -> Path: + return self._resolve_output_path( + 0, + context, + { + 'season': input_data.episode_info.season_code(), + 'episode': input_data.episode_info.episode_code(), + }, + ) @staticmethod def __resolve_text_file_path(input_data: TranscriptionData) -> Path: diff --git a/preprocessor/steps/text/embeddings_step.py b/preprocessor/steps/text/embeddings_step.py index 87d3af87f..cded2e49b 100644 --- a/preprocessor/steps/text/embeddings_step.py +++ b/preprocessor/steps/text/embeddings_step.py @@ -15,16 +15,27 @@ ) from preprocessor.core.base_step import PipelineStep from preprocessor.core.context import ExecutionContext +from preprocessor.core.output_descriptors import FileOutput from preprocessor.services.io.files import FileOperations from preprocessor.services.io.metadata import MetadataBuilder from preprocessor.services.search.embedding_model import EmbeddingModelWrapper +# pylint: disable=duplicate-code # Pattern shared with vision/embeddings_step - different data types (text vs frames) class TextEmbeddingStep(PipelineStep[TranscriptionData, EmbeddingCollection, TextEmbeddingConfig]): def __init__(self, config: TextEmbeddingConfig) -> None: super().__init__(config) self.__model: Optional[EmbeddingModelWrapper] = None + def get_output_descriptors(self) -> List[FileOutput]: + return [ + FileOutput( + pattern="{season}/{episode}.npy", + subdir="embeddings/text", + min_size_bytes=1024, + ), + ] + @property def name(self) -> str: return 'text_embedding' @@ -61,7 +72,11 @@ def execute( input_data: TranscriptionData, context: ExecutionContext, ) -> EmbeddingCollection: - output_path = self.__resolve_output_path(input_data, context) + output_path = self.__resolve_output_path( + context, + input_data.episode_info.season, + input_data.episode_info.episode, + ) if self._check_cache_validity(output_path, context, input_data.episode_id, 'cached text embeddings'): return self.__load_cached_result(output_path, input_data) @@ -83,7 +98,11 @@ def execute( def __execute_single( self, input_data: TranscriptionData, context: ExecutionContext, ) -> EmbeddingCollection: - output_path = self.__resolve_output_path(input_data, context) + output_path = self.__resolve_output_path( + context, + input_data.episode_info.season, + input_data.episode_info.episode, + ) if self._check_cache_validity(output_path, context, input_data.episode_id, 'cached text embeddings'): return self.__load_cached_result(output_path, input_data) @@ -196,11 +215,17 @@ def __construct_embedding_collection( embedding_type='text', ) - @staticmethod - def __resolve_output_path(input_data: TranscriptionData, context: ExecutionContext) -> Path: - episode_code = input_data.episode_info.episode_code() - output_filename: str = f'{context.series_name}_{episode_code}_embeddings_text.json' - return context.get_output_path(input_data.episode_info, 'embeddings', output_filename) + def __resolve_output_path( + self, + context: ExecutionContext, + season: int, + episode: int, + ) -> Path: + return self._resolve_output_path( + 0, + context, + {"season": season, "episode": episode}, + ) @staticmethod def __load_cached_result( diff --git a/preprocessor/steps/text/transcription_step.py b/preprocessor/steps/text/transcription_step.py index 335014493..947a48bd5 100644 --- a/preprocessor/steps/text/transcription_step.py +++ b/preprocessor/steps/text/transcription_step.py @@ -13,6 +13,7 @@ ) from preprocessor.core.base_step import PipelineStep from preprocessor.core.context import ExecutionContext +from preprocessor.core.output_descriptors import JsonFileOutput from preprocessor.services.episodes.episode_manager import EpisodeManager from preprocessor.services.io.files import FileOperations from preprocessor.services.transcription.whisper import Whisper @@ -23,6 +24,15 @@ def __init__(self, config: WhisperTranscriptionConfig) -> None: super().__init__(config) self.__whisper: Optional[Whisper] = None + def get_output_descriptors(self) -> List[JsonFileOutput]: + return [ + JsonFileOutput( + pattern="{season}/{episode}.json", + subdir="transcriptions", + min_size_bytes=50, + ), + ] + @property def name(self) -> str: return 'transcription' @@ -60,7 +70,11 @@ def cleanup(self) -> None: self.__whisper = None def execute(self, input_data: AudioArtifact, context: ExecutionContext) -> TranscriptionData: - output_path = self.__resolve_output_path(input_data, context) + output_path = self.__resolve_output_path( + context, + input_data.episode_info.season, + input_data.episode_info.episode, + ) if self._check_cache_validity(output_path, context, input_data.episode_id, 'cached transcription'): return self.__construct_cached_result(output_path, input_data) @@ -79,7 +93,11 @@ def execute(self, input_data: AudioArtifact, context: ExecutionContext) -> Trans def __execute_single( self, input_data: AudioArtifact, context: ExecutionContext, ) -> TranscriptionData: - output_path = self.__resolve_output_path(input_data, context) + output_path = self.__resolve_output_path( + context, + input_data.episode_info.season, + input_data.episode_info.episode, + ) if self._check_cache_validity(output_path, context, input_data.episode_id, 'cached transcription'): return self.__construct_cached_result(output_path, input_data) @@ -149,13 +167,14 @@ def __construct_result_artifact( format='json', ) - @staticmethod - def __resolve_output_path(input_data: AudioArtifact, context: ExecutionContext) -> Path: - output_filename: str = ( - f'{context.series_name}_{input_data.episode_info.episode_code()}.json' - ) - return context.get_output_path( - input_data.episode_info, - 'transcriptions', - f'raw/{output_filename}', + def __resolve_output_path( + self, + context: ExecutionContext, + season: int, + episode: int, + ) -> Path: + return self._resolve_output_path( + 0, + context, + {"season": season, "episode": episode}, ) diff --git a/preprocessor/steps/video/frame_export_step.py b/preprocessor/steps/video/frame_export_step.py index 12bae66e4..beaea3b21 100644 --- a/preprocessor/steps/video/frame_export_step.py +++ b/preprocessor/steps/video/frame_export_step.py @@ -21,6 +21,11 @@ ) from preprocessor.core.base_step import PipelineStep from preprocessor.core.context import ExecutionContext +from preprocessor.core.output_descriptors import ( + DirectoryOutput, + create_frames_output, +) +from preprocessor.core.temp_files import StepTempFile from preprocessor.services.io.files import FileOperations from preprocessor.services.video.strategies.strategy_factory import KeyframeStrategyFactory @@ -33,6 +38,9 @@ def __init__(self, config: FrameExportConfig) -> None: self.config.keyframe_strategy, self.config.frames_per_scene, ) + def get_output_descriptors(self) -> List[DirectoryOutput]: + return [create_frames_output()] + @property def name(self) -> str: return 'frame_export' @@ -148,7 +156,10 @@ def __extract_and_save_frame( base_filename = f'{series_name}_{episode_info.episode_code()}' filename = f'{base_filename}_frame_{frame_num:06d}.jpg' - resized.save(episode_dir / filename, quality=90) + final_path = episode_dir / filename + + with StepTempFile(final_path) as temp_path: + resized.save(temp_path, quality=90) def __resize_frame(self, frame: Image.Image, display_aspect_ratio: float) -> Image.Image: target_width = self.config.resolution.width @@ -227,12 +238,19 @@ def __write_metadata( } FileOperations.atomic_write_json(metadata_file, metadata, indent=2) - @staticmethod def __resolve_output_paths( + self, input_data: SceneCollection, context: ExecutionContext, ) -> Tuple[Path, Path]: - episode_dir = context.get_output_path(input_data.episode_info, 'exported_frames', '') + episode_dir = self._resolve_output_path( + 0, + context, + { + 'season': input_data.episode_info.season_code(), + 'episode': input_data.episode_info.episode_code(), + }, + ) metadata_filename = f'{context.series_name}_{input_data.episode_info.episode_code()}_frame_metadata.json' metadata_file = episode_dir / metadata_filename return episode_dir, metadata_file diff --git a/preprocessor/steps/video/scene_detection_step.py b/preprocessor/steps/video/scene_detection_step.py index e05864c5c..3311adac7 100644 --- a/preprocessor/steps/video/scene_detection_step.py +++ b/preprocessor/steps/video/scene_detection_step.py @@ -12,6 +12,10 @@ ) from preprocessor.core.base_step import PipelineStep from preprocessor.core.context import ExecutionContext +from preprocessor.core.output_descriptors import ( + JsonFileOutput, + OutputDescriptor, +) from preprocessor.services.io.files import FileOperations from preprocessor.services.media.scene_detection import TransNetWrapper @@ -30,6 +34,15 @@ def name(self) -> str: def supports_batch_processing(self) -> bool: return True + def get_output_descriptors(self) -> List[OutputDescriptor]: + return [ + JsonFileOutput( + pattern="{season}/{episode}.json", + subdir="scene_detections", + min_size_bytes=10, + ), + ] + def setup_resources(self, context: ExecutionContext) -> None: if not self.__model_loaded: context.logger.info('Loading TransNetV2 model...') @@ -161,12 +174,16 @@ def __construct_scene_collection( min_scene_len=self.config.min_scene_len, ) - @staticmethod def __resolve_output_path( + self, input_data: TranscodedVideo, context: ExecutionContext, ) -> Path: - output_filename = f'{context.series_name}_{input_data.episode_info.episode_code()}_scenes.json' - return context.get_output_path( - input_data.episode_info, 'scene_timestamps', output_filename, + return self._resolve_output_path( + 0, + context, + { + 'season': input_data.episode_info.season_code(), + 'episode': input_data.episode_info.episode_code(), + }, ) diff --git a/preprocessor/steps/video/transcoding_step.py b/preprocessor/steps/video/transcoding_step.py index fe95e9d2c..949d321e1 100644 --- a/preprocessor/steps/video/transcoding_step.py +++ b/preprocessor/steps/video/transcoding_step.py @@ -15,6 +15,8 @@ ) from preprocessor.core.base_step import PipelineStep from preprocessor.core.context import ExecutionContext +from preprocessor.core.output_descriptors import FileOutput +from preprocessor.core.temp_files import StepTempFile from preprocessor.services.media.ffmpeg import FFmpegWrapper from preprocessor.services.media.transcode_params import TranscodeParams @@ -27,6 +29,15 @@ class VideoTranscoderStep(PipelineStep[SourceVideo, TranscodedVideo, TranscodeCo } __command_logged = False + def get_output_descriptors(self) -> List[FileOutput]: + return [ + FileOutput( + pattern="{season}/{episode}.mp4", + subdir="transcoded_videos", + min_size_bytes=1024*1024, + ), + ] + @property def name(self) -> str: return 'video_transcode' @@ -184,18 +195,13 @@ def __compute_audio_bitrate(self, probe: Dict[str, Any], context: ExecutionConte return tgt_a def __execute_ffmpeg_process(self, context: ExecutionContext, params: TranscodeParams, ep_id: str) -> None: - temp = params.output_path.with_suffix('.mp4.tmp') - t_params = replace(params, output_path=temp) - context.mark_step_started(self.name, ep_id, [str(temp)]) - try: - if t_params.log_command: + with StepTempFile(params.output_path) as temp_path: + temp_params = replace(params, output_path=temp_path) + context.mark_step_started(self.name, ep_id, [str(temp_path)]) + + if temp_params.log_command: context.logger.info('=' * 20 + ' FFmpeg ' + '=' * 20) - FFmpegWrapper.transcode(t_params) - temp.replace(params.output_path) - except BaseException: - if temp.exists(): - temp.unlink() - raise + FFmpegWrapper.transcode(temp_params) @staticmethod def __normalize_codec_name(codec: str) -> str: @@ -213,10 +219,15 @@ def __get_codec_efficiency_multiplier(src: str, tgt: str) -> float: 1.0, ) - @staticmethod - def __resolve_output_path(input_data: SourceVideo, context: ExecutionContext) -> Path: - filename = f'{context.series_name}_{input_data.episode_info.episode_code()}.mp4' - return context.get_season_output_path(input_data.episode_info, 'transcoded_videos', filename) + def __resolve_output_path(self, input_data: SourceVideo, context: ExecutionContext) -> Path: + return self._resolve_output_path( + 0, + context, + { + 'season': input_data.episode_info.season_code(), + 'episode': input_data.episode_info.episode_code(), + }, + ) def __construct_result_artifact(self, path: Path, input_data: SourceVideo) -> TranscodedVideo: return TranscodedVideo( diff --git a/preprocessor/steps/vision/character_detection_step.py b/preprocessor/steps/vision/character_detection_step.py index cb33252a5..e81e8c4dd 100644 --- a/preprocessor/steps/vision/character_detection_step.py +++ b/preprocessor/steps/vision/character_detection_step.py @@ -14,11 +14,26 @@ ) from preprocessor.core.base_step import PipelineStep from preprocessor.core.context import ExecutionContext +from preprocessor.core.output_descriptors import ( + JsonFileOutput, + OutputDescriptor, +) from preprocessor.services.characters import FaceDetector from preprocessor.services.io.files import FileOperations class CharacterDetectorStep(PipelineStep[FrameCollection, DetectionResults, CharacterDetectionConfig]): + def get_output_descriptors(self) -> List[OutputDescriptor]: + """Define output file descriptors for character detection step.""" + return [ + JsonFileOutput( + subdir="detections/characters", + pattern="{season}/{episode}.json", + min_size_bytes=10, + ), + ] + + def __init__(self, config: CharacterDetectionConfig) -> None: super().__init__(config) self.__face_app = None @@ -163,12 +178,14 @@ def __save_detection_results( } FileOperations.atomic_write_json(output_path, output_data) - @staticmethod - def __resolve_output_path(input_data: FrameCollection, context: ExecutionContext) -> Path: - filename = f'{context.series_name}_{input_data.episode_info.episode_code()}' - output_filename: str = f'{filename}_character_detections.json' - return context.get_output_path( - input_data.episode_info, 'character_detections', output_filename, + def __resolve_output_path(self, input_data: FrameCollection, context: ExecutionContext) -> Path: + return self._resolve_output_path( + 0, + context, + { + 'season': f'S{input_data.episode_info.season:02d}', + 'episode': input_data.episode_info.episode_code(), + }, ) @staticmethod diff --git a/preprocessor/steps/vision/embeddings_step.py b/preprocessor/steps/vision/embeddings_step.py index 3b8809f03..af3964440 100644 --- a/preprocessor/steps/vision/embeddings_step.py +++ b/preprocessor/steps/vision/embeddings_step.py @@ -15,16 +15,27 @@ ) from preprocessor.core.base_step import PipelineStep from preprocessor.core.context import ExecutionContext +from preprocessor.core.output_descriptors import FileOutput from preprocessor.services.io.files import FileOperations from preprocessor.services.io.metadata import MetadataBuilder from preprocessor.services.search.embedding_model import EmbeddingModelWrapper +# pylint: disable=duplicate-code # Pattern shared with text/embeddings_step - different data types (frames vs text) class VideoEmbeddingStep(PipelineStep[FrameCollection, EmbeddingCollection, VideoEmbeddingConfig]): def __init__(self, config: VideoEmbeddingConfig) -> None: super().__init__(config) self.__model: Optional[EmbeddingModelWrapper] = None + def get_output_descriptors(self) -> List[FileOutput]: + return [ + FileOutput( + pattern="{season}/{episode}.npy", + subdir="embeddings/vision", + min_size_bytes=1024, + ), + ] + @property def name(self) -> str: return 'video_embedding' @@ -58,7 +69,11 @@ def cleanup(self) -> None: def execute( self, input_data: FrameCollection, context: ExecutionContext, ) -> EmbeddingCollection: - output_path = self.__resolve_output_path(input_data, context) + output_path = self.__resolve_output_path( + context, + input_data.episode_info.season, + input_data.episode_info.episode, + ) if self._check_cache_validity(output_path, context, input_data.episode_id, 'cached video embeddings'): return self.__load_cached_result(output_path, input_data) @@ -83,7 +98,11 @@ def execute( def __execute_single( self, input_data: FrameCollection, context: ExecutionContext, ) -> EmbeddingCollection: - output_path = self.__resolve_output_path(input_data, context) + output_path = self.__resolve_output_path( + context, + input_data.episode_info.season, + input_data.episode_info.episode, + ) if self._check_cache_validity(output_path, context, input_data.episode_id, 'cached video embeddings'): return self.__load_cached_result(output_path, input_data) @@ -145,12 +164,14 @@ def __load_cached_result( len(emb_data.get('video_embeddings', [])), ) - def __construct_embedding_collection( # pylint: disable=duplicate-code + def __construct_embedding_collection( self, input_data: FrameCollection, output_path: Path, embedding_count: int, ) -> EmbeddingCollection: + # Similar pattern exists in text/embeddings_step.py but with different input type (FrameCollection vs TranscriptionData) + # and embedding_type ('video' vs 'text'). Not truly duplicated - both use the same MetadataBuilder method. return MetadataBuilder.create_embedding_collection( episode_id=input_data.episode_id, episode_info=input_data.episode_info, @@ -181,11 +202,17 @@ def __save_embedding_results( ) FileOperations.atomic_write_json(output_path, output_data) - @staticmethod - def __resolve_output_path(input_data: FrameCollection, context: ExecutionContext) -> Path: - filename_base = f'{context.series_name}_{input_data.episode_info.episode_code()}' - output_filename: str = f'{filename_base}_embeddings_video.json' - return context.get_output_path(input_data.episode_info, 'embeddings', output_filename) + def __resolve_output_path( + self, + context: ExecutionContext, + season: int, + episode: int, + ) -> Path: + return self._resolve_output_path( + 0, + context, + {"season": season, "episode": episode}, + ) @staticmethod def __extract_frame_requests( diff --git a/preprocessor/steps/vision/emotion_detection_step.py b/preprocessor/steps/vision/emotion_detection_step.py index 618829de4..c77813f6e 100644 --- a/preprocessor/steps/vision/emotion_detection_step.py +++ b/preprocessor/steps/vision/emotion_detection_step.py @@ -18,11 +18,37 @@ ) from preprocessor.core.base_step import PipelineStep from preprocessor.core.context import ExecutionContext +from preprocessor.core.output_descriptors import ( + JsonFileOutput, + OutputDescriptor, +) from preprocessor.services.io.files import FileOperations from preprocessor.services.video.emotion_utils import EmotionDetector class EmotionDetectionStep(PipelineStep[FrameCollection, EmotionData, EmotionDetectionConfig]): + def get_output_descriptors(self) -> List[OutputDescriptor]: + """Define output file descriptors for emotion detection step.""" + return [ + JsonFileOutput( + subdir="detections/emotions", + pattern="{season}/{episode}.json", + min_size_bytes=10, + ), + ] + + def __resolve_output_path(self, input_data: FrameCollection, context: ExecutionContext) -> Path: + return self._resolve_output_path( + 0, + context, + { + 'season': f'S{input_data.episode_info.season:02d}', + 'episode': input_data.episode_info.episode_code(), + }, + ) + + + def __init__(self, config: EmotionDetectionConfig) -> None: super().__init__(config) self.__model: Optional[HSEmotionRecognizer] = None @@ -58,50 +84,52 @@ def teardown_resources(self, context: ExecutionContext) -> None: def __execute_single( self, input_data: FrameCollection, context: ExecutionContext, ) -> EmotionData: - detections_path = self.__resolve_detections_path(input_data, context) + input_path = self.__resolve_input_path(input_data, context) + output_path = self.__resolve_output_path(input_data, context) - if self._check_cache_validity(detections_path, context, input_data.episode_id, 'cached emotion detection'): - return self.__construct_emotion_data(input_data, detections_path) + if self._check_cache_validity(output_path, context, input_data.episode_id, 'cached emotion detection'): + return self.__construct_emotion_data(input_data, output_path) - if not detections_path.exists(): + if not input_path.exists(): context.logger.warning( - f'No character detections found for emotion analysis: {detections_path}', + f'No character detections found for emotion analysis: {input_path}', ) - return self.__construct_emotion_data(input_data, detections_path) + return self.__construct_emotion_data(input_data, output_path) context.logger.info(f'Detecting emotions for {input_data.episode_id}') context.mark_step_started(self.name, input_data.episode_id) - detections_data = FileOperations.load_json(detections_path) + detections_data = FileOperations.load_json(input_path) self.__process_and_update_emotions(detections_data, input_data, context) - FileOperations.atomic_write_json(detections_path, detections_data) + FileOperations.atomic_write_json(output_path, detections_data) context.mark_step_completed(self.name, input_data.episode_id) - return self.__construct_emotion_data(input_data, detections_path) + return self.__construct_emotion_data(input_data, output_path) def execute(self, input_data: FrameCollection, context: ExecutionContext) -> EmotionData: - detections_path = self.__resolve_detections_path(input_data, context) + input_path = self.__resolve_input_path(input_data, context) + output_path = self.__resolve_output_path(input_data, context) - if self._check_cache_validity(detections_path, context, input_data.episode_id, 'cached emotion detection'): - return self.__construct_emotion_data(input_data, detections_path) + if self._check_cache_validity(output_path, context, input_data.episode_id, 'cached emotion detection'): + return self.__construct_emotion_data(input_data, output_path) - if not detections_path.exists(): + if not input_path.exists(): context.logger.warning( - f'No character detections found for emotion analysis: {detections_path}', + f'No character detections found for emotion analysis: {input_path}', ) - return self.__construct_emotion_data(input_data, detections_path) + return self.__construct_emotion_data(input_data, output_path) context.logger.info(f'Detecting emotions for {input_data.episode_id}') context.mark_step_started(self.name, input_data.episode_id) self.__prepare_emotion_model(context) - detections_data = FileOperations.load_json(detections_path) + detections_data = FileOperations.load_json(input_path) self.__process_and_update_emotions(detections_data, input_data, context) - FileOperations.atomic_write_json(detections_path, detections_data) + FileOperations.atomic_write_json(output_path, detections_data) context.mark_step_completed(self.name, input_data.episode_id) - return self.__construct_emotion_data(input_data, detections_path) + return self.__construct_emotion_data(input_data, output_path) def __prepare_emotion_model(self, context: ExecutionContext) -> None: if self.__model is None: @@ -130,14 +158,17 @@ def __process_and_update_emotions( self.__apply_emotion_results(detections, emotion_results, face_metadata, context) - @staticmethod - def __resolve_detections_path( - input_data: FrameCollection, context: ExecutionContext, + def __resolve_input_path( + self, input_data: FrameCollection, context: ExecutionContext, ) -> Path: - filename = f'{context.series_name}_{input_data.episode_info.episode_code()}' - output_filename: str = f'{filename}_character_detections.json' - return context.get_output_path( - input_data.episode_info, 'character_detections', output_filename, + season_code = f'S{input_data.episode_info.season:02d}' + episode_code = input_data.episode_info.episode_code() + return ( + context.base_output_dir + / 'detections' + / 'characters' + / season_code + / f'{episode_code}.json' ) @staticmethod diff --git a/preprocessor/steps/vision/face_clustering_step.py b/preprocessor/steps/vision/face_clustering_step.py index 1559957fd..45216e0c0 100644 --- a/preprocessor/steps/vision/face_clustering_step.py +++ b/preprocessor/steps/vision/face_clustering_step.py @@ -8,9 +8,24 @@ ) from preprocessor.core.base_step import PipelineStep from preprocessor.core.context import ExecutionContext +from preprocessor.core.output_descriptors import ( + JsonFileOutput, + OutputDescriptor, +) class FaceClusteringStep(PipelineStep[FrameCollection, ClusterData, FaceClusteringConfig]): + def get_output_descriptors(self) -> List[OutputDescriptor]: + """Define output file descriptors for face clustering step.""" + return [ + JsonFileOutput( + subdir="clusters/faces", + pattern="{season}/{episode}.json", + min_size_bytes=10, + ), + ] + + def __init__(self, config: FaceClusteringConfig) -> None: super().__init__(config) self.__model = None @@ -48,8 +63,7 @@ def __execute_single( """Execute single episode (batch processing variant without lazy loading).""" output_path = self.__resolve_output_path(input_data, context) - if self.__is_execution_cached(output_path, input_data.episode_id, context): - context.logger.info(f'Skipping {input_data.episode_id} (cached face clustering)') + if self._check_cache_validity(output_path, context, input_data.episode_id, 'cached face clustering'): return self.__construct_cluster_data(input_data, output_path) context.logger.info(f'Clustering faces for {input_data.episode_id}') @@ -63,8 +77,7 @@ def execute( ) -> ClusterData: output_path = self.__resolve_output_path(input_data, context) - if self.__is_execution_cached(output_path, input_data.episode_id, context): - context.logger.info(f'Skipping {input_data.episode_id} (cached face clustering)') + if self._check_cache_validity(output_path, context, input_data.episode_id, 'cached face clustering'): return self.__construct_cluster_data(input_data, output_path) context.logger.info(f'Clustering faces for {input_data.episode_id}') @@ -73,22 +86,16 @@ def execute( context.mark_step_completed(self.name, input_data.episode_id) return self.__construct_cluster_data(input_data, output_path) - def __is_execution_cached( - self, output_path: Path, episode_id: str, context: ExecutionContext, - ) -> bool: - if not output_path.exists(): - return False - if context.force_rerun: - return False - return context.is_step_completed(self.name, episode_id) - - @staticmethod def __resolve_output_path( - input_data: FrameCollection, context: ExecutionContext, + self, input_data: FrameCollection, context: ExecutionContext, ) -> Path: - output_filename = f'{context.series_name}_{input_data.episode_info.episode_code()}_clusters.json' - return context.get_output_path( - input_data.episode_info, 'face_clusters', output_filename, + return self._resolve_output_path( + 0, + context, + { + 'season': f'S{input_data.episode_info.season:02d}', + 'episode': input_data.episode_info.episode_code(), + }, ) @staticmethod diff --git a/preprocessor/steps/vision/image_hashing_step.py b/preprocessor/steps/vision/image_hashing_step.py index 895733ebd..a20ee0609 100644 --- a/preprocessor/steps/vision/image_hashing_step.py +++ b/preprocessor/steps/vision/image_hashing_step.py @@ -1,4 +1,3 @@ -# pylint: disable=cyclic-import import gc from pathlib import Path from typing import ( @@ -18,12 +17,27 @@ ) from preprocessor.core.base_step import PipelineStep from preprocessor.core.context import ExecutionContext +from preprocessor.core.output_descriptors import ( + JsonFileOutput, + OutputDescriptor, +) from preprocessor.services.io.files import FileOperations from preprocessor.services.video.frame_utils import FrameLoader from preprocessor.services.video.image_hasher import PerceptualHasher class ImageHashStep(PipelineStep[FrameCollection, ImageHashCollection, ImageHashConfig]): + def get_output_descriptors(self) -> List[OutputDescriptor]: + """Define output file descriptors for image hashing step.""" + return [ + JsonFileOutput( + subdir="hashes", + pattern="{season}/{episode}.json", + min_size_bytes=50, + ), + ] + + def __init__(self, config: ImageHashConfig) -> None: super().__init__(config) self.__hasher: Optional[PerceptualHasher] = None @@ -108,11 +122,15 @@ def __compute_hashes( return hash_results - @staticmethod - def __resolve_output_path(input_data: FrameCollection, context: ExecutionContext) -> Path: - filename_base = f'{context.series_name}_{input_data.episode_info.episode_code()}' - output_filename: str = f'{filename_base}_image_hashes.json' - return context.get_output_path(input_data.episode_info, 'image_hashes', output_filename) + def __resolve_output_path(self, input_data: FrameCollection, context: ExecutionContext) -> Path: + return self._resolve_output_path( + 0, + context, + { + 'season': f'S{input_data.episode_info.season:02d}', + 'episode': input_data.episode_info.episode_code(), + }, + ) @staticmethod def __load_cached_result(output_path: Path, input_data: FrameCollection) -> ImageHashCollection: diff --git a/preprocessor/steps/vision/object_detection_step.py b/preprocessor/steps/vision/object_detection_step.py index b88d6c9a5..f7fc3c757 100644 --- a/preprocessor/steps/vision/object_detection_step.py +++ b/preprocessor/steps/vision/object_detection_step.py @@ -8,9 +8,24 @@ ) from preprocessor.core.base_step import PipelineStep from preprocessor.core.context import ExecutionContext +from preprocessor.core.output_descriptors import ( + JsonFileOutput, + OutputDescriptor, +) class ObjectDetectionStep(PipelineStep[FrameCollection, ObjectDetectionData, ObjectDetectionConfig]): + def get_output_descriptors(self) -> List[OutputDescriptor]: + """Define output file descriptors for object detection step.""" + return [ + JsonFileOutput( + subdir="detections/objects", + pattern="{season}/{episode}.json", + min_size_bytes=10, + ), + ] + + def __init__(self, config: ObjectDetectionConfig) -> None: super().__init__(config) self.__model = None @@ -47,8 +62,7 @@ def __execute_single( ) -> ObjectDetectionData: output_path = self.__resolve_output_path(input_data, context) - if self.__is_execution_cached(output_path, input_data.episode_id, context): - context.logger.info(f'Skipping {input_data.episode_id} (cached object detection)') + if self._check_cache_validity(output_path, context, input_data.episode_id, 'cached object detection'): return self.__construct_object_data(input_data, output_path) context.logger.info(f'Detecting objects for {input_data.episode_id}') @@ -62,8 +76,7 @@ def execute( ) -> ObjectDetectionData: output_path = self.__resolve_output_path(input_data, context) - if self.__is_execution_cached(output_path, input_data.episode_id, context): - context.logger.info(f'Skipping {input_data.episode_id} (cached object detection)') + if self._check_cache_validity(output_path, context, input_data.episode_id, 'cached object detection'): return self.__construct_object_data(input_data, output_path) context.logger.info(f'Detecting objects for {input_data.episode_id}') @@ -72,22 +85,16 @@ def execute( context.mark_step_completed(self.name, input_data.episode_id) return self.__construct_object_data(input_data, output_path) - def __is_execution_cached( - self, output_path: Path, episode_id: str, context: ExecutionContext, - ) -> bool: - if not output_path.exists(): - return False - if context.force_rerun: - return False - return context.is_step_completed(self.name, episode_id) - - @staticmethod def __resolve_output_path( - input_data: FrameCollection, context: ExecutionContext, + self, input_data: FrameCollection, context: ExecutionContext, ) -> Path: - output_filename = f'{context.series_name}_{input_data.episode_info.episode_code()}_objects.json' - return context.get_output_path( - input_data.episode_info, 'object_detections', output_filename, + return self._resolve_output_path( + 0, + context, + { + 'season': f'S{input_data.episode_info.season:02d}', + 'episode': input_data.episode_info.episode_code(), + }, ) @staticmethod From 56e17044af19fd7b95da6843da760a12f1484eaa Mon Sep 17 00:00:00 2001 From: dam2452 <81230036+dam2452@users.noreply.github.com> Date: Mon, 16 Feb 2026 07:49:12 +0100 Subject: [PATCH 39/89] Set default frame export to 1 frame & 1080p Reduce the default frames_per_scene from 3 to 1 and bump the default FrameExportConfig resolution from R720P to R1080P. Also update DefaultConfigFactory to use the new frames_per_scene and explicitly pass Resolution.R1080P, and add the required Resolution import in step_defaults. --- preprocessor/config/step_configs.py | 4 ++-- preprocessor/config/step_defaults.py | 3 ++- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/preprocessor/config/step_configs.py b/preprocessor/config/step_configs.py index d522c0c02..5cdbfd716 100644 --- a/preprocessor/config/step_configs.py +++ b/preprocessor/config/step_configs.py @@ -69,10 +69,10 @@ class SceneDetectionConfig(BaseModel): class FrameExportConfig(BaseModel): model_config = ConfigDict(arbitrary_types_allowed=True) - frames_per_scene: int = Field(default=3, ge=1) + frames_per_scene: int = Field(default=1, ge=1) keyframe_strategy: KeyframeStrategy = KeyframeStrategy.SCENE_CHANGES max_parallel_episodes: int = Field(default=4, ge=1, le=8) - resolution: Resolution = Field(default=Resolution.R720P) + resolution: Resolution = Field(default=Resolution.R1080P) class TranscriptionConfig(BaseModel): diff --git a/preprocessor/config/step_defaults.py b/preprocessor/config/step_defaults.py index 0563f5da3..aef97264b 100644 --- a/preprocessor/config/step_defaults.py +++ b/preprocessor/config/step_defaults.py @@ -18,6 +18,7 @@ VideoEmbeddingConfig, WhisperTranscriptionConfig, ) +from preprocessor.services.media.resolution import Resolution class DefaultConfigFactory: @@ -40,7 +41,7 @@ def get_configs(series_name: str) -> Dict[str, object]: 'separate_sounds': SoundSeparationConfig(), 'analyze_text': TextAnalysisConfig(language='pl'), 'detect_scenes': SceneDetectionConfig(threshold=0.5, min_scene_len=10), - 'export_frames': FrameExportConfig(frames_per_scene=3), + 'export_frames': FrameExportConfig(frames_per_scene=1, resolution=Resolution.R1080P), 'text_embeddings': TextEmbeddingConfig( model_name='Qwen/Qwen3-VL-Embedding-8B', batch_size=8, From 937c218e66ba71c9e6923373e0fc8446ef46341f Mon Sep 17 00:00:00 2001 From: dam2452 <81230036+dam2452@users.noreply.github.com> Date: Mon, 16 Feb 2026 07:58:18 +0100 Subject: [PATCH 40/89] Register object_detections earlier in pipeline Move the registration of object_detections to occur before character_detections (rather than after face_clusters). This adjusts the pipeline registration order to avoid potential ordering/dependency issues between detection components; no other functional changes were made. --- preprocessor/app/pipeline_factory.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/preprocessor/app/pipeline_factory.py b/preprocessor/app/pipeline_factory.py index b41b86229..2e1b1b409 100644 --- a/preprocessor/app/pipeline_factory.py +++ b/preprocessor/app/pipeline_factory.py @@ -497,10 +497,10 @@ def build_pipeline(series_name: str) -> PipelineDefinition: # pylint: disable=t pipeline.register(image_hashes) pipeline.register(video_embeddings) + pipeline.register(object_detections) pipeline.register(character_detections) pipeline.register(emotion_data) pipeline.register(face_clusters) - pipeline.register(object_detections) pipeline.register(elastic_documents) pipeline.register(episode_archives) From aa146d6a4d2e9cb139b8f6fc6b31578640476620 Mon Sep 17 00:00:00 2001 From: dam2452 <81230036+dam2452@users.noreply.github.com> Date: Mon, 16 Feb 2026 12:06:07 +0100 Subject: [PATCH 41/89] Refactor PipelineStep flow and caching Restructure the step execution & caching model and update callers accordingly. - Introduces a managed PipelineStep lifecycle: automatic .name, abstract _process(), and a centralized execute flow that handles caching, cache path resolution, restore via _load_from_cache, and state marking. Adds helpers: _get_cache_path, _get_output_descriptors, _get_standard_cache_path, and descriptor validation. - Changes execute/execute_batch behavior: steps can opt into batch processing and caching via new methods/properties; many concrete steps updated to implement _process/_get_cache_path/_load_from_cache or to disable caching when appropriate. - PipelineExecutor: add __should_skip_global_step and simplify batch processing logic (uses step-managed caching instead of pre-skip), improved logging and worker reporting. - Adds core models package and AnalysisData dataclass used by resolution analysis. - Adds CharacterMetadata and EpisodeMetadata artifacts. - Updates multiple steps/services to the new API (audio extraction, scraping base, transcription import, resolution analysis, sound separation, archive generation, character reference, document generation, and others), including bugfixes and small formatting/regex cleanups. This refactor centralizes caching and state handling in the base step, simplifies executor logic, and standardizes how steps expose names and outputs. --- preprocessor/app/pipeline_builder.py | 58 ++-- preprocessor/core/artifacts.py | 11 + preprocessor/core/base_step.py | 242 ++++++++------ preprocessor/core/models/__init__.py | 3 + preprocessor/core/models/analysis_models.py | 22 ++ preprocessor/services/audio/extraction.py | 5 +- .../services/scraping/base_scraper_step.py | 3 + preprocessor/services/text/import_step.py | 5 +- .../analysis/resolution_analysis_step.py | 306 +++++++++--------- preprocessor/steps/audio/separation_step.py | 214 ++++++------ preprocessor/steps/packaging/archives_step.py | 64 ++-- .../steps/scraping/episode_scraper_step.py | 13 +- .../scraping/reference_processor_step.py | 44 +-- .../steps/search/document_generation_step.py | 200 +++++++----- preprocessor/steps/search/indexing_step.py | 77 +++-- preprocessor/steps/text/analysis_step.py | 88 ++--- preprocessor/steps/text/embeddings_step.py | 165 +++++----- preprocessor/steps/text/transcription_step.py | 163 ++++------ .../steps/validation/validator_step.py | 6 +- preprocessor/steps/video/frame_export_step.py | 190 ++++++----- .../steps/video/scene_detection_step.py | 128 +++----- preprocessor/steps/video/transcoding_step.py | 178 ++++++---- .../steps/vision/character_detection_step.py | 72 ++--- preprocessor/steps/vision/embeddings_step.py | 233 ++++++------- .../steps/vision/emotion_detection_step.py | 108 +++---- .../steps/vision/face_clustering_step.py | 104 +++--- .../steps/vision/image_hashing_step.py | 108 ++++--- .../steps/vision/object_detection_step.py | 106 +++--- 28 files changed, 1467 insertions(+), 1449 deletions(-) create mode 100644 preprocessor/core/models/__init__.py create mode 100644 preprocessor/core/models/analysis_models.py diff --git a/preprocessor/app/pipeline_builder.py b/preprocessor/app/pipeline_builder.py index f9d976774..fb201e572 100644 --- a/preprocessor/app/pipeline_builder.py +++ b/preprocessor/app/pipeline_builder.py @@ -92,7 +92,7 @@ def run(self, source_path: Path, episode_manager: EpisodeManager) -> None: def __run_global_step(self, step: PipelineStep) -> None: self.__context.logger.info(f"=== Running Global Step: {step.name} ===") - if self.__should_skip_step(step.name, 'all'): + if self.__should_skip_global_step(step.name): self.__context.logger.info(f"Skipping {step.name} (already completed)") return @@ -104,6 +104,15 @@ def __run_global_step(self, step: PipelineStep) -> None: self.__context.logger.error(f"Global step {step.name} failed: {e}") raise + def __should_skip_global_step(self, step_name: str) -> bool: + if self.__context.force_rerun: + return False + + if self.__context.state_manager is None: + return False + + return self.__context.state_manager.is_step_completed(step_name, 'all') + def __run_episode_step( self, step: PipelineStep, current_artifacts: List[Any], ) -> List[Any]: @@ -141,13 +150,6 @@ def __run_episode_step_sequential( for artifact in current_artifacts: episode_id = artifact.episode_id - if self.__should_skip_step(step.name, episode_id): - self.__context.logger.info( - f"Skipping {step.name} for {episode_id} (already completed)", - ) - next_artifacts.append(artifact) - continue - try: self.__mark_step_in_progress(step.name, episode_id) result = step.execute(artifact, self.__context) @@ -168,36 +170,29 @@ def __run_episode_step_sequential( def __run_episode_step_batch( self, step: PipelineStep, current_artifacts: List[Any], ) -> List[Any]: - artifacts_to_process = [] - next_artifacts = [] - - for artifact in current_artifacts: - episode_id = artifact.episode_id - if self.__should_skip_step(step.name, episode_id): - self.__context.logger.info( - f"Skipping {step.name} for {episode_id} (already completed)", - ) - next_artifacts.append(artifact) - else: - artifacts_to_process.append(artifact) - - if not artifacts_to_process: - return next_artifacts + if not current_artifacts: + return [] + workers = ( + step.config.max_parallel_episodes + if hasattr(step.config, 'max_parallel_episodes') + else 'N' + ) self.__context.logger.info( - f"Processing {len(artifacts_to_process)} episodes with batch processing", + f"Batch processing {len(current_artifacts)} episodes with {workers} workers", ) try: if hasattr(step, 'setup_resources'): step.setup_resources(self.__context) - for artifact in artifacts_to_process: + for artifact in current_artifacts: self.__mark_step_in_progress(step.name, artifact.episode_id) - results = step.execute_batch(artifacts_to_process, self.__context) + results = step.execute_batch(current_artifacts, self.__context) - for artifact, result in zip(artifacts_to_process, results): + next_artifacts = [] + for artifact, result in zip(current_artifacts, results): self.__mark_step_completed(step.name, artifact.episode_id) next_artifacts.append(result or artifact) @@ -216,12 +211,3 @@ def __mark_step_in_progress(self, step_name: str, episode_id: str) -> None: if self.__context.state_manager is None: return self.__context.state_manager.mark_step_started(step_name, episode_id) - - def __should_skip_step(self, step_name: str, episode_id: str) -> bool: - if self.__context.force_rerun: - return False - - if self.__context.state_manager is None: - return False - - return self.__context.state_manager.is_step_completed(step_name, episode_id) diff --git a/preprocessor/core/artifacts.py b/preprocessor/core/artifacts.py index 2fb759826..ed19b122e 100644 --- a/preprocessor/core/artifacts.py +++ b/preprocessor/core/artifacts.py @@ -129,6 +129,17 @@ class ArchiveArtifact(EpisodeArtifact): path: Path +@dataclass(frozen=True) +class CharacterMetadata(EpisodeArtifact): + path: Path + character_count: int + + +@dataclass(frozen=True) +class EpisodeMetadata(EpisodeArtifact): + path: Path + + @dataclass(frozen=True) class ValidationResult(Artifact): season: str diff --git a/preprocessor/core/base_step.py b/preprocessor/core/base_step.py index 78e017575..1a7a2f6a5 100644 --- a/preprocessor/core/base_step.py +++ b/preprocessor/core/base_step.py @@ -9,6 +9,7 @@ as_completed, ) from pathlib import Path +import re from typing import ( Callable, Dict, @@ -33,137 +34,115 @@ class PipelineStep(ABC, Generic[InputT, OutputT, ConfigT]): def __init__(self, config: ConfigT) -> None: self.__config: ConfigT = config + @property + def name(self) -> str: + class_name = self.__class__.__name__ + if class_name.endswith('Step'): + class_name = class_name[:-4] + + snake_case = re.sub(r'(? ConfigT: return self.__config @property - @abstractmethod - def name(self) -> str: - pass + def is_global(self) -> bool: + return False @property - def is_global(self) -> bool: + def uses_caching(self) -> bool: + return True + + @property + def supports_batch_processing(self) -> bool: return False - def get_output_descriptors(self) -> List[OutputDescriptor]: - """ - Override in subclass to define step outputs. - Used for automatic output validation and path resolution. - """ - return [] + def execute(self, input_data: InputT, context: ExecutionContext) -> OutputT: + if not self.uses_caching: + return self._process(input_data, context) - def _resolve_output_path( - self, - descriptor_index: int, - context: ExecutionContext, - context_vars: Optional[Dict[str, str]] = None, - ) -> Path: - """ - Resolve output path from OutputDescriptor at given index. - Eliminates hardcoded subdirectories - uses descriptor definition. - """ - descriptors = self.get_output_descriptors() - if not descriptors or descriptor_index >= len(descriptors): - raise ValueError( - f'Step {self.name} has no output descriptor at index {descriptor_index}', - ) + return self.__execute_managed_flow(input_data, context) - descriptor = descriptors[descriptor_index] - return descriptor.resolve_path(context.base_output_dir, context_vars) + def execute_batch( + self, input_data: List[InputT], context: ExecutionContext, + ) -> List[OutputT]: + return [self.execute(item, context) for item in input_data] def should_skip_execution( - self, episode_id: str, context: ExecutionContext, context_vars: Optional[Dict[str, str]] = None, + self, + episode_id: str, + context: ExecutionContext, + context_vars: Optional[Dict[str, str]] = None, ) -> bool: - """ - Default caching logic - checks state manager and output validity. - Subclasses can call this at the start of execute() to skip if already done. - """ if context.force_rerun: return False if not context.is_step_completed(self.name, episode_id): return False - descriptors = self.get_output_descriptors() - if not descriptors: - return True - - for descriptor in descriptors: - result = descriptor.validate(context.base_output_dir, context_vars) - if not result.is_valid: - context.logger.warning( - f'{episode_id} - output invalid: {result.message}', - ) - return False - - return True - - @abstractmethod - def execute(self, input_data: InputT, context: ExecutionContext) -> OutputT: - pass - - @property - def supports_batch_processing(self) -> bool: - return False + return self.__validate_all_descriptors(context, context_vars, episode_id) def setup_resources(self, context: ExecutionContext) -> None: pass - def execute_batch( - self, input_data: List[InputT], context: ExecutionContext, - ) -> List[OutputT]: - return [self.execute(item, context) for item in input_data] - def teardown_resources(self, context: ExecutionContext) -> None: pass def cleanup(self) -> None: pass - def _check_cache_validity( - self, - output_path: Path, - context: ExecutionContext, - episode_id: str, - cache_description: str, - ) -> bool: - if output_path.exists() and not context.force_rerun: - if context.is_step_completed(self.name, episode_id): - context.logger.info(f'Skipping {episode_id} ({cache_description})') - return True - return False + @abstractmethod + def _process(self, input_data: InputT, context: ExecutionContext) -> OutputT: + raise NotImplementedError( + f'{self.__class__.__name__} must implement _process()', + ) - def _check_output_validity( - self, - output_descriptor: OutputDescriptor, - context: ExecutionContext, - episode_id: str, - context_vars: Optional[Dict[str, str]] = None, - ) -> bool: - if context.force_rerun: - return False + def _get_output_descriptors(self) -> List[OutputDescriptor]: + return [] - if not context.is_step_completed(self.name, episode_id): - return False + def _get_cache_path(self, input_data: InputT, context: ExecutionContext) -> Path: + raise NotImplementedError( + f'{self.__class__.__name__} must implement _get_cache_path() when caching is enabled', + ) - validation_result = output_descriptor.validate( - context.base_output_dir, context_vars, + def _load_from_cache( + self, cache_path: Path, input_data: InputT, context: ExecutionContext, + ) -> OutputT: + raise NotImplementedError( + f'{self.__class__.__name__} must implement _load_from_cache() when caching is enabled', ) - if validation_result.is_valid: - context.logger.info( - f'Skipping {episode_id} - output valid ' - f'({validation_result.file_count} files, ' - f'{validation_result.total_size_bytes} bytes)', + def _resolve_output_path( + self, + descriptor_index: int, + context: ExecutionContext, + context_vars: Optional[Dict[str, str]] = None, + ) -> Path: + descriptors = self._get_output_descriptors() + if not descriptors or descriptor_index >= len(descriptors): + raise ValueError( + f'Step {self.name} has no output descriptor at index {descriptor_index}', ) - return True - context.logger.warning( - f'Output invalid for {episode_id}: {validation_result.message}', - ) - return False + descriptor = descriptors[descriptor_index] + return descriptor.resolve_path(context.base_output_dir, context_vars) + def _get_standard_cache_path( + self, + input_data: InputT, + context: ExecutionContext, + descriptor_index: int = 0, + ) -> Path: + return self._resolve_output_path( + descriptor_index, + context, + { + 'season': input_data.episode_info.season_code(), + 'episode': input_data.episode_info.episode_code(), + }, + ) @staticmethod def _execute_with_threadpool( @@ -214,3 +193,80 @@ def _atomic_write( ) -> None: with StepTempFile(final_path, temp_suffix) as temp_path: write_func(temp_path) + + def __execute_managed_flow( + self, input_data: InputT, context: ExecutionContext, + ) -> OutputT: + cache_path = self._get_cache_path(input_data, context) + + if self.__should_restore_from_cache(cache_path, input_data, context): + return self.__restore_result(cache_path, input_data, context) + + return self.__compute_new_result(input_data, context) + + def __should_restore_from_cache( + self, cache_path: Path, input_data: InputT, context: ExecutionContext, + ) -> bool: + return self._check_cache_validity( + cache_path, context, input_data.episode_id, 'cached', + ) + + def __restore_result( + self, cache_path: Path, input_data: InputT, context: ExecutionContext, + ) -> OutputT: + context.logger.info(f'Loading {input_data.episode_id} from cache') + return self._load_from_cache(cache_path, input_data, context) + + def __compute_new_result( + self, input_data: InputT, context: ExecutionContext, + ) -> OutputT: + context.logger.info(f'Processing {input_data.episode_id}') + context.mark_step_started(self.name, input_data.episode_id) + + result = self._process(input_data, context) + + context.mark_step_completed(self.name, input_data.episode_id) + return result + + def _check_cache_validity( + self, + output_path: Path, + context: ExecutionContext, + episode_id: str, + cache_description: str, + ) -> bool: + if output_path.exists() and not context.force_rerun: + if context.is_step_completed(self.name, episode_id): + context.logger.info(f'Skipping {episode_id} ({cache_description})') + return True + return False + + def __validate_all_descriptors( + self, + context: ExecutionContext, + context_vars: Optional[Dict[str, str]], + episode_id: str, + ) -> bool: + descriptors = self._get_output_descriptors() + if not descriptors: + return True + + return all( + self.__validate_single_descriptor(descriptor, context, context_vars, episode_id) + for descriptor in descriptors + ) + + @staticmethod + def __validate_single_descriptor( + descriptor: OutputDescriptor, + context: ExecutionContext, + context_vars: Optional[Dict[str, str]], + episode_id: str, + ) -> bool: + result = descriptor.validate(context.base_output_dir, context_vars) + if not result.is_valid: + context.logger.warning( + f'{episode_id} - output invalid: {result.message}', + ) + return False + return True diff --git a/preprocessor/core/models/__init__.py b/preprocessor/core/models/__init__.py new file mode 100644 index 000000000..97147acbb --- /dev/null +++ b/preprocessor/core/models/__init__.py @@ -0,0 +1,3 @@ +from preprocessor.core.models.analysis_models import AnalysisData + +__all__ = ['AnalysisData'] diff --git a/preprocessor/core/models/analysis_models.py b/preprocessor/core/models/analysis_models.py new file mode 100644 index 000000000..09d8f478d --- /dev/null +++ b/preprocessor/core/models/analysis_models.py @@ -0,0 +1,22 @@ +from collections import Counter +from dataclasses import dataclass +from typing import ( + Any, + Dict, + List, +) + + +@dataclass(frozen=True) +class AnalysisData: + video_info: List[Dict[str, Any]] + resolution_counts: Counter + total_episodes: int + target_width: int + target_height: int + target_pixels: int + upscaling_count: int + upscaling_pct: float + progressive_count: int + needs_deinterlace_count: int + metadata_mismatch_count: int diff --git a/preprocessor/services/audio/extraction.py b/preprocessor/services/audio/extraction.py index fe3348050..578fc5c0f 100644 --- a/preprocessor/services/audio/extraction.py +++ b/preprocessor/services/audio/extraction.py @@ -12,9 +12,8 @@ class AudioExtractionStep(PipelineStep[SourceVideo, AudioArtifact, AudioExtractionConfig]): - @property - def name(self) -> str: - return 'audio_extraction' + def _process(self, input_data: SourceVideo, context: ExecutionContext) -> AudioArtifact: + raise NotImplementedError("AudioExtractionStep uses execute() instead of _process()") def execute(self, input_data: SourceVideo, context: ExecutionContext) -> AudioArtifact: output_path = self.__resolve_output_path(input_data, context) diff --git a/preprocessor/services/scraping/base_scraper_step.py b/preprocessor/services/scraping/base_scraper_step.py index ef7b5fab5..01f9ffbd0 100644 --- a/preprocessor/services/scraping/base_scraper_step.py +++ b/preprocessor/services/scraping/base_scraper_step.py @@ -26,6 +26,9 @@ class BaseScraperStep(PipelineStep[SourceVideo, SourceVideo, ConfigT], ABC): def is_global(self) -> bool: return True + def _process(self, input_data: SourceVideo, context: ExecutionContext) -> SourceVideo: + raise NotImplementedError("BaseScraperStep uses execute() instead of _process()") + def execute(self, input_data: SourceVideo, context: ExecutionContext) -> Optional[SourceVideo]: output_path = self.__resolve_output_path(context) diff --git a/preprocessor/services/text/import_step.py b/preprocessor/services/text/import_step.py index f926a3af2..ed267e991 100644 --- a/preprocessor/services/text/import_step.py +++ b/preprocessor/services/text/import_step.py @@ -26,9 +26,8 @@ def __init__(self, config: TranscriptionImportConfig) -> None: super().__init__(config) self.__episode_manager: Optional[EpisodeManager] = None - @property - def name(self) -> str: - return 'transcription_import' + def _process(self, input_data: None, context: ExecutionContext) -> List[TranscriptionData]: + raise NotImplementedError("TranscriptionImportStep uses execute() instead of _process()") def execute(self, input_data: None, context: ExecutionContext) -> List[TranscriptionData]: self.__ensure_episode_manager(context) diff --git a/preprocessor/steps/analysis/resolution_analysis_step.py b/preprocessor/steps/analysis/resolution_analysis_step.py index 61c003e7a..5b50113c2 100644 --- a/preprocessor/steps/analysis/resolution_analysis_step.py +++ b/preprocessor/steps/analysis/resolution_analysis_step.py @@ -1,5 +1,4 @@ from collections import Counter -from dataclasses import dataclass from datetime import datetime from pathlib import Path from typing import ( @@ -13,37 +12,23 @@ from preprocessor.core.artifacts import ResolutionAnalysisResult from preprocessor.core.base_step import PipelineStep from preprocessor.core.context import ExecutionContext +from preprocessor.core.models import AnalysisData from preprocessor.services.io.files import FileOperations from preprocessor.services.io.path_service import PathService from preprocessor.services.media.ffmpeg import FFmpegWrapper -@dataclass(frozen=True) -class _AnalysisData: - video_info: List[Dict[str, Any]] - resolution_counts: Counter - total_episodes: int - target_width: int - target_height: int - target_pixels: int - upscaling_count: int - upscaling_pct: float - progressive_count: int - needs_deinterlace_count: int - metadata_mismatch_count: int - - class ResolutionAnalysisStep(PipelineStep[None, ResolutionAnalysisResult, ResolutionAnalysisConfig]): - @property - def name(self) -> str: - return 'resolution_analysis' - @property def is_global(self) -> bool: return True - def execute( - self, input_data: None, context: ExecutionContext, + @property + def uses_caching(self) -> bool: + return False + + def _process( + self, input_data: None, context: ExecutionContext, ) -> ResolutionAnalysisResult: self.__log_analysis_header(context) @@ -58,31 +43,23 @@ def execute( upscaling_pct = self.__analyze_and_report(video_info, context) self.__save_results_to_json(video_info, upscaling_pct, context) - context.mark_step_completed(self.name, 'all') return ResolutionAnalysisResult( total_files=len(video_info), upscaling_percentage=upscaling_pct, ) - @staticmethod - def __log_analysis_header(context: ExecutionContext) -> None: - context.logger.info('=' * 80) - context.logger.info('RESOLUTION ANALYSIS - Checking source video resolutions') - context.logger.info('=' * 80) - - def __handle_missing_videos(self, context: ExecutionContext) -> ResolutionAnalysisResult: - context.logger.warning('No video files found - skipping resolution analysis') - context.mark_step_completed(self.name, 'all') - return ResolutionAnalysisResult(total_files=0, upscaling_percentage=0.0) - - def __handle_failed_analysis( - self, video_paths: List[Path], context: ExecutionContext, - ) -> ResolutionAnalysisResult: - context.logger.warning('Failed to analyze videos - skipping') - context.mark_step_completed(self.name, 'all') - return ResolutionAnalysisResult(total_files=len(video_paths), upscaling_percentage=0.0) + def __scan_resolutions( + self, video_paths: List[Path], context: ExecutionContext, + ) -> List[Dict[str, Any]]: + results = self._execute_with_threadpool( + video_paths, + context, + self.config.max_parallel_episodes, + self.__scan_single_video, + ) + return [r for r in results if r is not None] def __analyze_and_report( - self, video_info: List[Dict[str, Any]], context: ExecutionContext, + self, video_info: List[Dict[str, Any]], context: ExecutionContext, ) -> float: resolution_counts = Counter((v['width'], v['height']) for v in video_info) total_episodes = len(video_info) @@ -95,103 +72,40 @@ def __analyze_and_report( 1 for v in video_info if (v['width'] * v['height']) < target_pixels ) - upscaling_pct = (upscaling_count / total_episodes) * 100 if total_episodes > 0 else 0 + upscaling_pct = ( + (upscaling_count / total_episodes) * 100 if total_episodes > 0 else 0 + ) needs_deinterlace_count = sum(1 for v in video_info if v['needs_deinterlace']) progressive_count = sum(1 for v in video_info if not v['needs_deinterlace']) - metadata_mismatch_count = sum(1 for v in video_info if v['metadata_match'] != 'match') + metadata_mismatch_count = sum( + 1 for v in video_info if v['metadata_match'] != 'match' + ) self.__log_resolution_distribution( - context, resolution_counts, total_episodes, target_width, target_height, + context, + resolution_counts, + total_episodes, + target_width, + target_height, ) self.__log_upscaling_warnings(context, upscaling_pct) self.__log_interlacing_analysis( - context, progressive_count, needs_deinterlace_count, total_episodes, + context, + progressive_count, + needs_deinterlace_count, + total_episodes, ) self.__log_metadata_warnings(context, metadata_mismatch_count) context.logger.info('=' * 80) return upscaling_pct - def __log_resolution_distribution( - self, - context: ExecutionContext, - resolution_counts: Counter, - total_episodes: int, - target_width: int, - target_height: int, - ) -> None: - context.logger.info('') - context.logger.info('Source Resolution Distribution:') - context.logger.info('-' * 60) - - for (width, height), count in resolution_counts.most_common(): - pct = (count / total_episodes) * 100 - label = self.__get_resolution_label(width, height) - context.logger.info( - f' {width}x{height} ({label}): {count} episodes ({pct:.1f}%)', - ) - - context.logger.info('') - context.logger.info( - f'Target Resolution: {target_width}x{target_height} ' - f'({self.__get_resolution_label(target_width, target_height)})', - ) - - @staticmethod - def __log_upscaling_warnings(context: ExecutionContext, upscaling_pct: float) -> None: - if upscaling_pct > 50: - context.logger.warning('') - context.logger.warning('⚠' * 30) - context.logger.warning( - f'⚠ WARNING: {upscaling_pct:.1f}% of episodes will require UPSCALING!', - ) - context.logger.warning( - '⚠ Upscaling degrades quality. Consider using analyze-resolution CLI ' - 'to find optimal target resolution.', - ) - context.logger.warning('⚠' * 30) - elif upscaling_pct > 0: - context.logger.info( - f'Note: {upscaling_pct:.1f}% of episodes will be upscaled ' - '(enhanced quality params will be used)', - ) - - @staticmethod - def __log_interlacing_analysis( - context: ExecutionContext, - progressive_count: int, - needs_deinterlace_count: int, - total_episodes: int, - ) -> None: - context.logger.info('') - context.logger.info('Interlacing Analysis (based on idet, not metadata):') - context.logger.info('-' * 60) - context.logger.info( - f' Progressive: {progressive_count} episodes ' - f'({(progressive_count / total_episodes) * 100:.1f}%)', - ) - context.logger.info( - f' Interlaced (needs deinterlace): {needs_deinterlace_count} episodes ' - f'({(needs_deinterlace_count / total_episodes) * 100:.1f}%)', - ) - - @staticmethod - def __log_metadata_warnings(context: ExecutionContext, mismatch_count: int) -> None: - if mismatch_count > 0: - context.logger.warning('') - context.logger.warning( - f'⚠ WARNING: {mismatch_count} episodes have INCORRECT field_order metadata!', - ) - context.logger.warning( - '⚠ Using idet analysis instead of metadata for deinterlacing decisions.', - ) - def __save_results_to_json( - self, - video_info: List[Dict[str, Any]], - upscaling_pct: float, - context: ExecutionContext, + self, + video_info: List[Dict[str, Any]], + upscaling_pct: float, + context: ExecutionContext, ) -> None: output_file = self.__resolve_output_file(context) @@ -208,9 +122,11 @@ def __save_results_to_json( ) needs_deinterlace_count = sum(1 for v in video_info if v['needs_deinterlace']) progressive_count = sum(1 for v in video_info if not v['needs_deinterlace']) - metadata_mismatch_count = sum(1 for v in video_info if v['metadata_match'] != 'match') + metadata_mismatch_count = sum( + 1 for v in video_info if v['metadata_match'] != 'match' + ) - analysis_data = _AnalysisData( + analysis_data = AnalysisData( video_info=video_info, resolution_counts=resolution_counts, total_episodes=total_episodes, @@ -229,9 +145,9 @@ def __save_results_to_json( context.logger.info(f'Resolution analysis saved to: {output_file}') def __build_analysis_payload( - self, - context: ExecutionContext, - data: _AnalysisData, + self, + context: ExecutionContext, + data: AnalysisData, ) -> Dict[str, Any]: source_resolutions = [ { @@ -265,7 +181,9 @@ def __build_analysis_payload( 'target_resolution': { 'width': data.target_width, 'height': data.target_height, - 'label': self.__get_resolution_label(data.target_width, data.target_height), + 'label': self.__get_resolution_label( + data.target_width, data.target_height, + ), }, 'source_resolutions': source_resolutions, 'total_files': data.total_episodes, @@ -276,20 +194,124 @@ def __build_analysis_payload( 'interlacing_analysis': { 'progressive': { 'count': data.progressive_count, - 'percentage': round((data.progressive_count / data.total_episodes) * 100, 1), + 'percentage': round( + (data.progressive_count / data.total_episodes) * 100, 1, + ), }, 'interlaced': { 'count': data.needs_deinterlace_count, - 'percentage': round((data.needs_deinterlace_count / data.total_episodes) * 100, 1), + 'percentage': round( + (data.needs_deinterlace_count / data.total_episodes) * 100, 1, + ), }, 'metadata_mismatches': { 'count': data.metadata_mismatch_count, - 'percentage': round((data.metadata_mismatch_count / data.total_episodes) * 100, 1), + 'percentage': round( + (data.metadata_mismatch_count / data.total_episodes) * 100, 1, + ), }, }, 'files': files_details, } + @staticmethod + def __handle_missing_videos( + context: ExecutionContext, + ) -> ResolutionAnalysisResult: + context.logger.warning('No video files found - skipping resolution analysis') + return ResolutionAnalysisResult(total_files=0, upscaling_percentage=0.0) + + @staticmethod + def __handle_failed_analysis( + video_paths: List[Path], context: ExecutionContext, + ) -> ResolutionAnalysisResult: + context.logger.warning('Failed to analyze videos - skipping') + return ResolutionAnalysisResult( + total_files=len(video_paths), upscaling_percentage=0.0, + ) + + def __log_resolution_distribution( + self, + context: ExecutionContext, + resolution_counts: Counter, + total_episodes: int, + target_width: int, + target_height: int, + ) -> None: + context.logger.info('') + context.logger.info('Source Resolution Distribution:') + context.logger.info('-' * 60) + + for (width, height), count in resolution_counts.most_common(): + pct = (count / total_episodes) * 100 + label = self.__get_resolution_label(width, height) + context.logger.info( + f' {width}x{height} ({label}): {count} episodes ({pct:.1f}%)', + ) + + context.logger.info('') + context.logger.info( + f'Target Resolution: {target_width}x{target_height} ' + f'({self.__get_resolution_label(target_width, target_height)})', + ) + + @staticmethod + def __log_analysis_header(context: ExecutionContext) -> None: + context.logger.info('=' * 80) + context.logger.info('RESOLUTION ANALYSIS - Checking source video resolutions') + context.logger.info('=' * 80) + + @staticmethod + def __log_upscaling_warnings( + context: ExecutionContext, upscaling_pct: float, + ) -> None: + if upscaling_pct > 50: + context.logger.warning('') + context.logger.warning('⚠' * 30) + context.logger.warning( + f'⚠ WARNING: {upscaling_pct:.1f}% of episodes will require UPSCALING!', + ) + context.logger.warning( + '⚠ Upscaling degrades quality. Consider using analyze-resolution CLI ' + 'to find optimal target resolution.', + ) + context.logger.warning('⚠' * 30) + elif upscaling_pct > 0: + context.logger.info( + f'Note: {upscaling_pct:.1f}% of episodes will be upscaled ' + '(enhanced quality params will be used)', + ) + + @staticmethod + def __log_interlacing_analysis( + context: ExecutionContext, + progressive_count: int, + needs_deinterlace_count: int, + total_episodes: int, + ) -> None: + context.logger.info('') + context.logger.info('Interlacing Analysis (based on idet, not metadata):') + context.logger.info('-' * 60) + context.logger.info( + f' Progressive: {progressive_count} episodes ' + f'({(progressive_count / total_episodes) * 100:.1f}%)', + ) + context.logger.info( + f' Interlaced (needs deinterlace): {needs_deinterlace_count} episodes ' + f'({(needs_deinterlace_count / total_episodes) * 100:.1f}%)', + ) + + @staticmethod + def __log_metadata_warnings(context: ExecutionContext, mismatch_count: int) -> None: + if mismatch_count > 0: + context.logger.warning('') + context.logger.warning( + f'⚠ WARNING: {mismatch_count} episodes have INCORRECT field_order metadata!', + ) + context.logger.warning( + '⚠ Using idet analysis instead of metadata for deinterlacing decisions.', + ) + @staticmethod def __find_video_files(context: ExecutionContext) -> List[Path]: input_base = PathService.get_input_base() @@ -300,22 +322,17 @@ def __find_video_files(context: ExecutionContext) -> List[Path]: video_extensions = {'.mp4', '.mkv', '.avi', '.mov', '.m4v'} video_files = [ - p for p in series_path.rglob('*') + p + for p in series_path.rglob('*') if p.is_file() and p.suffix.lower() in video_extensions ] return sorted(video_files) - def __scan_resolutions( - self, video_paths: List[Path], context: ExecutionContext, - ) -> List[Dict[str, Any]]: - results = self._execute_with_threadpool( - video_paths, context, self.config.max_parallel_episodes, self.__scan_single_video, - ) - return [r for r in results if r is not None] - @staticmethod - def __scan_single_video(video_path: Path, context: ExecutionContext) -> Optional[Dict[str, Any]]: + def __scan_single_video( + video_path: Path, context: ExecutionContext, + ) -> Optional[Dict[str, Any]]: try: probe_data = FFmpegWrapper.probe_video(video_path) width, height = FFmpegWrapper.get_resolution(probe_data) @@ -338,7 +355,8 @@ def __scan_single_video(video_path: Path, context: ExecutionContext) -> Optional if metadata_vs_reality != 'match': context.logger.warning( - f'⚠ {video_path.name}: field_order={field_order} but idet says {metadata_vs_reality}!', + f'⚠ {video_path.name}: field_order={field_order} ' + f'but idet says {metadata_vs_reality}!', ) return { @@ -357,7 +375,7 @@ def __scan_single_video(video_path: Path, context: ExecutionContext) -> Optional @staticmethod def __validate_field_order( - field_order: str, has_interlacing: bool, idet_stats: Optional[Dict[str, int]], + field_order: str, has_interlacing: bool, idet_stats: Optional[Dict[str, int]], ) -> str: if not idet_stats: return 'unknown' diff --git a/preprocessor/steps/audio/separation_step.py b/preprocessor/steps/audio/separation_step.py index d1cb8061b..540ea231f 100644 --- a/preprocessor/steps/audio/separation_step.py +++ b/preprocessor/steps/audio/separation_step.py @@ -28,11 +28,9 @@ ) -class SoundSeparationStep(PipelineStep[TranscriptionData, TranscriptionData, SoundSeparationConfig]): - @property - def name(self) -> str: - return 'sound_separation' - +class SoundSeparationStep( + PipelineStep[TranscriptionData, TranscriptionData, SoundSeparationConfig], +): @property def supports_batch_processing(self) -> bool: return True @@ -44,18 +42,10 @@ def execute_batch( input_data, context, self.config.max_parallel_episodes, self.execute, ) - def execute( - self, - input_data: TranscriptionData, - context: ExecutionContext, + def _process( + self, input_data: TranscriptionData, context: ExecutionContext, ) -> TranscriptionData: output_paths = self.__resolve_output_paths(input_data) - clean_json = output_paths['clean_json'] - - if self._check_cache_validity(clean_json, context, input_data.episode_id, 'cached'): - return self.__construct_cached_result(output_paths, input_data) - - context.mark_step_started(self.name, input_data.episode_id) transcription_data = self.__load_transcription_payload(input_data) dialogue_segments, sound_segments = self.__separate_dialogue_from_sounds( @@ -74,12 +64,26 @@ def execute( sound_segments, ) - context.mark_step_completed(self.name, input_data.episode_id) + return self.__construct_result_artifact(output_paths, input_data) + + def _get_cache_path( + self, input_data: TranscriptionData, context: ExecutionContext, + ) -> Path: + output_paths = self.__resolve_output_paths(input_data) + return output_paths['clean_json'] + + def _load_from_cache( + self, + cache_path: Path, + input_data: TranscriptionData, + context: ExecutionContext, + ) -> TranscriptionData: + output_paths = self.__resolve_output_paths(input_data) return self.__construct_result_artifact(output_paths, input_data) def __separate_dialogue_from_sounds( - self, - segments: List[Dict[str, Any]], + self, + segments: List[Dict[str, Any]], ) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]: dialogue_segments = [] sound_segments = [] @@ -104,15 +108,15 @@ def __separate_dialogue_from_sounds( return dialogue_segments, sound_segments def __split_mixed_segment( - self, - segment: Dict[str, Any], + self, + segment: Dict[str, Any], ) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]]]: words = segment.get(WordKeys.WORDS, []) dialogue_parts = [] sound_parts = [] current_type = None current_words = [] - current_start = None + current_start = segment.get(WordKeys.START, 0.0) for word in words: word_type = 'sound' if is_sound_event(word) else 'dialogue' @@ -147,14 +151,50 @@ def __split_mixed_segment( return dialogue_parts, sound_parts + @staticmethod + def __finalize_sequence( + seq_type: str, + words: List[Dict[str, Any]], + start: float, + dialogue_parts: List[Dict[str, Any]], + sound_parts: List[Dict[str, Any]], + ) -> None: + non_spacing = [ + w for w in words if w.get(WordKeys.TYPE) != WordTypeValues.SPACING + ] + if not non_spacing: + return + + text = ''.join((w.get(WordKeys.TEXT, '') for w in words)) + # Use the end time of the last word, or start if not available + end = words[-1].get(WordKeys.END, start) + + new_segment = { + 'id': 0, + 'text': text, + WordKeys.START: start, + WordKeys.END: end, + WordKeys.WORDS: words, + } + + if seq_type == 'sound': + new_segment['sound_type'] = 'sound' + sound_parts.append(new_segment) + else: + dialogue_parts.append(new_segment) + def __generate_additional_formats( - self, - output_paths: Dict[str, Path], - dialogue_segments: List[Dict[str, Any]], - sound_segments: List[Dict[str, Any]], + self, + output_paths: Dict[str, Path], + dialogue_segments: List[Dict[str, Any]], + sound_segments: List[Dict[str, Any]], ) -> None: - self.__generate_txt_file(output_paths['clean_json'], output_paths['clean_txt']) - self.__generate_txt_file(output_paths['sound_json'], output_paths['sound_txt']) + self.__generate_txt_file( + output_paths['clean_json'], output_paths['clean_txt'], + ) + self.__generate_txt_file( + output_paths['sound_json'], output_paths['sound_txt'], + ) self.__generate_srt_file(dialogue_segments, output_paths['clean_srt']) self.__generate_srt_file(sound_segments, output_paths['sound_srt']) @@ -169,18 +209,28 @@ def __resolve_output_paths(input_data: TranscriptionData) -> Dict[str, Path]: sound_dir.mkdir(parents=True, exist_ok=True) return { - 'clean_json': clean_dir / f"{base_name}{FILE_SUFFIXES['clean']}{FILE_EXTENSIONS['json']}", - 'sound_json': sound_dir / f"{base_name}{FILE_SUFFIXES['sound_events']}{FILE_EXTENSIONS['json']}", - 'clean_segmented': clean_dir / f"{base_name}{FILE_SUFFIXES['segmented']}_clean{FILE_EXTENSIONS['json']}", - 'sound_segmented': sound_dir / f"{base_name}{FILE_SUFFIXES['segmented']}_sound_events{FILE_EXTENSIONS['json']}", - 'clean_txt': clean_dir / f"{base_name}{FILE_SUFFIXES['clean']}{FILE_EXTENSIONS['txt']}", - 'sound_txt': sound_dir / f"{base_name}{FILE_SUFFIXES['sound_events']}{FILE_EXTENSIONS['txt']}", - 'clean_srt': clean_dir / f"{base_name}{FILE_SUFFIXES['clean']}{FILE_EXTENSIONS['srt']}", - 'sound_srt': sound_dir / f"{base_name}{FILE_SUFFIXES['sound_events']}{FILE_EXTENSIONS['srt']}", + 'clean_json': clean_dir + / f"{base_name}{FILE_SUFFIXES['clean']}{FILE_EXTENSIONS['json']}", + 'sound_json': sound_dir + / f"{base_name}{FILE_SUFFIXES['sound_events']}{FILE_EXTENSIONS['json']}", + 'clean_segmented': clean_dir + / f"{base_name}{FILE_SUFFIXES['segmented']}_clean{FILE_EXTENSIONS['json']}", + 'sound_segmented': sound_dir + / f"{base_name}{FILE_SUFFIXES['segmented']}_sound_events{FILE_EXTENSIONS['json']}", + 'clean_txt': clean_dir + / f"{base_name}{FILE_SUFFIXES['clean']}{FILE_EXTENSIONS['txt']}", + 'sound_txt': sound_dir + / f"{base_name}{FILE_SUFFIXES['sound_events']}{FILE_EXTENSIONS['txt']}", + 'clean_srt': clean_dir + / f"{base_name}{FILE_SUFFIXES['clean']}{FILE_EXTENSIONS['srt']}", + 'sound_srt': sound_dir + / f"{base_name}{FILE_SUFFIXES['sound_events']}{FILE_EXTENSIONS['srt']}", } @staticmethod - def __load_transcription_payload(input_data: TranscriptionData) -> Dict[str, Any]: + def __load_transcription_payload( + input_data: TranscriptionData, + ) -> Dict[str, Any]: with open(input_data.path, 'r', encoding='utf-8') as f: data = json.load(f) return { @@ -190,37 +240,30 @@ def __load_transcription_payload(input_data: TranscriptionData) -> Dict[str, Any @staticmethod def __save_separated_data( - output_paths: Dict[str, Path], - episode_info_dict: Dict[str, Any], - dialogue_segments: List[Dict[str, Any]], - sound_segments: List[Dict[str, Any]], + output_paths: Dict[str, Path], + episode_info_dict: Dict[str, Any], + dialogue_segments: List[Dict[str, Any]], + sound_segments: List[Dict[str, Any]], ) -> None: - clean_data = {'episode_info': episode_info_dict, 'segments': dialogue_segments} + clean_data = { + 'episode_info': episode_info_dict, + 'segments': dialogue_segments, + } sound_data = {'episode_info': episode_info_dict, 'segments': sound_segments} FileOperations.atomic_write_json(output_paths['clean_json'], clean_data) FileOperations.atomic_write_json(output_paths['sound_json'], sound_data) - FileOperations.atomic_write_json(output_paths['clean_segmented'], clean_data) - FileOperations.atomic_write_json(output_paths['sound_segmented'], sound_data) - - @staticmethod - def __construct_cached_result( - output_paths: Dict[str, Path], - input_data: TranscriptionData, - ) -> TranscriptionData: - return TranscriptionData( - path=output_paths['clean_json'], - episode_id=input_data.episode_id, - episode_info=input_data.episode_info, - language=input_data.language, - model=input_data.model, - format='json', + FileOperations.atomic_write_json( + output_paths['clean_segmented'], clean_data, + ) + FileOperations.atomic_write_json( + output_paths['sound_segmented'], sound_data, ) @staticmethod def __construct_result_artifact( - output_paths: Dict[str, Path], - input_data: TranscriptionData, + output_paths: Dict[str, Path], + input_data: TranscriptionData, ) -> TranscriptionData: return TranscriptionData( path=output_paths['clean_json'], @@ -235,47 +278,24 @@ def __construct_result_artifact( def __clean_segment_text(segment: Dict[str, Any]) -> Dict[str, Any]: cleaned = segment.copy() text = cleaned.get('text', '') - text = re.sub('\\s+', ' ', text) + text = re.sub(r'\s+', ' ', text) cleaned['text'] = text.strip() words = cleaned.get(WordKeys.WORDS, []) if words: - non_spacing = [w for w in words if w.get(WordKeys.TYPE) != WordTypeValues.SPACING] + non_spacing = [ + w for w in words if w.get(WordKeys.TYPE) != WordTypeValues.SPACING + ] if non_spacing: - cleaned[WordKeys.START] = min((w.get(WordKeys.START, 0) for w in non_spacing)) - cleaned[WordKeys.END] = max((w.get(WordKeys.END, 0) for w in non_spacing)) + cleaned[WordKeys.START] = min( + (w.get(WordKeys.START, 0) for w in non_spacing), + ) + cleaned[WordKeys.END] = max( + (w.get(WordKeys.END, 0) for w in non_spacing), + ) return cleaned - @staticmethod - def __finalize_sequence( - seq_type: str, - words: List[Dict[str, Any]], - start: float, - dialogue_parts: List[Dict[str, Any]], - sound_parts: List[Dict[str, Any]], - ) -> None: - non_spacing = [w for w in words if w.get(WordKeys.TYPE) != WordTypeValues.SPACING] - if not non_spacing: - return - - text = ''.join((w.get(WordKeys.TEXT, '') for w in words)) - end = words[-1].get(WordKeys.END, start) - - new_segment = { - 'id': 0, - 'text': text, - WordKeys.START: start, - WordKeys.END: end, - WordKeys.WORDS: words, - } - - if seq_type == 'sound': - new_segment['sound_type'] = 'sound' - sound_parts.append(new_segment) - else: - dialogue_parts.append(new_segment) - @staticmethod def __format_srt_time(seconds: float) -> str: hours = int(seconds // 3600) @@ -285,7 +305,9 @@ def __format_srt_time(seconds: float) -> str: return f'{hours:02d}:{minutes:02d}:{secs:02d},{millis:03d}' @staticmethod - def __generate_srt_file(segments: List[Dict[str, Any]], srt_path: Path) -> None: + def __generate_srt_file( + segments: List[Dict[str, Any]], srt_path: Path, + ) -> None: with StepTempFile(srt_path) as temp_path: with open(temp_path, 'w', encoding='utf-8') as f: for idx, seg in enumerate(segments, 1): @@ -310,8 +332,8 @@ def __generate_txt_file(json_path: Path, txt_path: Path) -> None: for seg in segments: text = seg.get('text', '').strip() - text = re.sub('\\([^)]*\\)', '', text) - text = re.sub('\\s+', ' ', text).strip() + text = re.sub(r'\([^)]*\)', '', text) + text = re.sub(r'\s+', ' ', text).strip() if text: text_lines.append(text) @@ -320,7 +342,9 @@ def __generate_txt_file(json_path: Path, txt_path: Path) -> None: f.write(' '.join(text_lines)) @staticmethod - def __renumber_segments(segments: List[Dict[str, Any]]) -> List[Dict[str, Any]]: + def __renumber_segments( + segments: List[Dict[str, Any]], + ) -> List[Dict[str, Any]]: for i, seg in enumerate(segments): seg['id'] = i return segments diff --git a/preprocessor/steps/packaging/archives_step.py b/preprocessor/steps/packaging/archives_step.py index 219fbea92..3e0d84529 100644 --- a/preprocessor/steps/packaging/archives_step.py +++ b/preprocessor/steps/packaging/archives_step.py @@ -11,20 +11,9 @@ from preprocessor.core.output_descriptors import FileOutput -class ArchiveGenerationStep(PipelineStep[ProcessedEpisode, ArchiveArtifact, ArchiveConfig]): - def get_output_descriptors(self) -> List[FileOutput]: - return [ - FileOutput( - pattern="{season}/{episode}.zip", - subdir="archives", - min_size_bytes=1024*100, - ), - ] - - @property - def name(self) -> str: - return 'archive_generation' - +class ArchiveGenerationStep( + PipelineStep[ProcessedEpisode, ArchiveArtifact, ArchiveConfig], +): @property def supports_batch_processing(self) -> bool: return True @@ -36,35 +25,38 @@ def execute_batch( input_data, context, self.config.max_parallel_episodes, self.execute, ) - def execute( - self, input_data: ProcessedEpisode, context: ExecutionContext, + def _process( + self, input_data: ProcessedEpisode, context: ExecutionContext, ) -> ArchiveArtifact: - output_path = self.__resolve_output_path(input_data, context) - - if self._check_cache_validity(output_path, context, input_data.episode_id, 'cached archive'): - return self.__construct_archive_artifact(input_data, output_path) - - context.logger.info(f'Generating archive for {input_data.episode_id}') - context.mark_step_started(self.name, input_data.episode_id) - - context.mark_step_completed(self.name, input_data.episode_id) + output_path = self._get_cache_path(input_data, context) + # Archive generation logic would go here return self.__construct_archive_artifact(input_data, output_path) - def __resolve_output_path( - self, input_data: ProcessedEpisode, context: ExecutionContext, + def _get_output_descriptors(self) -> List[FileOutput]: + return [ + FileOutput( + pattern="{season}/{episode}.zip", + subdir="archives", + min_size_bytes=1024 * 100, + ), + ] + + def _get_cache_path( + self, input_data: ProcessedEpisode, context: ExecutionContext, ) -> Path: - return self._resolve_output_path( - 0, - context, - { - 'season': input_data.episode_info.season_code(), - 'episode': input_data.episode_info.episode_code(), - }, - ) + return self._get_standard_cache_path(input_data, context) + + def _load_from_cache( + self, + cache_path: Path, + input_data: ProcessedEpisode, + context: ExecutionContext, + ) -> ArchiveArtifact: + return self.__construct_archive_artifact(input_data, cache_path) @staticmethod def __construct_archive_artifact( - input_data: ProcessedEpisode, output_path: Path, + input_data: ProcessedEpisode, output_path: Path, ) -> ArchiveArtifact: return ArchiveArtifact( episode_id=input_data.episode_id, diff --git a/preprocessor/steps/scraping/episode_scraper_step.py b/preprocessor/steps/scraping/episode_scraper_step.py index d1aa3dae1..d6766d9f3 100644 --- a/preprocessor/steps/scraping/episode_scraper_step.py +++ b/preprocessor/steps/scraping/episode_scraper_step.py @@ -1,12 +1,6 @@ -from pathlib import Path -from typing import ( - Any, - Dict, - Type, -) +from typing import Type from preprocessor.config.step_configs import EpisodeScraperConfig -from preprocessor.core.context import ExecutionContext from preprocessor.services.scraping.base_scraper_step import BaseScraperStep from preprocessor.services.scraping.episode_scraper import EpisodeScraper @@ -21,8 +15,3 @@ def _get_scraper_class(self) -> Type[EpisodeScraper]: def _get_metadata_type_name(self) -> str: return "Episodes" - - def _build_scraper_args(self, output_path: Path, context: ExecutionContext) -> Dict[str, Any]: - args = super()._build_scraper_args(output_path, context) - args["merge_sources"] = self.config.merge_sources - return args diff --git a/preprocessor/steps/scraping/reference_processor_step.py b/preprocessor/steps/scraping/reference_processor_step.py index f2104afa3..d1dc158f1 100644 --- a/preprocessor/steps/scraping/reference_processor_step.py +++ b/preprocessor/steps/scraping/reference_processor_step.py @@ -1,8 +1,5 @@ from pathlib import Path -from typing import ( - Optional, - Tuple, -) +from typing import Tuple from preprocessor.config.output_paths import get_base_output_dir from preprocessor.config.step_configs import CharacterReferenceConfig @@ -23,21 +20,36 @@ def name(self) -> str: def is_global(self) -> bool: return True - def execute( + def _get_cache_path( self, input_data: SourceVideo, context: ExecutionContext, - ) -> Optional[SourceVideo]: + ) -> Path: + _, output_dir = self.__resolve_paths(context) + return output_dir + + def _load_from_cache( + self, cache_path: Path, input_data: SourceVideo, context: ExecutionContext, + ) -> SourceVideo: + context.logger.info(f"Character references already exist in: {cache_path}") + return input_data + + def _process( + self, input_data: SourceVideo, context: ExecutionContext, + ) -> SourceVideo: characters_path, output_dir = self.__resolve_paths(context) self.__validate_characters_file(characters_path) - - if self.__should_skip_processing(output_dir, context): - context.logger.info(f"Character references already exist in: {output_dir}") - return input_data - self.__download_character_references(characters_path, output_dir, context) - return input_data - def __resolve_paths(self, context: ExecutionContext) -> Tuple[Path, Path]: + @staticmethod + def _should_use_cache( + cache_path: Path, _input_data: SourceVideo, context: ExecutionContext, + ) -> bool: + if context.force_rerun: + return False + return cache_path.exists() and any(cache_path.iterdir()) + + @staticmethod + def __resolve_paths(context: ExecutionContext) -> Tuple[Path, Path]: base_dir = get_base_output_dir(context.series_name) characters_path = base_dir / f"{context.series_name}_characters.json" output_dir = base_dir / "character_faces" @@ -70,12 +82,6 @@ def __download_character_references( context.logger.info(f"Character references saved to: {output_dir}") - @staticmethod - def __should_skip_processing(output_dir: Path, context: ExecutionContext) -> bool: - if context.force_rerun: - return False - return output_dir.exists() and any(output_dir.iterdir()) - @staticmethod def __validate_characters_file(characters_path: Path) -> None: if not characters_path.exists(): diff --git a/preprocessor/steps/search/document_generation_step.py b/preprocessor/steps/search/document_generation_step.py index 9fd4cd5b1..6572e9ca9 100644 --- a/preprocessor/steps/search/document_generation_step.py +++ b/preprocessor/steps/search/document_generation_step.py @@ -20,15 +20,6 @@ class DocumentGeneratorStep(PipelineStep[Artifact, ElasticDocuments, DocumentGenerationConfig]): - def get_output_descriptors(self) -> List[FileOutput]: - return [ - FileOutput( - pattern="{season}/{episode}.ndjson", - subdir="elastic_documents", - min_size_bytes=100, - ), - ] - @property def name(self) -> str: return 'document_generation' @@ -44,56 +35,125 @@ def execute_batch( input_data, context, self.config.max_parallel_episodes, self.execute, ) - def execute( - self, input_data: Artifact, context: ExecutionContext, + def _process( + self, input_data: Artifact, context: ExecutionContext, ) -> ElasticDocuments: episode_info, episode_id = self.__extract_episode_info(input_data) - output_dir = self.__resolve_output_dir(episode_info, context) + output_path = self._get_cache_path(input_data, context) + + data = self.__gather_input_data(episode_info, context) + total_docs = self.__generate_documents( + data, output_path, episode_info, context, + ) - if self._check_cache_validity(output_dir, context, episode_id, 'cached'): - return self.__construct_elastic_documents(episode_id, episode_info, output_dir, 0) + return self.__construct_elastic_documents( + episode_id, episode_info, output_path, total_docs, + ) - context.logger.info(f'Generating Elasticsearch documents for {episode_id}') - context.mark_step_started(self.name, episode_id) + def _get_output_descriptors(self) -> List[FileOutput]: + return [ + FileOutput( + pattern="{season}/{episode}.ndjson", + subdir="elastic_documents", + min_size_bytes=100, + ), + ] - data = self.__gather_input_data(episode_info, context) - total_docs = self.__generate_documents(data, episode_info, context) + def _get_cache_path( + self, input_data: Artifact, context: ExecutionContext, + ) -> Path: + episode_info, _ = self.__extract_episode_info(input_data) + return self._resolve_output_path( + 0, + context, + { + 'season': episode_info.season_code(), + 'episode': episode_info.episode_code(), + }, + ) - context.mark_step_completed(self.name, episode_id) - return self.__construct_elastic_documents(episode_id, episode_info, output_dir, total_docs) + def _load_from_cache( + self, cache_path: Path, input_data: Artifact, context: ExecutionContext, + ) -> ElasticDocuments: + episode_info, episode_id = self.__extract_episode_info(input_data) + return self.__construct_elastic_documents( + episode_id, episode_info, cache_path, 0, + ) def __generate_documents( - self, - data: Dict[str, Any], - episode_info: Any, - context: ExecutionContext, + self, + data: Dict[str, Any], + output_path: Path, + episode_info: Any, + context: ExecutionContext, ) -> int: total_docs = 0 if self.config.generate_segments and 'transcription' in data: - _, count = self.__generate_segments_jsonl(data, episode_info, context) - total_docs += count + total_docs += self.__generate_segments_jsonl( + data, output_path, episode_info, context, + ) return total_docs def __generate_segments_jsonl( - self, data: Dict[str, Any], episode_info: Any, context: ExecutionContext, - ) -> Tuple[Path, int]: - output_path = self.__resolve_segments_output_path(episode_info, context) + self, + data: Dict[str, Any], + output_path: Path, + episode_info: Any, + context: ExecutionContext, + ) -> int: segments = data['transcription'].get('segments', []) episode_metadata = self.__build_episode_metadata(episode_info, context) video_bot_path = self.__build_video_bot_path(episode_info, context) - count = self.__write_segments_to_jsonl( - segments, output_path, episode_info, episode_metadata, video_bot_path, + return self.__write_segments_to_jsonl( + segments, + output_path, + episode_info, + episode_metadata, + video_bot_path, ) - return output_path, count + + def __gather_input_data( + self, episode_info: Any, context: ExecutionContext, + ) -> Dict[str, Any]: + data: Dict[str, Any] = {} + + clean_path = self.__resolve_input_path( + episode_info, + context, + 'transcriptions/clean', + '_clean_transcription.json', + ) + if clean_path.exists(): + data['transcription'] = FileOperations.load_json(clean_path) + + text_emb_path = self.__resolve_input_path( + episode_info, + context, + 'embeddings', + '_embeddings_text.json', + ) + if text_emb_path.exists(): + data['text_embeddings'] = FileOperations.load_json(text_emb_path) + + scene_path = self.__resolve_input_path( + episode_info, + context, + 'scene_timestamps', + '_scenes.json', + ) + if scene_path.exists(): + data['scenes'] = FileOperations.load_json(scene_path) + + return data @staticmethod def __write_segments_to_jsonl( - segments: List[Dict[str, Any]], - output_path: Path, - episode_info: Any, - episode_metadata: Dict[str, Any], - video_bot_path: str, + segments: List[Dict[str, Any]], + output_path: Path, + episode_info: Any, + episode_metadata: Dict[str, Any], + video_bot_path: str, ) -> int: count = 0 with StepTempFile(output_path) as temp_path: @@ -122,78 +182,44 @@ def __extract_episode_info(input_data: Artifact) -> Tuple[Any, str]: episode_id = getattr(input_data, 'episode_id') return episode_info, episode_id - def __resolve_output_dir(self, episode_info: Any, context: ExecutionContext) -> Path: - output_path = self._resolve_output_path( - 0, - context, - { - 'season': episode_info.season_code(), - 'episode': episode_info.episode_code(), - }, - ) - return output_path.parent - - def __resolve_segments_output_path(self, episode_info: Any, context: ExecutionContext) -> Path: - return self._resolve_output_path( - 0, - context, - { - 'season': episode_info.season_code(), - 'episode': episode_info.episode_code(), - }, - ) - @staticmethod def __build_video_bot_path(episode_info: Any, context: ExecutionContext) -> str: filename = f'{context.series_name}_{episode_info.episode_code()}.mp4' - return f'bot/{context.series_name.upper()}-WIDEO/{episode_info.season_code()}/{filename}' + return ( + f'bot/{context.series_name.upper()}-WIDEO/' + f'{episode_info.season_code()}/{filename}' + ) @staticmethod def __construct_elastic_documents( - episode_id: str, episode_info: Any, output_dir: Path, document_count: int, + episode_id: str, + episode_info: Any, + output_path: Path, + document_count: int, ) -> ElasticDocuments: return ElasticDocuments( episode_id=episode_id, episode_info=episode_info, - path=output_dir, + path=output_path, document_count=document_count, ) @staticmethod - def __build_episode_metadata(episode_info: Any, context: ExecutionContext) -> Dict[str, Any]: + def __build_episode_metadata( + episode_info: Any, context: ExecutionContext, + ) -> Dict[str, Any]: return { 'season': episode_info.season, 'episode_number': episode_info.relative_episode, 'series_name': context.series_name, } - @staticmethod - def __gather_input_data(episode_info: Any, context: ExecutionContext) -> Dict[str, Any]: - data: Dict[str, Any] = {} - - clean_path = DocumentGeneratorStep.__resolve_input_path( - episode_info, context, 'transcriptions/clean', '_clean_transcription.json', - ) - if clean_path.exists(): - data['transcription'] = FileOperations.load_json(clean_path) - - text_emb_path = DocumentGeneratorStep.__resolve_input_path( - episode_info, context, 'embeddings', '_embeddings_text.json', - ) - if text_emb_path.exists(): - data['text_embeddings'] = FileOperations.load_json(text_emb_path) - - scene_path = DocumentGeneratorStep.__resolve_input_path( - episode_info, context, 'scene_timestamps', '_scenes.json', - ) - if scene_path.exists(): - data['scenes'] = FileOperations.load_json(scene_path) - - return data - @staticmethod def __resolve_input_path( - episode_info: Any, context: ExecutionContext, folder: str, suffix: str, + episode_info: Any, + context: ExecutionContext, + folder: str, + suffix: str, ) -> Path: filename = f'{context.series_name}_{episode_info.episode_code()}{suffix}' return context.get_output_path(episode_info, folder, filename) diff --git a/preprocessor/steps/search/indexing_step.py b/preprocessor/steps/search/indexing_step.py index 564b222c3..9e5b81911 100644 --- a/preprocessor/steps/search/indexing_step.py +++ b/preprocessor/steps/search/indexing_step.py @@ -18,7 +18,9 @@ from preprocessor.services.search.elasticsearch import ElasticsearchWrapper -class ElasticsearchIndexerStep(PipelineStep[List[ElasticDocuments], IndexingResult, ElasticsearchConfig]): +class ElasticsearchIndexerStep( + PipelineStep[List[ElasticDocuments], IndexingResult, ElasticsearchConfig], +): def __init__(self, config: ElasticsearchConfig) -> None: super().__init__(config) self.__es: Optional[ElasticsearchWrapper] = None @@ -29,33 +31,26 @@ def name(self) -> str: @property def is_global(self) -> bool: - """Indexing is a global step - processes all episodes at once.""" return True @property def supports_batch_processing(self) -> bool: return True + @property + def uses_caching(self) -> bool: + return False + def setup_resources(self, context: ExecutionContext) -> None: if self.__es is None: - context.logger.info(f'Initializing Elasticsearch client: {self.config.host}') + context.logger.info( + f'Initializing Elasticsearch client: {self.config.host}', + ) self.__es = ElasticsearchWrapper( host=self.config.host, index_name=self.config.index_name, ) - def execute_batch( - self, input_data: List[List[ElasticDocuments]], context: ExecutionContext, - ) -> List[IndexingResult]: - context.logger.info(f"Batch indexing {len(input_data)} document collections") - - results = [] - for docs in input_data: - result = asyncio.run(self.__execute_async(docs, context)) - results.append(result) - - return results - def teardown_resources(self, context: ExecutionContext) -> None: if self.__es: asyncio.run(self.__es.close()) @@ -67,12 +62,28 @@ def cleanup(self) -> None: asyncio.run(self.__es.close()) self.__es = None - def execute( + def execute_batch( + self, + input_data: List[List[ElasticDocuments]], + context: ExecutionContext, + ) -> List[IndexingResult]: + context.logger.info( + f"Batch indexing {len(input_data)} document collections", + ) + results = [] + for docs in input_data: + # Reusing _process logic via direct async call wrapper if needed, + # or calling execute which routes to _process + result = self.execute(docs, context) + results.append(result) + return results + + def _process( self, input_data: List[ElasticDocuments], context: ExecutionContext, ) -> IndexingResult: - return asyncio.run(self.__execute_async(input_data, context)) + return asyncio.run(self.__process_async(input_data, context)) - async def __execute_async( + async def __process_async( self, input_data: List[ElasticDocuments], context: ExecutionContext, @@ -81,11 +92,13 @@ async def __execute_async( return self.__construct_empty_result(context) docs_by_type = self.__group_documents_by_type(input_data) - total_indexed = await self.__process_all_document_types(docs_by_type, context) + total_indexed = await self.__index_grouped_documents( + docs_by_type, context, + ) return self.__construct_indexing_result(total_indexed) - async def __process_all_document_types( + async def __index_grouped_documents( self, docs_by_type: Dict[str, List[Path]], context: ExecutionContext, @@ -93,10 +106,14 @@ async def __process_all_document_types( total_indexed: int = 0 for doc_type, paths in docs_by_type.items(): try: - indexed_count = await self.__process_document_type(doc_type, paths, context) + indexed_count = await self.__process_document_type( + doc_type, paths, context, + ) total_indexed += indexed_count except Exception as e: - context.logger.error(f'Elasticsearch indexing failed for {doc_type}: {e}') + context.logger.error( + f'Elasticsearch indexing failed for {doc_type}: {e}', + ) raise return total_indexed @@ -113,7 +130,9 @@ async def __process_document_type( await self.__setup_index(doc_type) documents = self.__load_documents_from_paths(paths) - return await self.__execute_bulk_indexing(documents, index_name, context) + return await self.__execute_bulk_indexing( + documents, index_name, context, + ) async def __prepare_elasticsearch_client(self, index_name: str) -> None: if self.__es is None or self.__es.index_name != index_name: @@ -129,7 +148,9 @@ async def __setup_index(self, doc_type: str) -> None: if not self.config.append: await self.__es.delete_index() - mapping: Optional[Dict[str, Any]] = self.__get_mapping_for_type(doc_type) + mapping: Optional[Dict[str, Any]] = self.__get_mapping_for_type( + doc_type, + ) if mapping: await self.__es.create_index(mapping) @@ -158,12 +179,16 @@ def __construct_indexing_result(self, document_count: int) -> IndexingResult: success=True, ) - def __construct_empty_result(self, context: ExecutionContext) -> IndexingResult: + def __construct_empty_result( + self, context: ExecutionContext, + ) -> IndexingResult: context.logger.warning('No documents to index.') return self.__construct_indexing_result(0) @staticmethod - def __group_documents_by_type(input_data: List[ElasticDocuments]) -> Dict[str, List[Path]]: + def __group_documents_by_type( + input_data: List[ElasticDocuments], + ) -> Dict[str, List[Path]]: docs_by_type: Dict[str, List[Path]] = {} for doc_artifact in input_data: doc_type: str = doc_artifact.path.parent.name diff --git a/preprocessor/steps/text/analysis_step.py b/preprocessor/steps/text/analysis_step.py index b44dbc21d..6f840f111 100644 --- a/preprocessor/steps/text/analysis_step.py +++ b/preprocessor/steps/text/analysis_step.py @@ -22,23 +22,10 @@ class TextAnalysisStep(PipelineStep[TranscriptionData, TextAnalysisResults, TextAnalysisConfig]): - @property - def name(self) -> str: - return 'text_analysis' - @property def supports_batch_processing(self) -> bool: return True - def get_output_descriptors(self) -> List[OutputDescriptor]: - return [ - JsonFileOutput( - pattern="{season}/{episode}.json", - subdir="text_analysis", - min_size_bytes=50, - ), - ] - def execute_batch( self, input_data: List[TranscriptionData], context: ExecutionContext, ) -> List[TextAnalysisResults]: @@ -46,34 +33,47 @@ def execute_batch( input_data, context, self.config.max_parallel_episodes, self.execute, ) - def execute( - self, input_data: TranscriptionData, context: ExecutionContext, + def _process( + self, input_data: TranscriptionData, context: ExecutionContext, ) -> TextAnalysisResults: - output_path = self.__resolve_output_path(input_data, context) - - if self._check_cache_validity(output_path, context, input_data.episode_id, 'cached'): - return self.__load_cached_result(output_path, input_data) - - context.logger.info(f'Analyzing text for {input_data.episode_id}') - context.mark_step_started(self.name, input_data.episode_id) + output_path = self._get_cache_path(input_data, context) txt_path = self.__resolve_text_file_path(input_data) stats = self.__analyze_text_statistics(txt_path) result_data = self.__build_result_payload(stats, txt_path, input_data) - self.__save_analysis_results(output_path, result_data) - context.mark_step_completed(self.name, input_data.episode_id) + FileOperations.atomic_write_json(output_path, result_data) return self.__construct_analysis_results(input_data, output_path, result_data) + def _get_output_descriptors(self) -> List[OutputDescriptor]: + return [ + JsonFileOutput( + pattern="{season}/{episode}.json", + subdir="text_analysis", + min_size_bytes=50, + ), + ] + + def _get_cache_path( + self, input_data: TranscriptionData, context: ExecutionContext, + ) -> Path: + return self._get_standard_cache_path(input_data, context) + + def _load_from_cache( + self, cache_path: Path, input_data: TranscriptionData, context: ExecutionContext, + ) -> TextAnalysisResults: + stats_data = FileOperations.load_json(cache_path) + return self.__construct_analysis_results(input_data, cache_path, stats_data) + def __analyze_text_statistics(self, txt_path: Path) -> TextStatistics: return TextStatistics.from_file(txt_path, language=self.config.language) def __build_result_payload( - self, - stats: TextStatistics, - txt_path: Path, - input_data: TranscriptionData, + self, + stats: TextStatistics, + txt_path: Path, + input_data: TranscriptionData, ) -> Dict[str, Any]: return { 'metadata': { @@ -85,18 +85,6 @@ def __build_result_payload( **stats.to_dict(), } - def __resolve_output_path( - self, input_data: TranscriptionData, context: ExecutionContext, - ) -> Path: - return self._resolve_output_path( - 0, - context, - { - 'season': input_data.episode_info.season_code(), - 'episode': input_data.episode_info.episode_code(), - }, - ) - @staticmethod def __resolve_text_file_path(input_data: TranscriptionData) -> Path: txt_path = input_data.path @@ -108,25 +96,11 @@ def __resolve_text_file_path(input_data: TranscriptionData) -> Path: return txt_path - @staticmethod - def __load_cached_result( - output_path: Path, - input_data: TranscriptionData, - ) -> TextAnalysisResults: - stats_data = FileOperations.load_json(output_path) - return TextAnalysisStep.__construct_analysis_results( - input_data, output_path, stats_data, - ) - - @staticmethod - def __save_analysis_results(output_path: Path, result_data: Dict[str, Any]) -> None: - FileOperations.atomic_write_json(output_path, result_data) - @staticmethod def __construct_analysis_results( - input_data: TranscriptionData, - output_path: Path, - result_data: Dict[str, Any], + input_data: TranscriptionData, + output_path: Path, + result_data: Dict[str, Any], ) -> TextAnalysisResults: return TextAnalysisResults( episode_id=input_data.episode_id, diff --git a/preprocessor/steps/text/embeddings_step.py b/preprocessor/steps/text/embeddings_step.py index cded2e49b..56cc521cc 100644 --- a/preprocessor/steps/text/embeddings_step.py +++ b/preprocessor/steps/text/embeddings_step.py @@ -1,3 +1,4 @@ +# pylint: disable=duplicate-code from pathlib import Path import re from typing import ( @@ -21,25 +22,11 @@ from preprocessor.services.search.embedding_model import EmbeddingModelWrapper -# pylint: disable=duplicate-code # Pattern shared with vision/embeddings_step - different data types (text vs frames) class TextEmbeddingStep(PipelineStep[TranscriptionData, EmbeddingCollection, TextEmbeddingConfig]): def __init__(self, config: TextEmbeddingConfig) -> None: super().__init__(config) self.__model: Optional[EmbeddingModelWrapper] = None - def get_output_descriptors(self) -> List[FileOutput]: - return [ - FileOutput( - pattern="{season}/{episode}.npy", - subdir="embeddings/text", - min_size_bytes=1024, - ), - ] - - @property - def name(self) -> str: - return 'text_embedding' - @property def supports_batch_processing(self) -> bool: return True @@ -53,11 +40,6 @@ def setup_resources(self, context: ExecutionContext) -> None: self.config.batch_size, ) - def execute_batch( - self, input_data: List[TranscriptionData], context: ExecutionContext, - ) -> List[EmbeddingCollection]: - return self._execute_sequential(input_data, context, self.__execute_single) - def teardown_resources(self, context: ExecutionContext) -> None: if self.__model: self.__model = None @@ -67,58 +49,59 @@ def cleanup(self) -> None: if self.__model: self.__model = None - def execute( - self, - input_data: TranscriptionData, - context: ExecutionContext, - ) -> EmbeddingCollection: - output_path = self.__resolve_output_path( - context, - input_data.episode_info.season, - input_data.episode_info.episode, - ) + def execute_batch( + self, input_data: List[TranscriptionData], context: ExecutionContext, + ) -> List[EmbeddingCollection]: + return self._execute_sequential(input_data, context, self.execute) - if self._check_cache_validity(output_path, context, input_data.episode_id, 'cached text embeddings'): - return self.__load_cached_result(output_path, input_data) + def _process( + self, input_data: TranscriptionData, context: ExecutionContext, + ) -> EmbeddingCollection: + output_path = self._get_cache_path(input_data, context) segments = self.__extract_valid_segments(input_data, context) if not segments: - return self.__construct_embedding_collection(input_data, output_path, 0) + return self.__construct_embedding_collection( + input_data, output_path, 0, + ) self.__prepare_embedding_model() context.logger.info(f'Generating text embeddings for {input_data.episode_id}') - context.mark_step_started(self.name, input_data.episode_id) results = self.__process_text_embeddings(segments) self.__save_embedding_results(results, output_path, input_data) - context.mark_step_completed(self.name, input_data.episode_id) - return self.__construct_embedding_collection(input_data, output_path, len(results)) + return self.__construct_embedding_collection( + input_data, output_path, len(results), + ) - def __execute_single( + def _get_output_descriptors(self) -> List[FileOutput]: + return [ + FileOutput( + pattern="{season}/{episode}.json", + subdir="embeddings/text", + min_size_bytes=1024, + ), + ] + + def _get_cache_path( self, input_data: TranscriptionData, context: ExecutionContext, - ) -> EmbeddingCollection: - output_path = self.__resolve_output_path( + ) -> Path: + return self._resolve_output_path( + 0, context, - input_data.episode_info.season, - input_data.episode_info.episode, + self.__create_path_variables(input_data), ) - if self._check_cache_validity(output_path, context, input_data.episode_id, 'cached text embeddings'): - return self.__load_cached_result(output_path, input_data) - - segments = self.__extract_valid_segments(input_data, context) - if not segments: - return self.__construct_embedding_collection(input_data, output_path, 0) - - context.logger.info(f'Generating text embeddings for {input_data.episode_id}') - context.mark_step_started(self.name, input_data.episode_id) - - results = self.__process_text_embeddings(segments) - self.__save_embedding_results(results, output_path, input_data) - - context.mark_step_completed(self.name, input_data.episode_id) - return self.__construct_embedding_collection(input_data, output_path, len(results)) + def _load_from_cache( + self, cache_path: Path, input_data: TranscriptionData, context: ExecutionContext, + ) -> EmbeddingCollection: + emb_data: Dict[str, Any] = FileOperations.load_json(cache_path) + return self.__construct_embedding_collection( + input_data, + cache_path, + len(emb_data.get('text_embeddings', [])), + ) def __prepare_embedding_model(self) -> None: if self.__model is None: @@ -128,7 +111,9 @@ def __prepare_embedding_model(self) -> None: self.config.batch_size, ) - def __process_text_embeddings(self, segments: List[Dict[str, Any]]) -> List[Dict[str, Any]]: + def __process_text_embeddings( + self, segments: List[Dict[str, Any]], + ) -> List[Dict[str, Any]]: full_text: str = ' '.join([seg.get('text', '') for seg in segments]) sentences: List[str] = self.__split_into_sentences(full_text) text_chunks, chunk_metadata = self.__create_text_chunks(sentences, segments) @@ -141,10 +126,14 @@ def __create_text_chunks( ) -> Tuple[List[str], List[Dict[str, Any]]]: text_chunks: List[str] = [] chunk_metadata: List[Dict[str, Any]] = [] - step: int = self.config.text_sentences_per_chunk - self.config.text_chunk_overlap + step: int = ( + self.config.text_sentences_per_chunk - self.config.text_chunk_overlap + ) for i in range(0, len(sentences), step): - chunk_sentences: List[str] = sentences[i:i + self.config.text_sentences_per_chunk] + chunk_sentences: List[str] = sentences[ + i : i + self.config.text_sentences_per_chunk + ] if not chunk_sentences: continue @@ -172,9 +161,14 @@ def __batch_encode_chunks( ) -> List[Dict[str, Any]]: results: List[Dict[str, Any]] = [] + if not self.__model: + raise RuntimeError("Embedding model not initialized") + for i in range(0, len(text_chunks), self.config.batch_size): - batch_texts: List[str] = text_chunks[i:i + self.config.batch_size] - batch_meta: List[Dict[str, Any]] = chunk_metadata[i:i + self.config.batch_size] + batch_texts: List[str] = text_chunks[i : i + self.config.batch_size] + batch_meta: List[Dict[str, Any]] = chunk_metadata[ + i : i + self.config.batch_size + ] batch_embeddings: List[List[float]] = self.__model.encode_text(batch_texts) for meta, embedding in zip(batch_meta, batch_embeddings): @@ -215,57 +209,44 @@ def __construct_embedding_collection( embedding_type='text', ) - def __resolve_output_path( - self, - context: ExecutionContext, - season: int, - episode: int, - ) -> Path: - return self._resolve_output_path( - 0, - context, - {"season": season, "episode": episode}, - ) - @staticmethod - def __load_cached_result( - output_path: Path, - input_data: TranscriptionData, - ) -> EmbeddingCollection: - emb_data: Dict[str, Any] = FileOperations.load_json(output_path) - return MetadataBuilder.create_embedding_collection( - episode_id=input_data.episode_id, - episode_info=input_data.episode_info, - path=output_path, - model_name="cached_model", - embedding_count=len(emb_data.get('text_embeddings', [])), - embedding_type='text', - ) + def __create_path_variables(input_data: TranscriptionData) -> Dict[str, str]: + return { + "season": f"S{input_data.episode_info.season:02d}", + "episode": input_data.episode_info.episode_code(), + } @staticmethod def __extract_valid_segments( input_data: TranscriptionData, context: ExecutionContext, ) -> List[Dict[str, Any]]: - transcription: Dict[str, Any] = TextEmbeddingStep.__load_clean_transcription(input_data) + transcription: Dict[str, Any] = TextEmbeddingStep.__load_clean_transcription( + input_data, + ) segments: List[Dict[str, Any]] = transcription.get('segments', []) if not segments: - context.logger.warning(f'No text segments for embedding in {input_data.episode_id}') + context.logger.warning( + f'No text segments for embedding in {input_data.episode_id}', + ) return segments @staticmethod def __load_clean_transcription(input_data: TranscriptionData) -> Dict[str, Any]: raw_path: Path = input_data.path clean_path: Path = ( - raw_path.parent.parent / 'clean' / - raw_path.name.replace('.json', '_clean_transcription.json') + raw_path.parent.parent + / 'clean' + / raw_path.name.replace('.json', '_clean_transcription.json') ) if clean_path.exists(): return FileOperations.load_json(clean_path) return FileOperations.load_json(raw_path) @staticmethod - def __find_segment_at_position(segments: List[Dict[str, Any]], char_pos: int) -> int: + def __find_segment_at_position( + segments: List[Dict[str, Any]], char_pos: int, + ) -> int: cumulative_length: int = 0 for idx, seg in enumerate(segments): seg_length: int = len(seg.get('text', '')) + 1 @@ -276,8 +257,8 @@ def __find_segment_at_position(segments: List[Dict[str, Any]], char_pos: int) -> @staticmethod def __split_into_sentences(text: str) -> List[str]: - normalized_text: str = re.sub('\\.{2,}', '.', text) - sentences: List[str] = re.split('([.!?]+(?:\\s+|$))', normalized_text) + normalized_text: str = re.sub(r'\.{2,}', '.', text) + sentences: List[str] = re.split(r'([.!?]+(?:\s+|$))', normalized_text) result: List[str] = [] for i in range(0, len(sentences) - 1, 2): s: str = (sentences[i] + sentences[i + 1]).strip() diff --git a/preprocessor/steps/text/transcription_step.py b/preprocessor/steps/text/transcription_step.py index 947a48bd5..890408d8b 100644 --- a/preprocessor/steps/text/transcription_step.py +++ b/preprocessor/steps/text/transcription_step.py @@ -19,117 +19,108 @@ from preprocessor.services.transcription.whisper import Whisper -class TranscriptionStep(PipelineStep[AudioArtifact, TranscriptionData, WhisperTranscriptionConfig]): +class TranscriptionStep( + PipelineStep[AudioArtifact, TranscriptionData, WhisperTranscriptionConfig], +): def __init__(self, config: WhisperTranscriptionConfig) -> None: super().__init__(config) self.__whisper: Optional[Whisper] = None - def get_output_descriptors(self) -> List[JsonFileOutput]: - return [ - JsonFileOutput( - pattern="{season}/{episode}.json", - subdir="transcriptions", - min_size_bytes=50, - ), - ] - - @property - def name(self) -> str: - return 'transcription' - @property def supports_batch_processing(self) -> bool: return True def setup_resources(self, context: ExecutionContext) -> None: if self.__whisper is None: - context.logger.info(f'Loading Whisper model: {self.config.model}') - self.__whisper = Whisper( - model=self.config.model, - language=self.config.language, - device=self.config.device, - beam_size=self.config.beam_size, - ) - - def execute_batch( - self, input_data: List[AudioArtifact], context: ExecutionContext, - ) -> List[TranscriptionData]: - return self._execute_with_threadpool( - input_data, context, self.config.max_parallel_episodes, self.__execute_single, - ) + self.__load_whisper(context) def teardown_resources(self, context: ExecutionContext) -> None: if self.__whisper: - self.__whisper.cleanup() - self.__whisper = None - context.logger.info('Whisper model unloaded') + self.__unload_whisper(context) def cleanup(self) -> None: - if self.__whisper: - self.__whisper.cleanup() - self.__whisper = None + self.__unload_whisper() - def execute(self, input_data: AudioArtifact, context: ExecutionContext) -> TranscriptionData: - output_path = self.__resolve_output_path( - context, - input_data.episode_info.season, - input_data.episode_info.episode, + def execute_batch( + self, input_data: List[AudioArtifact], context: ExecutionContext, + ) -> List[TranscriptionData]: + return self._execute_with_threadpool( + input_data, context, self.config.max_parallel_episodes, self.execute, ) - if self._check_cache_validity(output_path, context, input_data.episode_id, 'cached transcription'): - return self.__construct_cached_result(output_path, input_data) + def _process( + self, input_data: AudioArtifact, context: ExecutionContext, + ) -> TranscriptionData: + output_path = self._get_cache_path(input_data, context) - self.__prepare_whisper_model() - context.logger.info( - f'Transcribing {input_data.episode_id} using Whisper {self.config.model}', - ) - context.mark_step_started(self.name, input_data.episode_id) + if self.__whisper is None: + self.__load_whisper(context) - result = self.__process_audio_transcription(input_data, output_path, context) + result = self.__transcribe_and_save(input_data, output_path, context) - context.mark_step_completed(self.name, input_data.episode_id) return self.__construct_result_artifact(output_path, input_data, result) - def __execute_single( - self, input_data: AudioArtifact, context: ExecutionContext, - ) -> TranscriptionData: - output_path = self.__resolve_output_path( - context, - input_data.episode_info.season, - input_data.episode_info.episode, - ) + def _get_output_descriptors(self) -> List[JsonFileOutput]: + return [ + JsonFileOutput( + pattern="{season}/{episode}.json", + subdir="transcriptions", + min_size_bytes=50, + ), + ] - if self._check_cache_validity(output_path, context, input_data.episode_id, 'cached transcription'): - return self.__construct_cached_result(output_path, input_data) + def _get_cache_path( + self, input_data: AudioArtifact, context: ExecutionContext, + ) -> Path: + return self._get_standard_cache_path(input_data, context) - context.logger.info( - f'Transcribing {input_data.episode_id} using Whisper {self.config.model}', + def _load_from_cache( + self, + cache_path: Path, + input_data: AudioArtifact, + context: ExecutionContext, + ) -> TranscriptionData: + return TranscriptionData( + episode_id=input_data.episode_id, + episode_info=input_data.episode_info, + path=cache_path, + language=self.config.language, + model=self.config.model, + format='json', ) - context.mark_step_started(self.name, input_data.episode_id) - result = self.__process_audio_transcription(input_data, output_path, context) + def __load_whisper(self, context: Optional[ExecutionContext] = None) -> None: + if context: + context.logger.info(f'Loading Whisper model: {self.config.model}') - context.mark_step_completed(self.name, input_data.episode_id) - return self.__construct_result_artifact(output_path, input_data, result) + self.__whisper = Whisper( + model=self.config.model, + language=self.config.language, + device=self.config.device, + beam_size=self.config.beam_size, + ) - def __prepare_whisper_model(self) -> None: - if self.__whisper is None: - self.__whisper = Whisper( - model=self.config.model, - language=self.config.language, - device=self.config.device, - beam_size=self.config.beam_size, - ) + def __unload_whisper(self, context: Optional[ExecutionContext] = None) -> None: + if self.__whisper: + self.__whisper.cleanup() + self.__whisper = None + if context: + context.logger.info('Whisper model unloaded') - def __process_audio_transcription( + def __transcribe_and_save( self, input_data: AudioArtifact, output_path: Path, context: ExecutionContext, ) -> Dict[str, Any]: try: + if self.__whisper is None: + raise RuntimeError("Whisper model not initialized") + result: Dict[str, Any] = self.__whisper.transcribe(input_data.path) - result['episode_info'] = EpisodeManager.get_metadata(input_data.episode_info) + result['episode_info'] = EpisodeManager.get_metadata( + input_data.episode_info, + ) FileOperations.atomic_write_json(output_path, result) return result except Exception as e: @@ -140,18 +131,6 @@ def __process_audio_transcription( output_path.unlink() raise - def __construct_cached_result( - self, output_path: Path, input_data: AudioArtifact, - ) -> TranscriptionData: - return TranscriptionData( - episode_id=input_data.episode_id, - episode_info=input_data.episode_info, - path=output_path, - language=self.config.language, - model=self.config.model, - format='json', - ) - def __construct_result_artifact( self, output_path: Path, @@ -166,15 +145,3 @@ def __construct_result_artifact( model=self.config.model, format='json', ) - - def __resolve_output_path( - self, - context: ExecutionContext, - season: int, - episode: int, - ) -> Path: - return self._resolve_output_path( - 0, - context, - {"season": season, "episode": episode}, - ) diff --git a/preprocessor/steps/validation/validator_step.py b/preprocessor/steps/validation/validator_step.py index d909ae115..0467d25d3 100644 --- a/preprocessor/steps/validation/validator_step.py +++ b/preprocessor/steps/validation/validator_step.py @@ -19,6 +19,10 @@ def name(self) -> str: def supports_batch_processing(self) -> bool: return True + @property + def uses_caching(self) -> bool: + return False + def execute_batch( self, input_data: List[ElasticDocuments], context: ExecutionContext, ) -> List[ValidationResult]: @@ -26,7 +30,7 @@ def execute_batch( input_data, context, self.config.max_parallel_episodes, self.execute, ) - def execute( + def _process( self, input_data: ElasticDocuments, context: ExecutionContext, diff --git a/preprocessor/steps/video/frame_export_step.py b/preprocessor/steps/video/frame_export_step.py index beaea3b21..044bd87ed 100644 --- a/preprocessor/steps/video/frame_export_step.py +++ b/preprocessor/steps/video/frame_export_step.py @@ -7,7 +7,6 @@ Any, Dict, List, - Tuple, ) from PIL import Image @@ -38,9 +37,6 @@ def __init__(self, config: FrameExportConfig) -> None: self.config.keyframe_strategy, self.config.frames_per_scene, ) - def get_output_descriptors(self) -> List[DirectoryOutput]: - return [create_frames_output()] - @property def name(self) -> str: return 'frame_export' @@ -56,24 +52,23 @@ def execute_batch( input_data, context, self.config.max_parallel_episodes, self.execute, ) - def execute( - self, input_data: SceneCollection, context: ExecutionContext, + def _process( + self, input_data: SceneCollection, context: ExecutionContext, ) -> FrameCollection: - episode_dir, metadata_file = self.__resolve_output_paths(input_data, context) - - if self._check_cache_validity(metadata_file, context, input_data.episode_id, 'cached'): - return self.__load_cached_result(metadata_file, episode_dir, input_data) + metadata_file = self._get_cache_path(input_data, context) + episode_dir = metadata_file.parent self.__prepare_episode_directory(episode_dir, context) frame_requests = self.__extract_frame_requests(input_data) if not frame_requests: - return self.__construct_empty_result(episode_dir, metadata_file, input_data, context) + return self.__construct_empty_result( + episode_dir, metadata_file, input_data, context, + ) context.logger.info( f'Extracting {len(frame_requests)} keyframes from {input_data.video_path.name}', ) - context.mark_step_started(self.name, input_data.episode_id) self.__process_frame_extraction( input_data.video_path, @@ -84,7 +79,6 @@ def execute( context, ) - context.mark_step_completed(self.name, input_data.episode_id) return FrameCollection( episode_id=input_data.episode_id, episode_info=input_data.episode_info, @@ -93,7 +87,36 @@ def execute( metadata_path=metadata_file, ) - def __extract_frame_requests(self, input_data: SceneCollection) -> List[FrameRequest]: + def _get_output_descriptors(self) -> List[DirectoryOutput]: + return [create_frames_output()] + + def _get_cache_path( + self, input_data: SceneCollection, context: ExecutionContext, + ) -> Path: + episode_dir = self._get_standard_cache_path(input_data, context) + metadata_filename = ( + f'{context.series_name}_' + f'{input_data.episode_info.episode_code()}_frame_metadata.json' + ) + return episode_dir / metadata_filename + + def _load_from_cache( + self, cache_path: Path, input_data: SceneCollection, context: ExecutionContext, + ) -> FrameCollection: + episode_dir = cache_path.parent + with open(cache_path, 'r', encoding='utf-8') as f: + metadata = json.load(f) + return FrameCollection( + episode_id=input_data.episode_id, + episode_info=input_data.episode_info, + directory=episode_dir, + frame_count=metadata['statistics']['total_frames'], + metadata_path=cache_path, + ) + + def __extract_frame_requests( + self, input_data: SceneCollection, + ) -> List[FrameRequest]: video_path = input_data.video_path if not video_path.exists(): raise FileNotFoundError(f'Video file not found for frame export: {video_path}') @@ -101,20 +124,28 @@ def __extract_frame_requests(self, input_data: SceneCollection) -> List[FrameReq return self.__strategy.extract_frame_requests(video_path, data) def __process_frame_extraction( - self, - video_path: Path, - frame_requests: List[FrameRequest], - episode_dir: Path, - input_data: SceneCollection, - metadata_file: Path, - context: ExecutionContext, + self, + video_path: Path, + frame_requests: List[FrameRequest], + episode_dir: Path, + input_data: SceneCollection, + metadata_file: Path, + context: ExecutionContext, ) -> None: try: self.__extract_frames( - video_path, frame_requests, episode_dir, input_data.episode_info, context, + video_path, + frame_requests, + episode_dir, + input_data.episode_info, + context, ) self.__write_metadata( - frame_requests, input_data.episode_info, video_path, context, metadata_file, + frame_requests, + input_data.episode_info, + video_path, + context, + metadata_file, ) except Exception as e: context.logger.error(f'Failed to extract frames from {video_path}: {e}') @@ -122,12 +153,12 @@ def __process_frame_extraction( raise def __extract_frames( - self, - video_file: Path, - frame_requests: List[FrameRequest], - episode_dir: Path, - episode_info, - context: ExecutionContext, + self, + video_file: Path, + frame_requests: List[FrameRequest], + episode_dir: Path, + episode_info, + context: ExecutionContext, ) -> None: video_metadata = self.__fetch_video_metadata(video_file) dar = self.__calculate_display_aspect_ratio(video_metadata) @@ -136,19 +167,24 @@ def __extract_frames( for req in frame_requests: frame_num = req['frame_number'] self.__extract_and_save_frame( - vr, frame_num, episode_dir, episode_info, dar, context.series_name, + vr, + frame_num, + episode_dir, + episode_info, + dar, + context.series_name, ) del vr def __extract_and_save_frame( - self, - vr: decord.VideoReader, - frame_num: int, - episode_dir: Path, - episode_info, - dar: float, - series_name: str, + self, + vr: decord.VideoReader, + frame_num: int, + episode_dir: Path, + episode_info, + dar: float, + series_name: str, ) -> None: frame_np = vr[frame_num].asnumpy() frame_pil = Image.fromarray(frame_np) @@ -161,13 +197,17 @@ def __extract_and_save_frame( with StepTempFile(final_path) as temp_path: resized.save(temp_path, quality=90) - def __resize_frame(self, frame: Image.Image, display_aspect_ratio: float) -> Image.Image: + def __resize_frame( + self, frame: Image.Image, display_aspect_ratio: float, + ) -> Image.Image: target_width = self.config.resolution.width target_height = self.config.resolution.height target_aspect = target_width / target_height if abs(display_aspect_ratio - target_aspect) < 0.01: - return frame.resize((target_width, target_height), Image.Resampling.LANCZOS) + return frame.resize( + (target_width, target_height), Image.Resampling.LANCZOS, + ) if display_aspect_ratio > target_aspect: new_height = target_height @@ -185,12 +225,12 @@ def __resize_frame(self, frame: Image.Image, display_aspect_ratio: float) -> Ima return result def __write_metadata( - self, - frame_requests: List[FrameRequest], - episode_info, - source_video: Path, - context: ExecutionContext, - metadata_file: Path, + self, + frame_requests: List[FrameRequest], + episode_info, + source_video: Path, + context: ExecutionContext, + metadata_file: Path, ) -> None: frame_types_count: Dict[str, int] = {} frames_with_paths: List[Dict[str, Any]] = [] @@ -230,60 +270,35 @@ def __write_metadata( 'frame_types': frame_types_count, 'total_scenes': len(scene_numbers), 'timestamp_range': { - 'start': min((f.get('timestamp', 0) for f in frame_requests), default=0), - 'end': max((f.get('timestamp', 0) for f in frame_requests), default=0), + 'start': min( + (f.get('timestamp', 0) for f in frame_requests), default=0, + ), + 'end': max( + (f.get('timestamp', 0) for f in frame_requests), default=0, + ), }, }, 'frames': frames_with_paths, } FileOperations.atomic_write_json(metadata_file, metadata, indent=2) - def __resolve_output_paths( - self, - input_data: SceneCollection, - context: ExecutionContext, - ) -> Tuple[Path, Path]: - episode_dir = self._resolve_output_path( - 0, - context, - { - 'season': input_data.episode_info.season_code(), - 'episode': input_data.episode_info.episode_code(), - }, - ) - metadata_filename = f'{context.series_name}_{input_data.episode_info.episode_code()}_frame_metadata.json' - metadata_file = episode_dir / metadata_filename - return episode_dir, metadata_file - @staticmethod - def __load_cached_result( - metadata_file: Path, - episode_dir: Path, - input_data: SceneCollection, - ) -> FrameCollection: - with open(metadata_file, 'r', encoding='utf-8') as f: - metadata = json.load(f) - return FrameCollection( - episode_id=input_data.episode_id, - episode_info=input_data.episode_info, - directory=episode_dir, - frame_count=metadata['statistics']['total_frames'], - metadata_path=metadata_file, - ) - - @staticmethod - def __prepare_episode_directory(episode_dir: Path, context: ExecutionContext) -> None: + def __prepare_episode_directory( + episode_dir: Path, context: ExecutionContext, + ) -> None: if episode_dir.exists(): - context.logger.info(f'Cleaning incomplete frames from previous run: {episode_dir}') + context.logger.info( + f'Cleaning incomplete frames from previous run: {episode_dir}', + ) shutil.rmtree(episode_dir, ignore_errors=True) episode_dir.mkdir(parents=True, exist_ok=True) @staticmethod def __construct_empty_result( - episode_dir: Path, - metadata_file: Path, - input_data: SceneCollection, - context: ExecutionContext, + episode_dir: Path, + metadata_file: Path, + input_data: SceneCollection, + context: ExecutionContext, ) -> FrameCollection: context.logger.warning(f'No frames to extract for {input_data.episode_id}') return FrameCollection( @@ -298,7 +313,8 @@ def __construct_empty_result( def __fetch_video_metadata(video_path: Path) -> Dict[str, Any]: cmd = [ 'ffprobe', '-v', 'error', '-select_streams', 'v:0', - '-show_entries', 'stream=width,height,sample_aspect_ratio,display_aspect_ratio', + '-show_entries', + 'stream=width,height,sample_aspect_ratio,display_aspect_ratio', '-of', 'json', str(video_path), ] result = subprocess.run(cmd, capture_output=True, text=True, check=True) diff --git a/preprocessor/steps/video/scene_detection_step.py b/preprocessor/steps/video/scene_detection_step.py index 3311adac7..82412058d 100644 --- a/preprocessor/steps/video/scene_detection_step.py +++ b/preprocessor/steps/video/scene_detection_step.py @@ -34,28 +34,12 @@ def name(self) -> str: def supports_batch_processing(self) -> bool: return True - def get_output_descriptors(self) -> List[OutputDescriptor]: - return [ - JsonFileOutput( - pattern="{season}/{episode}.json", - subdir="scene_detections", - min_size_bytes=10, - ), - ] - def setup_resources(self, context: ExecutionContext) -> None: if not self.__model_loaded: context.logger.info('Loading TransNetV2 model...') self.__transnet.load_model() self.__model_loaded = True - def execute_batch( - self, input_data: List[TranscodedVideo], context: ExecutionContext, - ) -> List[SceneCollection]: - return self._execute_with_threadpool( - input_data, context, self.config.max_parallel_episodes, self.__execute_single, - ) - def teardown_resources(self, context: ExecutionContext) -> None: if self.__model_loaded: self.__transnet.cleanup() @@ -67,41 +51,48 @@ def cleanup(self) -> None: self.__transnet.cleanup() self.__model_loaded = False - def execute( - self, input_data: TranscodedVideo, context: ExecutionContext, - ) -> SceneCollection: - output_path = self.__resolve_output_path(input_data, context) + def execute_batch( + self, input_data: List[TranscodedVideo], context: ExecutionContext, + ) -> List[SceneCollection]: + return self._execute_with_threadpool( + input_data, context, self.config.max_parallel_episodes, self.execute, + ) - if self._check_cache_validity(output_path, context, input_data.episode_id, 'cached'): - return self.__load_cached_result(output_path, input_data) + def _process( + self, input_data: TranscodedVideo, context: ExecutionContext, + ) -> SceneCollection: + output_path = self._get_cache_path(input_data, context) self.__prepare_detection_environment(context) - - context.logger.info(f'Detecting scenes in {input_data.episode_id}') - context.mark_step_started(self.name, input_data.episode_id) - scenes = self.__detect_scenes(input_data.path) - self.__save_detection_results(scenes, input_data.path, output_path) - context.mark_step_completed(self.name, input_data.episode_id) - return self.__construct_scene_collection(output_path, input_data, scenes) - - def __execute_single( - self, input_data: TranscodedVideo, context: ExecutionContext, - ) -> SceneCollection: - output_path = self.__resolve_output_path(input_data, context) + # Retrieve video info needed for the output payload + video_info = self.__transnet.get_video_info(input_data.path) + self.__save_detection_results(scenes, video_info, output_path) - if self._check_cache_validity(output_path, context, input_data.episode_id, 'cached'): - return self.__load_cached_result(output_path, input_data) + return self.__construct_scene_collection(output_path, input_data, scenes) - context.logger.info(f'Detecting scenes in {input_data.episode_id}') - context.mark_step_started(self.name, input_data.episode_id) + def _get_output_descriptors(self) -> List[OutputDescriptor]: + return [ + JsonFileOutput( + pattern="{season}/{episode}.json", + subdir="scene_detections", + min_size_bytes=10, + ), + ] - scenes = self.__detect_scenes(input_data.path) - self.__save_detection_results(scenes, input_data.path, output_path) + def _get_cache_path( + self, input_data: TranscodedVideo, context: ExecutionContext, + ) -> Path: + return self._get_standard_cache_path(input_data, context) - context.mark_step_completed(self.name, input_data.episode_id) - return self.__construct_scene_collection(output_path, input_data, scenes) + def _load_from_cache( + self, cache_path: Path, input_data: TranscodedVideo, context: ExecutionContext, + ) -> SceneCollection: + scenes_data: Dict[str, Any] = FileOperations.load_json(cache_path) + return self.__construct_scene_collection( + cache_path, input_data, scenes_data.get('scenes', []), + ) def __prepare_detection_environment(self, context: ExecutionContext) -> None: if not self.__model_loaded: @@ -117,19 +108,18 @@ def __detect_scenes(self, video_path: Path) -> List[Dict[str, Any]]: ) def __save_detection_results( - self, - scenes: List[Dict[str, Any]], - video_path: Path, - output_path: Path, + self, + scenes: List[Dict[str, Any]], + video_info: Dict[str, Any], + output_path: Path, ) -> None: - video_info = self.__transnet.get_video_info(video_path) output_data = self.__build_results_payload(scenes, video_info) FileOperations.atomic_write_json(output_path, output_data) def __build_results_payload( - self, - scenes: List[Dict[str, Any]], - video_info: Dict[str, Any], + self, + scenes: List[Dict[str, Any]], + video_info: Dict[str, Any], ) -> Dict[str, Any]: return { 'total_scenes': len(scenes), @@ -142,27 +132,11 @@ def __build_results_payload( 'scenes': scenes, } - def __load_cached_result( - self, - output_path: Path, - input_data: TranscodedVideo, - ) -> SceneCollection: - scenes_data = FileOperations.load_json(output_path) - return SceneCollection( - path=output_path, - video_path=input_data.path, - episode_id=input_data.episode_id, - episode_info=input_data.episode_info, - scenes=scenes_data.get('scenes', []), - threshold=self.config.threshold, - min_scene_len=self.config.min_scene_len, - ) - def __construct_scene_collection( - self, - output_path: Path, - input_data: TranscodedVideo, - scenes: List[Dict[str, Any]], + self, + output_path: Path, + input_data: TranscodedVideo, + scenes: List[Dict[str, Any]], ) -> SceneCollection: return SceneCollection( path=output_path, @@ -173,17 +147,3 @@ def __construct_scene_collection( threshold=self.config.threshold, min_scene_len=self.config.min_scene_len, ) - - def __resolve_output_path( - self, - input_data: TranscodedVideo, - context: ExecutionContext, - ) -> Path: - return self._resolve_output_path( - 0, - context, - { - 'season': input_data.episode_info.season_code(), - 'episode': input_data.episode_info.episode_code(), - }, - ) diff --git a/preprocessor/steps/video/transcoding_step.py b/preprocessor/steps/video/transcoding_step.py index 949d321e1..2157a80fb 100644 --- a/preprocessor/steps/video/transcoding_step.py +++ b/preprocessor/steps/video/transcoding_step.py @@ -22,21 +22,12 @@ class VideoTranscoderStep(PipelineStep[SourceVideo, TranscodedVideo, TranscodeConfig]): - __CODEC_EFFICIENCY = { + __CODEC_EFFICIENCY: Dict[str, float] = { 'h264': 1.0, 'avc': 1.0, 'hevc': 2.0, 'h265': 2.0, 'vp9': 2.85, 'av1': 4.0, } - __command_logged = False - - def get_output_descriptors(self) -> List[FileOutput]: - return [ - FileOutput( - pattern="{season}/{episode}.mp4", - subdir="transcoded_videos", - min_size_bytes=1024*1024, - ), - ] + __command_logged: bool = False @property def name(self) -> str: @@ -46,16 +37,15 @@ def name(self) -> str: def supports_batch_processing(self) -> bool: return True - def execute_batch(self, input_data: List[SourceVideo], context: ExecutionContext) -> List[TranscodedVideo]: + def execute_batch( + self, input_data: List[SourceVideo], context: ExecutionContext, + ) -> List[TranscodedVideo]: return self._execute_with_threadpool( input_data, context, self.config.max_parallel_episodes, self.execute, ) - def execute(self, input_data: SourceVideo, context: ExecutionContext) -> TranscodedVideo: - output_path = self.__resolve_output_path(input_data, context) - - if self._check_cache_validity(output_path, context, input_data.episode_id, 'output exists'): - return self.__construct_result_artifact(output_path, input_data) + def _process(self, input_data: SourceVideo, context: ExecutionContext) -> TranscodedVideo: + output_path = self._get_cache_path(input_data, context) probe_data = FFmpegWrapper.probe_video(input_data.path) params = self.__create_transcode_params(input_data, output_path, probe_data, context) @@ -63,16 +53,46 @@ def execute(self, input_data: SourceVideo, context: ExecutionContext) -> Transco self.__log_transcode_details(context, input_data, params, probe_data) self.__execute_ffmpeg_process(context, params, input_data.episode_id) - context.mark_step_completed(self.name, input_data.episode_id) return self.__construct_result_artifact(output_path, input_data) + def _get_output_descriptors(self) -> List[FileOutput]: + return [ + FileOutput( + pattern="{season}/{series_name}_{episode}.mp4", + subdir="transcoded_videos", + min_size_bytes=1024 * 1024, + ), + ] + + def _get_cache_path(self, input_data: SourceVideo, context: ExecutionContext) -> Path: + return self._resolve_output_path( + 0, + context, + { + 'season': input_data.episode_info.season_code(), + 'episode': input_data.episode_info.episode_code(), + 'series_name': context.series_name, + }, + ) + + def _load_from_cache( + self, cache_path: Path, input_data: SourceVideo, context: ExecutionContext, + ) -> TranscodedVideo: + return self.__construct_result_artifact(cache_path, input_data) + def __create_transcode_params( - self, input_data: SourceVideo, output_path: Path, probe_data: Dict[str, Any], context: ExecutionContext, + self, + input_data: SourceVideo, + output_path: Path, + probe_data: Dict[str, Any], + context: ExecutionContext, ) -> TranscodeParams: target_fps = self.__resolve_target_framerate() is_upscaling, src_px, target_px = self.__analyze_resolution_scaling(probe_data) - bitrates = self.__compute_all_bitrate_settings(probe_data, context, is_upscaling, src_px, target_px) + bitrates = self.__compute_all_bitrate_settings( + probe_data, context, is_upscaling, src_px, target_px, + ) return TranscodeParams( input_path=input_data.path, @@ -103,8 +123,12 @@ def __analyze_resolution_scaling(self, probe_data: Dict[str, Any]) -> Tuple[bool return src_px < target_px, src_px, target_px def __compute_all_bitrate_settings( - self, probe_data: Dict[str, Any], context: ExecutionContext, - is_up: bool, src_px: int, target_px: int, + self, + probe_data: Dict[str, Any], + context: ExecutionContext, + is_up: bool, + src_px: int, + target_px: int, ) -> Dict[str, float]: src_v = FFmpegWrapper.get_video_bitrate(probe_data) target_max = self.config.video_bitrate_mbps @@ -116,7 +140,7 @@ def __compute_all_bitrate_settings( ratio = target_px / src_px exp = self.__calculate_scaling_exponent(ratio, is_up) - scaled_raw = norm_v * (ratio ** exp) + scaled_raw = norm_v * (ratio**exp) scaled_min = self.__apply_min_upscale_constraint(scaled_raw, target_max, is_up) final_v = min(scaled_min, target_max) @@ -127,8 +151,7 @@ def __compute_all_bitrate_settings( return self.__scale_bitrate_limits(final_v / target_max) def __get_normalized_bitrate( - self, src_v: float, probe: Dict[str, Any], is_up: bool, - context: ExecutionContext, + self, src_v: float, probe: Dict[str, Any], is_up: bool, context: ExecutionContext, ) -> float: if not is_up: return src_v @@ -140,7 +163,8 @@ def __get_normalized_bitrate( if mult != 1.0: norm = src_v * mult context.logger.info( - f'Codec: {src_codec.upper()}->{tgt_codec.upper()} ({mult}x) | {src_v:.2f}->{norm:.2f} Mbps', + f'Codec: {src_codec.upper()}->{tgt_codec.upper()} ({mult}x) | ' + f'{src_v:.2f}->{norm:.2f} Mbps', ) return norm return src_v @@ -166,16 +190,8 @@ def __build_fallback_bitrates(self, target_max: float) -> Dict[str, float]: "buf": self.config.calculate_bufsize_mbps(), } - @staticmethod - def __calculate_scaling_exponent(ratio: float, is_up: bool) -> float: - log_r = math.log10(max(ratio, 0.01)) - if is_up: - return 0.8 + min(log_r, 1.0) * 0.35 - return 0.8 + max(log_r, -2.0) * 0.175 - def __resolve_deinterlacing_strategy( - self, input_data: SourceVideo, context: ExecutionContext, - probe: Dict[str, Any], + self, input_data: SourceVideo, context: ExecutionContext, probe: Dict[str, Any], ) -> bool: if self.config.force_deinterlace: return True @@ -194,7 +210,9 @@ def __compute_audio_bitrate(self, probe: Dict[str, Any], context: ExecutionConte return adj return tgt_a - def __execute_ffmpeg_process(self, context: ExecutionContext, params: TranscodeParams, ep_id: str) -> None: + def __execute_ffmpeg_process( + self, context: ExecutionContext, params: TranscodeParams, ep_id: str, + ) -> None: with StepTempFile(params.output_path) as temp_path: temp_params = replace(params, output_path=temp_path) context.mark_step_started(self.name, ep_id, [str(temp_path)]) @@ -203,10 +221,38 @@ def __execute_ffmpeg_process(self, context: ExecutionContext, params: TranscodeP context.logger.info('=' * 20 + ' FFmpeg ' + '=' * 20) FFmpegWrapper.transcode(temp_params) + def __construct_result_artifact(self, path: Path, input_data: SourceVideo) -> TranscodedVideo: + return TranscodedVideo( + path=path, + episode_id=input_data.episode_id, + episode_info=input_data.episode_info, + resolution=f'{self.config.resolution.width}x{self.config.resolution.height}', + codec=self.config.codec, + ) + + @staticmethod + def __should_log_command() -> bool: + if not VideoTranscoderStep.__command_logged: + VideoTranscoderStep.__command_logged = True + return True + return False + + @staticmethod + def __calculate_scaling_exponent(ratio: float, is_up: bool) -> float: + log_r = math.log10(max(ratio, 0.01)) + if is_up: + return 0.8 + min(log_r, 1.0) * 0.35 + return 0.8 + max(log_r, -2.0) * 0.175 + @staticmethod def __normalize_codec_name(codec: str) -> str: name = codec.lower() - mapping = {'h264': ('h264', 'avc'), 'hevc': ('h265', 'hevc'), 'vp9': ('vp9',), 'av1': ('av1',)} + mapping = { + 'h264': ('h264', 'avc'), + 'hevc': ('h265', 'hevc'), + 'vp9': ('vp9',), + 'av1': ('av1',), + } for norm, patterns in mapping.items(): if any(p in name for p in patterns): return norm @@ -214,51 +260,45 @@ def __normalize_codec_name(codec: str) -> str: @staticmethod def __get_codec_efficiency_multiplier(src: str, tgt: str) -> float: - return VideoTranscoderStep.__CODEC_EFFICIENCY.get(src, 1.0) / VideoTranscoderStep.__CODEC_EFFICIENCY.get( - tgt, - 1.0, - ) - - def __resolve_output_path(self, input_data: SourceVideo, context: ExecutionContext) -> Path: - return self._resolve_output_path( - 0, - context, - { - 'season': input_data.episode_info.season_code(), - 'episode': input_data.episode_info.episode_code(), - }, - ) - - def __construct_result_artifact(self, path: Path, input_data: SourceVideo) -> TranscodedVideo: - return TranscodedVideo( - path=path, episode_id=input_data.episode_id, episode_info=input_data.episode_info, - resolution=f'{self.config.resolution.width}x{self.config.resolution.height}', codec=self.config.codec, - ) + eff = VideoTranscoderStep.__CODEC_EFFICIENCY + return eff.get(src, 1.0) / eff.get(tgt, 1.0) @staticmethod - def __log_bitrate_workflow(ctx, src, norm, raw, s_min, final, limit, ratio, is_up): + def __log_bitrate_workflow( + ctx: ExecutionContext, + src: float, + norm: float, + raw: float, + s_min: float, + final: float, + limit: float, + ratio: float, + is_up: bool, + ) -> None: dir_label = "upscaling" if is_up else ("downscaling" if ratio < 1.0 else "same") min_msg = f' (MinBoost: {s_min:.2f})' if is_up and (s_min > raw) else '' - ctx.logger.info(f'[{dir_label}] {src:.2f}->{norm:.2f}->{raw:.2f}{min_msg} -> {final:.2f} Mbps (Max: {limit})') + ctx.logger.info( + f'[{dir_label}] {src:.2f}->{norm:.2f}->{raw:.2f}{min_msg} -> {final:.2f} Mbps ' + f'(Max: {limit})', + ) @staticmethod - def __log_transcode_details(ctx, input_data, params, probe): + def __log_transcode_details( + ctx: ExecutionContext, + input_data: SourceVideo, + params: TranscodeParams, + probe: Dict[str, Any], + ) -> None: w, h = FFmpegWrapper.get_resolution(probe) + up_label = "UP" if params.is_upscaling else "DOWN" ctx.logger.info( - f'{input_data.episode_id}: {w}x{h} -> {params.resolution} [{"UP" if params.is_upscaling else "DOWN"}]', + f'{input_data.episode_id}: {w}x{h} -> {params.resolution} [{up_label}]', ) @staticmethod - def __log_int_diagnostics(ctx, has_int, stats, order): + def __log_int_diagnostics(ctx: ExecutionContext, has_int: bool, stats: Dict[str, float], order: str) -> None: ctx.logger.info(f"Interlacing: {has_int} ({stats['ratio'] * 100:.1f}%) | {order}") @staticmethod def __resolve_target_framerate() -> float: return 24.0 - - @staticmethod - def __should_log_command() -> bool: - if not VideoTranscoderStep.__command_logged: - VideoTranscoderStep.__command_logged = True - return True - return False diff --git a/preprocessor/steps/vision/character_detection_step.py b/preprocessor/steps/vision/character_detection_step.py index e81e8c4dd..9415ede06 100644 --- a/preprocessor/steps/vision/character_detection_step.py +++ b/preprocessor/steps/vision/character_detection_step.py @@ -23,7 +23,8 @@ class CharacterDetectorStep(PipelineStep[FrameCollection, DetectionResults, CharacterDetectionConfig]): - def get_output_descriptors(self) -> List[OutputDescriptor]: + @staticmethod + def get_output_descriptors() -> List[OutputDescriptor]: """Define output file descriptors for character detection step.""" return [ JsonFileOutput( @@ -61,7 +62,7 @@ def execute_batch( self, input_data: List[FrameCollection], context: ExecutionContext, ) -> List[DetectionResults]: return self._execute_with_threadpool( - input_data, context, self.config.max_parallel_episodes, self.__execute_single, + input_data, context, self.config.max_parallel_episodes, self.execute, ) def teardown_resources(self, context: ExecutionContext) -> None: @@ -70,44 +71,35 @@ def teardown_resources(self, context: ExecutionContext) -> None: self.__face_app = None self.__character_vectors = {} - def __execute_single( + def _get_cache_path( self, input_data: FrameCollection, context: ExecutionContext, - ) -> DetectionResults: - output_path = self.__resolve_output_path(input_data, context) - - if self._check_cache_validity(output_path, context, input_data.episode_id, 'cached character detections'): - return self.__load_cached_result(output_path, input_data) - - context.logger.info(f'Detecting characters in {input_data.episode_id}') - context.mark_step_started(self.name, input_data.episode_id) - - frame_files = self.__extract_frame_files(input_data) - if not frame_files: - return self.__construct_empty_result(output_path, input_data, context) - - results = self.__process_character_detection(frame_files) - self.__save_detection_results(results, output_path, input_data, context, frame_files) + ) -> Path: + return self._resolve_output_path( + 0, + context, + { + 'season': f'S{input_data.episode_info.season:02d}', + 'episode': input_data.episode_info.episode_code(), + }, + ) - context.mark_step_completed(self.name, input_data.episode_id) + def _load_from_cache( + self, cache_path: Path, input_data: FrameCollection, context: ExecutionContext, + ) -> DetectionResults: + detection_data: Dict[str, Any] = FileOperations.load_json(cache_path) return DetectionResults( episode_id=input_data.episode_id, episode_info=input_data.episode_info, - path=output_path, + path=cache_path, detection_type='character', - detection_count=len(results), + detection_count=len(detection_data.get('detections', [])), ) - def execute( + def _process( self, input_data: FrameCollection, context: ExecutionContext, ) -> DetectionResults: - output_path = self.__resolve_output_path(input_data, context) - - if self._check_cache_validity(output_path, context, input_data.episode_id, 'cached character detections'): - return self.__load_cached_result(output_path, input_data) - + output_path = self._get_cache_path(input_data, context) self.__prepare_detection_environment(context) - context.logger.info(f'Detecting characters in {input_data.episode_id}') - context.mark_step_started(self.name, input_data.episode_id) frame_files = self.__extract_frame_files(input_data) if not frame_files: @@ -116,7 +108,6 @@ def execute( results = self.__process_character_detection(frame_files) self.__save_detection_results(results, output_path, input_data, context, frame_files) - context.mark_step_completed(self.name, input_data.episode_id) return DetectionResults( episode_id=input_data.episode_id, episode_info=input_data.episode_info, @@ -178,27 +169,6 @@ def __save_detection_results( } FileOperations.atomic_write_json(output_path, output_data) - def __resolve_output_path(self, input_data: FrameCollection, context: ExecutionContext) -> Path: - return self._resolve_output_path( - 0, - context, - { - 'season': f'S{input_data.episode_info.season:02d}', - 'episode': input_data.episode_info.episode_code(), - }, - ) - - @staticmethod - def __load_cached_result(output_path: Path, input_data: FrameCollection) -> DetectionResults: - detection_data: Dict[str, Any] = FileOperations.load_json(output_path) - return DetectionResults( - episode_id=input_data.episode_id, - episode_info=input_data.episode_info, - path=output_path, - detection_type='character', - detection_count=len(detection_data.get('detections', [])), - ) - @staticmethod def __extract_frame_files(input_data: FrameCollection) -> List[Path]: return sorted([ diff --git a/preprocessor/steps/vision/embeddings_step.py b/preprocessor/steps/vision/embeddings_step.py index af3964440..e796a6cec 100644 --- a/preprocessor/steps/vision/embeddings_step.py +++ b/preprocessor/steps/vision/embeddings_step.py @@ -1,3 +1,4 @@ +# pylint: disable=duplicate-code from pathlib import Path from typing import ( Any, @@ -21,25 +22,11 @@ from preprocessor.services.search.embedding_model import EmbeddingModelWrapper -# pylint: disable=duplicate-code # Pattern shared with text/embeddings_step - different data types (frames vs text) class VideoEmbeddingStep(PipelineStep[FrameCollection, EmbeddingCollection, VideoEmbeddingConfig]): def __init__(self, config: VideoEmbeddingConfig) -> None: super().__init__(config) self.__model: Optional[EmbeddingModelWrapper] = None - def get_output_descriptors(self) -> List[FileOutput]: - return [ - FileOutput( - pattern="{season}/{episode}.npy", - subdir="embeddings/vision", - min_size_bytes=1024, - ), - ] - - @property - def name(self) -> str: - return 'video_embedding' - @property def supports_batch_processing(self) -> bool: return True @@ -50,11 +37,6 @@ def setup_resources(self, context: ExecutionContext) -> None: self.__model = EmbeddingModelWrapper(self.config.model_name, self.config.device) self.__model.load_model() - def execute_batch( - self, input_data: List[FrameCollection], context: ExecutionContext, - ) -> List[EmbeddingCollection]: - return self._execute_sequential(input_data, context, self.__execute_single) - def teardown_resources(self, context: ExecutionContext) -> None: if self.__model: self.__model.cleanup() @@ -66,62 +48,63 @@ def cleanup(self) -> None: self.__model.cleanup() self.__model = None - def execute( - self, input_data: FrameCollection, context: ExecutionContext, - ) -> EmbeddingCollection: - output_path = self.__resolve_output_path( - context, - input_data.episode_info.season, - input_data.episode_info.episode, - ) + def execute_batch( + self, input_data: List[FrameCollection], context: ExecutionContext, + ) -> List[EmbeddingCollection]: + return self._execute_sequential(input_data, context, self.execute) - if self._check_cache_validity(output_path, context, input_data.episode_id, 'cached video embeddings'): - return self.__load_cached_result(output_path, input_data) + def _process( + self, input_data: FrameCollection, context: ExecutionContext, + ) -> EmbeddingCollection: + output_path = self._get_cache_path(input_data, context) frame_requests = self.__extract_frame_requests(input_data, context) if not frame_requests: - return self.__construct_embedding_collection(input_data, output_path, 0) + return self.__construct_embedding_collection( + input_data, output_path, 0, self.config.model_name, + ) self.__prepare_embedding_model(context) context.logger.info( f'Generating video embeddings for {len(frame_requests)} frames in {input_data.episode_id}', ) - context.mark_step_started(self.name, input_data.episode_id) image_hashes = self.__fetch_image_hashes(input_data, context) results = self.__generate_embeddings(frame_requests, input_data, image_hashes) self.__save_embedding_results(results, output_path, input_data, image_hashes) - context.mark_step_completed(self.name, input_data.episode_id) - return self.__construct_embedding_collection(input_data, output_path, len(results)) + return self.__construct_embedding_collection( + input_data, output_path, len(results), self.config.model_name, + ) - def __execute_single( + def _get_output_descriptors(self) -> List[FileOutput]: + return [ + FileOutput( + pattern="{season}/{episode}.json", + subdir="embeddings/vision", + min_size_bytes=1024, + ), + ] + + def _get_cache_path( self, input_data: FrameCollection, context: ExecutionContext, - ) -> EmbeddingCollection: - output_path = self.__resolve_output_path( + ) -> Path: + return self._resolve_output_path( + 0, context, - input_data.episode_info.season, - input_data.episode_info.episode, + self.__create_path_variables(input_data), ) - if self._check_cache_validity(output_path, context, input_data.episode_id, 'cached video embeddings'): - return self.__load_cached_result(output_path, input_data) - - frame_requests = self.__extract_frame_requests(input_data, context) - if not frame_requests: - return self.__construct_embedding_collection(input_data, output_path, 0) - - context.logger.info( - f'Generating video embeddings for {len(frame_requests)} frames in {input_data.episode_id}', + def _load_from_cache( + self, cache_path: Path, input_data: FrameCollection, context: ExecutionContext, + ) -> EmbeddingCollection: + emb_data: Dict[str, Any] = FileOperations.load_json(cache_path) + return self.__construct_embedding_collection( + input_data, + cache_path, + len(emb_data.get('video_embeddings', [])), + self.config.model_name, ) - context.mark_step_started(self.name, input_data.episode_id) - - image_hashes = self.__fetch_image_hashes(input_data, context) - results = self.__generate_embeddings(frame_requests, input_data, image_hashes) - self.__save_embedding_results(results, output_path, input_data, image_hashes) - - context.mark_step_completed(self.name, input_data.episode_id) - return self.__construct_embedding_collection(input_data, output_path, len(results)) def __prepare_embedding_model(self, context: ExecutionContext) -> None: if self.__model is None: @@ -130,17 +113,22 @@ def __prepare_embedding_model(self, context: ExecutionContext) -> None: self.__model.load_model() def __generate_embeddings( - self, - frame_requests: List[Dict[str, Any]], - input_data: FrameCollection, - image_hashes: Dict[int, str], + self, + frame_requests: List[Dict[str, Any]], + input_data: FrameCollection, + image_hashes: Dict[int, str], ) -> List[Dict[str, Any]]: results: List[Dict[str, Any]] = [] batch_size: int = self.config.batch_size + if not self.__model: + raise RuntimeError("Embedding model not initialized") + for i in range(0, len(frame_requests), batch_size): - batch: List[Dict[str, Any]] = frame_requests[i:i + batch_size] - image_paths: List[str] = [str(input_data.directory / f['frame_path']) for f in batch] + batch: List[Dict[str, Any]] = frame_requests[i : i + batch_size] + image_paths: List[str] = [ + str(input_data.directory / f['frame_path']) for f in batch + ] batch_embeddings: List[np.ndarray] = self.__model.encode_images(image_paths) for request, embedding in zip(batch, batch_embeddings): @@ -152,74 +140,21 @@ def __generate_embeddings( return results - def __load_cached_result( - self, - output_path: Path, - input_data: FrameCollection, - ) -> EmbeddingCollection: - emb_data: Dict[str, Any] = FileOperations.load_json(output_path) - return self.__construct_embedding_collection( - input_data, - output_path, - len(emb_data.get('video_embeddings', [])), - ) - - def __construct_embedding_collection( - self, - input_data: FrameCollection, - output_path: Path, - embedding_count: int, - ) -> EmbeddingCollection: - # Similar pattern exists in text/embeddings_step.py but with different input type (FrameCollection vs TranscriptionData) - # and embedding_type ('video' vs 'text'). Not truly duplicated - both use the same MetadataBuilder method. - return MetadataBuilder.create_embedding_collection( - episode_id=input_data.episode_id, - episode_info=input_data.episode_info, - path=output_path, - model_name=self.config.model_name, - embedding_count=embedding_count, - embedding_type='video', - ) - @staticmethod - def __save_embedding_results( - results: List[Dict[str, Any]], - output_path: Path, - input_data: FrameCollection, - image_hashes: Dict[int, str], - ) -> None: - statistics = { - 'total_embeddings': len(results), - 'embedding_dimension': len(results[0]['embedding']) if results else 0, - 'frames_with_hash': len(image_hashes), + def __create_path_variables(input_data: FrameCollection) -> Dict[str, str]: + return { + "season": f"S{input_data.episode_info.season:02d}", + "episode": input_data.episode_info.episode_code(), } - output_data: Dict[str, Any] = MetadataBuilder.create_processing_metadata( - episode_info=input_data.episode_info, - processing_params={}, - statistics=statistics, - results_key='video_embeddings', - results_data=results, - ) - FileOperations.atomic_write_json(output_path, output_data) - - def __resolve_output_path( - self, - context: ExecutionContext, - season: int, - episode: int, - ) -> Path: - return self._resolve_output_path( - 0, - context, - {"season": season, "episode": episode}, - ) @staticmethod def __extract_frame_requests( - input_data: FrameCollection, - context: ExecutionContext, + input_data: FrameCollection, + context: ExecutionContext, ) -> List[Dict[str, Any]]: - frame_metadata: Dict[str, Any] = FileOperations.load_json(input_data.metadata_path) + frame_metadata: Dict[str, Any] = FileOperations.load_json( + input_data.metadata_path, + ) frame_requests: List[Dict[str, Any]] = frame_metadata.get('frames', []) if not frame_requests: context.logger.warning(f'No frames for embedding in {input_data.episode_id}') @@ -227,18 +162,62 @@ def __extract_frame_requests( @staticmethod def __fetch_image_hashes( - input_data: FrameCollection, context: ExecutionContext, + input_data: FrameCollection, context: ExecutionContext, ) -> Dict[int, str]: - filename_base = f'{context.series_name}_{input_data.episode_info.episode_code()}' + filename_base = ( + f'{context.series_name}_{input_data.episode_info.episode_code()}' + ) hash_filename: str = f'{filename_base}_image_hashes.json' - hash_path: Path = context.get_output_path(input_data.episode_info, 'image_hashes', hash_filename) + hash_path: Path = context.get_output_path( + input_data.episode_info, 'image_hashes', hash_filename, + ) if not hash_path.exists(): return {} try: data: Dict[str, Any] = FileOperations.load_json(hash_path) - return {h['frame_number']: h['perceptual_hash'] for h in data.get('hashes', [])} + return { + h['frame_number']: h['perceptual_hash'] + for h in data.get('hashes', []) + } except Exception as e: context.logger.warning(f'Could not load image hashes from {hash_path}: {e}') return {} + + @staticmethod + def __save_embedding_results( + results: List[Dict[str, Any]], + output_path: Path, + input_data: FrameCollection, + image_hashes: Dict[int, str], + ) -> None: + statistics = { + 'total_embeddings': len(results), + 'embedding_dimension': len(results[0]['embedding']) if results else 0, + 'frames_with_hash': len(image_hashes), + } + output_data: Dict[str, Any] = MetadataBuilder.create_processing_metadata( + episode_info=input_data.episode_info, + processing_params={}, + statistics=statistics, + results_key='video_embeddings', + results_data=results, + ) + FileOperations.atomic_write_json(output_path, output_data) + + @staticmethod + def __construct_embedding_collection( + input_data: FrameCollection, + output_path: Path, + embedding_count: int, + model_name: str, + ) -> EmbeddingCollection: + return MetadataBuilder.create_embedding_collection( + episode_id=input_data.episode_id, + episode_info=input_data.episode_info, + path=output_path, + model_name=model_name, + embedding_count=embedding_count, + embedding_type='video', + ) diff --git a/preprocessor/steps/vision/emotion_detection_step.py b/preprocessor/steps/vision/emotion_detection_step.py index c77813f6e..ce292e0cb 100644 --- a/preprocessor/steps/vision/emotion_detection_step.py +++ b/preprocessor/steps/vision/emotion_detection_step.py @@ -1,3 +1,4 @@ +# pylint: disable=duplicate-code from pathlib import Path from typing import ( Any, @@ -27,39 +28,10 @@ class EmotionDetectionStep(PipelineStep[FrameCollection, EmotionData, EmotionDetectionConfig]): - def get_output_descriptors(self) -> List[OutputDescriptor]: - """Define output file descriptors for emotion detection step.""" - return [ - JsonFileOutput( - subdir="detections/emotions", - pattern="{season}/{episode}.json", - min_size_bytes=10, - ), - ] - - def __resolve_output_path(self, input_data: FrameCollection, context: ExecutionContext) -> Path: - return self._resolve_output_path( - 0, - context, - { - 'season': f'S{input_data.episode_info.season:02d}', - 'episode': input_data.episode_info.episode_code(), - }, - ) - - - def __init__(self, config: EmotionDetectionConfig) -> None: super().__init__(config) self.__model: Optional[HSEmotionRecognizer] = None - @property - def name(self) -> str: - return 'emotion_detection' - - def cleanup(self) -> None: - self.__model = None - @property def supports_batch_processing(self) -> bool: return True @@ -69,26 +41,26 @@ def setup_resources(self, context: ExecutionContext) -> None: context.logger.info('Loading HSEmotion model...') self.__model = EmotionDetector.init_model(context.logger) + def teardown_resources(self, context: ExecutionContext) -> None: + if self.__model: + context.logger.info('HSEmotion model unloaded') + self.__model = None + + def cleanup(self) -> None: + self.__model = None + def execute_batch( self, input_data: List[FrameCollection], context: ExecutionContext, ) -> List[EmotionData]: return self._execute_with_threadpool( - input_data, context, self.config.max_parallel_episodes, self.__execute_single, + input_data, context, self.config.max_parallel_episodes, self.execute, ) - def teardown_resources(self, context: ExecutionContext) -> None: - if self.__model: - context.logger.info('HSEmotion model unloaded') - self.__model = None - - def __execute_single( + def _process( self, input_data: FrameCollection, context: ExecutionContext, ) -> EmotionData: input_path = self.__resolve_input_path(input_data, context) - output_path = self.__resolve_output_path(input_data, context) - - if self._check_cache_validity(output_path, context, input_data.episode_id, 'cached emotion detection'): - return self.__construct_emotion_data(input_data, output_path) + output_path = self._get_cache_path(input_data, context) if not input_path.exists(): context.logger.warning( @@ -96,40 +68,36 @@ def __execute_single( ) return self.__construct_emotion_data(input_data, output_path) - context.logger.info(f'Detecting emotions for {input_data.episode_id}') - context.mark_step_started(self.name, input_data.episode_id) + self.__prepare_emotion_model(context) detections_data = FileOperations.load_json(input_path) self.__process_and_update_emotions(detections_data, input_data, context) FileOperations.atomic_write_json(output_path, detections_data) - context.mark_step_completed(self.name, input_data.episode_id) return self.__construct_emotion_data(input_data, output_path) - def execute(self, input_data: FrameCollection, context: ExecutionContext) -> EmotionData: - input_path = self.__resolve_input_path(input_data, context) - output_path = self.__resolve_output_path(input_data, context) - - if self._check_cache_validity(output_path, context, input_data.episode_id, 'cached emotion detection'): - return self.__construct_emotion_data(input_data, output_path) - - if not input_path.exists(): - context.logger.warning( - f'No character detections found for emotion analysis: {input_path}', - ) - return self.__construct_emotion_data(input_data, output_path) - - context.logger.info(f'Detecting emotions for {input_data.episode_id}') - context.mark_step_started(self.name, input_data.episode_id) - - self.__prepare_emotion_model(context) + def _get_output_descriptors(self) -> List[OutputDescriptor]: + return [ + JsonFileOutput( + subdir="detections/emotions", + pattern="{season}/{episode}.json", + min_size_bytes=10, + ), + ] - detections_data = FileOperations.load_json(input_path) - self.__process_and_update_emotions(detections_data, input_data, context) - FileOperations.atomic_write_json(output_path, detections_data) + def _get_cache_path( + self, input_data: FrameCollection, context: ExecutionContext, + ) -> Path: + return self._resolve_output_path( + 0, + context, + self.__create_path_variables(input_data), + ) - context.mark_step_completed(self.name, input_data.episode_id) - return self.__construct_emotion_data(input_data, output_path) + def _load_from_cache( + self, cache_path: Path, input_data: FrameCollection, context: ExecutionContext, + ) -> EmotionData: + return self.__construct_emotion_data(input_data, cache_path) def __prepare_emotion_model(self, context: ExecutionContext) -> None: if self.__model is None: @@ -158,8 +126,16 @@ def __process_and_update_emotions( self.__apply_emotion_results(detections, emotion_results, face_metadata, context) + @staticmethod + def __create_path_variables(input_data: FrameCollection) -> Dict[str, str]: + return { + 'season': f'S{input_data.episode_info.season:02d}', + 'episode': input_data.episode_info.episode_code(), + } + + @staticmethod def __resolve_input_path( - self, input_data: FrameCollection, context: ExecutionContext, + input_data: FrameCollection, context: ExecutionContext, ) -> Path: season_code = f'S{input_data.episode_info.season:02d}' episode_code = input_data.episode_info.episode_code() diff --git a/preprocessor/steps/vision/face_clustering_step.py b/preprocessor/steps/vision/face_clustering_step.py index 45216e0c0..958025b36 100644 --- a/preprocessor/steps/vision/face_clustering_step.py +++ b/preprocessor/steps/vision/face_clustering_step.py @@ -1,5 +1,9 @@ +# pylint: disable=duplicate-code from pathlib import Path -from typing import List +from typing import ( + Dict, + List, +) from preprocessor.config.step_configs import FaceClusteringConfig from preprocessor.core.artifacts import ( @@ -15,92 +19,80 @@ class FaceClusteringStep(PipelineStep[FrameCollection, ClusterData, FaceClusteringConfig]): - def get_output_descriptors(self) -> List[OutputDescriptor]: - """Define output file descriptors for face clustering step.""" - return [ - JsonFileOutput( - subdir="clusters/faces", - pattern="{season}/{episode}.json", - min_size_bytes=10, - ), - ] - - def __init__(self, config: FaceClusteringConfig) -> None: super().__init__(config) self.__model = None - @property - def name(self) -> str: - return 'face_clustering' - - def cleanup(self) -> None: - self.__model = None - @property def supports_batch_processing(self) -> bool: return True def setup_resources(self, context: ExecutionContext) -> None: if self.__model is None: - context.logger.info('Loading Face Clustering model...') + self.__load_model(context) + + def teardown_resources(self, context: ExecutionContext) -> None: + if self.__model: + self.__unload_model(context) + + def cleanup(self) -> None: + self.__model = None def execute_batch( self, input_data: List[FrameCollection], context: ExecutionContext, ) -> List[ClusterData]: return self._execute_with_threadpool( - input_data, context, self.config.max_parallel_episodes, self.__execute_single, + input_data, context, self.config.max_parallel_episodes, self.execute, ) - def teardown_resources(self, context: ExecutionContext) -> None: - if self.__model: - context.logger.info('Face Clustering model unloaded') - self.__model = None - - def __execute_single( + def _process( self, input_data: FrameCollection, context: ExecutionContext, ) -> ClusterData: - """Execute single episode (batch processing variant without lazy loading).""" - output_path = self.__resolve_output_path(input_data, context) - - if self._check_cache_validity(output_path, context, input_data.episode_id, 'cached face clustering'): - return self.__construct_cluster_data(input_data, output_path) - - context.logger.info(f'Clustering faces for {input_data.episode_id}') - context.mark_step_started(self.name, input_data.episode_id) - - context.mark_step_completed(self.name, input_data.episode_id) + output_path = self._get_cache_path(input_data, context) return self.__construct_cluster_data(input_data, output_path) - def execute( - self, input_data: FrameCollection, context: ExecutionContext, - ) -> ClusterData: - output_path = self.__resolve_output_path(input_data, context) - - if self._check_cache_validity(output_path, context, input_data.episode_id, 'cached face clustering'): - return self.__construct_cluster_data(input_data, output_path) - - context.logger.info(f'Clustering faces for {input_data.episode_id}') - context.mark_step_started(self.name, input_data.episode_id) - - context.mark_step_completed(self.name, input_data.episode_id) - return self.__construct_cluster_data(input_data, output_path) + def _get_output_descriptors(self) -> List[OutputDescriptor]: + return [ + JsonFileOutput( + subdir="clusters/faces", + pattern="{season}/{episode}.json", + min_size_bytes=10, + ), + ] - def __resolve_output_path( + def _get_cache_path( self, input_data: FrameCollection, context: ExecutionContext, ) -> Path: return self._resolve_output_path( 0, context, - { - 'season': f'S{input_data.episode_info.season:02d}', - 'episode': input_data.episode_info.episode_code(), - }, + self.__create_path_variables(input_data), ) + def _load_from_cache( + self, cache_path: Path, input_data: FrameCollection, context: ExecutionContext, + ) -> ClusterData: + return self.__construct_cluster_data(input_data, cache_path) + + @staticmethod + def __load_model(context: ExecutionContext) -> None: + context.logger.info('Loading Face Clustering model...') + # Model loading logic implementation + + def __unload_model(self, context: ExecutionContext) -> None: + context.logger.info('Face Clustering model unloaded') + self.__model = None + + @staticmethod + def __create_path_variables(input_data: FrameCollection) -> Dict[str, str]: + return { + 'season': f'S{input_data.episode_info.season:02d}', + 'episode': input_data.episode_info.episode_code(), + } + @staticmethod def __construct_cluster_data( - input_data: FrameCollection, output_path: Path, + input_data: FrameCollection, output_path: Path, ) -> ClusterData: return ClusterData( episode_id=input_data.episode_id, diff --git a/preprocessor/steps/vision/image_hashing_step.py b/preprocessor/steps/vision/image_hashing_step.py index a20ee0609..07d288c15 100644 --- a/preprocessor/steps/vision/image_hashing_step.py +++ b/preprocessor/steps/vision/image_hashing_step.py @@ -27,17 +27,6 @@ class ImageHashStep(PipelineStep[FrameCollection, ImageHashCollection, ImageHashConfig]): - def get_output_descriptors(self) -> List[OutputDescriptor]: - """Define output file descriptors for image hashing step.""" - return [ - JsonFileOutput( - subdir="hashes", - pattern="{season}/{episode}.json", - min_size_bytes=50, - ), - ] - - def __init__(self, config: ImageHashConfig) -> None: super().__init__(config) self.__hasher: Optional[PerceptualHasher] = None @@ -61,13 +50,10 @@ def cleanup(self) -> None: self.__hasher = None self.__cleanup_memory() - def execute( - self, input_data: FrameCollection, context: ExecutionContext, + def _process( + self, input_data: FrameCollection, context: ExecutionContext, ) -> ImageHashCollection: - output_path = self.__resolve_output_path(input_data, context) - - if self._check_cache_validity(output_path, context, input_data.episode_id, 'cached'): - return self.__load_cached_result(output_path, input_data) + output_path = self._get_cache_path(input_data, context) frame_metadata, frame_requests = self.__load_frame_metadata(input_data, context) if not frame_requests: @@ -78,12 +64,12 @@ def execute( context.logger.info( f'Computing hashes for {len(frame_requests)} frames in {input_data.episode_id}', ) - context.mark_step_started(self.name, input_data.episode_id) hash_results = self.__compute_hashes(frame_requests, input_data) - self.__save_hash_results(hash_results, output_path, input_data, context, frame_metadata) + self.__save_hash_results( + hash_results, output_path, input_data, context, frame_metadata, + ) - context.mark_step_completed(self.name, input_data.episode_id) self.__cleanup_memory() return ImageHashCollection( @@ -93,21 +79,50 @@ def execute( hash_count=len(hash_results), ) + def _get_output_descriptors(self) -> List[OutputDescriptor]: + return [ + JsonFileOutput( + subdir="hashes", + pattern="{season}/{episode}.json", + min_size_bytes=50, + ), + ] + + def _get_cache_path( + self, input_data: FrameCollection, context: ExecutionContext, + ) -> Path: + return self._resolve_output_path( + 0, + context, + self.__create_path_variables(input_data), + ) + + def _load_from_cache( + self, cache_path: Path, input_data: FrameCollection, context: ExecutionContext, + ) -> ImageHashCollection: + hash_data: Dict[str, Any] = FileOperations.load_json(cache_path) + return ImageHashCollection( + episode_id=input_data.episode_id, + episode_info=input_data.episode_info, + path=cache_path, + hash_count=len(hash_data.get('hashes', [])), + ) + def __prepare_hasher(self, context: ExecutionContext) -> None: if self.__hasher is None: context.logger.info(f'Loading image hasher on {self.config.device}...') self.__hasher = PerceptualHasher() def __compute_hashes( - self, - frame_requests: List[Dict[str, Any]], - input_data: FrameCollection, + self, + frame_requests: List[Dict[str, Any]], + input_data: FrameCollection, ) -> List[Dict[str, Any]]: hash_results: List[Dict[str, Any]] = [] batch_size: int = self.config.batch_size for i in range(0, len(frame_requests), batch_size): - batch: List[Dict[str, Any]] = frame_requests[i:i + batch_size] + batch: List[Dict[str, Any]] = frame_requests[i : i + batch_size] pil_images = FrameLoader.load_from_requests(input_data.directory, batch) phashes: List[str] = self.__hasher.compute_phash_batch(pil_images) @@ -122,32 +137,21 @@ def __compute_hashes( return hash_results - def __resolve_output_path(self, input_data: FrameCollection, context: ExecutionContext) -> Path: - return self._resolve_output_path( - 0, - context, - { - 'season': f'S{input_data.episode_info.season:02d}', - 'episode': input_data.episode_info.episode_code(), - }, - ) - @staticmethod - def __load_cached_result(output_path: Path, input_data: FrameCollection) -> ImageHashCollection: - hash_data: Dict[str, Any] = FileOperations.load_json(output_path) - return ImageHashCollection( - episode_id=input_data.episode_id, - episode_info=input_data.episode_info, - path=output_path, - hash_count=len(hash_data.get('hashes', [])), - ) + def __create_path_variables(input_data: FrameCollection) -> Dict[str, str]: + return { + 'season': f'S{input_data.episode_info.season:02d}', + 'episode': input_data.episode_info.episode_code(), + } @staticmethod def __load_frame_metadata( - input_data: FrameCollection, - context: ExecutionContext, + input_data: FrameCollection, + context: ExecutionContext, ) -> Tuple[Dict[str, Any], List[Dict[str, Any]]]: - frame_metadata: Dict[str, Any] = FileOperations.load_json(input_data.metadata_path) + frame_metadata: Dict[str, Any] = FileOperations.load_json( + input_data.metadata_path, + ) frame_requests: List[Dict[str, Any]] = frame_metadata.get('frames', []) if not frame_requests: @@ -157,8 +161,8 @@ def __load_frame_metadata( @staticmethod def __construct_empty_result( - output_path: Path, - input_data: FrameCollection, + output_path: Path, + input_data: FrameCollection, ) -> ImageHashCollection: return ImageHashCollection( episode_id=input_data.episode_id, @@ -169,11 +173,11 @@ def __construct_empty_result( @staticmethod def __save_hash_results( - hash_results: List[Dict[str, Any]], - output_path: Path, - input_data: FrameCollection, - context: ExecutionContext, - frame_metadata: Dict[str, Any], + hash_results: List[Dict[str, Any]], + output_path: Path, + input_data: FrameCollection, + context: ExecutionContext, + frame_metadata: Dict[str, Any], ) -> None: output_data: Dict[str, Any] = { 'episode_id': input_data.episode_id, diff --git a/preprocessor/steps/vision/object_detection_step.py b/preprocessor/steps/vision/object_detection_step.py index f7fc3c757..624fdf124 100644 --- a/preprocessor/steps/vision/object_detection_step.py +++ b/preprocessor/steps/vision/object_detection_step.py @@ -1,5 +1,9 @@ +# pylint: disable=duplicate-code from pathlib import Path -from typing import List +from typing import ( + Dict, + List, +) from preprocessor.config.step_configs import ObjectDetectionConfig from preprocessor.core.artifacts import ( @@ -14,89 +18,81 @@ ) -class ObjectDetectionStep(PipelineStep[FrameCollection, ObjectDetectionData, ObjectDetectionConfig]): - def get_output_descriptors(self) -> List[OutputDescriptor]: - """Define output file descriptors for object detection step.""" - return [ - JsonFileOutput( - subdir="detections/objects", - pattern="{season}/{episode}.json", - min_size_bytes=10, - ), - ] - - +class ObjectDetectionStep( + PipelineStep[FrameCollection, ObjectDetectionData, ObjectDetectionConfig], +): def __init__(self, config: ObjectDetectionConfig) -> None: super().__init__(config) self.__model = None - @property - def name(self) -> str: - return 'object_detection' - - def cleanup(self) -> None: - self.__model = None - @property def supports_batch_processing(self) -> bool: return True def setup_resources(self, context: ExecutionContext) -> None: if self.__model is None: - context.logger.info('Loading Object Detection model...') + self.__load_model(context) + + def teardown_resources(self, context: ExecutionContext) -> None: + if self.__model: + self.__unload_model(context) + + def cleanup(self) -> None: + self.__model = None def execute_batch( self, input_data: List[FrameCollection], context: ExecutionContext, ) -> List[ObjectDetectionData]: return self._execute_with_threadpool( - input_data, context, self.config.max_parallel_episodes, self.__execute_single, + input_data, context, self.config.max_parallel_episodes, self.execute, ) - def teardown_resources(self, context: ExecutionContext) -> None: - if self.__model: - context.logger.info('Object Detection model unloaded') - self.__model = None - - def __execute_single( + def _process( self, input_data: FrameCollection, context: ExecutionContext, ) -> ObjectDetectionData: - output_path = self.__resolve_output_path(input_data, context) - - if self._check_cache_validity(output_path, context, input_data.episode_id, 'cached object detection'): - return self.__construct_object_data(input_data, output_path) - - context.logger.info(f'Detecting objects for {input_data.episode_id}') - context.mark_step_started(self.name, input_data.episode_id) - - context.mark_step_completed(self.name, input_data.episode_id) + output_path = self._get_cache_path(input_data, context) + # Main processing logic would go here return self.__construct_object_data(input_data, output_path) - def execute( - self, input_data: FrameCollection, context: ExecutionContext, - ) -> ObjectDetectionData: - output_path = self.__resolve_output_path(input_data, context) - - if self._check_cache_validity(output_path, context, input_data.episode_id, 'cached object detection'): - return self.__construct_object_data(input_data, output_path) - - context.logger.info(f'Detecting objects for {input_data.episode_id}') - context.mark_step_started(self.name, input_data.episode_id) - - context.mark_step_completed(self.name, input_data.episode_id) - return self.__construct_object_data(input_data, output_path) + def _get_output_descriptors(self) -> List[OutputDescriptor]: + return [ + JsonFileOutput( + subdir="detections/objects", + pattern="{season}/{episode}.json", + min_size_bytes=10, + ), + ] - def __resolve_output_path( + def _get_cache_path( self, input_data: FrameCollection, context: ExecutionContext, ) -> Path: return self._resolve_output_path( 0, context, - { - 'season': f'S{input_data.episode_info.season:02d}', - 'episode': input_data.episode_info.episode_code(), - }, + self.__create_path_variables(input_data), ) + def _load_from_cache( + self, cache_path: Path, input_data: FrameCollection, context: ExecutionContext, + ) -> ObjectDetectionData: + return self.__construct_object_data(input_data, cache_path) + + @staticmethod + def __load_model(context: ExecutionContext) -> None: + context.logger.info('Loading Object Detection model...') + # Model loading logic implementation + + def __unload_model(self, context: ExecutionContext) -> None: + context.logger.info('Object Detection model unloaded') + self.__model = None + + @staticmethod + def __create_path_variables(input_data: FrameCollection) -> Dict[str, str]: + return { + 'season': f'S{input_data.episode_info.season:02d}', + 'episode': input_data.episode_info.episode_code(), + } + @staticmethod def __construct_object_data( input_data: FrameCollection, output_path: Path, From 0ffaa8deaf1265b24aefa5da5dceeca2a637f178 Mon Sep 17 00:00:00 2001 From: dam2452 <81230036+dam2452@users.noreply.github.com> Date: Mon, 16 Feb 2026 12:56:46 +0100 Subject: [PATCH 42/89] Add source_video_path and fix threadpool order Propagate the original source video path through artifacts and steps, and fix threadpool execution ordering. - Add source_video_path field to TranscodedVideo and SceneCollection so downstream steps can access the original file. - SceneDetectorStep now populates source_video_path when producing SceneCollection. - VideoTranscoderStep sets source_video_path on TranscodedVideo outputs. - FrameExporterStep uses source_video_path (with fallback to video_path) and updates log message and frame extraction to reference the original file. - TranscriptionStep updated to accept TranscodedVideo instead of AudioArtifact and adjusted type signatures/usages accordingly. - PipelineStep threadpool executor now maps futures to input artifacts and returns results in the same order as the original input list to preserve ordering. These changes ensure downstream steps operate on the correct source file and that parallel execution preserves input ordering. --- preprocessor/core/artifacts.py | 2 ++ preprocessor/core/base_step.py | 11 ++++++----- preprocessor/steps/text/transcription_step.py | 16 ++++++++-------- preprocessor/steps/video/frame_export_step.py | 6 +++--- preprocessor/steps/video/scene_detection_step.py | 1 + preprocessor/steps/video/transcoding_step.py | 1 + 6 files changed, 21 insertions(+), 16 deletions(-) diff --git a/preprocessor/core/artifacts.py b/preprocessor/core/artifacts.py index ed19b122e..242f14379 100644 --- a/preprocessor/core/artifacts.py +++ b/preprocessor/core/artifacts.py @@ -36,6 +36,7 @@ class TranscodedVideo(EpisodeArtifact): codec: str path: Path resolution: str + source_video_path: Path @dataclass(frozen=True) @@ -45,6 +46,7 @@ class SceneCollection(EpisodeArtifact): scenes: List[Dict[str, Any]] threshold: float video_path: Path + source_video_path: Path @dataclass(frozen=True) diff --git a/preprocessor/core/base_step.py b/preprocessor/core/base_step.py index 1a7a2f6a5..866cdb7cb 100644 --- a/preprocessor/core/base_step.py +++ b/preprocessor/core/base_step.py @@ -156,17 +156,18 @@ def _execute_with_threadpool( ) with ThreadPoolExecutor(max_workers=max_workers) as executor: - futures = { + futures_to_input = { executor.submit(executor_fn, artifact, context): artifact for artifact in input_data } - results = [] - for future in as_completed(futures): + results_dict: Dict[int, OutputT] = {} + for future in as_completed(futures_to_input): + input_artifact = futures_to_input[future] result = future.result() - results.append(result) + results_dict[id(input_artifact)] = result - return results + return [results_dict[id(artifact)] for artifact in input_data] @staticmethod def _execute_sequential( diff --git a/preprocessor/steps/text/transcription_step.py b/preprocessor/steps/text/transcription_step.py index 890408d8b..90a3f534f 100644 --- a/preprocessor/steps/text/transcription_step.py +++ b/preprocessor/steps/text/transcription_step.py @@ -8,7 +8,7 @@ from preprocessor.config.step_configs import WhisperTranscriptionConfig from preprocessor.core.artifacts import ( - AudioArtifact, + TranscodedVideo, TranscriptionData, ) from preprocessor.core.base_step import PipelineStep @@ -20,7 +20,7 @@ class TranscriptionStep( - PipelineStep[AudioArtifact, TranscriptionData, WhisperTranscriptionConfig], + PipelineStep[TranscodedVideo, TranscriptionData, WhisperTranscriptionConfig], ): def __init__(self, config: WhisperTranscriptionConfig) -> None: super().__init__(config) @@ -42,14 +42,14 @@ def cleanup(self) -> None: self.__unload_whisper() def execute_batch( - self, input_data: List[AudioArtifact], context: ExecutionContext, + self, input_data: List[TranscodedVideo], context: ExecutionContext, ) -> List[TranscriptionData]: return self._execute_with_threadpool( input_data, context, self.config.max_parallel_episodes, self.execute, ) def _process( - self, input_data: AudioArtifact, context: ExecutionContext, + self, input_data: TranscodedVideo, context: ExecutionContext, ) -> TranscriptionData: output_path = self._get_cache_path(input_data, context) @@ -70,14 +70,14 @@ def _get_output_descriptors(self) -> List[JsonFileOutput]: ] def _get_cache_path( - self, input_data: AudioArtifact, context: ExecutionContext, + self, input_data: TranscodedVideo, context: ExecutionContext, ) -> Path: return self._get_standard_cache_path(input_data, context) def _load_from_cache( self, cache_path: Path, - input_data: AudioArtifact, + input_data: TranscodedVideo, context: ExecutionContext, ) -> TranscriptionData: return TranscriptionData( @@ -109,7 +109,7 @@ def __unload_whisper(self, context: Optional[ExecutionContext] = None) -> None: def __transcribe_and_save( self, - input_data: AudioArtifact, + input_data: TranscodedVideo, output_path: Path, context: ExecutionContext, ) -> Dict[str, Any]: @@ -134,7 +134,7 @@ def __transcribe_and_save( def __construct_result_artifact( self, output_path: Path, - input_data: AudioArtifact, + input_data: TranscodedVideo, result: Dict[str, Any], ) -> TranscriptionData: return TranscriptionData( diff --git a/preprocessor/steps/video/frame_export_step.py b/preprocessor/steps/video/frame_export_step.py index 044bd87ed..74aaea9fa 100644 --- a/preprocessor/steps/video/frame_export_step.py +++ b/preprocessor/steps/video/frame_export_step.py @@ -67,11 +67,11 @@ def _process( ) context.logger.info( - f'Extracting {len(frame_requests)} keyframes from {input_data.video_path.name}', + f'Extracting {len(frame_requests)} keyframes from {input_data.source_video_path.name}', ) self.__process_frame_extraction( - input_data.video_path, + input_data.source_video_path, frame_requests, episode_dir, input_data, @@ -117,7 +117,7 @@ def _load_from_cache( def __extract_frame_requests( self, input_data: SceneCollection, ) -> List[FrameRequest]: - video_path = input_data.video_path + video_path = getattr(input_data, 'source_video_path', input_data.video_path) if not video_path.exists(): raise FileNotFoundError(f'Video file not found for frame export: {video_path}') data = {'scene_timestamps': {'scenes': input_data.scenes}} diff --git a/preprocessor/steps/video/scene_detection_step.py b/preprocessor/steps/video/scene_detection_step.py index 82412058d..41176a8da 100644 --- a/preprocessor/steps/video/scene_detection_step.py +++ b/preprocessor/steps/video/scene_detection_step.py @@ -141,6 +141,7 @@ def __construct_scene_collection( return SceneCollection( path=output_path, video_path=input_data.path, + source_video_path=getattr(input_data, 'source_video_path', input_data.path), episode_id=input_data.episode_id, episode_info=input_data.episode_info, scenes=scenes, diff --git a/preprocessor/steps/video/transcoding_step.py b/preprocessor/steps/video/transcoding_step.py index 2157a80fb..d507eae3d 100644 --- a/preprocessor/steps/video/transcoding_step.py +++ b/preprocessor/steps/video/transcoding_step.py @@ -228,6 +228,7 @@ def __construct_result_artifact(self, path: Path, input_data: SourceVideo) -> Tr episode_info=input_data.episode_info, resolution=f'{self.config.resolution.width}x{self.config.resolution.height}', codec=self.config.codec, + source_video_path=input_data.path, ) @staticmethod From b1e44f6971e185dfde61b2de7991998d4690345f Mon Sep 17 00:00:00 2001 From: dam2452 <81230036+dam2452@users.noreply.github.com> Date: Mon, 16 Feb 2026 14:21:12 +0100 Subject: [PATCH 43/89] Add artifact registry and timestamp frames Introduce an artifact registry and refactor pipeline execution to use discovered SourceVideo artifacts instead of per-file loops. Add PipelineExecutor.__discover_source_videos, __execute_step_with_registry and __get_input_artifacts to manage step inputs/outputs. Change scene change strategy to operate on scene start/end seconds (timestamps) rather than frame numbers and FPS: remove FPS extraction, compute timestamps and durations, and produce FrameRequest entries with timestamp fields. Update FrameExporterStep to drop decord-based frame reads and extract single frames via ffmpeg (using subprocess and BytesIO), rename frame files to include millisecond timestamps, and adapt request handling and path generation accordingly. --- preprocessor/app/pipeline_builder.py | 93 ++++++++++++++----- .../strategies/scene_changes_strategy.py | 51 +++++----- preprocessor/steps/video/frame_export_step.py | 49 ++++++---- 3 files changed, 126 insertions(+), 67 deletions(-) diff --git a/preprocessor/app/pipeline_builder.py b/preprocessor/app/pipeline_builder.py index fb201e572..ced436c35 100644 --- a/preprocessor/app/pipeline_builder.py +++ b/preprocessor/app/pipeline_builder.py @@ -3,6 +3,7 @@ from pathlib import Path from typing import ( Any, + Dict, List, ) @@ -31,6 +32,69 @@ def cleanup(self) -> None: except Exception as e: self.__context.logger.error(f"Cleanup failed for step {step.name}: {e}") + def __discover_source_videos( + self, source_path: Path, episode_manager: EpisodeManager, + ) -> List[SourceVideo]: + video_files = VideoDiscovery.discover(source_path) + self.__context.logger.info( + f"Discovered {len(video_files)} video files in {source_path}", + ) + + source_videos: List[SourceVideo] = [] + for video_file in video_files: + episode_info = episode_manager.parse_filename(video_file) + if not episode_info: + self.__context.logger.warning(f"Cannot parse: {video_file}") + continue + + episode_id = episode_manager.get_episode_id_for_state(episode_info) + source_videos.append( + SourceVideo( + path=video_file, + episode_id=episode_id, + episode_info=episode_info, + ), + ) + + return source_videos + + def __execute_step_with_registry( + self, + pipeline: "PipelineDefinition", + step_id: str, + artifact_registry: Dict[str, List[Any]], + ) -> None: + step_def = pipeline.get_step(step_id) + self.__context.logger.info(f"Step: {step_id}") + self.__context.logger.info(f"{step_def.description}") + + instance = step_def.step_class(step_def.config) + + if instance.is_global: + self.__run_global_step(instance) + else: + input_artifacts = self.__get_input_artifacts(step_def, artifact_registry) + output_artifacts = self.__run_episode_step(instance, input_artifacts) + artifact_registry[step_id] = output_artifacts + + self.__context.logger.info(f"Step '{step_id}' completed") + + @staticmethod + def __get_input_artifacts( + step_def, + artifact_registry: Dict[str, List[Any]], + ) -> List[Any]: + if not step_def.dependency_ids: + return artifact_registry.get('__source__', []) + + input_source_id = step_def.dependency_ids[0] + artifacts = artifact_registry.get(input_source_id, []) + + if not artifacts: + return artifact_registry.get('__source__', []) + + return artifacts + def execute_step( self, pipeline: "PipelineDefinition", @@ -57,31 +121,18 @@ def execute_steps( source_path: Path, episode_manager: EpisodeManager, ) -> None: + artifact_registry: Dict[str, List[Any]] = {} + source_artifacts = self.__discover_source_videos(source_path, episode_manager) + artifact_registry['__source__'] = source_artifacts + for step_id in step_ids: self.__context.logger.info(f"{'=' * 80}") - self.execute_step(pipeline, step_id, source_path, episode_manager) + self.__execute_step_with_registry( + pipeline, step_id, artifact_registry, + ) def run(self, source_path: Path, episode_manager: EpisodeManager) -> None: - video_files = VideoDiscovery.discover(source_path) - self.__context.logger.info( - f"Discovered {len(video_files)} video files in {source_path}", - ) - - current_artifacts: List[Any] = [] - for video_file in video_files: - episode_info = episode_manager.parse_filename(video_file) - if not episode_info: - self.__context.logger.warning(f"Cannot parse: {video_file}") - continue - - episode_id = episode_manager.get_episode_id_for_state(episode_info) - current_artifacts.append( - SourceVideo( - path=video_file, - episode_id=episode_id, - episode_info=episode_info, - ), - ) + current_artifacts = self.__discover_source_videos(source_path, episode_manager) for step in self.__steps: if step.is_global: diff --git a/preprocessor/services/video/strategies/scene_changes_strategy.py b/preprocessor/services/video/strategies/scene_changes_strategy.py index 9f299de0d..1c1ece31e 100644 --- a/preprocessor/services/video/strategies/scene_changes_strategy.py +++ b/preprocessor/services/video/strategies/scene_changes_strategy.py @@ -24,51 +24,51 @@ def extract_frame_requests( console.print('[yellow]No scene timestamps found[/yellow]') return [] - fps = self.__extract_fps(data) - return self.__process_all_scenes(scenes, fps) + return self.__process_all_scenes(scenes) def __process_all_scenes( - self, scenes: List[Dict[str, Any]], fps: float, + self, scenes: List[Dict[str, Any]], ) -> List[FrameRequest]: frame_requests: List[FrameRequest] = [] for i, scene in enumerate(scenes): - frame_requests.extend(self.__process_single_scene(scene, i, fps)) + frame_requests.extend(self.__process_single_scene(scene, i)) return frame_requests def __process_single_scene( - self, scene: Dict[str, Any], scene_index: int, fps: float, + self, scene: Dict[str, Any], scene_index: int, ) -> List[FrameRequest]: - start_frame = scene.get('start', {}).get('frame', 0) - frame_count = scene.get('frame_count', 1) + start_seconds = scene.get('start', {}).get('seconds', 0.0) + end_seconds = scene.get('end', {}).get('seconds', start_seconds) + duration = end_seconds - start_seconds - if frame_count <= 1: + if duration <= 0.1: return [ - self.__create_request(start_frame, fps, FrameType.SCENE_SINGLE, scene_index), + self.__create_request(start_seconds, FrameType.SCENE_SINGLE, scene_index), ] return self.__generate_multi_frame_requests( - start_frame, frame_count, scene_index, fps, + start_seconds, duration, scene_index, ) def __generate_multi_frame_requests( - self, start_frame: int, frame_count: int, scene_index: int, fps: float, + self, start_seconds: float, duration: float, scene_index: int, ) -> List[FrameRequest]: requests: List[FrameRequest] = [] for frame_idx in range(self.__frames_per_scene): - frame_number = self.__calculate_frame_number( - start_frame, frame_count, frame_idx, + timestamp = self.__calculate_timestamp( + start_seconds, duration, frame_idx, ) frame_type = self.__determine_frame_type(frame_idx) requests.append( - self.__create_request(frame_number, fps, frame_type, scene_index), + self.__create_request(timestamp, frame_type, scene_index), ) return requests - def __calculate_frame_number( - self, start_frame: int, frame_count: int, frame_idx: int, - ) -> int: + def __calculate_timestamp( + self, start_seconds: float, duration: float, frame_idx: int, + ) -> float: position = frame_idx / (self.__frames_per_scene - 1) if self.__frames_per_scene > 1 else 0.0 - return int(start_frame + position * (frame_count - 1)) + return start_seconds + position * duration def __determine_frame_type(self, frame_idx: int) -> str: if frame_idx == 0: @@ -82,22 +82,13 @@ def __extract_scenes(data: Dict[str, Any]) -> List[Dict[str, Any]]: scene_timestamps = data.get('scene_timestamps', {}) return scene_timestamps.get('scenes', []) - @staticmethod - def __extract_fps(data: Dict[str, Any]) -> float: - scene_timestamps = data.get('scene_timestamps', {}) - video_info = scene_timestamps.get('video_info', {}) - fps = video_info.get('fps') - if fps is None: - raise ValueError('FPS not found in scene_timestamps video_info') - return fps - @staticmethod def __create_request( - frame: int, fps: float, type_name: str, scene_num: Optional[int] = None, + timestamp: float, type_name: str, scene_num: Optional[int] = None, ) -> FrameRequest: req: FrameRequest = { - 'frame_number': int(frame), - 'timestamp': float(frame / fps), + 'frame_number': 0, + 'timestamp': float(timestamp), 'type': type_name, } if scene_num is not None: diff --git a/preprocessor/steps/video/frame_export_step.py b/preprocessor/steps/video/frame_export_step.py index 74aaea9fa..891d1fc32 100644 --- a/preprocessor/steps/video/frame_export_step.py +++ b/preprocessor/steps/video/frame_export_step.py @@ -1,4 +1,5 @@ from datetime import datetime +from io import BytesIO import json from pathlib import Path import shutil @@ -10,7 +11,6 @@ ) from PIL import Image -import decord from preprocessor.config.step_configs import FrameExportConfig from preprocessor.config.types import FrameRequest @@ -32,7 +32,6 @@ class FrameExporterStep(PipelineStep[SceneCollection, FrameCollection, FrameExportConfig]): def __init__(self, config: FrameExportConfig) -> None: super().__init__(config) - decord.bridge.set_bridge('native') self.__strategy = KeyframeStrategyFactory.create( self.config.keyframe_strategy, self.config.frames_per_scene, ) @@ -120,7 +119,9 @@ def __extract_frame_requests( video_path = getattr(input_data, 'source_video_path', input_data.video_path) if not video_path.exists(): raise FileNotFoundError(f'Video file not found for frame export: {video_path}') - data = {'scene_timestamps': {'scenes': input_data.scenes}} + data = { + 'scene_timestamps': {'scenes': input_data.scenes}, + } return self.__strategy.extract_frame_requests(video_path, data) def __process_frame_extraction( @@ -162,41 +163,57 @@ def __extract_frames( ) -> None: video_metadata = self.__fetch_video_metadata(video_file) dar = self.__calculate_display_aspect_ratio(video_metadata) - vr = decord.VideoReader(str(video_file), ctx=decord.cpu(0)) for req in frame_requests: - frame_num = req['frame_number'] + timestamp = req['timestamp'] self.__extract_and_save_frame( - vr, - frame_num, + video_file, + timestamp, episode_dir, episode_info, dar, context.series_name, ) - del vr - def __extract_and_save_frame( self, - vr: decord.VideoReader, - frame_num: int, + video_file: Path, + timestamp: float, episode_dir: Path, episode_info, dar: float, series_name: str, ) -> None: - frame_np = vr[frame_num].asnumpy() - frame_pil = Image.fromarray(frame_np) + frame_pil = self.__extract_frame_at_timestamp(video_file, timestamp) resized = self.__resize_frame(frame_pil, dar) base_filename = f'{series_name}_{episode_info.episode_code()}' - filename = f'{base_filename}_frame_{frame_num:06d}.jpg' + timestamp_ms = int(timestamp * 1000) + filename = f'{base_filename}_frame_{timestamp_ms:08d}.jpg' final_path = episode_dir / filename with StepTempFile(final_path) as temp_path: resized.save(temp_path, quality=90) + @staticmethod + def __extract_frame_at_timestamp(video_file: Path, timestamp: float) -> Image.Image: + cmd = [ + 'ffmpeg', + '-ss', str(timestamp), + '-i', str(video_file), + '-frames:v', '1', + '-f', 'image2pipe', + '-vcodec', 'bmp', + '-', + ] + result = subprocess.run( + cmd, + capture_output=True, + check=True, + stdin=subprocess.DEVNULL, + ) + return Image.open(BytesIO(result.stdout)) + def __resize_frame( self, frame: Image.Image, display_aspect_ratio: float, ) -> Image.Image: @@ -241,8 +258,8 @@ def __write_metadata( frame_types_count[frame_type] = frame_types_count.get(frame_type, 0) + 1 frame_with_path = frame.copy() - frame_num = frame['frame_number'] - frame_with_path['frame_path'] = f'{base_filename}_frame_{frame_num:06d}.jpg' + timestamp_ms = int(frame['timestamp'] * 1000) + frame_with_path['frame_path'] = f'{base_filename}_frame_{timestamp_ms:08d}.jpg' frames_with_paths.append(frame_with_path) scene_numbers = { From 22f7653dd685c58fb6548a0ebdaffee17559026a Mon Sep 17 00:00:00 2001 From: dam2452 <81230036+dam2452@users.noreply.github.com> Date: Mon, 16 Feb 2026 16:19:26 +0100 Subject: [PATCH 44/89] Add state sync CLI and filesystem reconstructor Introduce a way to reconstruct processing state from existing output files and a CLI to trigger it. Added StateReconstructor (preprocessor/core/state_reconstruction.py) which scans pipeline step outputs for global and per-episode steps using an episodes JSON, producing StepCheckpoint entries. Added a `sync-state` CLI command (preprocessor/cli/cli_main.py) that loads episodes from a JSON file and invokes the reconstructor, then calls StateManager.rebuild_state to persist the rebuilt state. Make base step handling robust for global steps by treating None input as episode 'all' and using that id when checking cache and marking steps started/completed (preprocessor/core/base_step.py). Implemented StateManager.rebuild_state to replace the current state with reconstructed checkpoints and save it (preprocessor/core/state_manager.py). Fixed frame export behavior to explicitly save JPEGs and correct aspect-ratio handling so frames are resized and letterboxed/pillarboxed (padded) instead of incorrectly cropped (preprocessor/steps/video/frame_export_step.py). --- preprocessor/cli/cli_main.py | 62 +++++++++++- preprocessor/core/base_step.py | 13 ++- preprocessor/core/state_manager.py | 13 +++ preprocessor/core/state_reconstruction.py | 99 +++++++++++++++++++ preprocessor/steps/video/frame_export_step.py | 20 ++-- 5 files changed, 192 insertions(+), 15 deletions(-) create mode 100644 preprocessor/core/state_reconstruction.py diff --git a/preprocessor/cli/cli_main.py b/preprocessor/cli/cli_main.py index 01489dc46..efcd0dfcb 100644 --- a/preprocessor/cli/cli_main.py +++ b/preprocessor/cli/cli_main.py @@ -1,7 +1,11 @@ import asyncio +import json from pathlib import Path import sys -from typing import Tuple +from typing import ( + List, + Tuple, +) import click from click import Command @@ -24,6 +28,8 @@ ) from preprocessor.cli.skip_list_builder import SkipListBuilder from preprocessor.config.series_config import SeriesConfig +from preprocessor.core.state_reconstruction import StateReconstructor +from preprocessor.services.episodes.types import EpisodeInfo from preprocessor.services.io.path_service import PathService from preprocessor.services.search.clients.elasticsearch_queries import ElasticsearchQueries from preprocessor.services.search.clients.embedding_service import EmbeddingService @@ -80,6 +86,60 @@ def __run_all(series: str, force_rerun: bool, skip: Tuple[str, ...]) -> None: setup.logger.finalize() +def __load_episodes_from_json(episodes_file: Path) -> List[EpisodeInfo]: + with open(episodes_file, 'r', encoding='utf-8') as f: + episodes_data = json.load(f) + + episodes_list = [] + for season_data in episodes_data.get('seasons', []): + season_num = season_data['season_number'] + for ep_data in season_data.get('episodes', []): + episode_info = EpisodeInfo( + season=season_num, + relative_episode=ep_data['episode_in_season'], + absolute_episode=ep_data['overall_episode_number'], + title=ep_data.get('title', ''), + premiere_date=ep_data.get('premiere_date'), + ) + episodes_list.append(episode_info) + + return episodes_list + + +@cli.command(name="sync-state") +@click.option("--series", required=True, help="Series name (e.g., ranczo)") +def __sync_state(series: str) -> None: + pipeline = build_pipeline(series) + setup = setup_pipeline_context(series, "sync_state", force_rerun=False, with_episode_manager=True) + + try: + episodes_file = setup.context.base_output_dir / f'{series}_episodes.json' + + if not episodes_file.exists(): + setup.logger.error(f'Episodes file not found: {episodes_file}') + setup.logger.error('Run scraping steps first to generate episodes.json') + sys.exit(1) + + setup.logger.info(f'Loading episodes from {episodes_file}') + episodes_list = __load_episodes_from_json(episodes_file) + setup.logger.info(f'Found {len(episodes_list)} episodes') + + completed_steps = StateReconstructor.scan_filesystem( + pipeline=pipeline, + episodes_list=episodes_list, + base_output_dir=setup.context.base_output_dir, + series_name=series, + ) + + setup.context.state_manager.rebuild_state(completed_steps) + setup.logger.info('State synchronization completed!') + except Exception as e: + setup.logger.error(f'Failed to sync state: {e}') + raise + finally: + setup.logger.finalize() + + def __create_step_command(step_id: str, step_description: str) -> Command: @click.command(name=step_id.replace("_", "-"), help=f"{step_description}") @click.option("--series", required=True, help="Series name (e.g., ranczo)") diff --git a/preprocessor/core/base_step.py b/preprocessor/core/base_step.py index 866cdb7cb..07c799743 100644 --- a/preprocessor/core/base_step.py +++ b/preprocessor/core/base_step.py @@ -208,25 +208,28 @@ def __execute_managed_flow( def __should_restore_from_cache( self, cache_path: Path, input_data: InputT, context: ExecutionContext, ) -> bool: + episode_id = 'all' if input_data is None else input_data.episode_id return self._check_cache_validity( - cache_path, context, input_data.episode_id, 'cached', + cache_path, context, episode_id, 'cached', ) def __restore_result( self, cache_path: Path, input_data: InputT, context: ExecutionContext, ) -> OutputT: - context.logger.info(f'Loading {input_data.episode_id} from cache') + episode_id = 'all' if input_data is None else input_data.episode_id + context.logger.info(f'Loading {episode_id} from cache') return self._load_from_cache(cache_path, input_data, context) def __compute_new_result( self, input_data: InputT, context: ExecutionContext, ) -> OutputT: - context.logger.info(f'Processing {input_data.episode_id}') - context.mark_step_started(self.name, input_data.episode_id) + episode_id = 'all' if input_data is None else input_data.episode_id + context.logger.info(f'Processing {episode_id}') + context.mark_step_started(self.name, episode_id) result = self._process(input_data, context) - context.mark_step_completed(self.name, input_data.episode_id) + context.mark_step_completed(self.name, episode_id) return result def _check_cache_validity( diff --git a/preprocessor/core/state_manager.py b/preprocessor/core/state_manager.py index 52373f50c..a6c2b4360 100644 --- a/preprocessor/core/state_manager.py +++ b/preprocessor/core/state_manager.py @@ -172,3 +172,16 @@ def __save_state(self) -> None: self.__state.last_checkpoint = datetime.now().isoformat() with open(self.__state_file, 'w', encoding='utf-8') as f: json.dump(self.__state.to_dict(), f, indent=2, ensure_ascii=False) + + def rebuild_state(self, completed_steps: List[StepCheckpoint]) -> ProcessingState: + now = datetime.now().isoformat() + self.__state = ProcessingState( + series_name=self.__series_name, + started_at=now, + last_checkpoint=now, + completed_steps=completed_steps, + in_progress=[], + ) + self.__save_state() + console.print(f'[green]State rebuilt with {len(completed_steps)} completed steps[/green]') + return self.__state diff --git a/preprocessor/core/state_reconstruction.py b/preprocessor/core/state_reconstruction.py new file mode 100644 index 000000000..4a2b7f85e --- /dev/null +++ b/preprocessor/core/state_reconstruction.py @@ -0,0 +1,99 @@ +from datetime import datetime +from pathlib import Path +from typing import ( + Dict, + List, +) + +from preprocessor.app.pipeline import PipelineDefinition +from preprocessor.core.state_manager import StepCheckpoint +from preprocessor.services.episodes.types import EpisodeInfo +from preprocessor.services.ui.console import console + + +class StateReconstructor: + @staticmethod + def scan_filesystem( + pipeline: PipelineDefinition, + episodes_list: List[EpisodeInfo], + base_output_dir: Path, + series_name: str, + ) -> List[StepCheckpoint]: + console.print('[cyan]Reconstructing state from filesystem...[/cyan]') + + now = datetime.now().isoformat() + completed_steps: List[StepCheckpoint] = [] + + total_checked = 0 + total_completed = 0 + + for step_id, step_def in pipeline.get_all_steps().items(): + step_instance = step_def.step_class(step_def.config) + step_name = step_instance.name + + if step_instance.is_global: + if StateReconstructor.__check_global_step_outputs(step_instance, base_output_dir): + checkpoint = StepCheckpoint( + step=step_name, + episode='all', + completed_at=now, + ) + completed_steps.append(checkpoint) + total_completed += 1 + console.print(f'[green]✓ {step_id} ({step_name}) - global[/green]') + else: + console.print(f'[yellow]✗ {step_id} ({step_name}) - global - outputs missing[/yellow]') + total_checked += 1 + else: + for episode_info in episodes_list: + episode_id = f'S{episode_info.season:02d}E{episode_info.relative_episode:02d}' + context_vars = { + 'season': episode_info.season_code(), + 'episode': episode_info.episode_code(), + 'series_name': series_name, + } + + if StateReconstructor.__check_episode_step_outputs( + step_instance, base_output_dir, context_vars, + ): + checkpoint = StepCheckpoint( + step=step_name, + episode=episode_id, + completed_at=now, + ) + completed_steps.append(checkpoint) + total_completed += 1 + total_checked += 1 + + console.print('\n[green]Filesystem scan complete:[/green]') + console.print(f' Checked: {total_checked} step-episode combinations') + console.print(f' Found completed: {total_completed}') + console.print(f' Missing: {total_checked - total_completed}') + + return completed_steps + + @staticmethod + def __check_global_step_outputs(step_instance, base_output_dir: Path) -> bool: + descriptors = step_instance._get_output_descriptors() + if not descriptors: + return True + + return all( + descriptor.validate(base_output_dir).is_valid + for descriptor in descriptors + ) + + @staticmethod + def __check_episode_step_outputs( + step_instance, + base_output_dir: Path, + context_vars: Dict[str, str], + ) -> bool: + descriptors = step_instance._get_output_descriptors() + if not descriptors: + return True + + return all( + descriptor.validate(base_output_dir, context_vars).is_valid + for descriptor in descriptors + ) diff --git a/preprocessor/steps/video/frame_export_step.py b/preprocessor/steps/video/frame_export_step.py index 891d1fc32..362f819a7 100644 --- a/preprocessor/steps/video/frame_export_step.py +++ b/preprocessor/steps/video/frame_export_step.py @@ -193,7 +193,7 @@ def __extract_and_save_frame( final_path = episode_dir / filename with StepTempFile(final_path) as temp_path: - resized.save(temp_path, quality=90) + resized.save(temp_path, format='JPEG', quality=90) @staticmethod def __extract_frame_at_timestamp(video_file: Path, timestamp: float) -> Image.Image: @@ -227,18 +227,20 @@ def __resize_frame( ) if display_aspect_ratio > target_aspect: - new_height = target_height - new_width = int(target_height * display_aspect_ratio) + new_width = target_width + new_height = int(target_width / display_aspect_ratio) resized = frame.resize((new_width, new_height), Image.Resampling.LANCZOS) - x_crop = (new_width - target_width) // 2 - return resized.crop((x_crop, 0, x_crop + target_width, target_height)) + result = Image.new('RGB', (target_width, target_height), (0, 0, 0)) + y_offset = (target_height - new_height) // 2 + result.paste(resized, (0, y_offset)) + return result - new_width = target_width - new_height = int(target_width / display_aspect_ratio) + new_height = target_height + new_width = int(target_height * display_aspect_ratio) resized = frame.resize((new_width, new_height), Image.Resampling.LANCZOS) result = Image.new('RGB', (target_width, target_height), (0, 0, 0)) - y_offset = (target_height - new_height) // 2 - result.paste(resized, (0, y_offset)) + x_offset = (target_width - new_width) // 2 + result.paste(resized, (x_offset, 0)) return result def __write_metadata( From 2232aef5f7d79d4adee83ffa2e98eb152265006e Mon Sep 17 00:00:00 2001 From: dam2452 <81230036+dam2452@users.noreply.github.com> Date: Mon, 16 Feb 2026 16:23:30 +0100 Subject: [PATCH 45/89] Make get_output_descriptors public Rename PipelineStep._get_output_descriptors to get_output_descriptors and update all overrides and call sites. Updated references in preprocessor/core/base_step.py and preprocessor/core/state_reconstruction.py, and renamed method implementations across packaging, search, text, video, and vision steps so each step exposes the descriptor method as part of the public API. This change exposes output descriptors for external use (e.g., state reconstruction) and normalizes the API name across the codebase. --- preprocessor/core/base_step.py | 6 +++--- preprocessor/core/state_reconstruction.py | 4 ++-- preprocessor/steps/packaging/archives_step.py | 2 +- preprocessor/steps/search/document_generation_step.py | 2 +- preprocessor/steps/text/analysis_step.py | 2 +- preprocessor/steps/text/embeddings_step.py | 2 +- preprocessor/steps/text/transcription_step.py | 2 +- preprocessor/steps/video/frame_export_step.py | 2 +- preprocessor/steps/video/scene_detection_step.py | 2 +- preprocessor/steps/video/transcoding_step.py | 2 +- preprocessor/steps/vision/embeddings_step.py | 2 +- preprocessor/steps/vision/emotion_detection_step.py | 2 +- preprocessor/steps/vision/face_clustering_step.py | 2 +- preprocessor/steps/vision/image_hashing_step.py | 2 +- preprocessor/steps/vision/object_detection_step.py | 2 +- 15 files changed, 18 insertions(+), 18 deletions(-) diff --git a/preprocessor/core/base_step.py b/preprocessor/core/base_step.py index 07c799743..7662c9877 100644 --- a/preprocessor/core/base_step.py +++ b/preprocessor/core/base_step.py @@ -99,7 +99,7 @@ def _process(self, input_data: InputT, context: ExecutionContext) -> OutputT: f'{self.__class__.__name__} must implement _process()', ) - def _get_output_descriptors(self) -> List[OutputDescriptor]: + def get_output_descriptors(self) -> List[OutputDescriptor]: return [] def _get_cache_path(self, input_data: InputT, context: ExecutionContext) -> Path: @@ -120,7 +120,7 @@ def _resolve_output_path( context: ExecutionContext, context_vars: Optional[Dict[str, str]] = None, ) -> Path: - descriptors = self._get_output_descriptors() + descriptors = self.get_output_descriptors() if not descriptors or descriptor_index >= len(descriptors): raise ValueError( f'Step {self.name} has no output descriptor at index {descriptor_index}', @@ -251,7 +251,7 @@ def __validate_all_descriptors( context_vars: Optional[Dict[str, str]], episode_id: str, ) -> bool: - descriptors = self._get_output_descriptors() + descriptors = self.get_output_descriptors() if not descriptors: return True diff --git a/preprocessor/core/state_reconstruction.py b/preprocessor/core/state_reconstruction.py index 4a2b7f85e..2f25e8caa 100644 --- a/preprocessor/core/state_reconstruction.py +++ b/preprocessor/core/state_reconstruction.py @@ -74,7 +74,7 @@ def scan_filesystem( @staticmethod def __check_global_step_outputs(step_instance, base_output_dir: Path) -> bool: - descriptors = step_instance._get_output_descriptors() + descriptors = step_instance.get_output_descriptors() if not descriptors: return True @@ -89,7 +89,7 @@ def __check_episode_step_outputs( base_output_dir: Path, context_vars: Dict[str, str], ) -> bool: - descriptors = step_instance._get_output_descriptors() + descriptors = step_instance.get_output_descriptors() if not descriptors: return True diff --git a/preprocessor/steps/packaging/archives_step.py b/preprocessor/steps/packaging/archives_step.py index 3e0d84529..ed475798b 100644 --- a/preprocessor/steps/packaging/archives_step.py +++ b/preprocessor/steps/packaging/archives_step.py @@ -32,7 +32,7 @@ def _process( # Archive generation logic would go here return self.__construct_archive_artifact(input_data, output_path) - def _get_output_descriptors(self) -> List[FileOutput]: + def get_output_descriptors(self) -> List[FileOutput]: return [ FileOutput( pattern="{season}/{episode}.zip", diff --git a/preprocessor/steps/search/document_generation_step.py b/preprocessor/steps/search/document_generation_step.py index 6572e9ca9..862cfb4e1 100644 --- a/preprocessor/steps/search/document_generation_step.py +++ b/preprocessor/steps/search/document_generation_step.py @@ -50,7 +50,7 @@ def _process( episode_id, episode_info, output_path, total_docs, ) - def _get_output_descriptors(self) -> List[FileOutput]: + def get_output_descriptors(self) -> List[FileOutput]: return [ FileOutput( pattern="{season}/{episode}.ndjson", diff --git a/preprocessor/steps/text/analysis_step.py b/preprocessor/steps/text/analysis_step.py index 6f840f111..7dfc9c37c 100644 --- a/preprocessor/steps/text/analysis_step.py +++ b/preprocessor/steps/text/analysis_step.py @@ -46,7 +46,7 @@ def _process( return self.__construct_analysis_results(input_data, output_path, result_data) - def _get_output_descriptors(self) -> List[OutputDescriptor]: + def get_output_descriptors(self) -> List[OutputDescriptor]: return [ JsonFileOutput( pattern="{season}/{episode}.json", diff --git a/preprocessor/steps/text/embeddings_step.py b/preprocessor/steps/text/embeddings_step.py index 56cc521cc..47b15b0d5 100644 --- a/preprocessor/steps/text/embeddings_step.py +++ b/preprocessor/steps/text/embeddings_step.py @@ -75,7 +75,7 @@ def _process( input_data, output_path, len(results), ) - def _get_output_descriptors(self) -> List[FileOutput]: + def get_output_descriptors(self) -> List[FileOutput]: return [ FileOutput( pattern="{season}/{episode}.json", diff --git a/preprocessor/steps/text/transcription_step.py b/preprocessor/steps/text/transcription_step.py index 90a3f534f..c90d62f09 100644 --- a/preprocessor/steps/text/transcription_step.py +++ b/preprocessor/steps/text/transcription_step.py @@ -60,7 +60,7 @@ def _process( return self.__construct_result_artifact(output_path, input_data, result) - def _get_output_descriptors(self) -> List[JsonFileOutput]: + def get_output_descriptors(self) -> List[JsonFileOutput]: return [ JsonFileOutput( pattern="{season}/{episode}.json", diff --git a/preprocessor/steps/video/frame_export_step.py b/preprocessor/steps/video/frame_export_step.py index 362f819a7..d4dbfb00a 100644 --- a/preprocessor/steps/video/frame_export_step.py +++ b/preprocessor/steps/video/frame_export_step.py @@ -86,7 +86,7 @@ def _process( metadata_path=metadata_file, ) - def _get_output_descriptors(self) -> List[DirectoryOutput]: + def get_output_descriptors(self) -> List[DirectoryOutput]: return [create_frames_output()] def _get_cache_path( diff --git a/preprocessor/steps/video/scene_detection_step.py b/preprocessor/steps/video/scene_detection_step.py index 41176a8da..f80494b80 100644 --- a/preprocessor/steps/video/scene_detection_step.py +++ b/preprocessor/steps/video/scene_detection_step.py @@ -72,7 +72,7 @@ def _process( return self.__construct_scene_collection(output_path, input_data, scenes) - def _get_output_descriptors(self) -> List[OutputDescriptor]: + def get_output_descriptors(self) -> List[OutputDescriptor]: return [ JsonFileOutput( pattern="{season}/{episode}.json", diff --git a/preprocessor/steps/video/transcoding_step.py b/preprocessor/steps/video/transcoding_step.py index d507eae3d..3983d0fae 100644 --- a/preprocessor/steps/video/transcoding_step.py +++ b/preprocessor/steps/video/transcoding_step.py @@ -55,7 +55,7 @@ def _process(self, input_data: SourceVideo, context: ExecutionContext) -> Transc return self.__construct_result_artifact(output_path, input_data) - def _get_output_descriptors(self) -> List[FileOutput]: + def get_output_descriptors(self) -> List[FileOutput]: return [ FileOutput( pattern="{season}/{series_name}_{episode}.mp4", diff --git a/preprocessor/steps/vision/embeddings_step.py b/preprocessor/steps/vision/embeddings_step.py index e796a6cec..932c046dc 100644 --- a/preprocessor/steps/vision/embeddings_step.py +++ b/preprocessor/steps/vision/embeddings_step.py @@ -77,7 +77,7 @@ def _process( input_data, output_path, len(results), self.config.model_name, ) - def _get_output_descriptors(self) -> List[FileOutput]: + def get_output_descriptors(self) -> List[FileOutput]: return [ FileOutput( pattern="{season}/{episode}.json", diff --git a/preprocessor/steps/vision/emotion_detection_step.py b/preprocessor/steps/vision/emotion_detection_step.py index ce292e0cb..26d31d96d 100644 --- a/preprocessor/steps/vision/emotion_detection_step.py +++ b/preprocessor/steps/vision/emotion_detection_step.py @@ -76,7 +76,7 @@ def _process( return self.__construct_emotion_data(input_data, output_path) - def _get_output_descriptors(self) -> List[OutputDescriptor]: + def get_output_descriptors(self) -> List[OutputDescriptor]: return [ JsonFileOutput( subdir="detections/emotions", diff --git a/preprocessor/steps/vision/face_clustering_step.py b/preprocessor/steps/vision/face_clustering_step.py index 958025b36..71160d70d 100644 --- a/preprocessor/steps/vision/face_clustering_step.py +++ b/preprocessor/steps/vision/face_clustering_step.py @@ -51,7 +51,7 @@ def _process( output_path = self._get_cache_path(input_data, context) return self.__construct_cluster_data(input_data, output_path) - def _get_output_descriptors(self) -> List[OutputDescriptor]: + def get_output_descriptors(self) -> List[OutputDescriptor]: return [ JsonFileOutput( subdir="clusters/faces", diff --git a/preprocessor/steps/vision/image_hashing_step.py b/preprocessor/steps/vision/image_hashing_step.py index 07d288c15..eba641aef 100644 --- a/preprocessor/steps/vision/image_hashing_step.py +++ b/preprocessor/steps/vision/image_hashing_step.py @@ -79,7 +79,7 @@ def _process( hash_count=len(hash_results), ) - def _get_output_descriptors(self) -> List[OutputDescriptor]: + def get_output_descriptors(self) -> List[OutputDescriptor]: return [ JsonFileOutput( subdir="hashes", diff --git a/preprocessor/steps/vision/object_detection_step.py b/preprocessor/steps/vision/object_detection_step.py index 624fdf124..4549a55a0 100644 --- a/preprocessor/steps/vision/object_detection_step.py +++ b/preprocessor/steps/vision/object_detection_step.py @@ -54,7 +54,7 @@ def _process( # Main processing logic would go here return self.__construct_object_data(input_data, output_path) - def _get_output_descriptors(self) -> List[OutputDescriptor]: + def get_output_descriptors(self) -> List[OutputDescriptor]: return [ JsonFileOutput( subdir="detections/objects", From 61fe0814230df5c5e9bf39788867ca1582d5abf1 Mon Sep 17 00:00:00 2001 From: dam2452 <81230036+dam2452@users.noreply.github.com> Date: Tue, 17 Feb 2026 07:47:18 +0100 Subject: [PATCH 46/89] Make output subdir optional and snap to keyframes Make OutputDescriptor.subdir optional and adjust path resolution so steps can resolve outputs without hardcoded subdirs. Remove many hardcoded step name properties and normalize step identifiers (update series skip_steps examples). Add FrameExportConfig.scene_change_offset_seconds and lower default export resolution to 720p. Wire the offset through KeyframeStrategyFactory -> SceneChangesStrategy. Enhance FrameExporterStep: use video_path consistently, find I-frames via ffprobe, snap requested timestamps to nearest keyframe (with debug logging), handle KeyboardInterrupts, and improve error messages. Add KeyboardInterrupt handling in the threadpool loop to cancel pending futures. Misc: change transcoder default framerate to 25.0 and refactor CharacterDetectorStep (reorder methods, move output descriptors and cache helpers, cleanup and batch execution handling). --- preprocessor/app/pipeline_factory.py | 32 ----- preprocessor/config/step_configs.py | 3 +- preprocessor/core/base_step.py | 19 ++- preprocessor/core/output_descriptors.py | 11 +- preprocessor/series_configs/ranczo.json | 8 +- preprocessor/series_configs/template.json | 4 +- .../strategies/scene_changes_strategy.py | 5 +- .../video/strategies/strategy_factory.py | 9 +- .../steps/scraping/character_scraper_step.py | 4 - .../steps/scraping/episode_scraper_step.py | 4 - .../scraping/reference_processor_step.py | 4 - .../steps/search/document_generation_step.py | 4 - preprocessor/steps/search/indexing_step.py | 4 - .../steps/validation/validator_step.py | 4 - preprocessor/steps/video/frame_export_step.py | 83 +++++++++++-- .../steps/video/scene_detection_step.py | 4 - preprocessor/steps/video/transcoding_step.py | 6 +- .../steps/vision/character_detection_step.py | 110 +++++++++--------- .../steps/vision/image_hashing_step.py | 4 - 19 files changed, 167 insertions(+), 155 deletions(-) diff --git a/preprocessor/app/pipeline_factory.py b/preprocessor/app/pipeline_factory.py index 2e1b1b409..df6db9920 100644 --- a/preprocessor/app/pipeline_factory.py +++ b/preprocessor/app/pipeline_factory.py @@ -84,7 +84,6 @@ def build_pipeline(series_name: str) -> PipelineDefinition: # pylint: disable=t # SCRAPING PHASE # ========================================================= episodes_metadata = StepBuilder( - id="scrape_episodes", phase=SCRAPING, step_class=EpisodeScraperStep, description="Scrapes episode metadata from wiki", @@ -106,7 +105,6 @@ def build_pipeline(series_name: str) -> PipelineDefinition: # pylint: disable=t ) characters_metadata = StepBuilder( - id="scrape_characters", phase=SCRAPING, step_class=CharacterScraperStep, description="Scrapes character data from wiki", @@ -127,7 +125,6 @@ def build_pipeline(series_name: str) -> PipelineDefinition: # pylint: disable=t ) character_references = StepBuilder( - id="process_references", phase=SCRAPING, step_class=CharacterReferenceStep, description="Downloads and processes character reference images", @@ -162,14 +159,12 @@ def build_pipeline(series_name: str) -> PipelineDefinition: # pylint: disable=t ) transcoded_videos = StepBuilder( - id="transcode", phase=PROCESSING, step_class=VideoTranscoderStep, description=f"Conversion to h264_nvenc {series_config.processing.transcode.resolution} with adaptive bitrate", produces=[ FileOutput( pattern="{season}/{episode}.mp4", - subdir="transcoded_videos", min_size_bytes=1024 * 1024, ), ], @@ -184,14 +179,12 @@ def build_pipeline(series_name: str) -> PipelineDefinition: # pylint: disable=t ) scene_data = StepBuilder( - id="detect_scenes", phase=PROCESSING, step_class=SceneDetectorStep, description="Detects scene changes using TransNetV2", produces=[ JsonFileOutput( pattern="{season}/{episode}.json", - subdir="scene_detections", min_size_bytes=10, ), ], @@ -205,7 +198,6 @@ def build_pipeline(series_name: str) -> PipelineDefinition: # pylint: disable=t # Frame export output descriptor matches FrameExporterStep.get_output_descriptors() # Defined here for pipeline validation before step instantiation exported_frames = StepBuilder( - id="export_frames", phase=PROCESSING, step_class=FrameExporterStep, description="Exports frames (PNG) at scene boundaries", @@ -220,14 +212,12 @@ def build_pipeline(series_name: str) -> PipelineDefinition: # pylint: disable=t # PROCESSING PHASE: TEXT & AUDIO # ========================================================= transcription_data = StepBuilder( - id="transcribe", phase=PROCESSING, step_class=TranscriptionStep, description=f"Audio transcription using {series_config.processing.transcription.mode}", produces=[ JsonFileOutput( pattern="{season}/{episode}.json", - subdir="transcriptions", min_size_bytes=50, ), ], @@ -242,14 +232,12 @@ def build_pipeline(series_name: str) -> PipelineDefinition: # pylint: disable=t ) separated_audio = StepBuilder( - id="separate_sounds", phase=PROCESSING, step_class=SoundSeparationStep, description="Separates dialogue from sound effects", produces=[ DirectoryOutput( pattern="{season}/{episode}", - subdir="separated_audio", expected_file_pattern="*.wav", min_files=1, min_size_per_file_bytes=1024, @@ -260,14 +248,12 @@ def build_pipeline(series_name: str) -> PipelineDefinition: # pylint: disable=t ) text_stats = StepBuilder( - id="analyze_text", phase=PROCESSING, step_class=TextAnalysisStep, description="Analyzes text statistics (word frequency, sentiment)", produces=[ JsonFileOutput( pattern="{season}/{episode}.json", - subdir="text_analysis", min_size_bytes=50, ), ], @@ -282,7 +268,6 @@ def build_pipeline(series_name: str) -> PipelineDefinition: # pylint: disable=t produces=[ FileOutput( pattern="{season}/{episode}.npy", - subdir="embeddings/text", min_size_bytes=1024, ), ], @@ -300,14 +285,12 @@ def build_pipeline(series_name: str) -> PipelineDefinition: # pylint: disable=t # PROCESSING PHASE: VISION # ========================================================= image_hashes = StepBuilder( - id="image_hashing", phase=PROCESSING, step_class=ImageHashStep, description="Perceptual frame hashing (phash, dhash, wavelet)", produces=[ JsonFileOutput( pattern="{season}/{episode}.json", - subdir="hashes", min_size_bytes=50, ), ], @@ -322,7 +305,6 @@ def build_pipeline(series_name: str) -> PipelineDefinition: # pylint: disable=t produces=[ FileOutput( pattern="{season}/{episode}.npy", - subdir="embeddings/vision", min_size_bytes=1024, ), ], @@ -335,14 +317,12 @@ def build_pipeline(series_name: str) -> PipelineDefinition: # pylint: disable=t ) character_detections = StepBuilder( - id="detect_characters", phase=PROCESSING, step_class=CharacterDetectorStep, description="Recognizes characters in frames using InsightFace", produces=[ JsonFileOutput( pattern="{season}/{episode}.json", - subdir="detections/characters", min_size_bytes=10, ), ], @@ -351,14 +331,12 @@ def build_pipeline(series_name: str) -> PipelineDefinition: # pylint: disable=t ) emotion_data = StepBuilder( - id="detect_emotions", phase=PROCESSING, step_class=EmotionDetectionStep, description="Detects emotions on faces using EmoNet", produces=[ JsonFileOutput( pattern="{season}/{episode}.json", - subdir="detections/emotions", min_size_bytes=10, ), ], @@ -367,14 +345,12 @@ def build_pipeline(series_name: str) -> PipelineDefinition: # pylint: disable=t ) face_clusters = StepBuilder( - id="cluster_faces", phase=PROCESSING, step_class=FaceClusteringStep, description="Face clustering using HDBSCAN", produces=[ JsonFileOutput( pattern="{season}/{episode}.json", - subdir="clusters/faces", min_size_bytes=10, ), ], @@ -383,14 +359,12 @@ def build_pipeline(series_name: str) -> PipelineDefinition: # pylint: disable=t ) object_detections = StepBuilder( - id="detect_objects", phase=PROCESSING, step_class=ObjectDetectionStep, description="General object detection using D-FINE", produces=[ JsonFileOutput( pattern="{season}/{episode}.json", - subdir="detections/objects", min_size_bytes=10, ), ], @@ -402,14 +376,12 @@ def build_pipeline(series_name: str) -> PipelineDefinition: # pylint: disable=t # INDEXING PHASE # ========================================================= elastic_documents = StepBuilder( - id="generate_elastic_docs", phase=INDEXING, step_class=DocumentGeneratorStep, description="Combines all data into Elasticsearch documents", produces=[ FileOutput( pattern="{season}/{episode}.ndjson", - subdir="elastic_documents", min_size_bytes=100, ), ], @@ -425,14 +397,12 @@ def build_pipeline(series_name: str) -> PipelineDefinition: # pylint: disable=t ) episode_archives = StepBuilder( - id="generate_archives", phase=INDEXING, step_class=ArchiveGenerationStep, description="Creates ZIP archives per episode (all artifacts)", produces=[ FileOutput( pattern="{season}/{episode}.zip", - subdir="archives", min_size_bytes=1024 * 100, ), ], @@ -441,7 +411,6 @@ def build_pipeline(series_name: str) -> PipelineDefinition: # pylint: disable=t ) indexed_data = StepBuilder( - id="index_to_elasticsearch", phase=INDEXING, step_class=ElasticsearchIndexerStep, description="Indexes documents into Elasticsearch", @@ -465,7 +434,6 @@ def build_pipeline(series_name: str) -> PipelineDefinition: # pylint: disable=t produces=[ DirectoryOutput( pattern="{season}", - subdir="validation_reports", expected_file_pattern="*.json", min_files=1, min_size_per_file_bytes=50, diff --git a/preprocessor/config/step_configs.py b/preprocessor/config/step_configs.py index 5cdbfd716..4427c237c 100644 --- a/preprocessor/config/step_configs.py +++ b/preprocessor/config/step_configs.py @@ -72,7 +72,8 @@ class FrameExportConfig(BaseModel): frames_per_scene: int = Field(default=1, ge=1) keyframe_strategy: KeyframeStrategy = KeyframeStrategy.SCENE_CHANGES max_parallel_episodes: int = Field(default=4, ge=1, le=8) - resolution: Resolution = Field(default=Resolution.R1080P) + resolution: Resolution = Field(default=Resolution.R720P) + scene_change_offset_seconds: float = Field(default=0.5, ge=0) class TranscriptionConfig(BaseModel): diff --git a/preprocessor/core/base_step.py b/preprocessor/core/base_step.py index 7662c9877..13fb80326 100644 --- a/preprocessor/core/base_step.py +++ b/preprocessor/core/base_step.py @@ -127,6 +127,11 @@ def _resolve_output_path( ) descriptor = descriptors[descriptor_index] + + if not descriptor.subdir: + formatted_pattern = descriptor.format_pattern(context_vars) + return context.base_output_dir / self.name / formatted_pattern + return descriptor.resolve_path(context.base_output_dir, context_vars) def _get_standard_cache_path( @@ -162,10 +167,16 @@ def _execute_with_threadpool( } results_dict: Dict[int, OutputT] = {} - for future in as_completed(futures_to_input): - input_artifact = futures_to_input[future] - result = future.result() - results_dict[id(input_artifact)] = result + try: + for future in as_completed(futures_to_input): + input_artifact = futures_to_input[future] + result = future.result() + results_dict[id(input_artifact)] = result + except KeyboardInterrupt: + context.logger.warning("Batch processing interrupted - cancelling remaining tasks") + for future in futures_to_input: + future.cancel() + raise return [results_dict[id(artifact)] for artifact in input_data] diff --git a/preprocessor/core/output_descriptors.py b/preprocessor/core/output_descriptors.py index 03ea248f3..f41bb40c1 100644 --- a/preprocessor/core/output_descriptors.py +++ b/preprocessor/core/output_descriptors.py @@ -21,7 +21,7 @@ class ValidationResult: class OutputDescriptor(ABC): - def __init__(self, pattern: str, subdir: str) -> None: + def __init__(self, pattern: str, subdir: str = "") -> None: self._pattern = pattern self._subdir = subdir @@ -51,7 +51,7 @@ class FileOutput(OutputDescriptor): def __init__( self, pattern: str, - subdir: str, + subdir: str = "", min_size_bytes: int = 1, expected_count: int = 1, ) -> None: @@ -100,7 +100,7 @@ class DirectoryOutput(OutputDescriptor): def __init__( self, pattern: str, - subdir: str, + subdir: str = "", expected_file_pattern: Optional[str] = None, min_files: int = 1, min_size_per_file_bytes: int = 1, @@ -172,7 +172,7 @@ class JsonFileOutput(FileOutput): def __init__( self, pattern: str, - subdir: str, + subdir: str = "", min_size_bytes: int = 2, schema_validator: Optional[Callable[[Dict], bool]] = None, ) -> None: @@ -231,9 +231,6 @@ def validate(self, base_dir: Path, context_vars: Optional[Dict[str, str]] = None class GlobalOutput(OutputDescriptor): - def __init__(self, pattern: str, subdir: str = '') -> None: - super().__init__(pattern, subdir) - def resolve_path(self, base_dir: Path, context_vars: Optional[Dict[str, str]] = None) -> Path: formatted_pattern = self.format_pattern(context_vars) if self._subdir: diff --git a/preprocessor/series_configs/ranczo.json b/preprocessor/series_configs/ranczo.json index 698718f53..9c1be0ef1 100644 --- a/preprocessor/series_configs/ranczo.json +++ b/preprocessor/series_configs/ranczo.json @@ -32,9 +32,9 @@ }, "series_name": "ranczo", "skip_steps": [ - "scrape_episodes", - "scrape_characters", - "process_references", - "transcribe" + "episode_scraper", + "character_scraper", + "character_reference", + "transcription" ] } diff --git a/preprocessor/series_configs/template.json b/preprocessor/series_configs/template.json index 69e9deadc..1f47d56cf 100644 --- a/preprocessor/series_configs/template.json +++ b/preprocessor/series_configs/template.json @@ -11,8 +11,8 @@ "scraping.character_references.search_engine_example": "google (je\u015bli masz SerpAPI)", "scraping.episodes.parser_mode_example": "premium (je\u015bli masz Gemini API)", "skip_steps_example": [ - "scrape_episodes", - "transcribe" + "episode_scraper", + "transcription" ] }, "_required_fields": { diff --git a/preprocessor/services/video/strategies/scene_changes_strategy.py b/preprocessor/services/video/strategies/scene_changes_strategy.py index 1c1ece31e..9ed873a31 100644 --- a/preprocessor/services/video/strategies/scene_changes_strategy.py +++ b/preprocessor/services/video/strategies/scene_changes_strategy.py @@ -13,8 +13,9 @@ class SceneChangesStrategy(BaseKeyframeStrategy): - def __init__(self, frames_per_scene: int) -> None: + def __init__(self, frames_per_scene: int, scene_change_offset_seconds: float = 0.5) -> None: self.__frames_per_scene = frames_per_scene + self.__offset = scene_change_offset_seconds def extract_frame_requests( self, video_path: Path, data: Dict[str, Any], @@ -37,7 +38,7 @@ def __process_all_scenes( def __process_single_scene( self, scene: Dict[str, Any], scene_index: int, ) -> List[FrameRequest]: - start_seconds = scene.get('start', {}).get('seconds', 0.0) + start_seconds = scene.get('start', {}).get('seconds', 0.0) + self.__offset end_seconds = scene.get('end', {}).get('seconds', start_seconds) duration = end_seconds - start_seconds diff --git a/preprocessor/services/video/strategies/strategy_factory.py b/preprocessor/services/video/strategies/strategy_factory.py index d902c03e0..87ac24b68 100644 --- a/preprocessor/services/video/strategies/strategy_factory.py +++ b/preprocessor/services/video/strategies/strategy_factory.py @@ -6,9 +6,14 @@ class KeyframeStrategyFactory: @staticmethod def create( - strategy_type: KeyframeStrategy, frames_per_scene: int = 1, + strategy_type: KeyframeStrategy, + frames_per_scene: int = 1, + scene_change_offset_seconds: float = 0.5, ) -> BaseKeyframeStrategy: if strategy_type == KeyframeStrategy.SCENE_CHANGES: - return SceneChangesStrategy(frames_per_scene=frames_per_scene) + return SceneChangesStrategy( + frames_per_scene=frames_per_scene, + scene_change_offset_seconds=scene_change_offset_seconds, + ) raise ValueError(f"Unknown strategy type: {strategy_type}") diff --git a/preprocessor/steps/scraping/character_scraper_step.py b/preprocessor/steps/scraping/character_scraper_step.py index 2f90caeef..274e1ba7e 100644 --- a/preprocessor/steps/scraping/character_scraper_step.py +++ b/preprocessor/steps/scraping/character_scraper_step.py @@ -6,10 +6,6 @@ class CharacterScraperStep(BaseScraperStep[CharacterScraperConfig]): - @property - def name(self) -> str: - return "scrape_characters" - def _get_scraper_class(self) -> Type[CharacterScraper]: return CharacterScraper diff --git a/preprocessor/steps/scraping/episode_scraper_step.py b/preprocessor/steps/scraping/episode_scraper_step.py index d6766d9f3..65c491d90 100644 --- a/preprocessor/steps/scraping/episode_scraper_step.py +++ b/preprocessor/steps/scraping/episode_scraper_step.py @@ -6,10 +6,6 @@ class EpisodeScraperStep(BaseScraperStep[EpisodeScraperConfig]): - @property - def name(self) -> str: - return "scrape_episodes" - def _get_scraper_class(self) -> Type[EpisodeScraper]: return EpisodeScraper diff --git a/preprocessor/steps/scraping/reference_processor_step.py b/preprocessor/steps/scraping/reference_processor_step.py index d1dc158f1..018ed62c2 100644 --- a/preprocessor/steps/scraping/reference_processor_step.py +++ b/preprocessor/steps/scraping/reference_processor_step.py @@ -12,10 +12,6 @@ class CharacterReferenceStep( PipelineStep[SourceVideo, SourceVideo, CharacterReferenceConfig], ): - @property - def name(self) -> str: - return "process_character_references" - @property def is_global(self) -> bool: return True diff --git a/preprocessor/steps/search/document_generation_step.py b/preprocessor/steps/search/document_generation_step.py index 862cfb4e1..f5adf6395 100644 --- a/preprocessor/steps/search/document_generation_step.py +++ b/preprocessor/steps/search/document_generation_step.py @@ -20,10 +20,6 @@ class DocumentGeneratorStep(PipelineStep[Artifact, ElasticDocuments, DocumentGenerationConfig]): - @property - def name(self) -> str: - return 'document_generation' - @property def supports_batch_processing(self) -> bool: return True diff --git a/preprocessor/steps/search/indexing_step.py b/preprocessor/steps/search/indexing_step.py index 9e5b81911..02cc7a60d 100644 --- a/preprocessor/steps/search/indexing_step.py +++ b/preprocessor/steps/search/indexing_step.py @@ -25,10 +25,6 @@ def __init__(self, config: ElasticsearchConfig) -> None: super().__init__(config) self.__es: Optional[ElasticsearchWrapper] = None - @property - def name(self) -> str: - return 'elasticsearch_indexing' - @property def is_global(self) -> bool: return True diff --git a/preprocessor/steps/validation/validator_step.py b/preprocessor/steps/validation/validator_step.py index 0467d25d3..2163407cc 100644 --- a/preprocessor/steps/validation/validator_step.py +++ b/preprocessor/steps/validation/validator_step.py @@ -11,10 +11,6 @@ class ValidationStep(PipelineStep[ElasticDocuments, ValidationResult, ValidationConfig]): - @property - def name(self) -> str: - return "validate" - @property def supports_batch_processing(self) -> bool: return True diff --git a/preprocessor/steps/video/frame_export_step.py b/preprocessor/steps/video/frame_export_step.py index d4dbfb00a..bbd9daabf 100644 --- a/preprocessor/steps/video/frame_export_step.py +++ b/preprocessor/steps/video/frame_export_step.py @@ -1,3 +1,4 @@ +import bisect from datetime import datetime from io import BytesIO import json @@ -33,13 +34,11 @@ class FrameExporterStep(PipelineStep[SceneCollection, FrameCollection, FrameExpo def __init__(self, config: FrameExportConfig) -> None: super().__init__(config) self.__strategy = KeyframeStrategyFactory.create( - self.config.keyframe_strategy, self.config.frames_per_scene, + self.config.keyframe_strategy, + self.config.frames_per_scene, + self.config.scene_change_offset_seconds, ) - @property - def name(self) -> str: - return 'frame_export' - @property def supports_batch_processing(self) -> bool: return True @@ -66,11 +65,11 @@ def _process( ) context.logger.info( - f'Extracting {len(frame_requests)} keyframes from {input_data.source_video_path.name}', + f'Extracting {len(frame_requests)} keyframes from {input_data.video_path.name}', ) self.__process_frame_extraction( - input_data.source_video_path, + input_data.video_path, frame_requests, episode_dir, input_data, @@ -116,7 +115,7 @@ def _load_from_cache( def __extract_frame_requests( self, input_data: SceneCollection, ) -> List[FrameRequest]: - video_path = getattr(input_data, 'source_video_path', input_data.video_path) + video_path = input_data.video_path if not video_path.exists(): raise FileNotFoundError(f'Video file not found for frame export: {video_path}') data = { @@ -148,8 +147,9 @@ def __process_frame_extraction( context, metadata_file, ) - except Exception as e: - context.logger.error(f'Failed to extract frames from {video_path}: {e}') + except (Exception, KeyboardInterrupt) as e: + error_type = "interrupted" if isinstance(e, KeyboardInterrupt) else "failed" + context.logger.error(f'Frame extraction {error_type} for {video_path}: {e}') shutil.rmtree(episode_dir, ignore_errors=True) raise @@ -164,11 +164,27 @@ def __extract_frames( video_metadata = self.__fetch_video_metadata(video_file) dar = self.__calculate_display_aspect_ratio(video_metadata) + context.logger.info(f'Finding I-frames (keyframes) in {video_file.name}') + keyframes = self.__get_all_keyframes(video_file) + context.logger.info(f'Found {len(keyframes)} I-frames') + for req in frame_requests: - timestamp = req['timestamp'] + target_timestamp = req['timestamp'] + snapped_timestamp = self.__snap_to_keyframe(keyframes, target_timestamp) + + if abs(snapped_timestamp - target_timestamp) > 0.1: + context.logger.debug( + f'Snapped {target_timestamp:.3f}s -> {snapped_timestamp:.3f}s ' + f'(I-frame, delta: {snapped_timestamp - target_timestamp:.3f}s)', + ) + + req['timestamp'] = snapped_timestamp + req['original_timestamp'] = target_timestamp + req['snapped_to_keyframe'] = True + self.__extract_and_save_frame( video_file, - timestamp, + snapped_timestamp, episode_dir, episode_info, dar, @@ -363,3 +379,46 @@ def __calculate_display_aspect_ratio(metadata: Dict[str, Any]) -> float: sar = 1.0 return width / height * sar + + @staticmethod + def __get_all_keyframes(video_file: Path) -> List[float]: + cmd = [ + 'ffprobe', + '-skip_frame', 'nokey', + '-select_streams', 'v:0', + '-show_entries', 'frame=pkt_pts_time', + '-of', 'json', + str(video_file), + ] + result = subprocess.run( + cmd, + capture_output=True, + text=True, + check=True, + stdin=subprocess.DEVNULL, + ) + data: Dict[str, Any] = json.loads(result.stdout) + frames: List[Dict[str, Any]] = data.get('frames', []) + + keyframes = [ + float(frame['pkt_pts_time']) + for frame in frames + if 'pkt_pts_time' in frame + ] + + return sorted(keyframes) + + @staticmethod + def __snap_to_keyframe( + keyframes: List[float], + target_timestamp: float, + ) -> float: + if not keyframes: + return target_timestamp + + idx = bisect.bisect_left(keyframes, target_timestamp) + + if idx < len(keyframes): + return keyframes[idx] + + return keyframes[-1] diff --git a/preprocessor/steps/video/scene_detection_step.py b/preprocessor/steps/video/scene_detection_step.py index f80494b80..bd83a5cbe 100644 --- a/preprocessor/steps/video/scene_detection_step.py +++ b/preprocessor/steps/video/scene_detection_step.py @@ -26,10 +26,6 @@ def __init__(self, config: SceneDetectionConfig) -> None: self.__transnet = TransNetWrapper() self.__model_loaded = False - @property - def name(self) -> str: - return 'scene_detection' - @property def supports_batch_processing(self) -> bool: return True diff --git a/preprocessor/steps/video/transcoding_step.py b/preprocessor/steps/video/transcoding_step.py index 3983d0fae..1a7baa8a7 100644 --- a/preprocessor/steps/video/transcoding_step.py +++ b/preprocessor/steps/video/transcoding_step.py @@ -29,10 +29,6 @@ class VideoTranscoderStep(PipelineStep[SourceVideo, TranscodedVideo, TranscodeCo } __command_logged: bool = False - @property - def name(self) -> str: - return 'video_transcode' - @property def supports_batch_processing(self) -> bool: return True @@ -302,4 +298,4 @@ def __log_int_diagnostics(ctx: ExecutionContext, has_int: bool, stats: Dict[str, @staticmethod def __resolve_target_framerate() -> float: - return 24.0 + return 25.0 diff --git a/preprocessor/steps/vision/character_detection_step.py b/preprocessor/steps/vision/character_detection_step.py index 9415ede06..834e32e7d 100644 --- a/preprocessor/steps/vision/character_detection_step.py +++ b/preprocessor/steps/vision/character_detection_step.py @@ -23,31 +23,11 @@ class CharacterDetectorStep(PipelineStep[FrameCollection, DetectionResults, CharacterDetectionConfig]): - @staticmethod - def get_output_descriptors() -> List[OutputDescriptor]: - """Define output file descriptors for character detection step.""" - return [ - JsonFileOutput( - subdir="detections/characters", - pattern="{season}/{episode}.json", - min_size_bytes=10, - ), - ] - - def __init__(self, config: CharacterDetectionConfig) -> None: super().__init__(config) self.__face_app = None self.__character_vectors: Dict[str, np.ndarray] = {} - @property - def name(self) -> str: - return 'character_detection' - - def cleanup(self) -> None: - self.__face_app = None - self.__character_vectors = {} - @property def supports_batch_processing(self) -> bool: return True @@ -58,41 +38,21 @@ def setup_resources(self, context: ExecutionContext) -> None: self.__face_app = FaceDetector.init() self.__load_character_references(context) - def execute_batch( - self, input_data: List[FrameCollection], context: ExecutionContext, - ) -> List[DetectionResults]: - return self._execute_with_threadpool( - input_data, context, self.config.max_parallel_episodes, self.execute, - ) - def teardown_resources(self, context: ExecutionContext) -> None: if self.__face_app: context.logger.info('Face Detection model unloaded') self.__face_app = None self.__character_vectors = {} - def _get_cache_path( - self, input_data: FrameCollection, context: ExecutionContext, - ) -> Path: - return self._resolve_output_path( - 0, - context, - { - 'season': f'S{input_data.episode_info.season:02d}', - 'episode': input_data.episode_info.episode_code(), - }, - ) + def cleanup(self) -> None: + self.__face_app = None + self.__character_vectors = {} - def _load_from_cache( - self, cache_path: Path, input_data: FrameCollection, context: ExecutionContext, - ) -> DetectionResults: - detection_data: Dict[str, Any] = FileOperations.load_json(cache_path) - return DetectionResults( - episode_id=input_data.episode_id, - episode_info=input_data.episode_info, - path=cache_path, - detection_type='character', - detection_count=len(detection_data.get('detections', [])), + def execute_batch( + self, input_data: List[FrameCollection], context: ExecutionContext, + ) -> List[DetectionResults]: + return self._execute_with_threadpool( + input_data, context, self.config.max_parallel_episodes, self.execute, ) def _process( @@ -106,7 +66,9 @@ def _process( return self.__construct_empty_result(output_path, input_data, context) results = self.__process_character_detection(frame_files) - self.__save_detection_results(results, output_path, input_data, context, frame_files) + self.__save_detection_results( + results, output_path, input_data, context, frame_files, + ) return DetectionResults( episode_id=input_data.episode_id, @@ -116,6 +78,37 @@ def _process( detection_count=len(results), ) + def get_output_descriptors(self) -> List[OutputDescriptor]: + """Define output file descriptors for character detection step.""" + return [ + JsonFileOutput( + subdir="detections/characters", + pattern="{season}/{episode}.json", + min_size_bytes=10, + ), + ] + + def _get_cache_path( + self, input_data: FrameCollection, context: ExecutionContext, + ) -> Path: + return self._resolve_output_path( + 0, + context, + self.__create_path_variables(input_data), + ) + + def _load_from_cache( + self, cache_path: Path, input_data: FrameCollection, context: ExecutionContext, + ) -> DetectionResults: + detection_data: Dict[str, Any] = FileOperations.load_json(cache_path) + return DetectionResults( + episode_id=input_data.episode_id, + episode_info=input_data.episode_info, + path=cache_path, + detection_type='character', + detection_count=len(detection_data.get('detections', [])), + ) + def __prepare_detection_environment(self, context: ExecutionContext) -> None: if self.__face_app is None: context.logger.info('Initializing face detection model...') @@ -123,9 +116,13 @@ def __prepare_detection_environment(self, context: ExecutionContext) -> None: self.__load_character_references(context) def __load_character_references(self, context: ExecutionContext) -> None: - characters_dir: Path = Path('preprocessor/output_data') / context.series_name / 'characters' + characters_dir: Path = ( + Path('preprocessor/output_data') / context.series_name / 'characters' + ) if not characters_dir.exists(): - characters_dir = Path('preprocessor/input_data') / context.series_name / 'characters' + characters_dir = ( + Path('preprocessor/input_data') / context.series_name / 'characters' + ) if characters_dir.exists(): context.logger.info(f'Loading character references from {characters_dir}') @@ -135,7 +132,9 @@ def __load_character_references(self, context: ExecutionContext) -> None: else: context.logger.warning(f'Characters directory not found: {characters_dir}') - def __process_character_detection(self, frame_files: List[Path]) -> List[Dict[str, Any]]: + def __process_character_detection( + self, frame_files: List[Path], + ) -> List[Dict[str, Any]]: results: List[Dict[str, Any]] = [] for frame_path in frame_files: detections: List[Dict[str, Any]] = FaceDetector.detect_characters_in_frame( @@ -169,6 +168,13 @@ def __save_detection_results( } FileOperations.atomic_write_json(output_path, output_data) + @staticmethod + def __create_path_variables(input_data: FrameCollection) -> Dict[str, str]: + return { + 'season': f'S{input_data.episode_info.season:02d}', + 'episode': input_data.episode_info.episode_code(), + } + @staticmethod def __extract_frame_files(input_data: FrameCollection) -> List[Path]: return sorted([ diff --git a/preprocessor/steps/vision/image_hashing_step.py b/preprocessor/steps/vision/image_hashing_step.py index eba641aef..aa1bfffec 100644 --- a/preprocessor/steps/vision/image_hashing_step.py +++ b/preprocessor/steps/vision/image_hashing_step.py @@ -31,10 +31,6 @@ def __init__(self, config: ImageHashConfig) -> None: super().__init__(config) self.__hasher: Optional[PerceptualHasher] = None - @property - def name(self) -> str: - return 'image_hashing' - @property def supports_batch_processing(self) -> bool: return True From 7cb1d7a4fc7592f6697fe0ab0fa35e29d354c9ec Mon Sep 17 00:00:00 2001 From: dam2452 <81230036+dam2452@users.noreply.github.com> Date: Tue, 17 Feb 2026 09:17:43 +0100 Subject: [PATCH 47/89] Refactor FFmpeg usage and bitrate config MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Centralize ffmpeg/ffprobe logic into FFmpegWrapper and replace ad-hoc subprocess calls across services. Add utilities for probing, extracting audio/frames, normalizing audio, and getting keyframe timestamps. Rename and change bitrate configuration: min_upscale_bitrate_ratio → min_bitrate_mbps and add bitrate_boost_ratio (with defaults), update defaults.json and step configs accordingly. Update VideoTranscoderStep to simplify bitrate computation (min/max/boost behavior), fix upscaling detection, and return/log transcode command output. Audio extraction and normalization now use the wrapper; frame export and file validators use the wrapper for probing and frame extraction. Extend FrameRequest type with original_timestamp and snapped_to_keyframe, and add audio extraction config fields (sample_rate, channels, format). Overall this centralizes media handling, reduces duplicated subprocess code, and makes bitrate behavior more explicit and configurable. --- preprocessor/app/pipeline_factory.py | 3 +- preprocessor/config/series_config.py | 6 +- preprocessor/config/step_configs.py | 6 +- preprocessor/config/step_defaults.py | 3 +- preprocessor/config/types/frame.py | 2 + preprocessor/series_configs/defaults.json | 3 +- preprocessor/services/audio/extraction.py | 26 ++-- preprocessor/services/media/ffmpeg.py | 92 +++++++++++++- .../processors/audio_normalizer.py | 26 ++-- .../services/validation/file_validators.py | 11 +- preprocessor/steps/video/frame_export_step.py | 60 ++-------- preprocessor/steps/video/transcoding_step.py | 113 ++++++------------ 12 files changed, 166 insertions(+), 185 deletions(-) diff --git a/preprocessor/app/pipeline_factory.py b/preprocessor/app/pipeline_factory.py index df6db9920..368b3ca9a 100644 --- a/preprocessor/app/pipeline_factory.py +++ b/preprocessor/app/pipeline_factory.py @@ -173,7 +173,8 @@ def build_pipeline(series_name: str) -> PipelineDefinition: # pylint: disable=t max_bitrate_file_size_mb=series_config.processing.transcode.max_bitrate_file_size_mb, max_bitrate_duration_seconds=series_config.processing.transcode.max_bitrate_duration_seconds, keyframe_interval_seconds=series_config.processing.transcode.keyframe_interval_seconds, - min_upscale_bitrate_ratio=series_config.processing.transcode.min_upscale_bitrate_ratio, + min_bitrate_mbps=series_config.processing.transcode.min_bitrate_mbps, + bitrate_boost_ratio=series_config.processing.transcode.bitrate_boost_ratio, force_deinterlace=series_config.processing.transcode.force_deinterlace, ), ) diff --git a/preprocessor/config/series_config.py b/preprocessor/config/series_config.py index 1d0c98919..6969f826d 100644 --- a/preprocessor/config/series_config.py +++ b/preprocessor/config/series_config.py @@ -55,11 +55,12 @@ class TranscriptionProcessingConfig: @dataclass class TranscodeProcessingConfig: + bitrate_boost_ratio: float force_deinterlace: bool keyframe_interval_seconds: float max_bitrate_duration_seconds: float max_bitrate_file_size_mb: float - min_upscale_bitrate_ratio: float + min_bitrate_mbps: float resolution: str @@ -152,7 +153,8 @@ def __load_from_dict(data: Dict[str, Any]) -> 'SeriesConfig': transcode=TranscodeProcessingConfig( max_bitrate_file_size_mb=data['processing']['transcode']['max_bitrate_file_size_mb'], max_bitrate_duration_seconds=data['processing']['transcode']['max_bitrate_duration_seconds'], - min_upscale_bitrate_ratio=data['processing']['transcode']['min_upscale_bitrate_ratio'], + min_bitrate_mbps=data['processing']['transcode']['min_bitrate_mbps'], + bitrate_boost_ratio=data['processing']['transcode']['bitrate_boost_ratio'], force_deinterlace=data['processing']['transcode']['force_deinterlace'], keyframe_interval_seconds=data['processing']['transcode']['keyframe_interval_seconds'], resolution=data['processing']['transcode']['resolution'], diff --git a/preprocessor/config/step_configs.py b/preprocessor/config/step_configs.py index 4427c237c..367fcb67f 100644 --- a/preprocessor/config/step_configs.py +++ b/preprocessor/config/step_configs.py @@ -17,12 +17,13 @@ class TranscodeConfig(BaseModel): model_config = ConfigDict(arbitrary_types_allowed=True) + bitrate_boost_ratio: float = Field(default=1.1, ge=1.0, le=2.0) force_deinterlace: bool = False keyframe_interval_seconds: float = Field(gt=0) max_bitrate_duration_seconds: float = Field(gt=0) max_bitrate_file_size_mb: float = Field(gt=0) max_parallel_episodes: int = Field(default=3, ge=1, le=10) - min_upscale_bitrate_ratio: float = Field(default=0.52, ge=0, le=1) + min_bitrate_mbps: float = Field(default=2.0, gt=0) resolution: Resolution = Field(default=Resolution.R720P) @property @@ -141,7 +142,10 @@ class ElasticsearchConfig(BaseModel): class AudioExtractionConfig(BaseModel): + channels: int = Field(default=1, ge=1, le=2) + format: str = 'wav' max_parallel_episodes: int = Field(default=4, ge=1, le=8) + sample_rate: int = Field(default=48000, ge=8000, le=96000) class CharacterDetectionConfig(BaseModel): diff --git a/preprocessor/config/step_defaults.py b/preprocessor/config/step_defaults.py index aef97264b..ce985e79d 100644 --- a/preprocessor/config/step_defaults.py +++ b/preprocessor/config/step_defaults.py @@ -29,7 +29,8 @@ def get_configs(series_name: str) -> Dict[str, object]: max_bitrate_file_size_mb=50.0, max_bitrate_duration_seconds=100.0, keyframe_interval_seconds=0.5, - min_upscale_bitrate_ratio=0.52, + min_bitrate_mbps=2.0, + bitrate_boost_ratio=1.1, ), 'transcribe': WhisperTranscriptionConfig( model='large-v3-turbo', diff --git a/preprocessor/config/types/frame.py b/preprocessor/config/types/frame.py index 70ffedef9..dc2e637f7 100644 --- a/preprocessor/config/types/frame.py +++ b/preprocessor/config/types/frame.py @@ -9,3 +9,5 @@ class FrameRequest(TypedDict): timestamp: float type: str scene_number: NotRequired[int] + original_timestamp: NotRequired[float] + snapped_to_keyframe: NotRequired[bool] diff --git a/preprocessor/series_configs/defaults.json b/preprocessor/series_configs/defaults.json index 12eca464c..d69ee5677 100644 --- a/preprocessor/series_configs/defaults.json +++ b/preprocessor/series_configs/defaults.json @@ -18,11 +18,12 @@ "threshold": 0.5 }, "transcode": { + "bitrate_boost_ratio": 1.1, "force_deinterlace": false, "keyframe_interval_seconds": 0.5, "max_bitrate_duration_seconds": 100.0, "max_bitrate_file_size_mb": 50.0, - "min_upscale_bitrate_ratio": 0.52, + "min_bitrate_mbps": 2.0, "resolution": "720p" }, "transcription": { diff --git a/preprocessor/services/audio/extraction.py b/preprocessor/services/audio/extraction.py index 578fc5c0f..74868f967 100644 --- a/preprocessor/services/audio/extraction.py +++ b/preprocessor/services/audio/extraction.py @@ -1,6 +1,4 @@ from pathlib import Path -import subprocess -from typing import List from preprocessor.config.step_configs import AudioExtractionConfig from preprocessor.core.artifacts import ( @@ -9,6 +7,7 @@ ) from preprocessor.core.base_step import PipelineStep from preprocessor.core.context import ExecutionContext +from preprocessor.services.media.ffmpeg import FFmpegWrapper class AudioExtractionStep(PipelineStep[SourceVideo, AudioArtifact, AudioExtractionConfig]): @@ -51,27 +50,20 @@ def __is_cached( def __extract_audio( self, input_path: Path, output_path: Path, context: ExecutionContext, ) -> None: - command = self.__build_ffmpeg_command(input_path, output_path) - try: - subprocess.run(command, check=True) - except subprocess.CalledProcessError as e: + FFmpegWrapper.extract_audio( + input_path, + output_path, + codec='pcm_s16le', + sample_rate=self.config.sample_rate, + channels=self.config.channels, + ) + except Exception as e: context.logger.error(f'FFmpeg audio extraction failed: {e}') if output_path.exists(): output_path.unlink() raise - def __build_ffmpeg_command(self, input_path: Path, output_path: Path) -> List[str]: - return [ - 'ffmpeg', '-y', '-v', 'error', - '-i', str(input_path), - '-vn', # Disable video processing - '-acodec', 'pcm_s16le', - '-ar', str(self.config.sample_rate), - '-ac', str(self.config.channels), - str(output_path), - ] - def __create_artifact(self, input_data: SourceVideo, output_path: Path) -> AudioArtifact: return AudioArtifact( episode_id=input_data.episode_id, diff --git a/preprocessor/services/media/ffmpeg.py b/preprocessor/services/media/ffmpeg.py index 6520393fe..edc478955 100644 --- a/preprocessor/services/media/ffmpeg.py +++ b/preprocessor/services/media/ffmpeg.py @@ -1,3 +1,4 @@ +from io import BytesIO import json from pathlib import Path import re @@ -11,6 +12,8 @@ Union, ) +from PIL import Image + from preprocessor.services.media.transcode_params import TranscodeParams @@ -163,7 +166,7 @@ def probe_video(video_path: Path) -> Dict[str, Any]: return json.loads(result.stdout) @staticmethod - def transcode(params: TranscodeParams) -> None: + def transcode(params: TranscodeParams) -> Optional[str]: width, height = params.get_resolution_tuple() vf_filter = FFmpegWrapper.__build_video_filter( width, height, params.deinterlace, params.is_upscaling, @@ -187,10 +190,91 @@ def transcode(params: TranscodeParams) -> None: ), ) - if params.log_command: - FFmpegWrapper.__log_ffmpeg_command(command) - + log_output = FFmpegWrapper.__log_ffmpeg_command(command) if params.log_command else None subprocess.run(command, check=True, capture_output=False) + return log_output + + @staticmethod + def get_audio_streams(video_path: Path) -> List[Dict[str, Any]]: + cmd = [ + 'ffprobe', '-v', 'error', '-select_streams', 'a', + '-show_entries', 'stream=index,bit_rate,codec_name,channels,sample_rate', + '-of', 'json', str(video_path), + ] + result = subprocess.run(cmd, capture_output=True, text=True, check=True) + return json.loads(result.stdout).get('streams', []) + + @staticmethod + def extract_audio( + video_path: Path, + output_path: Path, + audio_stream_index: Optional[int] = None, + codec: str = 'pcm_s16le', + sample_rate: int = 48000, + channels: int = 1, + ) -> None: + cmd = ['ffmpeg', '-y', '-i', str(video_path)] + + if audio_stream_index is not None: + cmd.extend(['-map', f'0:{audio_stream_index}']) + + cmd.extend([ + '-acodec', codec, + '-ar', str(sample_rate), + '-ac', str(channels), + str(output_path), + ]) + + subprocess.run( + cmd, check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, + ) + + @staticmethod + def normalize_audio(input_path: Path, output_path: Path) -> None: + cmd = [ + 'ffmpeg', '-y', '-i', str(input_path), + '-af', 'dynaudnorm', + str(output_path), + ] + subprocess.run( + cmd, check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, + ) + + @staticmethod + def extract_frame_at_timestamp(video_path: Path, timestamp: float) -> Image.Image: + cmd = [ + 'ffmpeg', + '-ss', str(timestamp), + '-i', str(video_path), + '-frames:v', '1', + '-f', 'image2pipe', + '-vcodec', 'bmp', + '-', + ] + result = subprocess.run(cmd, capture_output=True, check=True) + return Image.open(BytesIO(result.stdout)) + + @staticmethod + def get_keyframe_timestamps(video_path: Path) -> List[float]: + cmd = [ + 'ffprobe', + '-skip_frame', 'nokey', + '-select_streams', 'v:0', + '-show_entries', 'frame=pkt_pts_time', + '-of', 'json', + str(video_path), + ] + result = subprocess.run(cmd, capture_output=True, text=True, check=True) + data: Dict[str, Any] = json.loads(result.stdout) + frames: List[Dict[str, Any]] = data.get('frames', []) + + timestamps = [] + for frame in frames: + pts = frame.get('pkt_pts_time') + if pts: + timestamps.append(float(pts)) + + return timestamps @staticmethod def __log_ffmpeg_command(command: List[str]) -> None: diff --git a/preprocessor/services/transcription/processors/audio_normalizer.py b/preprocessor/services/transcription/processors/audio_normalizer.py index bc84b2593..570356bbc 100644 --- a/preprocessor/services/transcription/processors/audio_normalizer.py +++ b/preprocessor/services/transcription/processors/audio_normalizer.py @@ -1,6 +1,4 @@ -import json from pathlib import Path -import subprocess from typing import ( List, Optional, @@ -8,6 +6,7 @@ from preprocessor.services.core.base_processor import BaseProcessor from preprocessor.services.core.logging import ErrorHandlingLogger +from preprocessor.services.media.ffmpeg import FFmpegWrapper class AudioNormalizer: @@ -52,12 +51,7 @@ def __process_video(self, video: Path) -> None: self.__logger.error(f'Error processing video {video}: {e}') def __get_best_audio_stream(self, video: Path) -> Optional[int]: - cmd = [ - 'ffprobe', '-v', 'error', '-select_streams', 'a', - '-show_entries', 'stream=index,bit_rate', '-of', 'json', str(video), - ] - result = subprocess.run(cmd, capture_output=True, text=True, check=True) - streams = json.loads(result.stdout).get('streams', []) + streams = FFmpegWrapper.get_audio_streams(video) if not streams: self.__logger.error(f'No audio streams found in file: {video}') @@ -67,19 +61,13 @@ def __get_best_audio_stream(self, video: Path) -> Optional[int]: return best_stream['index'] def __execute_normalization_pipeline(self, video: Path, audio_idx: int, output: Path) -> None: - self.__extract_audio(video, audio_idx, output) + FFmpegWrapper.extract_audio( + video, output, audio_stream_index=audio_idx, + codec='pcm_s16le', sample_rate=48000, channels=1, + ) tmp_output = output.with_name(output.stem + '_temp.wav') - normalize_cmd = ['ffmpeg', '-y', '-i', str(output), '-af', 'dynaudnorm', str(tmp_output)] - subprocess.run(normalize_cmd, check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) + FFmpegWrapper.normalize_audio(output, tmp_output) tmp_output.replace(output) self.__logger.info(f'Normalization complete: {output.name}') - - @staticmethod - def __extract_audio(video: Path, audio_idx: int, output: Path) -> None: - cmd = [ - 'ffmpeg', '-y', '-i', str(video), '-map', f'0:{audio_idx}', - '-acodec', 'pcm_s16le', '-ar', '48000', '-ac', '1', str(output), - ] - subprocess.run(cmd, check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) diff --git a/preprocessor/services/validation/file_validators.py b/preprocessor/services/validation/file_validators.py index e57729061..0452f44fe 100644 --- a/preprocessor/services/validation/file_validators.py +++ b/preprocessor/services/validation/file_validators.py @@ -1,7 +1,6 @@ from dataclasses import dataclass import json from pathlib import Path -import subprocess from typing import ( Any, Dict, @@ -16,6 +15,7 @@ FfprobeStreamKeys, ValidationMetadataKeys, ) +from preprocessor.services.media.ffmpeg import FFmpegWrapper @dataclass @@ -112,11 +112,4 @@ def __verify_existence(path: Path) -> Optional[ValidationResult]: @staticmethod def __run_ffprobe(path: Path) -> Dict[str, Any]: - res = subprocess.run( - [ - 'ffprobe', '-v', 'error', '-select_streams', 'v:0', '-show_entries', - 'stream=codec_name,width,height,duration:format=duration,size', '-of', 'json', str(path), - ], - capture_output=True, text=True, check=True, - ) - return json.loads(res.stdout) + return FFmpegWrapper.probe_video(path) diff --git a/preprocessor/steps/video/frame_export_step.py b/preprocessor/steps/video/frame_export_step.py index bbd9daabf..207ef13b0 100644 --- a/preprocessor/steps/video/frame_export_step.py +++ b/preprocessor/steps/video/frame_export_step.py @@ -1,10 +1,8 @@ import bisect from datetime import datetime -from io import BytesIO import json from pathlib import Path import shutil -import subprocess from typing import ( Any, Dict, @@ -27,6 +25,7 @@ ) from preprocessor.core.temp_files import StepTempFile from preprocessor.services.io.files import FileOperations +from preprocessor.services.media.ffmpeg import FFmpegWrapper from preprocessor.services.video.strategies.strategy_factory import KeyframeStrategyFactory @@ -213,22 +212,7 @@ def __extract_and_save_frame( @staticmethod def __extract_frame_at_timestamp(video_file: Path, timestamp: float) -> Image.Image: - cmd = [ - 'ffmpeg', - '-ss', str(timestamp), - '-i', str(video_file), - '-frames:v', '1', - '-f', 'image2pipe', - '-vcodec', 'bmp', - '-', - ] - result = subprocess.run( - cmd, - capture_output=True, - check=True, - stdin=subprocess.DEVNULL, - ) - return Image.open(BytesIO(result.stdout)) + return FFmpegWrapper.extract_frame_at_timestamp(video_file, timestamp) def __resize_frame( self, frame: Image.Image, display_aspect_ratio: float, @@ -346,19 +330,13 @@ def __construct_empty_result( @staticmethod def __fetch_video_metadata(video_path: Path) -> Dict[str, Any]: - cmd = [ - 'ffprobe', '-v', 'error', '-select_streams', 'v:0', - '-show_entries', - 'stream=width,height,sample_aspect_ratio,display_aspect_ratio', - '-of', 'json', str(video_path), - ] - result = subprocess.run(cmd, capture_output=True, text=True, check=True) - probe_data: Dict[str, Any] = json.loads(result.stdout) + probe_data = FFmpegWrapper.probe_video(video_path) streams: List[Dict[str, Any]] = probe_data.get('streams', []) - if not streams: + video_streams = [s for s in streams if s.get('codec_type') == 'video'] + if not video_streams: raise ValueError(f'No video streams found in {video_path}') - return streams[0] + return video_streams[0] @staticmethod def __calculate_display_aspect_ratio(metadata: Dict[str, Any]) -> float: @@ -382,31 +360,7 @@ def __calculate_display_aspect_ratio(metadata: Dict[str, Any]) -> float: @staticmethod def __get_all_keyframes(video_file: Path) -> List[float]: - cmd = [ - 'ffprobe', - '-skip_frame', 'nokey', - '-select_streams', 'v:0', - '-show_entries', 'frame=pkt_pts_time', - '-of', 'json', - str(video_file), - ] - result = subprocess.run( - cmd, - capture_output=True, - text=True, - check=True, - stdin=subprocess.DEVNULL, - ) - data: Dict[str, Any] = json.loads(result.stdout) - frames: List[Dict[str, Any]] = data.get('frames', []) - - keyframes = [ - float(frame['pkt_pts_time']) - for frame in frames - if 'pkt_pts_time' in frame - ] - - return sorted(keyframes) + return sorted(FFmpegWrapper.get_keyframe_timestamps(video_file)) @staticmethod def __snap_to_keyframe( diff --git a/preprocessor/steps/video/transcoding_step.py b/preprocessor/steps/video/transcoding_step.py index 1a7baa8a7..aaa6c642f 100644 --- a/preprocessor/steps/video/transcoding_step.py +++ b/preprocessor/steps/video/transcoding_step.py @@ -1,11 +1,9 @@ from dataclasses import replace -import math from pathlib import Path from typing import ( Any, Dict, List, - Tuple, ) from preprocessor.config.step_configs import TranscodeConfig @@ -27,6 +25,7 @@ class VideoTranscoderStep(PipelineStep[SourceVideo, TranscodedVideo, TranscodeCo 'hevc': 2.0, 'h265': 2.0, 'vp9': 2.85, 'av1': 4.0, } + __TARGET_FRAMERATE: float = 25.0 __command_logged: bool = False @property @@ -83,12 +82,9 @@ def __create_transcode_params( probe_data: Dict[str, Any], context: ExecutionContext, ) -> TranscodeParams: - target_fps = self.__resolve_target_framerate() - is_upscaling, src_px, target_px = self.__analyze_resolution_scaling(probe_data) - - bitrates = self.__compute_all_bitrate_settings( - probe_data, context, is_upscaling, src_px, target_px, - ) + target_fps = self.__TARGET_FRAMERATE + bitrates = self.__compute_all_bitrate_settings(probe_data, context) + is_upscaling = self.__is_upscaling(probe_data) return TranscodeParams( input_path=input_data.path, @@ -108,50 +104,47 @@ def __create_transcode_params( log_command=self.__should_log_command(), ) - def __analyze_resolution_scaling(self, probe_data: Dict[str, Any]) -> Tuple[bool, int, int]: + def __is_upscaling(self, probe_data: Dict[str, Any]) -> bool: w, h = FFmpegWrapper.get_resolution(probe_data) sar_num, sar_denom = FFmpegWrapper.get_sample_aspect_ratio(probe_data) - eff_w = int(w * sar_num / sar_denom) src_px = eff_w * h target_px = self.config.resolution.width * self.config.resolution.height - - return src_px < target_px, src_px, target_px + return src_px < target_px def __compute_all_bitrate_settings( - self, - probe_data: Dict[str, Any], - context: ExecutionContext, - is_up: bool, - src_px: int, - target_px: int, + self, probe_data: Dict[str, Any], context: ExecutionContext, ) -> Dict[str, float]: - src_v = FFmpegWrapper.get_video_bitrate(probe_data) - target_max = self.config.video_bitrate_mbps - - if not src_v: - return self.__build_fallback_bitrates(target_max) - - norm_v = self.__get_normalized_bitrate(src_v, probe_data, is_up, context) - ratio = target_px / src_px - exp = self.__calculate_scaling_exponent(ratio, is_up) - - scaled_raw = norm_v * (ratio**exp) - scaled_min = self.__apply_min_upscale_constraint(scaled_raw, target_max, is_up) - final_v = min(scaled_min, target_max) - - self.__log_bitrate_workflow( - context, src_v, norm_v, scaled_raw, scaled_min, final_v, target_max, ratio, is_up, + src_bitrate = FFmpegWrapper.get_video_bitrate(probe_data) + min_bitrate = self.config.min_bitrate_mbps + max_bitrate = self.config.video_bitrate_mbps + + if not src_bitrate: + return self.__build_fallback_bitrates(max_bitrate) + + normalized_bitrate = self.__get_normalized_bitrate(src_bitrate, probe_data, context) + + if normalized_bitrate < min_bitrate: + final_bitrate = min_bitrate + adjustment = f"boosted to minimum ({min_bitrate} Mbps)" + elif normalized_bitrate > max_bitrate: + final_bitrate = max_bitrate + adjustment = f"capped to maximum ({max_bitrate} Mbps)" + else: + final_bitrate = normalized_bitrate * self.config.bitrate_boost_ratio + boost_percent = (self.config.bitrate_boost_ratio - 1.0) * 100 + adjustment = f"boosted by {boost_percent:.0f}%" + + context.logger.info( + f'Bitrate: {src_bitrate:.2f} → {normalized_bitrate:.2f} → {final_bitrate:.2f} Mbps ' + f'({adjustment})', ) - return self.__scale_bitrate_limits(final_v / target_max) + return self.__scale_bitrate_limits(final_bitrate / max_bitrate) def __get_normalized_bitrate( - self, src_v: float, probe: Dict[str, Any], is_up: bool, context: ExecutionContext, + self, src_v: float, probe: Dict[str, Any], context: ExecutionContext, ) -> float: - if not is_up: - return src_v - src_codec = self.__normalize_codec_name(FFmpegWrapper.get_video_codec(probe)) tgt_codec = self.__normalize_codec_name(self.config.codec) mult = self.__get_codec_efficiency_multiplier(src_codec, tgt_codec) @@ -159,17 +152,12 @@ def __get_normalized_bitrate( if mult != 1.0: norm = src_v * mult context.logger.info( - f'Codec: {src_codec.upper()}->{tgt_codec.upper()} ({mult}x) | ' + f'Codec: {src_codec.upper()}->{tgt_codec.upper()} ({mult:.2f}x) | ' f'{src_v:.2f}->{norm:.2f} Mbps', ) return norm return src_v - def __apply_min_upscale_constraint(self, scaled: float, target_max: float, is_up: bool) -> float: - if not is_up: - return scaled - return max(scaled, target_max * self.config.min_upscale_bitrate_ratio) - def __scale_bitrate_limits(self, scale: float) -> Dict[str, float]: return { "video": round(self.config.video_bitrate_mbps * scale, 2), @@ -213,9 +201,10 @@ def __execute_ffmpeg_process( temp_params = replace(params, output_path=temp_path) context.mark_step_started(self.name, ep_id, [str(temp_path)]) - if temp_params.log_command: + command_log = FFmpegWrapper.transcode(temp_params) + if command_log: context.logger.info('=' * 20 + ' FFmpeg ' + '=' * 20) - FFmpegWrapper.transcode(temp_params) + context.logger.info(command_log) def __construct_result_artifact(self, path: Path, input_data: SourceVideo) -> TranscodedVideo: return TranscodedVideo( @@ -234,13 +223,6 @@ def __should_log_command() -> bool: return True return False - @staticmethod - def __calculate_scaling_exponent(ratio: float, is_up: bool) -> float: - log_r = math.log10(max(ratio, 0.01)) - if is_up: - return 0.8 + min(log_r, 1.0) * 0.35 - return 0.8 + max(log_r, -2.0) * 0.175 - @staticmethod def __normalize_codec_name(codec: str) -> str: name = codec.lower() @@ -260,25 +242,6 @@ def __get_codec_efficiency_multiplier(src: str, tgt: str) -> float: eff = VideoTranscoderStep.__CODEC_EFFICIENCY return eff.get(src, 1.0) / eff.get(tgt, 1.0) - @staticmethod - def __log_bitrate_workflow( - ctx: ExecutionContext, - src: float, - norm: float, - raw: float, - s_min: float, - final: float, - limit: float, - ratio: float, - is_up: bool, - ) -> None: - dir_label = "upscaling" if is_up else ("downscaling" if ratio < 1.0 else "same") - min_msg = f' (MinBoost: {s_min:.2f})' if is_up and (s_min > raw) else '' - ctx.logger.info( - f'[{dir_label}] {src:.2f}->{norm:.2f}->{raw:.2f}{min_msg} -> {final:.2f} Mbps ' - f'(Max: {limit})', - ) - @staticmethod def __log_transcode_details( ctx: ExecutionContext, @@ -295,7 +258,3 @@ def __log_transcode_details( @staticmethod def __log_int_diagnostics(ctx: ExecutionContext, has_int: bool, stats: Dict[str, float], order: str) -> None: ctx.logger.info(f"Interlacing: {has_int} ({stats['ratio'] * 100:.1f}%) | {order}") - - @staticmethod - def __resolve_target_framerate() -> float: - return 25.0 From 692385a6ceed512da4d9e219b8b2c24b32e3f41e Mon Sep 17 00:00:00 2001 From: dam2452 <81230036+dam2452@users.noreply.github.com> Date: Tue, 17 Feb 2026 09:24:34 +0100 Subject: [PATCH 48/89] Improve ffmpeg logging and add batch info log Change __log_ffmpeg_command to return a single-line command string instead of printing multi-line output, and remove the redundant '-stats' flag from the ffmpeg invocation. Also add an informational log in VideoTranscoderStep.execute_batch that reports the total number of videos and how many will be processed in parallel (computed as min(max_parallel_episodes, total)). These changes make command logging easier to integrate into structured logs and add visibility into batch transcoding concurrency. --- preprocessor/services/media/ffmpeg.py | 12 +++--------- preprocessor/steps/video/transcoding_step.py | 5 +++++ 2 files changed, 8 insertions(+), 9 deletions(-) diff --git a/preprocessor/services/media/ffmpeg.py b/preprocessor/services/media/ffmpeg.py index edc478955..91b0c5352 100644 --- a/preprocessor/services/media/ffmpeg.py +++ b/preprocessor/services/media/ffmpeg.py @@ -277,14 +277,8 @@ def get_keyframe_timestamps(video_path: Path) -> List[float]: return timestamps @staticmethod - def __log_ffmpeg_command(command: List[str]) -> None: - print('ffmpeg \\') - for i, arg in enumerate(command[1:], 1): - if i == len(command) - 1: - print(f' {arg}') - else: - print(f' {arg} \\') - print() + def __log_ffmpeg_command(command: List[str]) -> str: + return ' '.join(command) @staticmethod def __build_audio_and_output_params( @@ -306,7 +300,7 @@ def __build_base_command( input_path: Path, codec: str, preset: str, target_fps: Optional[float], ) -> List[str]: command = [ - 'ffmpeg', '-v', 'error', '-stats', '-hide_banner', '-y', + 'ffmpeg', '-v', 'error', '-hide_banner', '-y', '-sws_flags', 'accurate_rnd+full_chroma_int+full_chroma_inp', '-i', str(input_path), '-c:v', codec, diff --git a/preprocessor/steps/video/transcoding_step.py b/preprocessor/steps/video/transcoding_step.py index aaa6c642f..e47af4823 100644 --- a/preprocessor/steps/video/transcoding_step.py +++ b/preprocessor/steps/video/transcoding_step.py @@ -35,6 +35,11 @@ def supports_batch_processing(self) -> bool: def execute_batch( self, input_data: List[SourceVideo], context: ExecutionContext, ) -> List[TranscodedVideo]: + total = len(input_data) + parallel = min(self.config.max_parallel_episodes, total) + context.logger.info( + f'Transcoding {total} videos (processing {parallel} in parallel)', + ) return self._execute_with_threadpool( input_data, context, self.config.max_parallel_episodes, self.execute, ) From b737e88c508e37bf0553d419adc54c8df646a921 Mon Sep 17 00:00:00 2001 From: dam2452 <81230036+dam2452@users.noreply.github.com> Date: Tue, 17 Feb 2026 10:56:04 +0100 Subject: [PATCH 49/89] Silence ffmpeg and improve interlace logs Redirect ffmpeg subprocess stdout/stderr to DEVNULL to suppress noisy output. Enhance interlacing diagnostics in the transcoding step: log when force_deinterlace is enabled, report the detected interlace ratio and field order, and provide distinct messages for detected vs. not-detected cases. Remove the obsolete __log_int_diagnostics helper. --- preprocessor/services/media/ffmpeg.py | 4 +++- preprocessor/steps/video/transcoding_step.py | 17 ++++++++++++----- 2 files changed, 15 insertions(+), 6 deletions(-) diff --git a/preprocessor/services/media/ffmpeg.py b/preprocessor/services/media/ffmpeg.py index 91b0c5352..9fea6fdc1 100644 --- a/preprocessor/services/media/ffmpeg.py +++ b/preprocessor/services/media/ffmpeg.py @@ -191,7 +191,9 @@ def transcode(params: TranscodeParams) -> Optional[str]: ) log_output = FFmpegWrapper.__log_ffmpeg_command(command) if params.log_command else None - subprocess.run(command, check=True, capture_output=False) + subprocess.run( + command, check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, + ) return log_output @staticmethod diff --git a/preprocessor/steps/video/transcoding_step.py b/preprocessor/steps/video/transcoding_step.py index e47af4823..259f1ff7a 100644 --- a/preprocessor/steps/video/transcoding_step.py +++ b/preprocessor/steps/video/transcoding_step.py @@ -183,11 +183,22 @@ def __resolve_deinterlacing_strategy( self, input_data: SourceVideo, context: ExecutionContext, probe: Dict[str, Any], ) -> bool: if self.config.force_deinterlace: + context.logger.info('Deinterlacing: FORCED') return True has_int, stats = FFmpegWrapper.detect_interlacing(input_data.path) if not stats: return False - self.__log_int_diagnostics(context, has_int, stats, FFmpegWrapper.get_field_order(probe)) + + field_order = FFmpegWrapper.get_field_order(probe) + ratio_pct = stats['ratio'] * 100 + + if has_int: + context.logger.info( + f"Interlacing detected ({ratio_pct:.1f}%) | {field_order} → APPLYING deinterlace filter", + ) + else: + context.logger.info(f"Interlacing: No ({ratio_pct:.1f}%) | {field_order}") + return has_int def __compute_audio_bitrate(self, probe: Dict[str, Any], context: ExecutionContext) -> int: @@ -259,7 +270,3 @@ def __log_transcode_details( ctx.logger.info( f'{input_data.episode_id}: {w}x{h} -> {params.resolution} [{up_label}]', ) - - @staticmethod - def __log_int_diagnostics(ctx: ExecutionContext, has_int: bool, stats: Dict[str, float], order: str) -> None: - ctx.logger.info(f"Interlacing: {has_int} ({stats['ratio'] * 100:.1f}%) | {order}") From bc15db42d7fbda9097d5ed92426bf6af7eb7e7cf Mon Sep 17 00:00:00 2001 From: dam2452 <81230036+dam2452@users.noreply.github.com> Date: Tue, 17 Feb 2026 13:38:18 +0100 Subject: [PATCH 50/89] Remove ES index mappings and add local types Remove embedded Elasticsearch index mapping constants from ElasticSearchManager and switch reindex service to use the external ElasticSearchManager implementation (preprocessor.search.elastic_manager). Update ReindexService to pass ES host/user/password from settings when connecting. Replace previous type imports with concrete TypedDict definitions in bot/types.py to centralize and expand local API types (segments, episodes, scenes, detections, embeddings, etc.). These changes decouple index mappings from this repo and provide explicit local type definitions for static typing. --- bot/search/elastic_search_manager.py | 122 ------------ bot/services/reindex/reindex_service.py | 6 +- bot/types.py | 246 ++++++++++++++++++------ 3 files changed, 196 insertions(+), 178 deletions(-) diff --git a/bot/search/elastic_search_manager.py b/bot/search/elastic_search_manager.py index 6b8d205e1..10d3aac14 100644 --- a/bot/search/elastic_search_manager.py +++ b/bot/search/elastic_search_manager.py @@ -27,128 +27,6 @@ class ElasticSearchManager: - SEGMENTS_INDEX_MAPPING = { - "mappings": { - "properties": { - "episode_id": {"type": "keyword"}, - "episode_metadata": {"type": "object"}, - "segment_id": {"type": "integer"}, - "text": {"type": "text"}, - "start_time": {"type": "float"}, - "end_time": {"type": "float"}, - "speaker": {"type": "keyword"}, - "video_path": {"type": "keyword"}, - "scene_info": {"type": "object"}, - }, - }, - } - - TEXT_EMBEDDINGS_INDEX_MAPPING = { - "mappings": { - "properties": { - "episode_id": {"type": "keyword"}, - "episode_metadata": {"type": "object"}, - "embedding_id": {"type": "integer"}, - "segment_range": {"type": "integer"}, - "text": {"type": "text"}, - "text_embedding": { - "type": "dense_vector", - "dims": 4096, - "index": True, - "similarity": "cosine", - }, - }, - }, - } - - VIDEO_EMBEDDINGS_INDEX_MAPPING = { - "mappings": { - "properties": { - "episode_id": {"type": "keyword"}, - "episode_metadata": {"type": "object"}, - "frame_number": {"type": "integer"}, - "timestamp": {"type": "float"}, - "frame_type": {"type": "keyword"}, - "video_path": {"type": "keyword"}, - "video_embedding": { - "type": "dense_vector", - "dims": 4096, - "index": True, - "similarity": "cosine", - }, - }, - }, - } - - EPISODE_NAMES_INDEX_MAPPING = { - "mappings": { - "properties": { - "episode_id": {"type": "keyword"}, - "episode_metadata": {"type": "object"}, - "title": {"type": "text"}, - "title_embedding": { - "type": "dense_vector", - "dims": 4096, - "index": True, - "similarity": "cosine", - }, - }, - }, - } - - FULL_EPISODE_EMBEDDINGS_INDEX_MAPPING = { - "mappings": { - "properties": { - "episode_id": {"type": "keyword"}, - "episode_metadata": {"type": "object"}, - "full_transcript": {"type": "text"}, - "full_episode_embedding": { - "type": "dense_vector", - "dims": 4096, - "index": True, - "similarity": "cosine", - }, - }, - }, - } - - SOUND_EVENTS_INDEX_MAPPING = { - "mappings": { - "properties": { - "episode_id": {"type": "keyword"}, - "episode_metadata": {"type": "object"}, - "segment_id": {"type": "integer"}, - "text": {"type": "text"}, - "sound_type": {"type": "keyword"}, - "start_time": {"type": "float"}, - "end_time": {"type": "float"}, - "video_path": {"type": "keyword"}, - "scene_info": {"type": "object"}, - }, - }, - } - - SOUND_EVENT_EMBEDDINGS_INDEX_MAPPING = { - "mappings": { - "properties": { - "episode_id": {"type": "keyword"}, - "episode_metadata": {"type": "object"}, - "embedding_id": {"type": "integer"}, - "segment_range": {"type": "object"}, - "text": {"type": "text"}, - "sound_types": {"type": "keyword"}, - "start_time": {"type": "float"}, - "end_time": {"type": "float"}, - "sound_event_embedding": { - "type": "dense_vector", - "dims": 4096, - "index": True, - "similarity": "cosine", - }, - }, - }, - } - @staticmethod async def connect_to_elasticsearch(logger: logging.Logger) -> AsyncElasticsearch: es = AsyncElasticsearch( diff --git a/bot/services/reindex/reindex_service.py b/bot/services/reindex/reindex_service.py index c06b688b9..1c8d67b45 100644 --- a/bot/services/reindex/reindex_service.py +++ b/bot/services/reindex/reindex_service.py @@ -18,10 +18,11 @@ async_bulk, ) -from bot.search.elastic_search_manager import ElasticSearchManager from bot.services.reindex.series_scanner import SeriesScanner from bot.services.reindex.video_path_transformer import VideoPathTransformer from bot.services.reindex.zip_extractor import ZipExtractor +from bot.settings import settings +from preprocessor.search.elastic_manager import ElasticSearchManager # pylint: disable=no-name-in-module @dataclass @@ -187,6 +188,9 @@ async def reindex_series( async def __init_elasticsearch(self) -> None: if self.__es_manager is None: self.__es_manager = await ElasticSearchManager.connect_to_elasticsearch( + settings.ES_HOST, + settings.ES_USER, + settings.ES_PASS.get_secret_value(), self.__logger, ) diff --git a/bot/types.py b/bot/types.py index a1520ef5a..c8bb18c44 100644 --- a/bot/types.py +++ b/bot/types.py @@ -1,57 +1,193 @@ -from preprocessor.config.types import ( - BaseSegment, - CharacterDetectionInFrame, - ClipSegment, - Detection, - ElasticsearchAggregations, - ElasticsearchHit, - ElasticsearchHits, - ElasticsearchResponse, - ElasticsearchSegment, - EpisodeBucket, - EpisodeInfo, - EpisodeMetadata, - FrameRequest, - HashResult, - ObjectDetectionInFrame, - SceneDict, - SceneTimestamp, - SceneTimestampPoint, - SceneTimestampsData, - SearchSegment, - SeasonBucket, - SeasonInfo, - SeasonInfoDict, - SegmentWithScore, - TranscriptionContext, - VideoMetadata, +from typing import ( + Any, + Dict, + List, + NotRequired, + TypedDict, + Union, ) -__all__ = [ - "BaseSegment", - "CharacterDetectionInFrame", - "ClipSegment", - "Detection", - "ElasticsearchAggregations", - "ElasticsearchHit", - "ElasticsearchHits", - "ElasticsearchResponse", - "ElasticsearchSegment", - "EpisodeBucket", - "EpisodeInfo", - "EpisodeMetadata", - "FrameRequest", - "HashResult", - "ObjectDetectionInFrame", - "SceneDict", - "SceneTimestamp", - "SceneTimestampPoint", - "SceneTimestampsData", - "SeasonBucket", - "SeasonInfo", - "SeasonInfoDict", - "SearchSegment", - "SegmentWithScore", - "TranscriptionContext", - "VideoMetadata", -] + +class EpisodeInfo(TypedDict): + episode_number: int + title: str + premiere_date: str + viewership: Union[str, int, float] + + +class EpisodeMetadata(TypedDict): + season: int + episode_number: int + title: str + premiere_date: str + viewership: Union[str, int, float] + series_name: str + + +class SeasonInfo(TypedDict): + pass + + +SeasonInfoDict = Dict[str, int] + + +class BaseSegment(TypedDict): + id: int + text: str + start: float + end: float + + +class SegmentWithTimes(TypedDict): + segment_id: int + text: str + start_time: float + end_time: float + episode_metadata: EpisodeMetadata + video_path: NotRequired[str] + + +class SegmentWithScore(SegmentWithTimes): + _score: float + + +class ElasticsearchSegment(TypedDict): + segment_id: NotRequired[int] + id: NotRequired[int] + text: str + start_time: NotRequired[float] + start: NotRequired[float] + end_time: NotRequired[float] + end: NotRequired[float] + episode_metadata: NotRequired[EpisodeMetadata] + episode_info: NotRequired[EpisodeMetadata] + video_path: NotRequired[str] + _score: NotRequired[float] + + +class TranscriptionContext(TypedDict): + target: ElasticsearchSegment + context: List[BaseSegment] + overall_start_time: float + overall_end_time: float + + +class ClipSegment(TypedDict): + video_path: Union[str, Any] + start_time: float + end_time: float + + +class SearchSegment(TypedDict): + season: int + episode_number: int + title: str + start_time: float + end_time: float + + +class ElasticsearchHit(TypedDict): + _source: ElasticsearchSegment + _score: float + + +class ElasticsearchHits(TypedDict): + hits: List[ElasticsearchHit] + total: Dict[str, Any] + max_score: float + + +class ElasticsearchResponse(TypedDict): + hits: ElasticsearchHits + aggregations: NotRequired[Dict[str, Any]] + took: int + timed_out: bool + + +class EpisodeBucket(TypedDict): + key: int + doc_count: int + episode_metadata: Dict[str, Any] + + +class SeasonBucket(TypedDict): + key: int + doc_count: int + unique_episodes: Dict[str, int] + + +class ElasticsearchAggregations(TypedDict): + seasons: Dict[str, Union[List[SeasonBucket], int]] + unique_episodes: Dict[str, Union[List[EpisodeBucket], int]] + buckets: NotRequired[List[Union[SeasonBucket, EpisodeBucket]]] + + +class SceneDict(TypedDict): + scene_number: int + start_frame: int + end_frame: int + start_time: float + end_time: float + fps: float + + +class FrameRequest(TypedDict): + frame: int + time: float + type: str + scene_number: NotRequired[int] + + +class HashResult(TypedDict): + frame_number: int + timestamp: float + hash: str + file_path: NotRequired[str] + + +class Detection(TypedDict): + bbox: List[int] + confidence: float + class_id: NotRequired[int] + class_name: NotRequired[str] + name: NotRequired[str] + + +class VideoMetadata(TypedDict): + width: int + height: int + fps: float + duration: float + codec: NotRequired[str] + bitrate: NotRequired[int] + + +class SceneTimestampPoint(TypedDict): + frame: int + seconds: float + + +class SceneTimestamp(TypedDict): + scene_number: int + start: SceneTimestampPoint + end: SceneTimestampPoint + + +class SceneTimestampsData(TypedDict): + scenes: List[SceneTimestamp] + total_scenes: NotRequired[int] + fps: NotRequired[float] + + +class CharacterDetectionInFrame(TypedDict): + name: str + confidence: float + bbox: List[int] + embedding: NotRequired[List[float]] + + +class ObjectDetectionInFrame(TypedDict): + class_name: str + class_id: int + confidence: float + bbox: List[int] From 410c27a8135ab21614a601d88069a024717b6190 Mon Sep 17 00:00:00 2001 From: dam2452 <81230036+dam2452@users.noreply.github.com> Date: Wed, 18 Feb 2026 14:17:24 +0100 Subject: [PATCH 51/89] Parallelize frame export, improve ffmpeg Add .gitattributes and a few robustness and performance improvements: - Add .gitattributes to normalize EOLs. - Add config field max_parallel_frames (default 4) to control frame extraction concurrency. - Frame export: deduplicate frame timestamps by millisecond and use ThreadPoolExecutor(max_workers=config.max_parallel_frames) to extract frames in parallel, waiting for all tasks to finish. - StateManager: write state atomically using StepTempFile to avoid partial/corrupt state files. - FFmpeg wrapper: raise an error when no frame data is returned for a requested timestamp; request pts_time from ffprobe and prefer pts_time over pkt_pts_time when available to improve timestamp accuracy. These changes improve reliability of state persistence, robustness of frame extraction, and speed up exporting multiple frames. --- .gitattributes | 6 ++++ preprocessor/config/step_configs.py | 1 + preprocessor/core/state_manager.py | 6 ++-- preprocessor/services/media/ffmpeg.py | 6 ++-- preprocessor/steps/video/frame_export_step.py | 32 ++++++++++++++----- 5 files changed, 39 insertions(+), 12 deletions(-) create mode 100644 .gitattributes diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 000000000..4e43979a9 --- /dev/null +++ b/.gitattributes @@ -0,0 +1,6 @@ +* text=auto + +*.sh text eol=lf +*.py text eol=lf +Dockerfile text eol=lf +*.dockerignore text eol=lf diff --git a/preprocessor/config/step_configs.py b/preprocessor/config/step_configs.py index 367fcb67f..ed2af2649 100644 --- a/preprocessor/config/step_configs.py +++ b/preprocessor/config/step_configs.py @@ -73,6 +73,7 @@ class FrameExportConfig(BaseModel): frames_per_scene: int = Field(default=1, ge=1) keyframe_strategy: KeyframeStrategy = KeyframeStrategy.SCENE_CHANGES max_parallel_episodes: int = Field(default=4, ge=1, le=8) + max_parallel_frames: int = Field(default=4, ge=1, le=16) resolution: Resolution = Field(default=Resolution.R720P) scene_change_offset_seconds: float = Field(default=0.5, ge=0) diff --git a/preprocessor/core/state_manager.py b/preprocessor/core/state_manager.py index a6c2b4360..61be585fa 100644 --- a/preprocessor/core/state_manager.py +++ b/preprocessor/core/state_manager.py @@ -14,6 +14,7 @@ Optional, ) +from preprocessor.core.temp_files import StepTempFile from preprocessor.services.ui.console import console @@ -170,8 +171,9 @@ def __save_state(self) -> None: return self.__state.last_checkpoint = datetime.now().isoformat() - with open(self.__state_file, 'w', encoding='utf-8') as f: - json.dump(self.__state.to_dict(), f, indent=2, ensure_ascii=False) + with StepTempFile(self.__state_file) as temp_path: + with open(temp_path, 'w', encoding='utf-8') as f: + json.dump(self.__state.to_dict(), f, indent=2, ensure_ascii=False) def rebuild_state(self, completed_steps: List[StepCheckpoint]) -> ProcessingState: now = datetime.now().isoformat() diff --git a/preprocessor/services/media/ffmpeg.py b/preprocessor/services/media/ffmpeg.py index 9fea6fdc1..5aff5db1c 100644 --- a/preprocessor/services/media/ffmpeg.py +++ b/preprocessor/services/media/ffmpeg.py @@ -254,6 +254,8 @@ def extract_frame_at_timestamp(video_path: Path, timestamp: float) -> Image.Imag '-', ] result = subprocess.run(cmd, capture_output=True, check=True) + if not result.stdout: + raise ValueError(f'No frame data extracted at timestamp {timestamp}s from {video_path}') return Image.open(BytesIO(result.stdout)) @staticmethod @@ -262,7 +264,7 @@ def get_keyframe_timestamps(video_path: Path) -> List[float]: 'ffprobe', '-skip_frame', 'nokey', '-select_streams', 'v:0', - '-show_entries', 'frame=pkt_pts_time', + '-show_entries', 'frame=pts_time,pkt_pts_time', '-of', 'json', str(video_path), ] @@ -272,7 +274,7 @@ def get_keyframe_timestamps(video_path: Path) -> List[float]: timestamps = [] for frame in frames: - pts = frame.get('pkt_pts_time') + pts = frame.get('pts_time') or frame.get('pkt_pts_time') if pts: timestamps.append(float(pts)) diff --git a/preprocessor/steps/video/frame_export_step.py b/preprocessor/steps/video/frame_export_step.py index 207ef13b0..b354bdb9b 100644 --- a/preprocessor/steps/video/frame_export_step.py +++ b/preprocessor/steps/video/frame_export_step.py @@ -1,4 +1,5 @@ import bisect +from concurrent.futures import ThreadPoolExecutor from datetime import datetime import json from pathlib import Path @@ -181,14 +182,29 @@ def __extract_frames( req['original_timestamp'] = target_timestamp req['snapped_to_keyframe'] = True - self.__extract_and_save_frame( - video_file, - snapped_timestamp, - episode_dir, - episode_info, - dar, - context.series_name, - ) + seen_ms: set[int] = set() + unique_timestamps: List[float] = [] + for req in frame_requests: + ts_ms = int(req['timestamp'] * 1000) + if ts_ms not in seen_ms: + seen_ms.add(ts_ms) + unique_timestamps.append(req['timestamp']) + + with ThreadPoolExecutor(max_workers=self.config.max_parallel_frames) as executor: + futures = [ + executor.submit( + self.__extract_and_save_frame, + video_file, + timestamp, + episode_dir, + episode_info, + dar, + context.series_name, + ) + for timestamp in unique_timestamps + ] + for future in futures: + future.result() def __extract_and_save_frame( self, From 4a55ea1a17afee92fe0aae5c127ade3d1991af9b Mon Sep 17 00:00:00 2001 From: dam2452 <81230036+dam2452@users.noreply.github.com> Date: Wed, 18 Feb 2026 14:32:32 +0100 Subject: [PATCH 52/89] Reduce default frames_per_scene to 1 Update preprocessor/series_configs/defaults.json to change the default frame_export setting frames_per_scene from 3 to 1. This reduces the number of exported frames per scene, lowering output size and speeding up downstream processing. --- preprocessor/series_configs/defaults.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/preprocessor/series_configs/defaults.json b/preprocessor/series_configs/defaults.json index d69ee5677..af45df889 100644 --- a/preprocessor/series_configs/defaults.json +++ b/preprocessor/series_configs/defaults.json @@ -11,7 +11,7 @@ "pipeline_mode": "full", "processing": { "frame_export": { - "frames_per_scene": 3 + "frames_per_scene": 1 }, "scene_detection": { "min_scene_len": 10, From 86915b9abb1fa14bff9826c7238241bf39401198 Mon Sep 17 00:00:00 2001 From: dam2452 <81230036+dam2452@users.noreply.github.com> Date: Thu, 19 Feb 2026 10:53:17 +0100 Subject: [PATCH 53/89] Add segment filter steps and refactor transcription Introduce a reusable SegmentFilterStep and two concrete filters (TextCleaningStep and SoundEventsStep) to remove/identify sound-event segments. Wire these steps into the pipeline and register their outputs. Refactor transcription flow: replace WhisperTranscriptionConfig with a unified TranscriptionConfig (adds mode, beam_size, temperature), add a TranscriptionEngine abstraction with cleanup(), and switch TranscriptionStep to create/use engines (WhisperEngine or ElevenLabsEngine) rather than a concrete Whisper class. WhisperEngine now accepts beam_size and temperature and passes them to the model. Improve transcription output handling: change JSON output path layout, generate/save additional formats (simple JSON, SRT, TXT) next to the produced JSON, and ensure they are created when loading from cache. Convert SrtGenerator.convert_to_srt_format to a staticmethod and fix timestamp formatting calls. MultiFormatGenerator raw output path uses episode_code instead of episode_num. Other changes: add TextStatistics.from_text and minor refactors in TextAnalysisStep to analyze in-memory text instead of files; add SegmentFilterConfig/TextCleaningConfig/SoundEventsConfig and defaults update to use TranscriptionConfig. Update series config example to set transcription.mode to "11labs". --- preprocessor/app/pipeline_factory.py | 59 +++++--- preprocessor/config/step_configs.py | 20 ++- preprocessor/config/step_defaults.py | 5 +- preprocessor/core/base_step.py | 7 +- preprocessor/series_configs/kiepscy.json | 3 + preprocessor/services/text/text_statistics.py | 8 +- .../services/transcription/__init__.py | 2 - .../transcription/engines/base_engine.py | 3 + .../transcription/engines/whisper_engine.py | 9 +- .../generators/multi_format_generator.py | 3 +- .../transcription/generators/srt_generator.py | 7 +- .../processors/normalized_audio_processor.py | 2 +- .../transcription/sound_classification.py | 2 +- .../services/transcription/whisper.py | 81 ---------- preprocessor/steps/text/__init__.py | 11 +- preprocessor/steps/text/analysis_step.py | 26 ++-- .../steps/text/segment_filter_step.py | 143 ++++++++++++++++++ preprocessor/steps/text/sound_events_step.py | 23 +++ preprocessor/steps/text/text_cleaning_step.py | 23 +++ preprocessor/steps/text/transcription_step.py | 96 ++++++++---- 20 files changed, 360 insertions(+), 173 deletions(-) delete mode 100644 preprocessor/services/transcription/whisper.py create mode 100644 preprocessor/steps/text/segment_filter_step.py create mode 100644 preprocessor/steps/text/sound_events_step.py create mode 100644 preprocessor/steps/text/text_cleaning_step.py diff --git a/preprocessor/app/pipeline_factory.py b/preprocessor/app/pipeline_factory.py index 368b3ca9a..4ee4426b8 100644 --- a/preprocessor/app/pipeline_factory.py +++ b/preprocessor/app/pipeline_factory.py @@ -5,7 +5,6 @@ Phase, StepBuilder, ) -from preprocessor.config.output_paths import get_base_output_dir from preprocessor.config.series_config import SeriesConfig from preprocessor.config.step_configs import ( ArchiveConfig, @@ -22,13 +21,15 @@ ObjectDetectionConfig, ResolutionAnalysisConfig, SceneDetectionConfig, + SoundEventsConfig, SoundSeparationConfig, TextAnalysisConfig, + TextCleaningConfig, TextEmbeddingConfig, TranscodeConfig, + TranscriptionConfig, ValidationConfig, VideoEmbeddingConfig, - WhisperTranscriptionConfig, ) from preprocessor.core.output_descriptors import ( DirectoryOutput, @@ -47,6 +48,8 @@ from preprocessor.steps.search.indexing_step import ElasticsearchIndexerStep from preprocessor.steps.text.analysis_step import TextAnalysisStep from preprocessor.steps.text.embeddings_step import TextEmbeddingStep +from preprocessor.steps.text.sound_events_step import SoundEventsStep +from preprocessor.steps.text.text_cleaning_step import TextCleaningStep from preprocessor.steps.text.transcription_step import TranscriptionStep from preprocessor.steps.validation.validator_step import ValidationStep from preprocessor.steps.video.frame_export_step import FrameExporterStep @@ -66,18 +69,7 @@ VALIDATION = Phase("VALIDATION", color="magenta") -def _get_output_path_from_descriptor(step: StepBuilder, series_name: str, descriptor_idx: int = 0) -> str: - """Get resolved output path from step's OutputDescriptor.""" - descriptors = step.get_output_descriptors() - if not descriptors or descriptor_idx >= len(descriptors): - raise ValueError(f'Step {step.id} has no descriptor at index {descriptor_idx}') - - descriptor = descriptors[descriptor_idx] - base_dir = get_base_output_dir(series_name) - return str(descriptor.resolve_path(base_dir, {'series': series_name})) - - -def build_pipeline(series_name: str) -> PipelineDefinition: # pylint: disable=too-many-locals # Pipeline factory creates 21 step objects - each step needs clear naming for readability +def build_pipeline(series_name: str) -> PipelineDefinition: # pylint: disable=too-many-locals series_config = SeriesConfig.load(series_name) # ========================================================= @@ -218,20 +210,47 @@ def build_pipeline(series_name: str) -> PipelineDefinition: # pylint: disable=t description=f"Audio transcription using {series_config.processing.transcription.mode}", produces=[ JsonFileOutput( - pattern="{season}/{episode}.json", + pattern="{season}/{episode}/{episode}.json", min_size_bytes=50, ), ], needs=[transcoded_videos], - config=WhisperTranscriptionConfig( + config=TranscriptionConfig( + mode=series_config.processing.transcription.mode, model=series_config.processing.transcription.model, language=series_config.processing.transcription.language, device=series_config.processing.transcription.device, - beam_size=10, - temperature=0.0, ), ) + text_cleaning = StepBuilder( + phase=PROCESSING, + step_class=TextCleaningStep, + description="Removes sound events from transcription segments", + produces=[ + JsonFileOutput( + pattern="{season}/{episode}.json", + min_size_bytes=10, + ), + ], + needs=[transcription_data], + config=TextCleaningConfig(), + ) + + sound_events = StepBuilder( + phase=PROCESSING, + step_class=SoundEventsStep, + description="Extracts sound event segments from transcription", + produces=[ + JsonFileOutput( + pattern="{season}/{episode}.json", + min_size_bytes=10, + ), + ], + needs=[transcription_data], + config=SoundEventsConfig(), + ) + separated_audio = StepBuilder( phase=PROCESSING, step_class=SoundSeparationStep, @@ -258,7 +277,7 @@ def build_pipeline(series_name: str) -> PipelineDefinition: # pylint: disable=t min_size_bytes=50, ), ], - needs=[transcription_data], + needs=[text_cleaning], config=TextAnalysisConfig(language=series_config.processing.transcription.language), ) @@ -459,6 +478,8 @@ def build_pipeline(series_name: str) -> PipelineDefinition: # pylint: disable=t pipeline.register(exported_frames) pipeline.register(transcription_data) + pipeline.register(text_cleaning) + pipeline.register(sound_events) pipeline.register(separated_audio) pipeline.register(text_stats) diff --git a/preprocessor/config/step_configs.py b/preprocessor/config/step_configs.py index ed2af2649..fc3a635ca 100644 --- a/preprocessor/config/step_configs.py +++ b/preprocessor/config/step_configs.py @@ -79,17 +79,11 @@ class FrameExportConfig(BaseModel): class TranscriptionConfig(BaseModel): - language: str = 'pl' - max_parallel_episodes: int = Field(default=2, ge=1, le=4) - model: str = 'large-v3' - output_formats: List[str] = ['json', 'srt', 'txt'] - - -class WhisperTranscriptionConfig(BaseModel): beam_size: int = Field(default=10, ge=1) device: str = 'cuda' language: str = 'pl' max_parallel_episodes: int = Field(default=2, ge=1, le=4) + mode: str = 'whisper' model: str = 'large-v3-turbo' temperature: float = Field(default=0.0, ge=0.0, le=1.0) @@ -99,6 +93,18 @@ class TextAnalysisConfig(BaseModel): max_parallel_episodes: int = Field(default=8, ge=1, le=16) +class SegmentFilterConfig(BaseModel): + max_parallel_episodes: int = Field(default=8, ge=1, le=16) + + +class TextCleaningConfig(SegmentFilterConfig): + pass + + +class SoundEventsConfig(SegmentFilterConfig): + pass + + class TextEmbeddingConfig(BaseModel): batch_size: int = Field(default=8, ge=1) device: str = 'cuda' diff --git a/preprocessor/config/step_defaults.py b/preprocessor/config/step_defaults.py index ce985e79d..36000a766 100644 --- a/preprocessor/config/step_defaults.py +++ b/preprocessor/config/step_defaults.py @@ -15,8 +15,8 @@ TextAnalysisConfig, TextEmbeddingConfig, TranscodeConfig, + TranscriptionConfig, VideoEmbeddingConfig, - WhisperTranscriptionConfig, ) from preprocessor.services.media.resolution import Resolution @@ -32,7 +32,8 @@ def get_configs(series_name: str) -> Dict[str, object]: min_bitrate_mbps=2.0, bitrate_boost_ratio=1.1, ), - 'transcribe': WhisperTranscriptionConfig( + 'transcribe': TranscriptionConfig( + mode='whisper', model='large-v3-turbo', language='pl', device='cuda', diff --git a/preprocessor/core/base_step.py b/preprocessor/core/base_step.py index 13fb80326..9bcfacf87 100644 --- a/preprocessor/core/base_step.py +++ b/preprocessor/core/base_step.py @@ -251,9 +251,10 @@ def _check_cache_validity( cache_description: str, ) -> bool: if output_path.exists() and not context.force_rerun: - if context.is_step_completed(self.name, episode_id): - context.logger.info(f'Skipping {episode_id} ({cache_description})') - return True + if not context.is_step_completed(self.name, episode_id): + context.mark_step_completed(self.name, episode_id) + context.logger.info(f'Skipping {episode_id} ({cache_description})') + return True return False def __validate_all_descriptors( diff --git a/preprocessor/series_configs/kiepscy.json b/preprocessor/series_configs/kiepscy.json index 8e002c480..5b2921ccf 100644 --- a/preprocessor/series_configs/kiepscy.json +++ b/preprocessor/series_configs/kiepscy.json @@ -9,6 +9,9 @@ "processing": { "transcode": { "force_deinterlace": false + }, + "transcription": { + "mode": "11labs" } }, "scraping": { diff --git a/preprocessor/services/text/text_statistics.py b/preprocessor/services/text/text_statistics.py index f8605e768..ba9cdb7eb 100644 --- a/preprocessor/services/text/text_statistics.py +++ b/preprocessor/services/text/text_statistics.py @@ -54,6 +54,12 @@ def from_file(cls, file_path: Path, language: str = 'pl') -> 'TextStatistics': stats.__process_calculations() return stats + @classmethod + def from_text(cls, text: str, language: str = 'pl') -> 'TextStatistics': + stats = cls(text=text, language=language) + stats.__process_calculations() + return stats + def to_dict(self) -> Dict[str, Any]: return { 'basic_statistics': { @@ -85,7 +91,7 @@ def to_dict(self) -> Dict[str, Any]: 'trigrams': self.trigrams, } - def __process_calculations(self) -> None: # pylint: disable=unused-private-member # Called in from_file (line 54) - false positive + def __process_calculations(self) -> None: # pylint: disable=unused-private-member # Called in from_file and from_text via name mangling - false positive self.__calculate_structural_stats() self.__calculate_character_distribution() self.__calculate_lexical_stats() diff --git a/preprocessor/services/transcription/__init__.py b/preprocessor/services/transcription/__init__.py index 6af73ca06..21b0091d6 100644 --- a/preprocessor/services/transcription/__init__.py +++ b/preprocessor/services/transcription/__init__.py @@ -10,7 +10,6 @@ TranscriptionUtils, WhisperUtils, ) -from preprocessor.services.transcription.whisper import Whisper __all__ = [ 'JsonGenerator', @@ -21,5 +20,4 @@ 'is_sound_event', 'TranscriptionUtils', 'WhisperUtils', - 'Whisper', ] diff --git a/preprocessor/services/transcription/engines/base_engine.py b/preprocessor/services/transcription/engines/base_engine.py index 4078a3368..0ac724ca6 100644 --- a/preprocessor/services/transcription/engines/base_engine.py +++ b/preprocessor/services/transcription/engines/base_engine.py @@ -10,6 +10,9 @@ class TranscriptionEngine(ABC): + def cleanup(self) -> None: + pass + @abstractmethod def get_name(self) -> str: pass diff --git a/preprocessor/services/transcription/engines/whisper_engine.py b/preprocessor/services/transcription/engines/whisper_engine.py index 01647bd7c..385c813c9 100644 --- a/preprocessor/services/transcription/engines/whisper_engine.py +++ b/preprocessor/services/transcription/engines/whisper_engine.py @@ -10,7 +10,7 @@ import torch from preprocessor.services.transcription.engines.base_engine import TranscriptionEngine -from preprocessor.services.transcription.whisper import WhisperUtils +from preprocessor.services.transcription.utils import WhisperUtils from preprocessor.services.ui.console import console @@ -20,10 +20,14 @@ def __init__( model_name: str = 'large-v3-turbo', language: str = 'Polish', device: str = 'cuda', + beam_size: int = 10, + temperature: float = 0.0, ) -> None: self.__model_name = model_name self.__language = language self.__device = device + self.__beam_size = beam_size + self.__temperature = temperature if device != 'cuda': raise ValueError(f'Whisper acceleration requires CUDA, got: {device}') @@ -57,9 +61,10 @@ def transcribe(self, audio_path: Path) -> Dict[str, Any]: segments, info = self.__model.transcribe( str(audio_path), language=language_code, - beam_size=10, + beam_size=self.__beam_size, word_timestamps=True, condition_on_previous_text=False, + temperature=self.__temperature, ) result = WhisperUtils.build_transcription_result(segments, language=info.language) diff --git a/preprocessor/services/transcription/generators/multi_format_generator.py b/preprocessor/services/transcription/generators/multi_format_generator.py index 657824e64..48d3bbf9f 100644 --- a/preprocessor/services/transcription/generators/multi_format_generator.py +++ b/preprocessor/services/transcription/generators/multi_format_generator.py @@ -59,7 +59,6 @@ def __generate_all_formats(self, transcription: Dict[str, Any], episode_info: An metadata = EpisodeManager.get_metadata(episode_info) full_data = {'episode_info': metadata, **transcription} - # Generowanie formatów self.__save_json(full_data, episode_info, base_dir, 'full') self.__save_json(transcription, episode_info, base_dir, 'segmented') self.__save_json(transcription, episode_info, base_dir, 'simple') @@ -107,7 +106,7 @@ def __get_raw_output_dir(self, ep_info: Any) -> Path: get_base_output_dir(self.__series_name) / settings.output_subdirs.transcriptions / ep_info.season_code() / - ep_info.episode_num() / 'raw' + ep_info.episode_code() / 'raw' ) def __load_json(self, path: Path) -> Optional[Dict[str, Any]]: diff --git a/preprocessor/services/transcription/generators/srt_generator.py b/preprocessor/services/transcription/generators/srt_generator.py index cbd4ab23e..8eaaa1f23 100644 --- a/preprocessor/services/transcription/generators/srt_generator.py +++ b/preprocessor/services/transcription/generators/srt_generator.py @@ -9,7 +9,8 @@ class SrtGenerator(BaseTranscriptionGenerator): - def convert_to_srt_format(self, data: Dict[str, Any]) -> str: + @staticmethod + def convert_to_srt_format(data: Dict[str, Any]) -> str: segments = data.get('segments', []) srt_lines = [] index = 1 @@ -19,8 +20,8 @@ def convert_to_srt_format(self, data: Dict[str, Any]) -> str: if not text: continue - start_time = self.__format_timestamp(seg.get('start', 0.0)) - end_time = self.__format_timestamp(seg.get('end', 0.0)) + start_time = SrtGenerator.__format_timestamp(seg.get('start', 0.0)) + end_time = SrtGenerator.__format_timestamp(seg.get('end', 0.0)) srt_lines.extend([str(index), f'{start_time} --> {end_time}', text, '']) index += 1 diff --git a/preprocessor/services/transcription/processors/normalized_audio_processor.py b/preprocessor/services/transcription/processors/normalized_audio_processor.py index 813cc8c50..36b0e6094 100644 --- a/preprocessor/services/transcription/processors/normalized_audio_processor.py +++ b/preprocessor/services/transcription/processors/normalized_audio_processor.py @@ -11,7 +11,7 @@ import torch from preprocessor.services.core.logging import ErrorHandlingLogger -from preprocessor.services.transcription.whisper import WhisperUtils +from preprocessor.services.transcription.utils import WhisperUtils class NormalizedAudioProcessor: diff --git a/preprocessor/services/transcription/sound_classification.py b/preprocessor/services/transcription/sound_classification.py index 525d76f1d..9a85e8cc1 100644 --- a/preprocessor/services/transcription/sound_classification.py +++ b/preprocessor/services/transcription/sound_classification.py @@ -14,7 +14,7 @@ def is_sound_event(word: Dict[str, Any]) -> bool: if word.get(WordKeys.TYPE) == WordTypeValues.AUDIO_EVENT: return True - text = word.get(WordKeys.TEXT, '').strip() + text = word.get(WordKeys.TEXT, word.get(WordKeys.WORD, '')).strip() return bool(re.match(r'^\(.*\)$', text)) diff --git a/preprocessor/services/transcription/whisper.py b/preprocessor/services/transcription/whisper.py deleted file mode 100644 index 3dfabcf1a..000000000 --- a/preprocessor/services/transcription/whisper.py +++ /dev/null @@ -1,81 +0,0 @@ -from pathlib import Path -from typing import ( - Any, - Dict, - Optional, -) - -from faster_whisper import WhisperModel -import torch - -from preprocessor.services.transcription.utils import WhisperUtils -from preprocessor.services.ui.console import console - - -class Whisper: - def __init__( - self, - model: str = 'large-v3-turbo', - language: str = 'pl', - device: str = 'cuda', - beam_size: int = 10, - temperature: float = 0.0, - ) -> None: - self.__model_name = model - self.__language = language - self.__device = device - self.__beam_size = beam_size - self.__temperature = temperature - self.__model: Optional[WhisperModel] = None - - def cleanup(self) -> None: - console.print('[cyan]Unloading Whisper model and clearing GPU memory...[/cyan]') - if self.__model is not None: - del self.__model - self.__model = None - - if torch.cuda.is_available(): - torch.cuda.empty_cache() - console.print('[green]Whisper model unloaded, GPU memory cleared[/green]') - - def transcribe(self, audio_path: Path) -> Dict[str, Any]: - console.print(f'[cyan]Transcribing with Whisper: {audio_path.name}[/cyan]') - - if not audio_path.exists(): - raise FileNotFoundError(f'Audio file not found: {audio_path}') - - model = self.__get_or_load_model() - language_code = WhisperUtils.get_language_code(self.__language) - - segments, info = model.transcribe( - str(audio_path), - language=language_code, - beam_size=self.__beam_size, - word_timestamps=True, - condition_on_previous_text=False, - temperature=self.__temperature, - ) - - result = WhisperUtils.build_transcription_result(segments, language=info.language) - console.print(f'[green]Transcription completed: {audio_path.name}[/green]') - return result - - def __get_or_load_model(self) -> WhisperModel: - if self.__model is not None: - return self.__model - - if self.__device != 'cuda': - raise ValueError(f'Only GPU (cuda) is supported, got device={self.__device}') - - compute_type = 'float16' - console.print( - f'[cyan]Loading Whisper: {self.__model_name} on {self.__device} ({compute_type})[/cyan]', - ) - - self.__model = WhisperModel( - self.__model_name, - device=self.__device, - compute_type=compute_type, - ) - console.print('[green]Whisper model loaded[/green]') - return self.__model diff --git a/preprocessor/steps/text/__init__.py b/preprocessor/steps/text/__init__.py index 96fee3bb3..ba7c25956 100644 --- a/preprocessor/steps/text/__init__.py +++ b/preprocessor/steps/text/__init__.py @@ -1,6 +1,15 @@ from preprocessor.services.text.import_step import TranscriptionImportStep from preprocessor.steps.text.analysis_step import TextAnalysisStep from preprocessor.steps.text.embeddings_step import TextEmbeddingStep +from preprocessor.steps.text.sound_events_step import SoundEventsStep +from preprocessor.steps.text.text_cleaning_step import TextCleaningStep from preprocessor.steps.text.transcription_step import TranscriptionStep -__all__ = ['TextAnalysisStep', 'TextEmbeddingStep', 'TranscriptionImportStep', 'TranscriptionStep'] +__all__ = [ + 'SoundEventsStep', + 'TextAnalysisStep', + 'TextCleaningStep', + 'TextEmbeddingStep', + 'TranscriptionImportStep', + 'TranscriptionStep', +] diff --git a/preprocessor/steps/text/analysis_step.py b/preprocessor/steps/text/analysis_step.py index 7dfc9c37c..f3c17054f 100644 --- a/preprocessor/steps/text/analysis_step.py +++ b/preprocessor/steps/text/analysis_step.py @@ -38,9 +38,9 @@ def _process( ) -> TextAnalysisResults: output_path = self._get_cache_path(input_data, context) - txt_path = self.__resolve_text_file_path(input_data) - stats = self.__analyze_text_statistics(txt_path) - result_data = self.__build_result_payload(stats, txt_path, input_data) + text = self.__extract_transcription_text(input_data) + stats = self.__analyze_text_statistics(text) + result_data = self.__build_result_payload(stats, input_data) FileOperations.atomic_write_json(output_path, result_data) @@ -66,35 +66,29 @@ def _load_from_cache( stats_data = FileOperations.load_json(cache_path) return self.__construct_analysis_results(input_data, cache_path, stats_data) - def __analyze_text_statistics(self, txt_path: Path) -> TextStatistics: - return TextStatistics.from_file(txt_path, language=self.config.language) + def __analyze_text_statistics(self, text: str) -> TextStatistics: + return TextStatistics.from_text(text, language=self.config.language) def __build_result_payload( self, stats: TextStatistics, - txt_path: Path, input_data: TranscriptionData, ) -> Dict[str, Any]: return { 'metadata': { 'episode_id': input_data.episode_id, 'language': self.config.language, - 'source_file': txt_path.name, + 'source_file': input_data.path.name, 'analyzed_at': datetime.now().isoformat(), }, **stats.to_dict(), } @staticmethod - def __resolve_text_file_path(input_data: TranscriptionData) -> Path: - txt_path = input_data.path - if input_data.format != 'txt': - txt_path = input_data.path.with_suffix('.txt') - - if not txt_path.exists(): - raise FileNotFoundError(f'Transcription text file not found: {txt_path}') - - return txt_path + def __extract_transcription_text(input_data: TranscriptionData) -> str: + data = FileOperations.load_json(input_data.path) + segments = data.get('segments', []) + return ' '.join(seg.get('text', '').strip() for seg in segments if seg.get('text')) @staticmethod def __construct_analysis_results( diff --git a/preprocessor/steps/text/segment_filter_step.py b/preprocessor/steps/text/segment_filter_step.py new file mode 100644 index 000000000..fe3a11cf6 --- /dev/null +++ b/preprocessor/steps/text/segment_filter_step.py @@ -0,0 +1,143 @@ +from abc import abstractmethod +from pathlib import Path +import re +from typing import ( + Any, + Dict, + Generic, + List, + Optional, + Tuple, + TypeVar, +) + +from preprocessor.config.step_configs import SegmentFilterConfig +from preprocessor.config.types import ( + WordKeys, + WordTypeValues, +) +from preprocessor.core.artifacts import TranscriptionData +from preprocessor.core.base_step import PipelineStep +from preprocessor.core.context import ExecutionContext +from preprocessor.core.output_descriptors import JsonFileOutput +from preprocessor.services.io.files import FileOperations +from preprocessor.services.transcription.sound_classification import ( + classify_segment, + is_sound_event, +) + +_ConfigT = TypeVar('_ConfigT', bound=SegmentFilterConfig) + +_SOUND_EVENT_PATTERN = re.compile(r'^\s*\(.*\)\s*$') + + +class SegmentFilterStep( + PipelineStep[TranscriptionData, TranscriptionData, _ConfigT], + Generic[_ConfigT], +): + @property + @abstractmethod + def _output_format(self) -> str: + pass + + @abstractmethod + def _process_segment(self, segment: Dict[str, Any]) -> List[Dict[str, Any]]: + pass + + @property + def supports_batch_processing(self) -> bool: + return True + + def execute_batch( + self, input_data: List[TranscriptionData], context: ExecutionContext, + ) -> List[TranscriptionData]: + return self._execute_with_threadpool( + input_data, context, self.config.max_parallel_episodes, self.execute, + ) + + def _process( + self, input_data: TranscriptionData, context: ExecutionContext, + ) -> TranscriptionData: + output_path = self._get_cache_path(input_data, context) + data = FileOperations.load_json(input_data.path) + filtered = self.__apply_filter(data) + FileOperations.atomic_write_json(output_path, filtered) + return self.__build_artifact(input_data, output_path) + + def get_output_descriptors(self) -> List[JsonFileOutput]: + return [ + JsonFileOutput( + pattern="{season}/{episode}.json", + subdir="", + min_size_bytes=10, + ), + ] + + def _get_cache_path( + self, input_data: TranscriptionData, context: ExecutionContext, + ) -> Path: + return self._get_standard_cache_path(input_data, context) + + def _load_from_cache( + self, + cache_path: Path, + input_data: TranscriptionData, + context: ExecutionContext, + ) -> TranscriptionData: + return self.__build_artifact(input_data, cache_path) + + @staticmethod + def _classify(segment: Dict[str, Any]) -> str: + words = segment.get(WordKeys.WORDS, []) + if not words: + text = segment.get('text', '').strip() + return 'sound_event' if _SOUND_EVENT_PATTERN.match(text) else 'dialogue' + return classify_segment(segment) + + @staticmethod + def _split_mixed( + segment: Dict[str, Any], + ) -> Tuple[Optional[Dict[str, Any]], Optional[Dict[str, Any]]]: + words = segment.get(WordKeys.WORDS, []) + + dialogue_words = [ + w for w in words + if not is_sound_event(w) and w.get(WordKeys.TYPE) not in (WordTypeValues.SPACING, '') + ] + sound_words = [w for w in words if is_sound_event(w)] + + dialogue_part = SegmentFilterStep.__make_sub_segment(segment, dialogue_words) if dialogue_words else None + sound_part = SegmentFilterStep.__make_sub_segment(segment, sound_words) if sound_words else None + + return dialogue_part, sound_part + + @staticmethod + def __make_sub_segment( + segment: Dict[str, Any], + words: List[Dict[str, Any]], + ) -> Dict[str, Any]: + text = ' '.join(w.get(WordKeys.TEXT, w.get(WordKeys.WORD, '')) for w in words).strip() + return { + **segment, + 'start': words[0].get(WordKeys.START, segment.get('start')), + 'end': words[-1].get(WordKeys.END, segment.get('end')), + 'text': text, + WordKeys.WORDS: words, + } + + def __apply_filter(self, data: Dict[str, Any]) -> Dict[str, Any]: + segments: List[Dict[str, Any]] = data.get('segments', []) + result: List[Dict[str, Any]] = [] + for seg in segments: + result.extend(self._process_segment(seg)) + return {**data, 'segments': result} + + def __build_artifact(self, input_data: TranscriptionData, path: Path) -> TranscriptionData: + return TranscriptionData( + episode_id=input_data.episode_id, + episode_info=input_data.episode_info, + path=path, + language=input_data.language, + model=input_data.model, + format=self._output_format, + ) diff --git a/preprocessor/steps/text/sound_events_step.py b/preprocessor/steps/text/sound_events_step.py new file mode 100644 index 000000000..c47d80fa0 --- /dev/null +++ b/preprocessor/steps/text/sound_events_step.py @@ -0,0 +1,23 @@ +from typing import ( + Any, + Dict, + List, +) + +from preprocessor.config.step_configs import SoundEventsConfig +from preprocessor.steps.text.segment_filter_step import SegmentFilterStep + + +class SoundEventsStep(SegmentFilterStep[SoundEventsConfig]): + @property + def _output_format(self) -> str: + return 'sound_events' + + def _process_segment(self, segment: Dict[str, Any]) -> List[Dict[str, Any]]: + kind = self._classify(segment) + if kind == 'sound_event': + return [segment] + if kind == 'dialogue': + return [] + _, sound_part = self._split_mixed(segment) + return [sound_part] if sound_part else [] diff --git a/preprocessor/steps/text/text_cleaning_step.py b/preprocessor/steps/text/text_cleaning_step.py new file mode 100644 index 000000000..50c6f950c --- /dev/null +++ b/preprocessor/steps/text/text_cleaning_step.py @@ -0,0 +1,23 @@ +from typing import ( + Any, + Dict, + List, +) + +from preprocessor.config.step_configs import TextCleaningConfig +from preprocessor.steps.text.segment_filter_step import SegmentFilterStep + + +class TextCleaningStep(SegmentFilterStep[TextCleaningConfig]): + @property + def _output_format(self) -> str: + return 'clean' + + def _process_segment(self, segment: Dict[str, Any]) -> List[Dict[str, Any]]: + kind = self._classify(segment) + if kind == 'dialogue': + return [segment] + if kind == 'sound_event': + return [] + dialogue_part, _ = self._split_mixed(segment) + return [dialogue_part] if dialogue_part else [] diff --git a/preprocessor/steps/text/transcription_step.py b/preprocessor/steps/text/transcription_step.py index c90d62f09..e3f7bde43 100644 --- a/preprocessor/steps/text/transcription_step.py +++ b/preprocessor/steps/text/transcription_step.py @@ -1,3 +1,4 @@ +import json from pathlib import Path from typing import ( Any, @@ -6,7 +7,7 @@ Optional, ) -from preprocessor.config.step_configs import WhisperTranscriptionConfig +from preprocessor.config.step_configs import TranscriptionConfig from preprocessor.core.artifacts import ( TranscodedVideo, TranscriptionData, @@ -16,30 +17,34 @@ from preprocessor.core.output_descriptors import JsonFileOutput from preprocessor.services.episodes.episode_manager import EpisodeManager from preprocessor.services.io.files import FileOperations -from preprocessor.services.transcription.whisper import Whisper +from preprocessor.services.transcription.engines.base_engine import TranscriptionEngine +from preprocessor.services.transcription.engines.elevenlabs_engine import ElevenLabsEngine +from preprocessor.services.transcription.engines.whisper_engine import WhisperEngine +from preprocessor.services.transcription.generators.json_generator import JsonGenerator +from preprocessor.services.transcription.generators.srt_generator import SrtGenerator +from preprocessor.services.transcription.generators.txt_generator import TxtGenerator class TranscriptionStep( - PipelineStep[TranscodedVideo, TranscriptionData, WhisperTranscriptionConfig], + PipelineStep[TranscodedVideo, TranscriptionData, TranscriptionConfig], ): - def __init__(self, config: WhisperTranscriptionConfig) -> None: + def __init__(self, config: TranscriptionConfig) -> None: super().__init__(config) - self.__whisper: Optional[Whisper] = None + self.__engine: Optional[TranscriptionEngine] = None @property def supports_batch_processing(self) -> bool: return True def setup_resources(self, context: ExecutionContext) -> None: - if self.__whisper is None: - self.__load_whisper(context) + if self.__engine is None: + self.__engine = self.__create_engine(context) def teardown_resources(self, context: ExecutionContext) -> None: - if self.__whisper: - self.__unload_whisper(context) - - def cleanup(self) -> None: - self.__unload_whisper() + if self.__engine: + self.__engine.cleanup() + self.__engine = None + context.logger.info('Transcription engine unloaded') def execute_batch( self, input_data: List[TranscodedVideo], context: ExecutionContext, @@ -53,18 +58,19 @@ def _process( ) -> TranscriptionData: output_path = self._get_cache_path(input_data, context) - if self.__whisper is None: - self.__load_whisper(context) + if self.__engine is None: + self.__engine = self.__create_engine(context) result = self.__transcribe_and_save(input_data, output_path, context) + self.__save_additional_formats(output_path, result) return self.__construct_result_artifact(output_path, input_data, result) def get_output_descriptors(self) -> List[JsonFileOutput]: return [ JsonFileOutput( - pattern="{season}/{episode}.json", - subdir="transcriptions", + pattern="{season}/{episode}/{episode}.json", + subdir="", min_size_bytes=50, ), ] @@ -80,6 +86,7 @@ def _load_from_cache( input_data: TranscodedVideo, context: ExecutionContext, ) -> TranscriptionData: + self.__ensure_additional_formats(cache_path) return TranscriptionData( episode_id=input_data.episode_id, episode_info=input_data.episode_info, @@ -89,24 +96,20 @@ def _load_from_cache( format='json', ) - def __load_whisper(self, context: Optional[ExecutionContext] = None) -> None: - if context: - context.logger.info(f'Loading Whisper model: {self.config.model}') + def __create_engine(self, context: ExecutionContext) -> TranscriptionEngine: + if self.config.mode == '11labs': + context.logger.info('Creating ElevenLabs transcription engine') + return ElevenLabsEngine(logger=context.logger) - self.__whisper = Whisper( - model=self.config.model, + context.logger.info(f'Loading Whisper model: {self.config.model}') + return WhisperEngine( + model_name=self.config.model, language=self.config.language, device=self.config.device, beam_size=self.config.beam_size, + temperature=self.config.temperature, ) - def __unload_whisper(self, context: Optional[ExecutionContext] = None) -> None: - if self.__whisper: - self.__whisper.cleanup() - self.__whisper = None - if context: - context.logger.info('Whisper model unloaded') - def __transcribe_and_save( self, input_data: TranscodedVideo, @@ -114,10 +117,10 @@ def __transcribe_and_save( context: ExecutionContext, ) -> Dict[str, Any]: try: - if self.__whisper is None: - raise RuntimeError("Whisper model not initialized") + if self.__engine is None: + raise RuntimeError('Transcription engine not initialized') - result: Dict[str, Any] = self.__whisper.transcribe(input_data.path) + result: Dict[str, Any] = self.__engine.transcribe(input_data.path) result['episode_info'] = EpisodeManager.get_metadata( input_data.episode_info, ) @@ -125,12 +128,41 @@ def __transcribe_and_save( return result except Exception as e: context.logger.error( - f'Whisper transcription failed for {input_data.episode_id}: {e}', + f'Transcription failed for {input_data.episode_id}: {e}', ) if output_path.exists(): output_path.unlink() raise + @staticmethod + def __save_additional_formats(output_path: Path, data: Dict[str, Any]) -> None: + stem = output_path.stem + parent = output_path.parent + + simple = JsonGenerator.convert_to_simple_format(data) + (parent / f'{stem}_simple.json').write_text( + json.dumps(simple, indent=2, ensure_ascii=False), encoding='utf-8', + ) + (parent / f'{stem}.srt').write_text( + SrtGenerator.convert_to_srt_format(data), encoding='utf-8', + ) + (parent / f'{stem}.txt').write_text( + TxtGenerator.convert_to_txt_format(data), encoding='utf-8', + ) + + @staticmethod + def __ensure_additional_formats(cache_path: Path) -> None: + stem = cache_path.stem + parent = cache_path.parent + missing = any( + not (parent / name).exists() + for name in (f'{stem}_simple.json', f'{stem}.srt', f'{stem}.txt') + ) + if not missing: + return + data = FileOperations.load_json(cache_path) + TranscriptionStep.__save_additional_formats(cache_path, data) + def __construct_result_artifact( self, output_path: Path, From 32bac7359de184c825faeb4075bf60318bd3514b Mon Sep 17 00:00:00 2001 From: dam2452 <81230036+dam2452@users.noreply.github.com> Date: Thu, 19 Feb 2026 11:49:49 +0100 Subject: [PATCH 54/89] Add image hasher device & hex hashes Add device support and improve image hashing output and caching behavior. Changes include: add ImageHashConfig.device (default 'cuda'); PerceptualHasher now encodes perceptual hashes as compact hexadecimal strings instead of long binary strings; ImageHashStep passes configured device to the hasher and records the actual device in hash_settings; add PipelineStep.load_all_from_cache helper to load multiple items from cache; include frame_number (ms) in FrameExporterStep requests for downstream use. These changes enable GPU usage for hashing, produce shorter hash representations, and improve cache/batch handling and metadata completeness. --- preprocessor/config/step_configs.py | 1 + preprocessor/core/base_step.py | 9 +++++++++ preprocessor/services/video/image_hasher.py | 5 +++-- preprocessor/steps/video/frame_export_step.py | 1 + preprocessor/steps/vision/image_hashing_step.py | 7 ++++--- 5 files changed, 18 insertions(+), 5 deletions(-) diff --git a/preprocessor/config/step_configs.py b/preprocessor/config/step_configs.py index fc3a635ca..1af70621c 100644 --- a/preprocessor/config/step_configs.py +++ b/preprocessor/config/step_configs.py @@ -132,6 +132,7 @@ class DocumentGenerationConfig(BaseModel): class ImageHashConfig(BaseModel): batch_size: int = Field(default=32, ge=1) + device: str = 'cuda' max_parallel_episodes: int = Field(default=2, ge=1, le=4) diff --git a/preprocessor/core/base_step.py b/preprocessor/core/base_step.py index 9bcfacf87..a9872a2f0 100644 --- a/preprocessor/core/base_step.py +++ b/preprocessor/core/base_step.py @@ -70,6 +70,15 @@ def execute_batch( ) -> List[OutputT]: return [self.execute(item, context) for item in input_data] + def load_all_from_cache( + self, input_list: List[InputT], context: ExecutionContext, + ) -> List[OutputT]: + results = [] + for inp in input_list: + result = self._load_from_cache(self._get_cache_path(inp, context), inp, context) + results.append(result if result else inp) + return results + def should_skip_execution( self, episode_id: str, diff --git a/preprocessor/services/video/image_hasher.py b/preprocessor/services/video/image_hasher.py index b7dab14f3..b35789b0f 100644 --- a/preprocessor/services/video/image_hasher.py +++ b/preprocessor/services/video/image_hasher.py @@ -61,8 +61,9 @@ def compute_phash_batch(self, images: List[Image.Image]) -> List[str]: for feature_vec in features: hash_bits = (feature_vec > feature_vec.median()).int() - hash_str = ''.join([str(bit.item()) for bit in hash_bits[:self.__hash_size * self.__hash_size]]) - hashes.append(hash_str) + n_bits = self.__hash_size * self.__hash_size + bits_str = ''.join(str(b.item()) for b in hash_bits[:n_bits]) + hashes.append(format(int(bits_str, 2), f'0{n_bits // 4}x')) return hashes diff --git a/preprocessor/steps/video/frame_export_step.py b/preprocessor/steps/video/frame_export_step.py index b354bdb9b..d5ec2f00f 100644 --- a/preprocessor/steps/video/frame_export_step.py +++ b/preprocessor/steps/video/frame_export_step.py @@ -181,6 +181,7 @@ def __extract_frames( req['timestamp'] = snapped_timestamp req['original_timestamp'] = target_timestamp req['snapped_to_keyframe'] = True + req['frame_number'] = int(snapped_timestamp * 1000) seen_ms: set[int] = set() unique_timestamps: List[float] = [] diff --git a/preprocessor/steps/vision/image_hashing_step.py b/preprocessor/steps/vision/image_hashing_step.py index aa1bfffec..d62cfdc6b 100644 --- a/preprocessor/steps/vision/image_hashing_step.py +++ b/preprocessor/steps/vision/image_hashing_step.py @@ -63,7 +63,7 @@ def _process( hash_results = self.__compute_hashes(frame_requests, input_data) self.__save_hash_results( - hash_results, output_path, input_data, context, frame_metadata, + hash_results, output_path, input_data, context, frame_metadata, self.config.device, ) self.__cleanup_memory() @@ -107,7 +107,7 @@ def _load_from_cache( def __prepare_hasher(self, context: ExecutionContext) -> None: if self.__hasher is None: context.logger.info(f'Loading image hasher on {self.config.device}...') - self.__hasher = PerceptualHasher() + self.__hasher = PerceptualHasher(device=self.config.device) def __compute_hashes( self, @@ -174,13 +174,14 @@ def __save_hash_results( input_data: FrameCollection, context: ExecutionContext, frame_metadata: Dict[str, Any], + device: str, ) -> None: output_data: Dict[str, Any] = { 'episode_id': input_data.episode_id, 'series_name': context.series_name, 'generated_at': frame_metadata.get('generated_at'), 'hash_settings': { - 'device': 'cpu', + 'device': device, 'batch_size': len(hash_results) // 10 if hash_results else 1, }, 'hashes': hash_results, From d72f841e3fa003c4df0d80340aa7aa4d03f7128f Mon Sep 17 00:00:00 2001 From: dam2452 <81230036+dam2452@users.noreply.github.com> Date: Thu, 19 Feb 2026 12:02:01 +0100 Subject: [PATCH 55/89] Skip completed episode steps and load cache Add a check to avoid re-running episode-level steps when all episodes are already marked completed. Introduces a helper __all_episodes_completed that respects force_rerun, step caching, and the presence of a state_manager, and queries state_manager.is_step_completed for each artifact. Use this helper in __run_episode_step_sequential to early-log and return step.load_all_from_cache, avoiding unnecessary per-episode execution and worker setup. --- preprocessor/app/pipeline_builder.py | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/preprocessor/app/pipeline_builder.py b/preprocessor/app/pipeline_builder.py index ced436c35..09c0a47f1 100644 --- a/preprocessor/app/pipeline_builder.py +++ b/preprocessor/app/pipeline_builder.py @@ -193,9 +193,27 @@ def __should_use_batch_processing(self, step: PipelineStep) -> bool: return True + def __all_episodes_completed( + self, step: PipelineStep, artifacts: List[Any], + ) -> bool: + if self.__context.force_rerun or not step.uses_caching: + return False + if self.__context.state_manager is None: + return False + return all( + self.__context.state_manager.is_step_completed(step.name, art.episode_id) + for art in artifacts + ) + def __run_episode_step_sequential( self, step: PipelineStep, current_artifacts: List[Any], ) -> List[Any]: + if self.__all_episodes_completed(step, current_artifacts): + self.__context.logger.info( + f'Step {step.name}: all {len(current_artifacts)} episodes already completed', + ) + return step.load_all_from_cache(current_artifacts, self.__context) + next_artifacts = [] for artifact in current_artifacts: @@ -224,6 +242,12 @@ def __run_episode_step_batch( if not current_artifacts: return [] + if self.__all_episodes_completed(step, current_artifacts): + self.__context.logger.info( + f'Step {step.name}: all {len(current_artifacts)} episodes already completed', + ) + return step.load_all_from_cache(current_artifacts, self.__context) + workers = ( step.config.max_parallel_episodes if hasattr(step.config, 'max_parallel_episodes') From 3753a3f9d445ef3faa75a9a684ec559214f4fb6a Mon Sep 17 00:00:00 2001 From: dam2452 <81230036+dam2452@users.noreply.github.com> Date: Sat, 21 Feb 2026 17:28:16 +0100 Subject: [PATCH 56/89] Add char ref processor and infra updates Introduce a CharacterReferenceProcessor pipeline step and config to process reference images into face vectors and metadata, and register it in the pipeline. Refactor caching and execution flow: PipelineExecutor now uses PipelineStep.all_outputs_exist and avoids marking cached steps in-progress/completed; PipelineStep gained all_outputs_exist. Frame exporting now computes FPS, snaps & deduplicates keyframes, uses stable frame_number filenames, and records fps in metadata. Embedding service and wrapper refactored for batching, parameterized model/device, memory cleanup, and proper batched embedding extraction. Object detection step implemented D-FINE model support with GPU loading, batching, postprocessing, result saving and statistics; added related config fields. Misc: tighten defaults for detection/frames, normalize character names in grid visualizer, enrich character reference metadata, minor fixes for hashing and emotion detection field names. --- preprocessor/app/pipeline_builder.py | 27 +-- preprocessor/app/pipeline_factory.py | 28 ++- preprocessor/config/step_configs.py | 15 +- preprocessor/core/base_step.py | 10 ++ preprocessor/services/media/ffmpeg.py | 1 + .../services/scraping/grid_visualizer.py | 8 +- .../services/scraping/reference_processor.py | 43 +++-- .../search/clients/embedding_service.py | 100 ++++++++--- .../services/search/embedding_model.py | 25 +-- preprocessor/steps/video/frame_export_step.py | 109 +++++++----- .../steps/vision/character_detection_step.py | 13 +- .../character_reference_processor_step.py | 92 ++++++++++ .../steps/vision/emotion_detection_step.py | 2 +- .../steps/vision/image_hashing_step.py | 5 + .../steps/vision/object_detection_step.py | 163 ++++++++++++++++-- 15 files changed, 500 insertions(+), 141 deletions(-) create mode 100644 preprocessor/steps/vision/character_reference_processor_step.py diff --git a/preprocessor/app/pipeline_builder.py b/preprocessor/app/pipeline_builder.py index 09c0a47f1..a02a032e9 100644 --- a/preprocessor/app/pipeline_builder.py +++ b/preprocessor/app/pipeline_builder.py @@ -198,12 +198,13 @@ def __all_episodes_completed( ) -> bool: if self.__context.force_rerun or not step.uses_caching: return False - if self.__context.state_manager is None: - return False - return all( - self.__context.state_manager.is_step_completed(step.name, art.episode_id) - for art in artifacts - ) + if self.__context.state_manager is not None: + if all( + self.__context.state_manager.is_step_completed(step.name, art.episode_id) + for art in artifacts + ): + return True + return step.all_outputs_exist(artifacts, self.__context) def __run_episode_step_sequential( self, step: PipelineStep, current_artifacts: List[Any], @@ -220,9 +221,11 @@ def __run_episode_step_sequential( episode_id = artifact.episode_id try: - self.__mark_step_in_progress(step.name, episode_id) + if not step.uses_caching: + self.__mark_step_in_progress(step.name, episode_id) result = step.execute(artifact, self.__context) - self.__mark_step_completed(step.name, episode_id) + if not step.uses_caching: + self.__mark_step_completed(step.name, episode_id) if result: next_artifacts.append(result) @@ -261,14 +264,16 @@ def __run_episode_step_batch( if hasattr(step, 'setup_resources'): step.setup_resources(self.__context) - for artifact in current_artifacts: - self.__mark_step_in_progress(step.name, artifact.episode_id) + if not step.uses_caching: + for artifact in current_artifacts: + self.__mark_step_in_progress(step.name, artifact.episode_id) results = step.execute_batch(current_artifacts, self.__context) next_artifacts = [] for artifact, result in zip(current_artifacts, results): - self.__mark_step_completed(step.name, artifact.episode_id) + if not step.uses_caching: + self.__mark_step_completed(step.name, artifact.episode_id) next_artifacts.append(result or artifact) return next_artifacts diff --git a/preprocessor/app/pipeline_factory.py b/preprocessor/app/pipeline_factory.py index 4ee4426b8..95d50f1f8 100644 --- a/preprocessor/app/pipeline_factory.py +++ b/preprocessor/app/pipeline_factory.py @@ -10,6 +10,7 @@ ArchiveConfig, CharacterDetectionConfig, CharacterReferenceConfig, + CharacterReferenceProcessorConfig, CharacterScraperConfig, DocumentGenerationConfig, ElasticsearchConfig, @@ -56,6 +57,7 @@ from preprocessor.steps.video.scene_detection_step import SceneDetectorStep from preprocessor.steps.video.transcoding_step import VideoTranscoderStep from preprocessor.steps.vision.character_detection_step import CharacterDetectorStep +from preprocessor.steps.vision.character_reference_processor_step import CharacterReferenceProcessorStep from preprocessor.steps.vision.embeddings_step import VideoEmbeddingStep from preprocessor.steps.vision.emotion_detection_step import EmotionDetectionStep from preprocessor.steps.vision.face_clustering_step import FaceClusteringStep @@ -69,7 +71,7 @@ VALIDATION = Phase("VALIDATION", color="magenta") -def build_pipeline(series_name: str) -> PipelineDefinition: # pylint: disable=too-many-locals +def build_pipeline(series_name: str) -> PipelineDefinition: # pylint: disable=too-many-locals,too-many-statements series_config = SeriesConfig.load(series_name) # ========================================================= @@ -119,7 +121,7 @@ def build_pipeline(series_name: str) -> PipelineDefinition: # pylint: disable=t character_references = StepBuilder( phase=SCRAPING, step_class=CharacterReferenceStep, - description="Downloads and processes character reference images", + description="Downloads character reference images from the web", produces=[ DirectoryOutput( pattern="character_faces", @@ -136,6 +138,23 @@ def build_pipeline(series_name: str) -> PipelineDefinition: # pylint: disable=t ), ) + character_reference_vectors = StepBuilder( + phase=SCRAPING, + step_class=CharacterReferenceProcessorStep, + description="Processes character reference images into face embedding vectors", + produces=[ + DirectoryOutput( + pattern="character_references_processed", + subdir="", + expected_file_pattern="**/face_vector.npy", + min_files=1, + min_size_per_file_bytes=100, + ), + ], + needs=[character_references], + config=CharacterReferenceProcessorConfig(), + ) + # ========================================================= # PROCESSING PHASE: VIDEO # ========================================================= @@ -346,8 +365,8 @@ def build_pipeline(series_name: str) -> PipelineDefinition: # pylint: disable=t min_size_bytes=10, ), ], - needs=[exported_frames], - config=CharacterDetectionConfig(threshold=0.7), + needs=[exported_frames, character_reference_vectors], + config=CharacterDetectionConfig(threshold=0.45, max_parallel_episodes=4), ) emotion_data = StepBuilder( @@ -471,6 +490,7 @@ def build_pipeline(series_name: str) -> PipelineDefinition: # pylint: disable=t pipeline.register(episodes_metadata) pipeline.register(characters_metadata) pipeline.register(character_references) + pipeline.register(character_reference_vectors) pipeline.register(resolution_analysis) pipeline.register(transcoded_videos) diff --git a/preprocessor/config/step_configs.py b/preprocessor/config/step_configs.py index 1af70621c..7680e5144 100644 --- a/preprocessor/config/step_configs.py +++ b/preprocessor/config/step_configs.py @@ -72,8 +72,8 @@ class FrameExportConfig(BaseModel): frames_per_scene: int = Field(default=1, ge=1) keyframe_strategy: KeyframeStrategy = KeyframeStrategy.SCENE_CHANGES - max_parallel_episodes: int = Field(default=4, ge=1, le=8) - max_parallel_frames: int = Field(default=4, ge=1, le=16) + max_parallel_episodes: int = Field(default=4, ge=1, le=16) + max_parallel_frames: int = Field(default=12, ge=1, le=32) resolution: Resolution = Field(default=Resolution.R720P) scene_change_offset_seconds: float = Field(default=0.5, ge=0) @@ -157,8 +157,8 @@ class AudioExtractionConfig(BaseModel): class CharacterDetectionConfig(BaseModel): - max_parallel_episodes: int = Field(default=2, ge=1, le=4) - threshold: float = Field(default=0.7, ge=0.0, le=1.0) + max_parallel_episodes: int = Field(default=4, ge=1, le=8) + threshold: float = Field(default=0.55, ge=0.0, le=1.0) class EmotionDetectionConfig(BaseModel): @@ -170,7 +170,10 @@ class FaceClusteringConfig(BaseModel): class ObjectDetectionConfig(BaseModel): + batch_size: int = Field(default=8, ge=1) + conf_threshold: float = Field(default=0.3, ge=0.0, le=1.0) max_parallel_episodes: int = Field(default=2, ge=1, le=4) + model_name: str = 'ustc-community/dfine-xlarge-obj2coco' class ArchiveConfig(BaseModel): @@ -202,3 +205,7 @@ class CharacterReferenceConfig(BaseModel): images_per_character: int = Field(default=5, ge=1, le=20) max_parallel_episodes: int = Field(default=4, ge=1, le=8) search_engine: str = "duckduckgo" + + +class CharacterReferenceProcessorConfig(BaseModel): + similarity_threshold: float = Field(default=0.45, ge=0.0, le=1.0) diff --git a/preprocessor/core/base_step.py b/preprocessor/core/base_step.py index a9872a2f0..6f27f2c86 100644 --- a/preprocessor/core/base_step.py +++ b/preprocessor/core/base_step.py @@ -79,6 +79,16 @@ def load_all_from_cache( results.append(result if result else inp) return results + def all_outputs_exist( + self, input_list: List[InputT], context: ExecutionContext, + ) -> bool: + if context.force_rerun: + return False + try: + return all(self._get_cache_path(inp, context).exists() for inp in input_list) + except NotImplementedError: + return False + def should_skip_execution( self, episode_id: str, diff --git a/preprocessor/services/media/ffmpeg.py b/preprocessor/services/media/ffmpeg.py index 5aff5db1c..d4038d661 100644 --- a/preprocessor/services/media/ffmpeg.py +++ b/preprocessor/services/media/ffmpeg.py @@ -258,6 +258,7 @@ def extract_frame_at_timestamp(video_path: Path, timestamp: float) -> Image.Imag raise ValueError(f'No frame data extracted at timestamp {timestamp}s from {video_path}') return Image.open(BytesIO(result.stdout)) + @staticmethod def get_keyframe_timestamps(video_path: Path) -> List[float]: cmd = [ diff --git a/preprocessor/services/scraping/grid_visualizer.py b/preprocessor/services/scraping/grid_visualizer.py index 8b53b44e5..a927bd44a 100644 --- a/preprocessor/services/scraping/grid_visualizer.py +++ b/preprocessor/services/scraping/grid_visualizer.py @@ -9,6 +9,7 @@ Optional, Tuple, ) +import unicodedata import cv2 import numpy as np @@ -191,7 +192,7 @@ def __render_single_row( char_name = char_dir.name.replace('_', ' ').title() cv2.putText( - canvas, char_name, + canvas, self.__ascii_safe(char_name), (self.__dims.padding * 2, y_offset + self.__dims.face_size // 2), cv2.FONT_HERSHEY_SIMPLEX, 0.55, (30, 40, 50), 1, cv2.LINE_AA, ) @@ -318,6 +319,11 @@ def __calculate_avg_similarity(metadata_all: List[Dict[str, Any]]) -> float: return 0.0 return float(np.mean([m.get('average_similarity', 0) for m in metadata_all])) + @staticmethod + def __ascii_safe(text: str) -> str: + text = text.translate(str.maketrans('łŁ', 'lL')) + return unicodedata.normalize('NFD', text).encode('ascii', 'ignore').decode('ascii') + @staticmethod def safe_resize(img: np.ndarray, target_size: Tuple[int, int]) -> Optional[np.ndarray]: if img is None or img.size == 0: diff --git a/preprocessor/services/scraping/reference_processor.py b/preprocessor/services/scraping/reference_processor.py index ceb4b6895..137e6d0be 100644 --- a/preprocessor/services/scraping/reference_processor.py +++ b/preprocessor/services/scraping/reference_processor.py @@ -104,12 +104,14 @@ def _process_item(self, item: ProcessingItem, _missing_outputs: List[OutputSpec] console.print(f'[yellow]Skipping {char_name}: no faces detected[/yellow]') return - selected_faces = self.__find_common_face(all_faces) - if not selected_faces: + result = self.__find_common_face(all_faces) + if not result: console.print(f'[yellow]Skipping {char_name}: could not identify common face[/yellow]') return - self.__save_processed_references(char_name, selected_faces, ref_images) + selected_faces, avg_similarity = result + faces_per_image = [len(faces) for faces in all_faces] + self.__save_processed_references(char_name, selected_faces, ref_images, avg_similarity, faces_per_image) console.print(f'[green]Processed {char_name}[/green]') def __generate_validation_grid(self) -> None: @@ -154,16 +156,16 @@ def __detect_faces_in_references(self, image_paths: List[Path]) -> List[List[Fac def __find_common_face( self, all_faces: List[List[FaceData]], - ) -> Optional[List[FaceData]]: + ) -> Optional[Tuple[List[FaceData], float]]: first_faces = all_faces[0] candidates = self.__find_face_candidates(first_faces, all_faces[1:], all_faces) if len(candidates) == 1: - return candidates[0].faces + return candidates[0].faces, candidates[0].avg_similarity if len(candidates) > 1 and not self.__interactive: candidates.sort(key=lambda c: c.avg_similarity, reverse=True) - return candidates[0].faces + return candidates[0].faces, candidates[0].avg_similarity return None @@ -199,7 +201,12 @@ def __get_best_match(ref_face: FaceData, candidates: List[FaceData]) -> Tuple[Op return best_match, best_sim def __save_processed_references( - self, char_name: str, selected_faces: List[FaceData], ref_images: List[Path], + self, + char_name: str, + selected_faces: List[FaceData], + ref_images: List[Path], + avg_similarity: float, + faces_per_image: List[int], ) -> None: char_out = self.__output_dir / char_name char_out.mkdir(parents=True, exist_ok=True) @@ -217,21 +224,35 @@ def __save_processed_references( mean_vector = np.mean(face_vectors, axis=0) np.save(char_out / 'face_vector.npy', mean_vector) - metadata = self.__create_metadata(char_name, selected_faces, ref_images, mean_vector) + metadata = self.__create_metadata( + char_name, selected_faces, ref_images, mean_vector, avg_similarity, faces_per_image, + ) with open(char_out / 'metadata.json', 'w', encoding='utf-8') as f: json.dump(metadata, f, indent=2, ensure_ascii=False) - def __create_metadata(self, name: str, faces: List[FaceData], refs: List[Path], mean_vec: np.ndarray) -> Dict[ - str, Any, - ]: + def __create_metadata( + self, + name: str, + faces: List[FaceData], + refs: List[Path], + mean_vec: np.ndarray, + avg_similarity: float, + faces_per_image: List[int], + ) -> Dict[str, Any]: return { 'character_name': name.replace('_', ' ').title(), 'source_images': [str(img) for img in refs], 'processed_at': datetime.now().isoformat(), + 'average_similarity': avg_similarity, 'processing_params': { 'similarity_threshold': self.__similarity_threshold, 'face_model': settings.face_recognition.model_name, }, + 'detection_stats': { + 'total_faces_detected': faces_per_image, + 'candidates_found': 1, + 'selection_method': 'automatic', + }, 'selected_face_indices': [f.source_image_idx for f in faces], 'face_vector_dim': int(mean_vec.shape[0]), } diff --git a/preprocessor/services/search/clients/embedding_service.py b/preprocessor/services/search/clients/embedding_service.py index 22313bf75..fd46357f4 100644 --- a/preprocessor/services/search/clients/embedding_service.py +++ b/preprocessor/services/search/clients/embedding_service.py @@ -1,3 +1,4 @@ +import gc from pathlib import Path from typing import ( Any, @@ -20,54 +21,102 @@ class EmbeddingService: - def __init__(self) -> None: + def __init__(self, model_name: Optional[str] = None, device: str = 'cuda') -> None: + self.__model_name: str = model_name or settings.embedding_model.model_name + self.__device = device self.__model: Optional[AutoModelForVision2Seq] = None self.__processor: Optional[AutoProcessor] = None - self.__device: str = 'cuda' + + def ensure_loaded(self) -> None: + if self.__model is None: + self.__load_resources() def cleanup(self) -> None: if self.__model is not None: del self.__model del self.__processor self.__model = self.__processor = None + gc.collect() if torch.cuda.is_available(): torch.cuda.empty_cache() - def get_image_embedding(self, image_path: Union[str, Path]) -> List[float]: + def get_image_embeddings_batch(self, image_paths: List[Union[str, Path]]) -> List[List[float]]: model, processor, device = self.__get_model() - messages = [{ - 'role': 'user', 'content': [ - {'type': 'image', 'image': str(image_path)}, - {'type': 'text', 'text': 'Describe this image.'}, - ], - }] - image_inputs, video_inputs = process_vision_info(messages) - prompt = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) + messages_batch = [ + [{ + 'role': 'user', 'content': [ + {'type': 'image', 'image': str(path)}, + {'type': 'text', 'text': 'Describe this image.'}, + ], + }] + for path in image_paths + ] + + all_image_inputs: List[Any] = [] + prompts: List[str] = [] + for messages in messages_batch: + image_inputs, _ = process_vision_info(messages) + all_image_inputs.extend(image_inputs) + prompts.append( + processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True), + ) inputs = processor( - text=[prompt], images=image_inputs, videos=video_inputs, padding=True, + text=prompts, + images=all_image_inputs, + padding=True, return_tensors='pt', ).to(device) - return self.__compute_normalized_embedding(model, inputs) - def get_text_embedding(self, text: str) -> List[float]: + return self.__compute_batch_embeddings(model, inputs, len(image_paths)) + + def get_text_embeddings_batch(self, texts: List[str]) -> List[List[float]]: model, processor, device = self.__get_model() - messages = [{'role': 'user', 'content': [{'type': 'text', 'text': text}]}] - inputs = processor.apply_chat_template( - messages, add_generation_prompt=True, tokenize=True, + messages_batch = [ + [{'role': 'user', 'content': [{'type': 'text', 'text': text}]}] + for text in texts + ] + prompts: List[str] = [ + processor.apply_chat_template(msgs, tokenize=False, add_generation_prompt=True) + for msgs in messages_batch + ] + + inputs = processor( + text=prompts, + padding=True, return_tensors='pt', ).to(device) - return self.__compute_normalized_embedding(model, {'input_ids': inputs}) + + return self.__compute_batch_embeddings(model, inputs, len(texts)) @staticmethod - def __compute_normalized_embedding(model: Any, inputs: Dict[str, Any]) -> List[float]: + def __compute_batch_embeddings( + model: Any, + inputs: Dict[str, Any], + count: int, + ) -> List[List[float]]: with torch.no_grad(): output = model(**inputs, output_hidden_states=True) - embedding = output.hidden_states[-1][:, -1, :].squeeze(0) - embedding = torch.nn.functional.normalize(embedding, p=2, dim=0) - return embedding.float().cpu().numpy().tolist() + hidden = output.hidden_states[-1] + + attention_mask = inputs.get('attention_mask') + if attention_mask is not None: + last_positions = attention_mask.sum(dim=1) - 1 + embeddings = torch.stack([ + hidden[i, last_positions[i], :] for i in range(count) + ]) + else: + embeddings = hidden[:, -1, :] + + embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=-1) + + result = [emb.float().cpu().numpy().tolist() for emb in embeddings] + gc.collect() + if torch.cuda.is_available(): + torch.cuda.empty_cache() + return result def __get_model(self) -> Tuple[AutoModelForVision2Seq, AutoProcessor, str]: if self.__model is None: @@ -79,6 +128,7 @@ def __load_resources(self) -> None: if not torch.cuda.is_available(): raise RuntimeError('CUDA required for multimodal embeddings.') - model_name = settings.embedding_model.model_name - self.__model = AutoModelForVision2Seq.from_pretrained(model_name, dtype=torch.bfloat16, device_map='auto') - self.__processor = AutoProcessor.from_pretrained(model_name) + self.__model = AutoModelForVision2Seq.from_pretrained( + self.__model_name, dtype=torch.bfloat16, device_map='auto', + ) + self.__processor = AutoProcessor.from_pretrained(self.__model_name) diff --git a/preprocessor/services/search/embedding_model.py b/preprocessor/services/search/embedding_model.py index 80fa7bb56..7a788c17d 100644 --- a/preprocessor/services/search/embedding_model.py +++ b/preprocessor/services/search/embedding_model.py @@ -11,30 +11,23 @@ class EmbeddingModelWrapper: def __init__( self, - _model_name: str, - _device: str = 'cuda', + model_name: str, + device: str = 'cuda', _batch_size: int = 8, ) -> None: - self.__service = EmbeddingService() + self.__service = EmbeddingService(model_name=model_name, device=device) def load_model(self) -> None: - pass + self.__service.ensure_loaded() def cleanup(self) -> None: - pass + self.__service.cleanup() def encode_text(self, text: Union[str, List[str]]) -> Union[List[float], List[List[float]]]: if isinstance(text, str): - return self.__service.get_text_embedding(text) - - return self.__process_batch_encoding(text) + return self.__service.get_text_embeddings_batch([text])[0] + return self.__service.get_text_embeddings_batch(text) def encode_images(self, image_paths: List[str]) -> List[np.ndarray]: - embeddings: List[np.ndarray] = [] - for path in image_paths: - embedding = self.__service.get_image_embedding(path) - embeddings.append(np.array(embedding)) - return embeddings - - def __process_batch_encoding(self, texts: List[str]) -> List[List[float]]: - return [self.__service.get_text_embedding(t) for t in texts] + embeddings_list = self.__service.get_image_embeddings_batch(image_paths) + return [np.array(e) for e in embeddings_list] diff --git a/preprocessor/steps/video/frame_export_step.py b/preprocessor/steps/video/frame_export_step.py index d5ec2f00f..ca0034b16 100644 --- a/preprocessor/steps/video/frame_export_step.py +++ b/preprocessor/steps/video/frame_export_step.py @@ -133,7 +133,7 @@ def __process_frame_extraction( context: ExecutionContext, ) -> None: try: - self.__extract_frames( + fps = self.__extract_frames( video_path, frame_requests, episode_dir, @@ -146,6 +146,7 @@ def __process_frame_extraction( video_path, context, metadata_file, + fps, ) except (Exception, KeyboardInterrupt) as e: error_type = "interrupted" if isinstance(e, KeyboardInterrupt) else "failed" @@ -160,77 +161,87 @@ def __extract_frames( episode_dir: Path, episode_info, context: ExecutionContext, - ) -> None: + ) -> float: video_metadata = self.__fetch_video_metadata(video_file) dar = self.__calculate_display_aspect_ratio(video_metadata) + fps = self.__get_fps(video_metadata) - context.logger.info(f'Finding I-frames (keyframes) in {video_file.name}') keyframes = self.__get_all_keyframes(video_file) - context.logger.info(f'Found {len(keyframes)} I-frames') + context.logger.info(f'Found {len(keyframes)} I-frames in {video_file.name}') - for req in frame_requests: - target_timestamp = req['timestamp'] - snapped_timestamp = self.__snap_to_keyframe(keyframes, target_timestamp) - - if abs(snapped_timestamp - target_timestamp) > 0.1: - context.logger.debug( - f'Snapped {target_timestamp:.3f}s -> {snapped_timestamp:.3f}s ' - f'(I-frame, delta: {snapped_timestamp - target_timestamp:.3f}s)', - ) - - req['timestamp'] = snapped_timestamp - req['original_timestamp'] = target_timestamp - req['snapped_to_keyframe'] = True - req['frame_number'] = int(snapped_timestamp * 1000) - - seen_ms: set[int] = set() - unique_timestamps: List[float] = [] - for req in frame_requests: - ts_ms = int(req['timestamp'] * 1000) - if ts_ms not in seen_ms: - seen_ms.add(ts_ms) - unique_timestamps.append(req['timestamp']) + unique_requests = self.__snap_and_deduplicate(frame_requests, keyframes, fps, context) with ThreadPoolExecutor(max_workers=self.config.max_parallel_frames) as executor: futures = [ executor.submit( - self.__extract_and_save_frame, - video_file, - timestamp, - episode_dir, - episode_info, - dar, - context.series_name, + self.__extract_resize_save_frame, + video_file, req['timestamp'], req['frame_number'], + episode_dir, episode_info, dar, context.series_name, ) - for timestamp in unique_timestamps + for req in unique_requests ] for future in futures: future.result() - def __extract_and_save_frame( + return fps + + def __snap_and_deduplicate( + self, + frame_requests: List[FrameRequest], + keyframes: List[float], + fps: float, + context: ExecutionContext, + ) -> List[FrameRequest]: + for req in frame_requests: + target = req['timestamp'] + snapped = self.__snap_to_keyframe(keyframes, target) + if abs(snapped - target) > 0.1: + context.logger.debug( + f'Snapped {target:.3f}s -> {snapped:.3f}s (delta: {snapped - target:.3f}s)', + ) + req['timestamp'] = snapped + req['original_timestamp'] = target + req['snapped_to_keyframe'] = True + req['frame_number'] = round(snapped * fps) + + seen: set[int] = set() + unique: List[FrameRequest] = [] + for req in frame_requests: + if req['frame_number'] not in seen: + seen.add(req['frame_number']) + unique.append(req) + return unique + + def __extract_resize_save_frame( self, video_file: Path, timestamp: float, + frame_number: int, episode_dir: Path, episode_info, dar: float, series_name: str, ) -> None: - frame_pil = self.__extract_frame_at_timestamp(video_file, timestamp) - resized = self.__resize_frame(frame_pil, dar) + image = FFmpegWrapper.extract_frame_at_timestamp(video_file, timestamp) + self.__resize_and_save_frame(image, frame_number, episode_dir, episode_info, dar, series_name) + def __resize_and_save_frame( + self, + image: Image.Image, + frame_number: int, + episode_dir: Path, + episode_info, + dar: float, + series_name: str, + ) -> None: + resized = self.__resize_frame(image, dar) base_filename = f'{series_name}_{episode_info.episode_code()}' - timestamp_ms = int(timestamp * 1000) - filename = f'{base_filename}_frame_{timestamp_ms:08d}.jpg' + filename = f'{base_filename}_frame_{frame_number:06d}.jpg' final_path = episode_dir / filename with StepTempFile(final_path) as temp_path: resized.save(temp_path, format='JPEG', quality=90) - @staticmethod - def __extract_frame_at_timestamp(video_file: Path, timestamp: float) -> Image.Image: - return FFmpegWrapper.extract_frame_at_timestamp(video_file, timestamp) - def __resize_frame( self, frame: Image.Image, display_aspect_ratio: float, ) -> Image.Image: @@ -267,6 +278,7 @@ def __write_metadata( source_video: Path, context: ExecutionContext, metadata_file: Path, + fps: float, ) -> None: frame_types_count: Dict[str, int] = {} frames_with_paths: List[Dict[str, Any]] = [] @@ -277,8 +289,7 @@ def __write_metadata( frame_types_count[frame_type] = frame_types_count.get(frame_type, 0) + 1 frame_with_path = frame.copy() - timestamp_ms = int(frame['timestamp'] * 1000) - frame_with_path['frame_path'] = f'{base_filename}_frame_{timestamp_ms:08d}.jpg' + frame_with_path['frame_path'] = f'{base_filename}_frame_{frame["frame_number"]:06d}.jpg' frames_with_paths.append(frame_with_path) scene_numbers = { @@ -298,6 +309,7 @@ def __write_metadata( 'processing_parameters': { 'frame_width': self.config.resolution.width, 'frame_height': self.config.resolution.height, + 'fps': fps, 'keyframe_strategy': self.config.keyframe_strategy.value, 'frames_per_scene': self.config.frames_per_scene, }, @@ -345,6 +357,13 @@ def __construct_empty_result( metadata_path=metadata_file, ) + @staticmethod + def __get_fps(stream: Dict[str, Any]) -> float: + r_frame_rate: str = stream.get('r_frame_rate', '25/1') + parts = r_frame_rate.split('/') + num, denom = int(parts[0]), int(parts[1]) if len(parts) > 1 else 1 + return num / denom if denom != 0 else 25.0 + @staticmethod def __fetch_video_metadata(video_path: Path) -> Dict[str, Any]: probe_data = FFmpegWrapper.probe_video(video_path) diff --git a/preprocessor/steps/vision/character_detection_step.py b/preprocessor/steps/vision/character_detection_step.py index 834e32e7d..91ad1d15a 100644 --- a/preprocessor/steps/vision/character_detection_step.py +++ b/preprocessor/steps/vision/character_detection_step.py @@ -1,3 +1,4 @@ +# pylint: disable=duplicate-code from pathlib import Path from typing import ( Any, @@ -7,6 +8,7 @@ import numpy as np +from preprocessor.config.output_paths import get_base_output_dir from preprocessor.config.step_configs import CharacterDetectionConfig from preprocessor.core.artifacts import ( DetectionResults, @@ -116,13 +118,10 @@ def __prepare_detection_environment(self, context: ExecutionContext) -> None: self.__load_character_references(context) def __load_character_references(self, context: ExecutionContext) -> None: - characters_dir: Path = ( - Path('preprocessor/output_data') / context.series_name / 'characters' - ) + base_dir = get_base_output_dir(context.series_name) + characters_dir: Path = base_dir / 'character_references_processed' if not characters_dir.exists(): - characters_dir = ( - Path('preprocessor/input_data') / context.series_name / 'characters' - ) + characters_dir = base_dir / 'character_faces' if characters_dir.exists(): context.logger.info(f'Loading character references from {characters_dir}') @@ -202,6 +201,6 @@ def __count_characters(results: List[Dict[str, Any]]) -> Dict[str, int]: counts: Dict[str, int] = {} for res in results: for face in res.get('faces', []): - name: str = face.get('character_name', 'unknown') + name: str = face.get('name', 'unknown') counts[name] = counts.get(name, 0) + 1 return counts diff --git a/preprocessor/steps/vision/character_reference_processor_step.py b/preprocessor/steps/vision/character_reference_processor_step.py new file mode 100644 index 000000000..bf39298ca --- /dev/null +++ b/preprocessor/steps/vision/character_reference_processor_step.py @@ -0,0 +1,92 @@ +# pylint: disable=duplicate-code +from pathlib import Path +from typing import ( + List, + Tuple, +) + +from preprocessor.config.output_paths import get_base_output_dir +from preprocessor.config.step_configs import CharacterReferenceProcessorConfig +from preprocessor.core.artifacts import SourceVideo +from preprocessor.core.base_step import PipelineStep +from preprocessor.core.context import ExecutionContext +from preprocessor.core.output_descriptors import ( + DirectoryOutput, + OutputDescriptor, +) +from preprocessor.services.scraping.reference_processor import CharacterReferenceProcessor + + +class CharacterReferenceProcessorStep( + PipelineStep[SourceVideo, SourceVideo, CharacterReferenceProcessorConfig], +): + @property + def is_global(self) -> bool: + return True + + def get_output_descriptors(self) -> List[OutputDescriptor]: + return [ + DirectoryOutput( + pattern="character_references_processed", + subdir="", + expected_file_pattern="**/face_vector.npy", + min_files=1, + min_size_per_file_bytes=100, + ), + ] + + def _get_cache_path( + self, input_data: SourceVideo, context: ExecutionContext, + ) -> Path: + _, output_dir = self.__resolve_paths(context) + return output_dir + + def _load_from_cache( + self, cache_path: Path, input_data: SourceVideo, context: ExecutionContext, + ) -> SourceVideo: + context.logger.info(f"Character reference vectors already exist in: {cache_path}") + return input_data + + def _process( + self, input_data: SourceVideo, context: ExecutionContext, + ) -> SourceVideo: + characters_dir, output_dir = self.__resolve_paths(context) + self.__validate_input_directory(characters_dir) + self.__run_reference_processor(characters_dir, output_dir, context) + return input_data + + @staticmethod + def __resolve_paths(context: ExecutionContext) -> Tuple[Path, Path]: + base_dir = get_base_output_dir(context.series_name) + return base_dir / 'character_faces', base_dir / 'character_references_processed' + + def __run_reference_processor( + self, + characters_dir: Path, + output_dir: Path, + context: ExecutionContext, + ) -> None: + context.logger.info(f"Processing character reference images from {characters_dir}") + + processor = CharacterReferenceProcessor({ + 'characters_dir': characters_dir, + 'output_dir': output_dir, + 'similarity_threshold': self.config.similarity_threshold, + 'interactive': False, + }) + + exit_code = processor.work() + if exit_code != 0: + raise RuntimeError( + f"Character reference processor failed with exit code {exit_code}", + ) + + context.logger.info(f"Character reference vectors saved to: {output_dir}") + + @staticmethod + def __validate_input_directory(characters_dir: Path) -> None: + if not characters_dir.exists(): + raise FileNotFoundError( + f"Character faces directory not found: {characters_dir}. " + f"Run character_reference step first.", + ) diff --git a/preprocessor/steps/vision/emotion_detection_step.py b/preprocessor/steps/vision/emotion_detection_step.py index 26d31d96d..bb7ad69f5 100644 --- a/preprocessor/steps/vision/emotion_detection_step.py +++ b/preprocessor/steps/vision/emotion_detection_step.py @@ -170,7 +170,7 @@ def __collect_face_crops( context.logger.info(f'Collecting {total_faces} faces for batch emotion analysis') for detection_idx, detection in enumerate(detections): - frame_file = detection.get('frame_file') + frame_file = detection.get('frame') if not frame_file: continue diff --git a/preprocessor/steps/vision/image_hashing_step.py b/preprocessor/steps/vision/image_hashing_step.py index d62cfdc6b..0d29e4882 100644 --- a/preprocessor/steps/vision/image_hashing_step.py +++ b/preprocessor/steps/vision/image_hashing_step.py @@ -109,6 +109,10 @@ def __prepare_hasher(self, context: ExecutionContext) -> None: context.logger.info(f'Loading image hasher on {self.config.device}...') self.__hasher = PerceptualHasher(device=self.config.device) + @staticmethod + def __parse_frame_number(request: Dict[str, Any]) -> int: + return int(request['frame_number']) + def __compute_hashes( self, frame_requests: List[Dict[str, Any]], @@ -124,6 +128,7 @@ def __compute_hashes( for request, phash in zip(batch, phashes): result: Dict[str, Any] = request.copy() + result['frame_number'] = self.__parse_frame_number(request) result['perceptual_hash'] = phash hash_results.append(result) diff --git a/preprocessor/steps/vision/object_detection_step.py b/preprocessor/steps/vision/object_detection_step.py index 4549a55a0..c7d653b96 100644 --- a/preprocessor/steps/vision/object_detection_step.py +++ b/preprocessor/steps/vision/object_detection_step.py @@ -1,8 +1,18 @@ # pylint: disable=duplicate-code +import gc from pathlib import Path from typing import ( + Any, Dict, List, + Optional, +) + +from PIL import Image +import torch +from transformers import ( + AutoImageProcessor, + DFineForObjectDetection, ) from preprocessor.config.step_configs import ObjectDetectionConfig @@ -16,6 +26,7 @@ JsonFileOutput, OutputDescriptor, ) +from preprocessor.services.io.files import FileOperations class ObjectDetectionStep( @@ -23,7 +34,8 @@ class ObjectDetectionStep( ): def __init__(self, config: ObjectDetectionConfig) -> None: super().__init__(config) - self.__model = None + self.__model: Optional[DFineForObjectDetection] = None + self.__image_processor: Optional[AutoImageProcessor] = None @property def supports_batch_processing(self) -> bool: @@ -39,6 +51,7 @@ def teardown_resources(self, context: ExecutionContext) -> None: def cleanup(self) -> None: self.__model = None + self.__image_processor = None def execute_batch( self, input_data: List[FrameCollection], context: ExecutionContext, @@ -51,8 +64,25 @@ def _process( self, input_data: FrameCollection, context: ExecutionContext, ) -> ObjectDetectionData: output_path = self._get_cache_path(input_data, context) - # Main processing logic would go here - return self.__construct_object_data(input_data, output_path) + self.__ensure_model_loaded(context) + + frame_files = self.__extract_frame_files(input_data) + if not frame_files: + context.logger.warning(f'No frame files found in {input_data.directory}') + return ObjectDetectionData( + episode_id=input_data.episode_id, + episode_info=input_data.episode_info, + path=output_path, + ) + + detections = self.__process_batches(frame_files) + self.__save_results(detections, output_path, input_data, context, frame_files) + + return ObjectDetectionData( + episode_id=input_data.episode_id, + episode_info=input_data.episode_info, + path=output_path, + ) def get_output_descriptors(self) -> List[OutputDescriptor]: return [ @@ -75,16 +105,111 @@ def _get_cache_path( def _load_from_cache( self, cache_path: Path, input_data: FrameCollection, context: ExecutionContext, ) -> ObjectDetectionData: - return self.__construct_object_data(input_data, cache_path) + return ObjectDetectionData( + episode_id=input_data.episode_id, + episode_info=input_data.episode_info, + path=cache_path, + ) - @staticmethod - def __load_model(context: ExecutionContext) -> None: - context.logger.info('Loading Object Detection model...') - # Model loading logic implementation + def __ensure_model_loaded(self, context: ExecutionContext) -> None: + if self.__model is None: + self.__load_model(context) + + def __load_model(self, context: ExecutionContext) -> None: + if not torch.cuda.is_available(): + raise RuntimeError('CUDA is not available. Object detection requires GPU.') + + context.logger.info(f'Loading D-FINE model: {self.config.model_name}') + self.__image_processor = AutoImageProcessor.from_pretrained(self.config.model_name) + self.__model = DFineForObjectDetection.from_pretrained(self.config.model_name) + self.__model.to('cuda') + context.logger.info('D-FINE model loaded on GPU') def __unload_model(self, context: ExecutionContext) -> None: context.logger.info('Object Detection model unloaded') self.__model = None + self.__image_processor = None + gc.collect() + if torch.cuda.is_available(): + torch.cuda.empty_cache() + + def __process_batches(self, frame_files: List[Path]) -> List[Dict[str, Any]]: + detections: List[Dict[str, Any]] = [] + + for batch_start in range(0, len(frame_files), self.config.batch_size): + batch_paths = frame_files[batch_start:batch_start + self.config.batch_size] + batch_detections = self.__process_single_batch(batch_paths) + detections.extend(batch_detections) + + return detections + + def __process_single_batch(self, batch_paths: List[Path]) -> List[Dict[str, Any]]: + batch_images = [Image.open(fp) for fp in batch_paths] + target_sizes = [(img.height, img.width) for img in batch_images] + + inputs = self.__image_processor(images=batch_images, return_tensors='pt') + inputs = {k: v.to('cuda') for k, v in inputs.items()} + + with torch.no_grad(): + outputs = self.__model(**inputs) + + results = self.__image_processor.post_process_object_detection( + outputs, + target_sizes=target_sizes, + threshold=self.config.conf_threshold, + ) + + batch_detections = [] + for frame_path, result in zip(batch_paths, results): + frame_entry = self.__build_frame_entry(frame_path, result) + if frame_entry['objects']: + batch_detections.append(frame_entry) + + for img in batch_images: + img.close() + + return batch_detections + + def __build_frame_entry( + self, frame_path: Path, result: Dict[str, Any], + ) -> Dict[str, Any]: + objects: List[Dict[str, Any]] = [] + for score, label_id, box in zip(result['scores'], result['labels'], result['boxes']): + box_coords = [float(v) for v in box.tolist()] + objects.append({ + 'class_id': label_id.item(), + 'class_name': self.__model.config.id2label[label_id.item()], + 'confidence': score.item(), + 'bbox': { + 'x1': box_coords[0], + 'y1': box_coords[1], + 'x2': box_coords[2], + 'y2': box_coords[3], + }, + }) + return {'frame': frame_path.name, 'objects': objects} + + def __save_results( + self, + detections: List[Dict[str, Any]], + output_path: Path, + input_data: FrameCollection, + context: ExecutionContext, + frame_files: List[Path], + ) -> None: + output_path.parent.mkdir(parents=True, exist_ok=True) + output_data: Dict[str, Any] = { + 'episode_id': input_data.episode_id, + 'series_name': context.series_name, + 'detection_settings': self.config.model_dump(), + 'statistics': { + 'total_frames_processed': len(frame_files), + 'frames_with_detections': len(detections), + 'object_counts': self.__count_objects(detections), + }, + 'detections': detections, + } + FileOperations.atomic_write_json(output_path, output_data) @staticmethod def __create_path_variables(input_data: FrameCollection) -> Dict[str, str]: @@ -94,11 +219,17 @@ def __create_path_variables(input_data: FrameCollection) -> Dict[str, str]: } @staticmethod - def __construct_object_data( - input_data: FrameCollection, output_path: Path, - ) -> ObjectDetectionData: - return ObjectDetectionData( - episode_id=input_data.episode_id, - episode_info=input_data.episode_info, - path=output_path, - ) + def __extract_frame_files(input_data: FrameCollection) -> List[Path]: + return sorted([ + f for f in input_data.directory.glob('*.jpg') + if f.is_file() and 'frame_' in f.name + ]) + + @staticmethod + def __count_objects(detections: List[Dict[str, Any]]) -> Dict[str, int]: + counts: Dict[str, int] = {} + for frame in detections: + for obj in frame.get('objects', []): + name: str = obj.get('class_name', 'unknown') + counts[name] = counts.get(name, 0) + 1 + return counts From 48b97da7913d7bdf860e2f1d203f2770ecf3744b Mon Sep 17 00:00:00 2001 From: dam2452 <81230036+dam2452@users.noreply.github.com> Date: Sun, 22 Feb 2026 14:01:09 +0100 Subject: [PATCH 57/89] Add embedding steps, face clusterer, and episode fallbacks MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Introduce new text embedding steps (sound event, full-episode, episode-name) and register them in the pipeline; add corresponding config classes and tweak TextEmbedding defaults for larger chunks. Add a GPU-backed FaceClusterer service (cuML/CuPy + insightface) and wire it into FaceClusteringStep with GPU cleanup and empty-output handling. Update pipeline builder and CLI helper to fall back to episodes.json (input or output) when no video files are present and add EpisodeManager.get_all_episodes to enumerate episodes. Patch HSEmotion to prefer CUDAExecutionProvider when available. Improve sentence splitting logic to merge short fragments. Export FaceClusterer from services.characters. Note: constants renamed (season_number, episode_in_season) — this may require downstream data/metadata migrations. --- preprocessor/app/pipeline_builder.py | 35 ++- preprocessor/app/pipeline_factory.py | 65 ++++- preprocessor/cli/helpers.py | 11 +- preprocessor/config/constants.py | 4 +- preprocessor/config/step_configs.py | 27 ++- preprocessor/services/characters/__init__.py | 9 +- .../services/characters/face_clusterer.py | 138 +++++++++++ .../services/episodes/episode_manager.py | 24 ++ preprocessor/services/video/emotion_utils.py | 30 ++- preprocessor/steps/text/embeddings_step.py | 14 +- .../steps/text/episode_name_embedding_step.py | 165 +++++++++++++ .../steps/text/full_episode_embedding_step.py | 224 ++++++++++++++++++ .../steps/text/sound_event_embedding_step.py | 223 +++++++++++++++++ .../steps/vision/face_clustering_step.py | 122 +++++++--- 14 files changed, 1044 insertions(+), 47 deletions(-) create mode 100644 preprocessor/services/characters/face_clusterer.py create mode 100644 preprocessor/steps/text/episode_name_embedding_step.py create mode 100644 preprocessor/steps/text/full_episode_embedding_step.py create mode 100644 preprocessor/steps/text/sound_event_embedding_step.py diff --git a/preprocessor/app/pipeline_builder.py b/preprocessor/app/pipeline_builder.py index a02a032e9..8c95088f5 100644 --- a/preprocessor/app/pipeline_builder.py +++ b/preprocessor/app/pipeline_builder.py @@ -40,13 +40,23 @@ def __discover_source_videos( f"Discovered {len(video_files)} video files in {source_path}", ) + if video_files: + return self.__build_source_videos_from_files(video_files, episode_manager) + + self.__context.logger.info( + "No input files found — building episode list from episodes.json", + ) + return self.__build_source_videos_from_episodes(episode_manager) + + def __build_source_videos_from_files( + self, video_files: List[Path], episode_manager: EpisodeManager, + ) -> List[SourceVideo]: source_videos: List[SourceVideo] = [] for video_file in video_files: episode_info = episode_manager.parse_filename(video_file) if not episode_info: self.__context.logger.warning(f"Cannot parse: {video_file}") continue - episode_id = episode_manager.get_episode_id_for_state(episode_info) source_videos.append( SourceVideo( @@ -55,9 +65,30 @@ def __discover_source_videos( episode_info=episode_info, ), ) - return source_videos + def __build_source_videos_from_episodes( + self, episode_manager: EpisodeManager, + ) -> List[SourceVideo]: + all_episodes = episode_manager.get_all_episodes() + if not all_episodes: + self.__context.logger.warning( + "No episodes in episodes.json and no input files — nothing to process", + ) + return [] + + self.__context.logger.info( + f"Building source from {len(all_episodes)} episodes in episodes.json", + ) + return [ + SourceVideo( + path=Path(''), + episode_id=episode_manager.get_episode_id_for_state(ep), + episode_info=ep, + ) + for ep in all_episodes + ] + def __execute_step_with_registry( self, pipeline: "PipelineDefinition", diff --git a/preprocessor/app/pipeline_factory.py b/preprocessor/app/pipeline_factory.py index 95d50f1f8..341bad3dd 100644 --- a/preprocessor/app/pipeline_factory.py +++ b/preprocessor/app/pipeline_factory.py @@ -15,13 +15,16 @@ DocumentGenerationConfig, ElasticsearchConfig, EmotionDetectionConfig, + EpisodeNameEmbeddingConfig, EpisodeScraperConfig, FaceClusteringConfig, FrameExportConfig, + FullEpisodeEmbeddingConfig, ImageHashConfig, ObjectDetectionConfig, ResolutionAnalysisConfig, SceneDetectionConfig, + SoundEventEmbeddingConfig, SoundEventsConfig, SoundSeparationConfig, TextAnalysisConfig, @@ -49,6 +52,9 @@ from preprocessor.steps.search.indexing_step import ElasticsearchIndexerStep from preprocessor.steps.text.analysis_step import TextAnalysisStep from preprocessor.steps.text.embeddings_step import TextEmbeddingStep +from preprocessor.steps.text.episode_name_embedding_step import EpisodeNameEmbeddingStep +from preprocessor.steps.text.full_episode_embedding_step import FullEpisodeEmbeddingStep +from preprocessor.steps.text.sound_event_embedding_step import SoundEventEmbeddingStep from preprocessor.steps.text.sound_events_step import SoundEventsStep from preprocessor.steps.text.text_cleaning_step import TextCleaningStep from preprocessor.steps.text.transcription_step import TranscriptionStep @@ -315,8 +321,60 @@ def build_pipeline(series_name: str) -> PipelineDefinition: # pylint: disable=t model_name="Qwen/Qwen3-VL-Embedding-8B", batch_size=8, device="cuda", - text_sentences_per_chunk=5, - text_chunk_overlap=1, + text_sentences_per_chunk=8, + text_chunk_overlap=3, + ), + ) + + sound_event_embeddings = StepBuilder( + phase=PROCESSING, + step_class=SoundEventEmbeddingStep, + description="Generates sound event embeddings using Qwen3-VL-Embedding", + produces=[ + FileOutput( + pattern="{season}/{episode}.json", + min_size_bytes=1024, + ), + ], + needs=[sound_events], + config=SoundEventEmbeddingConfig( + model_name="Qwen/Qwen3-VL-Embedding-8B", + batch_size=64, + device="cuda", + ), + ) + + full_episode_embeddings = StepBuilder( + phase=PROCESSING, + step_class=FullEpisodeEmbeddingStep, + description="Generates full episode embedding using Qwen3-VL-Embedding", + produces=[ + FileOutput( + pattern="{season}/{episode}.json", + min_size_bytes=1024, + ), + ], + needs=[text_cleaning], + config=FullEpisodeEmbeddingConfig( + model_name="Qwen/Qwen3-VL-Embedding-8B", + device="cuda", + ), + ) + + episode_name_embeddings = StepBuilder( + phase=PROCESSING, + step_class=EpisodeNameEmbeddingStep, + description="Generates episode title embedding using Qwen3-VL-Embedding", + produces=[ + FileOutput( + pattern="{season}/{episode}.json", + min_size_bytes=1024, + ), + ], + needs=[text_cleaning], + config=EpisodeNameEmbeddingConfig( + model_name="Qwen/Qwen3-VL-Embedding-8B", + device="cuda", ), ) @@ -504,6 +562,9 @@ def build_pipeline(series_name: str) -> PipelineDefinition: # pylint: disable=t pipeline.register(text_stats) pipeline.register(text_embeddings) + pipeline.register(sound_event_embeddings) + pipeline.register(full_episode_embeddings) + pipeline.register(episode_name_embeddings) pipeline.register(image_hashes) pipeline.register(video_embeddings) diff --git a/preprocessor/cli/helpers.py b/preprocessor/cli/helpers.py index e3f09d5d9..cf2813c32 100644 --- a/preprocessor/cli/helpers.py +++ b/preprocessor/cli/helpers.py @@ -57,9 +57,16 @@ def build( def __create_episode_manager( series: str, input_base: Path, logger: ErrorHandlingLogger, ) -> Optional[EpisodeManager]: - episodes_json: Optional[Path] = input_base / series / 'episodes.json' - if not episodes_json.exists(): + input_episodes = input_base / series / 'episodes.json' + output_episodes = PathService.get_output_base() / series / f'{series}_episodes.json' + + if input_episodes.exists(): + episodes_json: Optional[Path] = input_episodes + elif output_episodes.exists(): + episodes_json = output_episodes + else: episodes_json = None + return EpisodeManager(episodes_json, series, logger) @staticmethod diff --git a/preprocessor/config/constants.py b/preprocessor/config/constants.py index 7f25767b8..7a3316ce1 100644 --- a/preprocessor/config/constants.py +++ b/preprocessor/config/constants.py @@ -44,11 +44,11 @@ class EpisodesDataKeys: EPISODES = 'episodes' SEASONS = 'seasons' - SEASON_NUMBER = 'season' + SEASON_NUMBER = 'season_number' class EpisodeMetadataKeys: - EPISODE_NUMBER = 'episode_number' + EPISODE_NUMBER = 'episode_in_season' PREMIERE_DATE = 'premiere_date' TITLE = 'title' VIEWERSHIP = 'viewership' diff --git a/preprocessor/config/step_configs.py b/preprocessor/config/step_configs.py index 7680e5144..fdb71dc17 100644 --- a/preprocessor/config/step_configs.py +++ b/preprocessor/config/step_configs.py @@ -110,8 +110,8 @@ class TextEmbeddingConfig(BaseModel): device: str = 'cuda' max_parallel_episodes: int = Field(default=1, ge=1, le=2) model_name: str = 'Qwen/Qwen3-VL-Embedding-8B' - text_chunk_overlap: int = Field(default=1, ge=0) - text_sentences_per_chunk: int = Field(default=5, ge=1) + text_chunk_overlap: int = Field(default=3, ge=0) + text_sentences_per_chunk: int = Field(default=8, ge=1) class VideoEmbeddingConfig(BaseModel): @@ -121,6 +121,29 @@ class VideoEmbeddingConfig(BaseModel): model_name: str = 'Qwen/Qwen3-VL-Embedding-8B' +class SoundEventEmbeddingConfig(BaseModel): + batch_size: int = Field(default=64, ge=1) + device: str = 'cuda' + max_parallel_episodes: int = Field(default=1, ge=1, le=2) + model_name: str = 'Qwen/Qwen3-VL-Embedding-8B' + segments_per_embedding: int = Field(default=5, ge=1) + + +class FullEpisodeEmbeddingConfig(BaseModel): + device: str = 'cuda' + max_chars_per_chunk: int = Field(default=6000, ge=100) + max_parallel_episodes: int = Field(default=1, ge=1, le=2) + min_chunk_length: int = Field(default=100, ge=1) + model_name: str = 'Qwen/Qwen3-VL-Embedding-8B' + overlap_chars: int = Field(default=4500, ge=0) + + +class EpisodeNameEmbeddingConfig(BaseModel): + device: str = 'cuda' + max_parallel_episodes: int = Field(default=1, ge=1, le=2) + model_name: str = 'Qwen/Qwen3-VL-Embedding-8B' + + class SoundSeparationConfig(BaseModel): max_parallel_episodes: int = Field(default=4, ge=1, le=8) diff --git a/preprocessor/services/characters/__init__.py b/preprocessor/services/characters/__init__.py index 483361209..0aa7baa42 100644 --- a/preprocessor/services/characters/__init__.py +++ b/preprocessor/services/characters/__init__.py @@ -1,3 +1,4 @@ +from preprocessor.services.characters.face_clusterer import FaceClusterer from preprocessor.services.characters.face_detection import FaceDetector from preprocessor.services.characters.image_search import ( BaseImageSearch, @@ -5,4 +6,10 @@ GoogleImageSearch, ) -__all__ = ['BaseImageSearch', 'DuckDuckGoImageSearch', 'FaceDetector', 'GoogleImageSearch'] +__all__ = [ + 'BaseImageSearch', + 'DuckDuckGoImageSearch', + 'FaceClusterer', + 'FaceDetector', + 'GoogleImageSearch', +] diff --git a/preprocessor/services/characters/face_clusterer.py b/preprocessor/services/characters/face_clusterer.py new file mode 100644 index 000000000..610bd564d --- /dev/null +++ b/preprocessor/services/characters/face_clusterer.py @@ -0,0 +1,138 @@ +from collections import defaultdict +import gc +from pathlib import Path +from typing import ( + Any, + Dict, + List, + Tuple, +) + +from cuml.cluster import HDBSCAN as cuHDBSCAN +import cupy as cp +import cv2 +from insightface.app import FaceAnalysis +import numpy as np +import torch + + +class FaceClusterer: + @staticmethod + def extract_face_embeddings( + frame_files: List[Path], + face_app: FaceAnalysis, + ) -> List[Dict[str, Any]]: + face_data: List[Dict[str, Any]] = [] + + for frame_path in frame_files: + img = cv2.imread(str(frame_path)) # pylint: disable=no-member + if img is None: + continue + + for face_idx, face in enumerate(face_app.get(img)): + bbox = face.bbox.astype(int) + x1 = max(0, bbox[0]) + y1 = max(0, bbox[1]) + x2 = min(img.shape[1], bbox[2]) + y2 = min(img.shape[0], bbox[3]) + + if x2 <= x1 or y2 <= y1: + continue + + face_data.append({ + 'vector': face.normed_embedding, + 'frame_path': frame_path, + 'face_idx': face_idx, + }) + + return face_data + + @staticmethod + def cluster_embeddings( + face_data: List[Dict[str, Any]], + min_cluster_size: int, + min_samples: int, + ) -> np.ndarray: + vectors = np.array([fd['vector'] for fd in face_data]) + vectors_gpu = cp.asarray(vectors) + + clusterer = cuHDBSCAN( + min_cluster_size=min_cluster_size, + min_samples=min_samples, + metric='euclidean', + cluster_selection_method='eom', + ) + labels = clusterer.fit_predict(vectors_gpu) + return cp.asnumpy(labels) + + @staticmethod + def build_cluster_output( + face_data: List[Dict[str, Any]], + labels: np.ndarray, + save_noise: bool, + episode_id: str, + series_name: str, + min_cluster_size: int, + min_samples: int, + model_name: str, + total_frames: int, + ) -> Dict[str, Any]: + groups: Dict[int, List[Dict[str, Any]]] = defaultdict(list) + for face_info, label in zip(face_data, labels): + groups[int(label)].append(face_info) + + clusters, noise_info = FaceClusterer.__build_cluster_entries(groups, save_noise) + n_noise = len(groups.get(-1, [])) + frames_with_faces = len({fd['frame_path'] for fd in face_data}) + + return { + 'episode_id': episode_id, + 'series_name': series_name, + 'processing_params': { + 'min_cluster_size': min_cluster_size, + 'min_samples': min_samples, + 'metric': 'euclidean', + 'algorithm': 'hdbscan', + 'cluster_selection_method': 'eom', + 'model': model_name, + }, + 'statistics': { + 'total_faces_detected': len(face_data), + 'total_clusters': len(clusters), + 'noise_faces': n_noise, + 'frames_processed': total_frames, + 'frames_with_faces': frames_with_faces, + }, + 'clusters': clusters, + 'noise': noise_info if save_noise else {}, + } + + @staticmethod + def cleanup_gpu_memory() -> None: + gc.collect() + if torch.cuda.is_available(): + torch.cuda.empty_cache() + + @staticmethod + def __build_cluster_entries( + groups: Dict[int, List[Dict[str, Any]]], + save_noise: bool, + ) -> Tuple[Dict[str, Dict[str, Any]], Dict[str, Any]]: + clusters: Dict[str, Dict[str, Any]] = {} + noise_info: Dict[str, Any] = {} + + for cluster_id, faces in sorted(groups.items()): + frames_seen = sorted({fd['frame_path'].name for fd in faces}) + entry: Dict[str, Any] = { + 'face_count': len(faces), + 'frame_count': len(frames_seen), + 'frames': frames_seen, + 'character_name': None, + } + if cluster_id == -1: + if save_noise: + noise_info = entry + else: + clusters[f'cluster_{cluster_id}'] = entry + + return clusters, noise_info diff --git a/preprocessor/services/episodes/episode_manager.py b/preprocessor/services/episodes/episode_manager.py index 1a91334c1..44b83a43c 100644 --- a/preprocessor/services/episodes/episode_manager.py +++ b/preprocessor/services/episodes/episode_manager.py @@ -4,6 +4,7 @@ from typing import ( Any, Dict, + List, Optional, ) @@ -46,6 +47,29 @@ def get_episode_by_season_and_relative(self, season: int, relative_episode: int) self.__log_missing_season_warning(season, relative_episode) return self.__create_fallback_episode_info(season, relative_episode) + def get_all_episodes(self) -> List[EpisodeInfo]: + if not self.__episodes_data: + return [] + + result: List[EpisodeInfo] = [] + for season_data in self.__episodes_data.get(EpisodesDataKeys.SEASONS, []): + season = season_data.get(EpisodesDataKeys.SEASON_NUMBER, 0) + episodes = sorted( + season_data.get(EpisodesDataKeys.EPISODES, []), + key=lambda ep: ep.get(EpisodeMetadataKeys.EPISODE_NUMBER, 0), + ) + for idx, ep_data in enumerate(episodes, start=1): + result.append( + self.__create_episode_info( + season=season, + relative_episode=idx, + title=ep_data.get(EpisodeMetadataKeys.TITLE), + premiere_date=ep_data.get(EpisodeMetadataKeys.PREMIERE_DATE), + viewership=ep_data.get(EpisodeMetadataKeys.VIEWERSHIP), + ), + ) + return result + def parse_filename(self, file_path: Path) -> Optional[EpisodeInfo]: full_path_str = str(file_path) match_season_episode = re.search(r'S(\d+)[/\\]?E(\d+)', full_path_str, re.IGNORECASE) diff --git a/preprocessor/services/video/emotion_utils.py b/preprocessor/services/video/emotion_utils.py index a733aa813..df7c79167 100644 --- a/preprocessor/services/video/emotion_utils.py +++ b/preprocessor/services/video/emotion_utils.py @@ -5,8 +5,12 @@ Tuple, ) -from hsemotion_onnx.facial_emotions import HSEmotionRecognizer +from hsemotion_onnx.facial_emotions import ( + HSEmotionRecognizer, + get_model_path, +) import numpy as np +import onnxruntime as ort from preprocessor.config.settings_instance import settings from preprocessor.services.core.logging import ErrorHandlingLogger @@ -34,12 +38,36 @@ def init_model(logger: Optional[ErrorHandlingLogger] = None) -> HSEmotionRecogni try: fer = HSEmotionRecognizer(model_name=model_name) + EmotionDetector.__patch_gpu_session(fer, model_name, logger) if logger: logger.info(f'HSEmotion model loaded: {model_name}') return fer except Exception as e: raise RuntimeError(f'Failed to load HSEmotion model {model_name}: {e}') from e + @staticmethod + def __patch_gpu_session( + fer: HSEmotionRecognizer, + model_name: str, + logger: Optional[ErrorHandlingLogger], + ) -> None: + available_providers = ort.get_available_providers() + if 'CUDAExecutionProvider' not in available_providers: + if logger: + logger.warning( + 'CUDAExecutionProvider not available — HSEmotion running on CPU. ' + 'Install onnxruntime-gpu to enable GPU acceleration.', + ) + return + + model_path = get_model_path(model_name) + fer.ort_session = ort.InferenceSession( + model_path, + providers=['CUDAExecutionProvider', 'CPUExecutionProvider'], + ) + if logger: + logger.info('HSEmotion session patched to use GPU (CUDAExecutionProvider)') + @staticmethod def detect_batch( face_images: List[np.ndarray], diff --git a/preprocessor/steps/text/embeddings_step.py b/preprocessor/steps/text/embeddings_step.py index 47b15b0d5..3baf62903 100644 --- a/preprocessor/steps/text/embeddings_step.py +++ b/preprocessor/steps/text/embeddings_step.py @@ -258,12 +258,20 @@ def __find_segment_at_position( @staticmethod def __split_into_sentences(text: str) -> List[str]: normalized_text: str = re.sub(r'\.{2,}', '.', text) + normalized_text = re.sub(r'!{2,}', '!', normalized_text) + normalized_text = re.sub(r'\?{2,}', '?', normalized_text) sentences: List[str] = re.split(r'([.!?]+(?:\s+|$))', normalized_text) - result: List[str] = [] + raw: List[str] = [] for i in range(0, len(sentences) - 1, 2): s: str = (sentences[i] + sentences[i + 1]).strip() if s: - result.append(s) + raw.append(s) if len(sentences) % 2 == 1 and sentences[-1].strip(): - result.append(sentences[-1].strip()) + raw.append(sentences[-1].strip()) + result: List[str] = [] + for sentence in raw: + if len(sentence) < 30 and result: + result[-1] = result[-1] + ' ' + sentence + else: + result.append(sentence) return result diff --git a/preprocessor/steps/text/episode_name_embedding_step.py b/preprocessor/steps/text/episode_name_embedding_step.py new file mode 100644 index 000000000..4e700fdf0 --- /dev/null +++ b/preprocessor/steps/text/episode_name_embedding_step.py @@ -0,0 +1,165 @@ +# pylint: disable=duplicate-code +from datetime import datetime +from pathlib import Path +from typing import ( + Any, + Dict, + List, + Optional, +) + +from preprocessor.config.step_configs import EpisodeNameEmbeddingConfig +from preprocessor.core.artifacts import ( + EmbeddingCollection, + TranscriptionData, +) +from preprocessor.core.base_step import PipelineStep +from preprocessor.core.context import ExecutionContext +from preprocessor.core.output_descriptors import FileOutput +from preprocessor.services.io.files import FileOperations +from preprocessor.services.io.metadata import MetadataBuilder +from preprocessor.services.search.embedding_model import EmbeddingModelWrapper + + +class EpisodeNameEmbeddingStep( + PipelineStep[TranscriptionData, EmbeddingCollection, EpisodeNameEmbeddingConfig], +): + def __init__(self, config: EpisodeNameEmbeddingConfig) -> None: + super().__init__(config) + self.__model: Optional[EmbeddingModelWrapper] = None + + @property + def supports_batch_processing(self) -> bool: + return True + + def setup_resources(self, context: ExecutionContext) -> None: + if self.__model is None: + context.logger.info(f'Loading embedding model: {self.config.model_name}') + self.__model = EmbeddingModelWrapper( + self.config.model_name, + self.config.device, + 1, + ) + + def teardown_resources(self, context: ExecutionContext) -> None: + if self.__model: + self.__model = None + context.logger.info('Embedding model unloaded') + + def cleanup(self) -> None: + if self.__model: + self.__model = None + + def execute_batch( + self, + input_data: List[TranscriptionData], + context: ExecutionContext, + ) -> List[EmbeddingCollection]: + return self._execute_sequential(input_data, context, self.execute) + + def _process( + self, + input_data: TranscriptionData, + context: ExecutionContext, + ) -> EmbeddingCollection: + output_path = self._get_cache_path(input_data, context) + + title = input_data.episode_info.title + if not title: + context.logger.warning( + f'No title for episode name embedding in {input_data.episode_id}', + ) + return self.__build_collection(input_data, output_path, 0) + + self.__ensure_model() + context.logger.info(f'Generating episode name embedding for {input_data.episode_id}') + + embedding: List[float] = self.__model.encode_text(title) # type: ignore[assignment,union-attr] + self.__save_result(embedding, title, output_path, input_data) + + return self.__build_collection(input_data, output_path, 1) + + def get_output_descriptors(self) -> List[FileOutput]: + return [ + FileOutput( + pattern="{season}/{episode}.json", + subdir="embeddings/episode_names", + min_size_bytes=1024, + ), + ] + + def _get_cache_path( + self, + input_data: TranscriptionData, + context: ExecutionContext, + ) -> Path: + return self._resolve_output_path( + 0, + context, + self.__create_path_vars(input_data), + ) + + def _load_from_cache( + self, + cache_path: Path, + input_data: TranscriptionData, + context: ExecutionContext, + ) -> EmbeddingCollection: + data: Dict[str, Any] = FileOperations.load_json(cache_path) + count = 1 if data.get('title_embedding') else 0 + return self.__build_collection(input_data, cache_path, count) + + def __ensure_model(self) -> None: + if self.__model is None: + self.__model = EmbeddingModelWrapper( + self.config.model_name, + self.config.device, + 1, + ) + + def __save_result( + self, + embedding: List[float], + title: str, + output_path: Path, + input_data: TranscriptionData, + ) -> None: + episode_info = input_data.episode_info + output_data: Dict[str, Any] = { + 'generated_at': datetime.now().isoformat(), + 'processing_parameters': self.config.model_dump(), + 'episode_id': input_data.episode_id, + 'title': title, + 'title_embedding': embedding, + 'episode_metadata': { + 'season': episode_info.season, + 'episode_number': episode_info.relative_episode, + 'title': title, + 'premiere_date': episode_info.premiere_date, + 'series_name': episode_info.series_name, + 'viewership': episode_info.viewership, + }, + } + FileOperations.atomic_write_json(output_path, output_data) + + def __build_collection( + self, + input_data: TranscriptionData, + output_path: Path, + embedding_count: int, + ) -> EmbeddingCollection: + return MetadataBuilder.create_embedding_collection( + episode_id=input_data.episode_id, + episode_info=input_data.episode_info, + path=output_path, + model_name=self.config.model_name, + embedding_count=embedding_count, + embedding_type='episode_name', + ) + + @staticmethod + def __create_path_vars(input_data: TranscriptionData) -> Dict[str, str]: + return { + "season": f"S{input_data.episode_info.season:02d}", + "episode": input_data.episode_info.episode_code(), + } diff --git a/preprocessor/steps/text/full_episode_embedding_step.py b/preprocessor/steps/text/full_episode_embedding_step.py new file mode 100644 index 000000000..8ec29d60a --- /dev/null +++ b/preprocessor/steps/text/full_episode_embedding_step.py @@ -0,0 +1,224 @@ +# pylint: disable=duplicate-code +from datetime import datetime +from pathlib import Path +from typing import ( + Any, + Dict, + List, + Optional, + Tuple, +) + +import numpy as np + +from preprocessor.config.step_configs import FullEpisodeEmbeddingConfig +from preprocessor.core.artifacts import ( + EmbeddingCollection, + TranscriptionData, +) +from preprocessor.core.base_step import PipelineStep +from preprocessor.core.context import ExecutionContext +from preprocessor.core.output_descriptors import FileOutput +from preprocessor.services.io.files import FileOperations +from preprocessor.services.io.metadata import MetadataBuilder +from preprocessor.services.search.embedding_model import EmbeddingModelWrapper + + +class FullEpisodeEmbeddingStep( + PipelineStep[TranscriptionData, EmbeddingCollection, FullEpisodeEmbeddingConfig], +): + def __init__(self, config: FullEpisodeEmbeddingConfig) -> None: + super().__init__(config) + self.__model: Optional[EmbeddingModelWrapper] = None + + @property + def supports_batch_processing(self) -> bool: + return True + + def setup_resources(self, context: ExecutionContext) -> None: + if self.__model is None: + context.logger.info(f'Loading embedding model: {self.config.model_name}') + self.__model = EmbeddingModelWrapper( + self.config.model_name, + self.config.device, + 1, + ) + + def teardown_resources(self, context: ExecutionContext) -> None: + if self.__model: + self.__model = None + context.logger.info('Embedding model unloaded') + + def cleanup(self) -> None: + if self.__model: + self.__model = None + + def execute_batch( + self, + input_data: List[TranscriptionData], + context: ExecutionContext, + ) -> List[EmbeddingCollection]: + return self._execute_sequential(input_data, context, self.execute) + + def _process( + self, + input_data: TranscriptionData, + context: ExecutionContext, + ) -> EmbeddingCollection: + output_path = self._get_cache_path(input_data, context) + + full_text = self.__build_full_text(input_data, context) + if not full_text: + return self.__build_collection(input_data, output_path, 0) + + self.__ensure_model() + context.logger.info(f'Generating full episode embedding for {input_data.episode_id}') + + embedding = self.__embed_full_text(full_text) + self.__save_result(embedding, full_text, output_path, input_data) + + return self.__build_collection(input_data, output_path, 1) + + def get_output_descriptors(self) -> List[FileOutput]: + return [ + FileOutput( + pattern="{season}/{episode}.json", + subdir="embeddings/full_episode", + min_size_bytes=1024, + ), + ] + + def _get_cache_path( + self, + input_data: TranscriptionData, + context: ExecutionContext, + ) -> Path: + return self._resolve_output_path( + 0, + context, + self.__create_path_vars(input_data), + ) + + def _load_from_cache( + self, + cache_path: Path, + input_data: TranscriptionData, + context: ExecutionContext, + ) -> EmbeddingCollection: + data: Dict[str, Any] = FileOperations.load_json(cache_path) + count = 1 if data.get('full_episode_embedding') else 0 + return self.__build_collection(input_data, cache_path, count) + + def __ensure_model(self) -> None: + if self.__model is None: + self.__model = EmbeddingModelWrapper( + self.config.model_name, + self.config.device, + 1, + ) + + def __embed_full_text(self, full_text: str) -> List[float]: + if len(full_text) <= self.config.max_chars_per_chunk: + embedding: List[float] = self.__model.encode_text(full_text) # type: ignore[assignment,union-attr] + return embedding + return self.__sliding_window_embed(full_text) + + def __sliding_window_embed(self, full_text: str) -> List[float]: + chunks, weights = self.__build_chunks_and_weights(full_text) + if not self.__model: + raise RuntimeError("Embedding model not initialized") + + embeddings: List[List[float]] = self.__model.encode_text(chunks) # type: ignore[assignment] + total_weight = sum(weights) + normalized_weights = [w / total_weight for w in weights] + + dim = len(embeddings[0]) + avg: np.ndarray = np.zeros(dim, dtype=np.float64) + for emb, w in zip(embeddings, normalized_weights): + avg += np.array(emb, dtype=np.float64) * w + + norm = float(np.linalg.norm(avg)) + if norm > 0: + avg /= norm + + return avg.tolist() + + def __build_chunks_and_weights( + self, + full_text: str, + ) -> Tuple[List[str], List[float]]: + chunks: List[str] = [] + weights: List[float] = [] + step = self.config.max_chars_per_chunk - self.config.overlap_chars + pos = 0 + + while pos < len(full_text): + chunk = full_text[pos : pos + self.config.max_chars_per_chunk] + if len(chunk) >= self.config.min_chunk_length: + chunks.append(chunk) + weights.append(len(chunk) / self.config.max_chars_per_chunk) + pos += step + + return chunks, weights + + def __save_result( + self, + embedding: List[float], + full_text: str, + output_path: Path, + input_data: TranscriptionData, + ) -> None: + output_data: Dict[str, Any] = { + 'generated_at': datetime.now().isoformat(), + 'episode_info': { + 'season': input_data.episode_info.season, + 'episode_number': input_data.episode_info.relative_episode, + }, + 'processing_parameters': self.config.model_dump(), + 'statistics': { + 'transcript_length': len(full_text), + 'embedding_dimension': len(embedding), + }, + 'full_episode_embedding': { + 'text': full_text, + 'embedding': embedding, + 'transcript_length': len(full_text), + }, + } + FileOperations.atomic_write_json(output_path, output_data) + + def __build_collection( + self, + input_data: TranscriptionData, + output_path: Path, + embedding_count: int, + ) -> EmbeddingCollection: + return MetadataBuilder.create_embedding_collection( + episode_id=input_data.episode_id, + episode_info=input_data.episode_info, + path=output_path, + model_name=self.config.model_name, + embedding_count=embedding_count, + embedding_type='full_episode', + ) + + @staticmethod + def __create_path_vars(input_data: TranscriptionData) -> Dict[str, str]: + return { + "season": f"S{input_data.episode_info.season:02d}", + "episode": input_data.episode_info.episode_code(), + } + + @staticmethod + def __build_full_text( + input_data: TranscriptionData, + context: ExecutionContext, + ) -> str: + data: Dict[str, Any] = FileOperations.load_json(input_data.path) + segments: List[Dict[str, Any]] = data.get('segments', []) + if not segments: + context.logger.warning( + f'No text segments for full episode embedding in {input_data.episode_id}', + ) + return '' + return ' '.join(s.get('text', '') for s in segments).strip() diff --git a/preprocessor/steps/text/sound_event_embedding_step.py b/preprocessor/steps/text/sound_event_embedding_step.py new file mode 100644 index 000000000..a785c7414 --- /dev/null +++ b/preprocessor/steps/text/sound_event_embedding_step.py @@ -0,0 +1,223 @@ +# pylint: disable=duplicate-code +from pathlib import Path +import re +from typing import ( + Any, + Dict, + List, + Optional, + Set, +) + +from preprocessor.config.step_configs import SoundEventEmbeddingConfig +from preprocessor.core.artifacts import ( + EmbeddingCollection, + TranscriptionData, +) +from preprocessor.core.base_step import PipelineStep +from preprocessor.core.context import ExecutionContext +from preprocessor.core.output_descriptors import FileOutput +from preprocessor.services.io.files import FileOperations +from preprocessor.services.io.metadata import MetadataBuilder +from preprocessor.services.search.embedding_model import EmbeddingModelWrapper + +_SOUND_TYPE_PATTERN = re.compile(r'\(([^)]+)\)') + + +class SoundEventEmbeddingStep( + PipelineStep[TranscriptionData, EmbeddingCollection, SoundEventEmbeddingConfig], +): + def __init__(self, config: SoundEventEmbeddingConfig) -> None: + super().__init__(config) + self.__model: Optional[EmbeddingModelWrapper] = None + + @property + def supports_batch_processing(self) -> bool: + return True + + def setup_resources(self, context: ExecutionContext) -> None: + if self.__model is None: + context.logger.info(f'Loading embedding model: {self.config.model_name}') + self.__model = EmbeddingModelWrapper( + self.config.model_name, + self.config.device, + self.config.batch_size, + ) + + def teardown_resources(self, context: ExecutionContext) -> None: + if self.__model: + self.__model = None + context.logger.info('Embedding model unloaded') + + def cleanup(self) -> None: + if self.__model: + self.__model = None + + def execute_batch( + self, + input_data: List[TranscriptionData], + context: ExecutionContext, + ) -> List[EmbeddingCollection]: + return self._execute_sequential(input_data, context, self.execute) + + def _process( + self, + input_data: TranscriptionData, + context: ExecutionContext, + ) -> EmbeddingCollection: + output_path = self._get_cache_path(input_data, context) + + segments = self.__load_segments(input_data, context) + if not segments: + return self.__build_collection(input_data, output_path, 0) + + self.__ensure_model() + context.logger.info(f'Generating sound event embeddings for {input_data.episode_id}') + + results = self.__process_chunks(segments) + self.__save_results(results, output_path, input_data) + + return self.__build_collection(input_data, output_path, len(results)) + + def get_output_descriptors(self) -> List[FileOutput]: + return [ + FileOutput( + pattern="{season}/{episode}.json", + subdir="embeddings/sound_events", + min_size_bytes=1024, + ), + ] + + def _get_cache_path( + self, + input_data: TranscriptionData, + context: ExecutionContext, + ) -> Path: + return self._resolve_output_path( + 0, + context, + self.__create_path_vars(input_data), + ) + + def _load_from_cache( + self, + cache_path: Path, + input_data: TranscriptionData, + context: ExecutionContext, + ) -> EmbeddingCollection: + data: Dict[str, Any] = FileOperations.load_json(cache_path) + return self.__build_collection( + input_data, + cache_path, + len(data.get('sound_event_embeddings', [])), + ) + + def __ensure_model(self) -> None: + if self.__model is None: + self.__model = EmbeddingModelWrapper( + self.config.model_name, + self.config.device, + self.config.batch_size, + ) + + def __process_chunks( + self, + segments: List[Dict[str, Any]], + ) -> List[Dict[str, Any]]: + chunks = self.__group_segments(segments) + if not self.__model: + raise RuntimeError("Embedding model not initialized") + + results: List[Dict[str, Any]] = [] + for i in range(0, len(chunks), self.config.batch_size): + batch_chunks = chunks[i : i + self.config.batch_size] + batch_texts = [c['text'] for c in batch_chunks] + batch_embeddings: List[List[float]] = self.__model.encode_text(batch_texts) + for chunk, embedding in zip(batch_chunks, batch_embeddings): + results.append({**chunk, 'embedding': embedding}) + + return results + + def __group_segments( + self, + segments: List[Dict[str, Any]], + ) -> List[Dict[str, Any]]: + chunks: List[Dict[str, Any]] = [] + step = self.config.segments_per_embedding + + for i in range(0, len(segments), step): + chunk_segs = segments[i : i + step] + if not chunk_segs: + continue + + text = ' '.join(s.get('text', '') for s in chunk_segs).strip() + if not text: + continue + + sound_types: Set[str] = set() + for seg in chunk_segs: + for match in _SOUND_TYPE_PATTERN.finditer(seg.get('text', '')): + sound_types.add(match.group(1).strip().lower()) + + chunks.append({ + 'segment_range': [i, i + len(chunk_segs) - 1], + 'text': text, + 'sound_types': sorted(sound_types), + 'start_time': chunk_segs[0].get('start', 0.0), + 'end_time': chunk_segs[-1].get('end', 0.0), + }) + + return chunks + + def __save_results( + self, + results: List[Dict[str, Any]], + output_path: Path, + input_data: TranscriptionData, + ) -> None: + output_data: Dict[str, Any] = MetadataBuilder.create_processing_metadata( + episode_info=input_data.episode_info, + processing_params=self.config.model_dump(), + statistics={ + 'total_embeddings': len(results), + 'embedding_dimension': len(results[0]['embedding']) if results else 0, + }, + results_key='sound_event_embeddings', + results_data=results, + ) + FileOperations.atomic_write_json(output_path, output_data) + + def __build_collection( + self, + input_data: TranscriptionData, + output_path: Path, + embedding_count: int, + ) -> EmbeddingCollection: + return MetadataBuilder.create_embedding_collection( + episode_id=input_data.episode_id, + episode_info=input_data.episode_info, + path=output_path, + model_name=self.config.model_name, + embedding_count=embedding_count, + embedding_type='sound_events', + ) + + @staticmethod + def __create_path_vars(input_data: TranscriptionData) -> Dict[str, str]: + return { + "season": f"S{input_data.episode_info.season:02d}", + "episode": input_data.episode_info.episode_code(), + } + + @staticmethod + def __load_segments( + input_data: TranscriptionData, + context: ExecutionContext, + ) -> List[Dict[str, Any]]: + data: Dict[str, Any] = FileOperations.load_json(input_data.path) + segments: List[Dict[str, Any]] = data.get('segments', []) + if not segments: + context.logger.warning( + f'No sound event segments for embedding in {input_data.episode_id}', + ) + return segments diff --git a/preprocessor/steps/vision/face_clustering_step.py b/preprocessor/steps/vision/face_clustering_step.py index 71160d70d..5b2712bcb 100644 --- a/preprocessor/steps/vision/face_clustering_step.py +++ b/preprocessor/steps/vision/face_clustering_step.py @@ -1,10 +1,15 @@ # pylint: disable=duplicate-code from pathlib import Path from typing import ( + Any, Dict, List, + Optional, ) +from insightface.app import FaceAnalysis + +from preprocessor.config.settings_instance import settings from preprocessor.config.step_configs import FaceClusteringConfig from preprocessor.core.artifacts import ( ClusterData, @@ -16,72 +21,128 @@ JsonFileOutput, OutputDescriptor, ) +from preprocessor.services.characters import FaceDetector +from preprocessor.services.characters.face_clusterer import FaceClusterer +from preprocessor.services.io.files import FileOperations class FaceClusteringStep(PipelineStep[FrameCollection, ClusterData, FaceClusteringConfig]): def __init__(self, config: FaceClusteringConfig) -> None: super().__init__(config) - self.__model = None + self.__face_app: Optional[FaceAnalysis] = None @property def supports_batch_processing(self) -> bool: return True def setup_resources(self, context: ExecutionContext) -> None: - if self.__model is None: - self.__load_model(context) + if self.__face_app is None: + context.logger.info('Loading Face Clustering model...') + self.__face_app = FaceDetector.init() def teardown_resources(self, context: ExecutionContext) -> None: - if self.__model: - self.__unload_model(context) + if self.__face_app: + context.logger.info('Face Clustering model unloaded') + self.__face_app = None + FaceClusterer.cleanup_gpu_memory() def cleanup(self) -> None: - self.__model = None + self.__face_app = None def execute_batch( - self, input_data: List[FrameCollection], context: ExecutionContext, + self, input_data: List[FrameCollection], context: ExecutionContext, ) -> List[ClusterData]: return self._execute_with_threadpool( input_data, context, self.config.max_parallel_episodes, self.execute, ) def _process( - self, input_data: FrameCollection, context: ExecutionContext, + self, input_data: FrameCollection, context: ExecutionContext, ) -> ClusterData: output_path = self._get_cache_path(input_data, context) - return self.__construct_cluster_data(input_data, output_path) + face_app = self.__face_app + + frame_files = self.__extract_frame_files(input_data) + if not frame_files: + context.logger.warning(f'No frame files found in {input_data.directory}') + self.__write_empty_output(output_path, input_data, context) + return self.__build_result(input_data, output_path) + + face_data = FaceClusterer.extract_face_embeddings(frame_files, face_app) + if not face_data: + context.logger.warning(f'No faces detected in episode {input_data.episode_id}') + self.__write_empty_output(output_path, input_data, context) + return self.__build_result(input_data, output_path) + + clustering = settings.face_clustering + labels = FaceClusterer.cluster_embeddings( + face_data, clustering.min_cluster_size, clustering.min_samples, + ) + + output_data = FaceClusterer.build_cluster_output( + face_data=face_data, + labels=labels, + save_noise=clustering.save_noise, + episode_id=input_data.episode_id, + series_name=context.series_name, + min_cluster_size=clustering.min_cluster_size, + min_samples=clustering.min_samples, + model_name=settings.face_recognition.model_name, + total_frames=len(frame_files), + ) + FileOperations.atomic_write_json(output_path, output_data) + + return self.__build_result(input_data, output_path) def get_output_descriptors(self) -> List[OutputDescriptor]: return [ JsonFileOutput( - subdir="clusters/faces", - pattern="{season}/{episode}.json", + subdir='clusters/faces', + pattern='{season}/{episode}.json', min_size_bytes=10, ), ] def _get_cache_path( - self, input_data: FrameCollection, context: ExecutionContext, + self, input_data: FrameCollection, context: ExecutionContext, ) -> Path: return self._resolve_output_path( - 0, - context, - self.__create_path_variables(input_data), + 0, context, self.__create_path_variables(input_data), ) def _load_from_cache( - self, cache_path: Path, input_data: FrameCollection, context: ExecutionContext, + self, cache_path: Path, input_data: FrameCollection, context: ExecutionContext, ) -> ClusterData: - return self.__construct_cluster_data(input_data, cache_path) + return self.__build_result(input_data, cache_path) + + def __write_empty_output( + self, + output_path: Path, + input_data: FrameCollection, + context: ExecutionContext, + ) -> None: + empty_data: Dict[str, Any] = { + 'episode_id': input_data.episode_id, + 'series_name': context.series_name, + 'statistics': { + 'total_faces_detected': 0, + 'total_clusters': 0, + 'noise_faces': 0, + 'frames_processed': 0, + 'frames_with_faces': 0, + }, + 'clusters': {}, + 'noise': {}, + } + FileOperations.atomic_write_json(output_path, empty_data) @staticmethod - def __load_model(context: ExecutionContext) -> None: - context.logger.info('Loading Face Clustering model...') - # Model loading logic implementation - - def __unload_model(self, context: ExecutionContext) -> None: - context.logger.info('Face Clustering model unloaded') - self.__model = None + def __build_result(input_data: FrameCollection, output_path: Path) -> ClusterData: + return ClusterData( + episode_id=input_data.episode_id, + episode_info=input_data.episode_info, + path=output_path, + ) @staticmethod def __create_path_variables(input_data: FrameCollection) -> Dict[str, str]: @@ -91,11 +152,8 @@ def __create_path_variables(input_data: FrameCollection) -> Dict[str, str]: } @staticmethod - def __construct_cluster_data( - input_data: FrameCollection, output_path: Path, - ) -> ClusterData: - return ClusterData( - episode_id=input_data.episode_id, - episode_info=input_data.episode_info, - path=output_path, - ) + def __extract_frame_files(input_data: FrameCollection) -> List[Path]: + return sorted([ + f for f in input_data.directory.glob('*.jpg') + if f.is_file() and 'frame_' in f.name + ]) From 8fa2611daeea74f73ea2b09a963fd7dc183f0fc0 Mon Sep 17 00:00:00 2001 From: dam2452 <81230036+dam2452@users.noreply.github.com> Date: Tue, 24 Feb 2026 09:39:05 +0100 Subject: [PATCH 58/89] Replace Qwen-VL with vLLM embedding backend Swap out the prior AutoModelForVision2Seq/AutoProcessor based pipeline for a vLLM-based embedding backend. EmbeddingService now uses vllm.LLM to perform both image and text embedding requests (images are opened with PIL and passed as multi_modal_data, texts as prompt entries), removes the old processor/model and manual tensor handling, and moves resource configuration into vLLM init using settings. Also updates EmbeddingModelWrapper to stop forwarding a device argument. Keeps CUDA requirement and updates cleanup/embedding flows accordingly. --- .../search/clients/embedding_service.py | 125 +++++------------- .../services/search/embedding_model.py | 4 +- 2 files changed, 33 insertions(+), 96 deletions(-) diff --git a/preprocessor/services/search/clients/embedding_service.py b/preprocessor/services/search/clients/embedding_service.py index fd46357f4..7bed946be 100644 --- a/preprocessor/services/search/clients/embedding_service.py +++ b/preprocessor/services/search/clients/embedding_service.py @@ -5,130 +5,67 @@ Dict, List, Optional, - Tuple, Union, ) +from PIL import Image import click -from qwen_vl_utils import process_vision_info import torch -from transformers import ( - AutoModelForVision2Seq, - AutoProcessor, -) +from vllm import LLM from preprocessor.config.settings_instance import settings class EmbeddingService: - def __init__(self, model_name: Optional[str] = None, device: str = 'cuda') -> None: + def __init__(self, model_name: Optional[str] = None) -> None: self.__model_name: str = model_name or settings.embedding_model.model_name - self.__device = device - self.__model: Optional[AutoModelForVision2Seq] = None - self.__processor: Optional[AutoProcessor] = None + self.__llm: Optional[LLM] = None def ensure_loaded(self) -> None: - if self.__model is None: + if self.__llm is None: self.__load_resources() def cleanup(self) -> None: - if self.__model is not None: - del self.__model - del self.__processor - self.__model = self.__processor = None + if self.__llm is not None: + del self.__llm + self.__llm = None gc.collect() if torch.cuda.is_available(): torch.cuda.empty_cache() def get_image_embeddings_batch(self, image_paths: List[Union[str, Path]]) -> List[List[float]]: - model, processor, device = self.__get_model() - - messages_batch = [ - [{ - 'role': 'user', 'content': [ - {'type': 'image', 'image': str(path)}, - {'type': 'text', 'text': 'Describe this image.'}, - ], - }] + placeholder = settings.embedding_model.image_placeholder + inputs: List[Dict[str, Any]] = [ + { + 'prompt': f'{placeholder}\nDescribe this image.', + 'multi_modal_data': {'image': Image.open(str(path)).convert('RGB')}, + } for path in image_paths ] - - all_image_inputs: List[Any] = [] - prompts: List[str] = [] - for messages in messages_batch: - image_inputs, _ = process_vision_info(messages) - all_image_inputs.extend(image_inputs) - prompts.append( - processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True), - ) - - inputs = processor( - text=prompts, - images=all_image_inputs, - padding=True, - return_tensors='pt', - ).to(device) - - return self.__compute_batch_embeddings(model, inputs, len(image_paths)) + return self.__embed(inputs) def get_text_embeddings_batch(self, texts: List[str]) -> List[List[float]]: - model, processor, device = self.__get_model() - - messages_batch = [ - [{'role': 'user', 'content': [{'type': 'text', 'text': text}]}] - for text in texts - ] - prompts: List[str] = [ - processor.apply_chat_template(msgs, tokenize=False, add_generation_prompt=True) - for msgs in messages_batch - ] - - inputs = processor( - text=prompts, - padding=True, - return_tensors='pt', - ).to(device) - - return self.__compute_batch_embeddings(model, inputs, len(texts)) - - @staticmethod - def __compute_batch_embeddings( - model: Any, - inputs: Dict[str, Any], - count: int, - ) -> List[List[float]]: - with torch.no_grad(): - output = model(**inputs, output_hidden_states=True) - hidden = output.hidden_states[-1] - - attention_mask = inputs.get('attention_mask') - if attention_mask is not None: - last_positions = attention_mask.sum(dim=1) - 1 - embeddings = torch.stack([ - hidden[i, last_positions[i], :] for i in range(count) - ]) - else: - embeddings = hidden[:, -1, :] - - embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=-1) - - result = [emb.float().cpu().numpy().tolist() for emb in embeddings] - gc.collect() - if torch.cuda.is_available(): - torch.cuda.empty_cache() - return result + inputs: List[Dict[str, Any]] = [{'prompt': text} for text in texts] + return self.__embed(inputs) - def __get_model(self) -> Tuple[AutoModelForVision2Seq, AutoProcessor, str]: - if self.__model is None: + def __embed(self, inputs: List[Dict[str, Any]]) -> List[List[float]]: + if self.__llm is None: self.__load_resources() - return self.__model, self.__processor, self.__device + outputs = self.__llm.encode(inputs) # type: ignore[union-attr] + return [output.outputs.embedding for output in outputs] def __load_resources(self) -> None: - click.echo('Loading Qwen-VL embedding model...', err=True) + click.echo('Loading vLLM embedding model...', err=True) if not torch.cuda.is_available(): raise RuntimeError('CUDA required for multimodal embeddings.') - self.__model = AutoModelForVision2Seq.from_pretrained( - self.__model_name, dtype=torch.bfloat16, device_map='auto', + em = settings.embedding_model + self.__llm = LLM( + model=self.__model_name, + max_model_len=em.max_model_len, + gpu_memory_utilization=em.gpu_memory_utilization, + enable_chunked_prefill=em.enable_chunked_prefill, + enforce_eager=em.enforce_eager, + max_num_batched_tokens=em.max_num_batched_tokens, + tensor_parallel_size=em.tensor_parallel_size, ) - self.__processor = AutoProcessor.from_pretrained(self.__model_name) diff --git a/preprocessor/services/search/embedding_model.py b/preprocessor/services/search/embedding_model.py index 7a788c17d..d4dbfdfe3 100644 --- a/preprocessor/services/search/embedding_model.py +++ b/preprocessor/services/search/embedding_model.py @@ -12,10 +12,10 @@ class EmbeddingModelWrapper: def __init__( self, model_name: str, - device: str = 'cuda', + _device: str = 'cuda', _batch_size: int = 8, ) -> None: - self.__service = EmbeddingService(model_name=model_name, device=device) + self.__service = EmbeddingService(model_name=model_name) def load_model(self) -> None: self.__service.ensure_loaded() From ed5abe6294368133e4459929a646977c8e76b0c3 Mon Sep 17 00:00:00 2001 From: dam2452 <81230036+dam2452@users.noreply.github.com> Date: Tue, 24 Feb 2026 09:54:54 +0100 Subject: [PATCH 59/89] Use pooling runner and allow remote code in LLM Update EmbeddingService LLM construction to set runner='pooling', trust_remote_code=True and disable_log_stats=True. This configures the model to use a pooling runner, permit remote model code execution, and suppress logging of stats to reduce noise during embedding operations (file: preprocessor/services/search/clients/embedding_service.py). --- preprocessor/services/search/clients/embedding_service.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/preprocessor/services/search/clients/embedding_service.py b/preprocessor/services/search/clients/embedding_service.py index 7bed946be..06ba3aad1 100644 --- a/preprocessor/services/search/clients/embedding_service.py +++ b/preprocessor/services/search/clients/embedding_service.py @@ -62,10 +62,13 @@ def __load_resources(self) -> None: em = settings.embedding_model self.__llm = LLM( model=self.__model_name, + runner="pooling", + trust_remote_code=True, max_model_len=em.max_model_len, gpu_memory_utilization=em.gpu_memory_utilization, enable_chunked_prefill=em.enable_chunked_prefill, enforce_eager=em.enforce_eager, max_num_batched_tokens=em.max_num_batched_tokens, tensor_parallel_size=em.tensor_parallel_size, + disable_log_stats=True, ) From 3a4261c38693bb2ea1e5164daad005a09a0d97b4 Mon Sep 17 00:00:00 2001 From: dam2452 <81230036+dam2452@users.noreply.github.com> Date: Tue, 24 Feb 2026 12:24:10 +0100 Subject: [PATCH 60/89] Use embed() instead of encode() for embeddings Replace call to __llm.encode with __llm.embed to match the updated LLM client API and correctly retrieve embedding vectors. The method still extracts embedding via output.outputs.embedding and retains the type ignore for union-attr. --- preprocessor/services/search/clients/embedding_service.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/preprocessor/services/search/clients/embedding_service.py b/preprocessor/services/search/clients/embedding_service.py index 06ba3aad1..3791291d4 100644 --- a/preprocessor/services/search/clients/embedding_service.py +++ b/preprocessor/services/search/clients/embedding_service.py @@ -51,7 +51,7 @@ def get_text_embeddings_batch(self, texts: List[str]) -> List[List[float]]: def __embed(self, inputs: List[Dict[str, Any]]) -> List[List[float]]: if self.__llm is None: self.__load_resources() - outputs = self.__llm.encode(inputs) # type: ignore[union-attr] + outputs = self.__llm.embed(inputs) # type: ignore[union-attr] return [output.outputs.embedding for output in outputs] def __load_resources(self) -> None: From 0b00a3e60eb09f29f4c9aba77dc6066db5ac1011 Mon Sep 17 00:00:00 2001 From: dam2452 <81230036+dam2452@users.noreply.github.com> Date: Fri, 27 Feb 2026 14:49:53 +0100 Subject: [PATCH 61/89] Split transcriptions, document outputs & archives Reorganize pipeline outputs and document generation: transcriptions are now separated into transcriptions/raw, transcriptions/clean and transcriptions/sound_events and use episode_num in paths; text embeddings and other outputs moved under embeddings/* and elastic_documents/* (multiple JSONL files defined by ELASTIC_DOC_TYPES). Refactor DocumentGeneratorStep to produce per-type JSONL files, load optional inputs, enrich documents with metadata/scene/character/object info, and write NDJSON via temp files. Add ArchiveGenerationStep logic to collect elastic documents, create ZIP archives, and support allow_partial in ArchiveConfig. Update defaults (remove generate_segments default), adjust pipeline_factory outputs, transcription import/cache paths, segment filter/step subdir handling, validation behavior (skip missing seasons instead of failing, and pass explicit season to Validator), and small cleanup (face_clusterer lint fix). These changes centralize output locations and enable modular elastic-document generation and archiving. --- preprocessor/app/pipeline_factory.py | 17 +- preprocessor/config/constants.py | 12 + preprocessor/config/step_configs.py | 2 +- preprocessor/config/step_defaults.py | 2 +- preprocessor/core/state_reconstruction.py | 1 + .../services/characters/face_clusterer.py | 2 +- preprocessor/services/text/import_step.py | 2 +- preprocessor/services/validation/validator.py | 6 +- preprocessor/steps/packaging/archives_step.py | 76 ++- .../steps/search/document_generation_step.py | 584 +++++++++++++----- .../steps/text/segment_filter_step.py | 7 +- preprocessor/steps/text/sound_events_step.py | 4 + preprocessor/steps/text/text_cleaning_step.py | 4 + preprocessor/steps/text/transcription_step.py | 14 +- .../steps/validation/validator_step.py | 15 +- 15 files changed, 569 insertions(+), 179 deletions(-) diff --git a/preprocessor/app/pipeline_factory.py b/preprocessor/app/pipeline_factory.py index 341bad3dd..f083e0121 100644 --- a/preprocessor/app/pipeline_factory.py +++ b/preprocessor/app/pipeline_factory.py @@ -235,7 +235,8 @@ def build_pipeline(series_name: str) -> PipelineDefinition: # pylint: disable=t description=f"Audio transcription using {series_config.processing.transcription.mode}", produces=[ JsonFileOutput( - pattern="{season}/{episode}/{episode}.json", + pattern="{season}/{episode_num}/{episode}.json", + subdir="transcriptions/raw", min_size_bytes=50, ), ], @@ -255,6 +256,7 @@ def build_pipeline(series_name: str) -> PipelineDefinition: # pylint: disable=t produces=[ JsonFileOutput( pattern="{season}/{episode}.json", + subdir="transcriptions/clean", min_size_bytes=10, ), ], @@ -269,6 +271,7 @@ def build_pipeline(series_name: str) -> PipelineDefinition: # pylint: disable=t produces=[ JsonFileOutput( pattern="{season}/{episode}.json", + subdir="transcriptions/sound_events", min_size_bytes=10, ), ], @@ -312,11 +315,12 @@ def build_pipeline(series_name: str) -> PipelineDefinition: # pylint: disable=t description="Generates text embeddings using Qwen3-VL-Embedding", produces=[ FileOutput( - pattern="{season}/{episode}.npy", + pattern="{season}/{episode}.json", + subdir="embeddings/text", min_size_bytes=1024, ), ], - needs=[text_stats], + needs=[text_cleaning], config=TextEmbeddingConfig( model_name="Qwen/Qwen3-VL-Embedding-8B", batch_size=8, @@ -478,8 +482,9 @@ def build_pipeline(series_name: str) -> PipelineDefinition: # pylint: disable=t description="Combines all data into Elasticsearch documents", produces=[ FileOutput( - pattern="{season}/{episode}.ndjson", - min_size_bytes=100, + pattern="{season}/{episode}_text_segments.jsonl", + subdir="elastic_documents/text_segments", + min_size_bytes=10, ), ], needs=[ @@ -490,7 +495,7 @@ def build_pipeline(series_name: str) -> PipelineDefinition: # pylint: disable=t face_clusters, object_detections, ], - config=DocumentGenerationConfig(generate_segments=True), + config=DocumentGenerationConfig(), ) episode_archives = StepBuilder( diff --git a/preprocessor/config/constants.py b/preprocessor/config/constants.py index 7a3316ce1..4db6eb47c 100644 --- a/preprocessor/config/constants.py +++ b/preprocessor/config/constants.py @@ -57,3 +57,15 @@ class EpisodeMetadataKeys: class FfprobeKeys: FORMAT = 'format' STREAMS = 'streams' + + +ELASTIC_DOC_TYPES = [ + ("text_segments", "text_segments"), + ("sound_events", "sound_events"), + ("text_embeddings", "text_embeddings"), + ("video_frames", "video_frames"), + ("episode_names", "episode_name"), + ("text_statistics", "text_statistics"), + ("full_episode_embeddings", "full_episode_embedding"), + ("sound_event_embeddings", "sound_event_embeddings"), +] diff --git a/preprocessor/config/step_configs.py b/preprocessor/config/step_configs.py index fdb71dc17..2d1b0df0f 100644 --- a/preprocessor/config/step_configs.py +++ b/preprocessor/config/step_configs.py @@ -149,7 +149,6 @@ class SoundSeparationConfig(BaseModel): class DocumentGenerationConfig(BaseModel): - generate_segments: bool = True max_parallel_episodes: int = Field(default=8, ge=1, le=16) @@ -200,6 +199,7 @@ class ObjectDetectionConfig(BaseModel): class ArchiveConfig(BaseModel): + allow_partial: bool = False max_parallel_episodes: int = Field(default=4, ge=1, le=8) diff --git a/preprocessor/config/step_defaults.py b/preprocessor/config/step_defaults.py index 36000a766..3cb41b2cf 100644 --- a/preprocessor/config/step_defaults.py +++ b/preprocessor/config/step_defaults.py @@ -61,7 +61,7 @@ def get_configs(series_name: str) -> Dict[str, object]: 'emotion_detection': EmotionDetectionConfig(), 'face_clustering': FaceClusteringConfig(), 'object_detection': ObjectDetectionConfig(), - 'generate_elastic_documents': DocumentGenerationConfig(generate_segments=True), + 'generate_elastic_documents': DocumentGenerationConfig(), 'generate_archives': ArchiveConfig(), 'index': ElasticsearchConfig( index_name=f'{series_name}_clips', diff --git a/preprocessor/core/state_reconstruction.py b/preprocessor/core/state_reconstruction.py index 2f25e8caa..f87b9569d 100644 --- a/preprocessor/core/state_reconstruction.py +++ b/preprocessor/core/state_reconstruction.py @@ -50,6 +50,7 @@ def scan_filesystem( context_vars = { 'season': episode_info.season_code(), 'episode': episode_info.episode_code(), + 'episode_num': episode_info.episode_num(), 'series_name': series_name, } diff --git a/preprocessor/services/characters/face_clusterer.py b/preprocessor/services/characters/face_clusterer.py index 610bd564d..5bcc217e9 100644 --- a/preprocessor/services/characters/face_clusterer.py +++ b/preprocessor/services/characters/face_clusterer.py @@ -25,7 +25,7 @@ def extract_face_embeddings( face_data: List[Dict[str, Any]] = [] for frame_path in frame_files: - img = cv2.imread(str(frame_path)) # pylint: disable=no-member + img = cv2.imread(str(frame_path)) if img is None: continue diff --git a/preprocessor/services/text/import_step.py b/preprocessor/services/text/import_step.py index ed267e991..c9b36687d 100644 --- a/preprocessor/services/text/import_step.py +++ b/preprocessor/services/text/import_step.py @@ -107,7 +107,7 @@ def __should_skip_import(self, output_path: Path, episode_id: str, context: Exec def __get_output_path(self, episode_info: EpisodeInfo, context: ExecutionContext) -> Path: filename = self.__episode_manager.path_manager.build_filename(episode_info, extension='json') - return context.get_output_path(episode_info, 'transcriptions', filename) + return context.get_output_path(episode_info, 'transcriptions/raw', filename) @staticmethod def __convert_11labs_full(data: Dict[str, Any], source_file: Path) -> Dict[str, Any]: diff --git a/preprocessor/services/validation/validator.py b/preprocessor/services/validation/validator.py index c08419c47..2abd6e003 100644 --- a/preprocessor/services/validation/validator.py +++ b/preprocessor/services/validation/validator.py @@ -38,10 +38,10 @@ def __init__( self.__validation_reports_dir = base_output_dir / settings.output_subdirs.validation_reports def validate(self) -> int: - transcriptions_path = self.__base_output_dir / 'transcriptions' / self.__season + transcriptions_path = self.__base_output_dir / 'transcriptions' / 'raw' / self.__season if not transcriptions_path.exists(): - console.print(f'[red]Season directory not found: {transcriptions_path}[/red]') - return 1 + console.print(f'[yellow]Season directory not found, skipping: {transcriptions_path}[/yellow]') + return 0 console.print(f'[bold cyan]Validating season {self.__season}...[/bold cyan]') diff --git a/preprocessor/steps/packaging/archives_step.py b/preprocessor/steps/packaging/archives_step.py index ed475798b..be66018df 100644 --- a/preprocessor/steps/packaging/archives_step.py +++ b/preprocessor/steps/packaging/archives_step.py @@ -1,6 +1,11 @@ from pathlib import Path -from typing import List +from typing import ( + Dict, + List, +) +import zipfile +from preprocessor.config.constants import ELASTIC_DOC_TYPES from preprocessor.config.step_configs import ArchiveConfig from preprocessor.core.artifacts import ( ArchiveArtifact, @@ -9,6 +14,7 @@ from preprocessor.core.base_step import PipelineStep from preprocessor.core.context import ExecutionContext from preprocessor.core.output_descriptors import FileOutput +from preprocessor.services.episodes.types import EpisodeInfo class ArchiveGenerationStep( @@ -28,9 +34,28 @@ def execute_batch( def _process( self, input_data: ProcessedEpisode, context: ExecutionContext, ) -> ArchiveArtifact: + episode_info = input_data.episode_info output_path = self._get_cache_path(input_data, context) - # Archive generation logic would go here - return self.__construct_archive_artifact(input_data, output_path) + + episode_files = self.__collect_episode_files(context, episode_info) + + expected = len(ELASTIC_DOC_TYPES) + found = len(episode_files) + + if found == 0: + context.logger.warning(f"No elastic documents found for {input_data.episode_id}") + return self.__build_artifact(input_data, output_path) + + if found < expected and not self.config.allow_partial: + context.logger.warning( + f"Skipping {input_data.episode_id}: incomplete documents " + f"({found}/{expected}). Set allow_partial=True to archive anyway.", + ) + return self.__build_artifact(input_data, output_path) + + self.__create_archive(output_path, episode_files, context) + + return self.__build_artifact(input_data, output_path) def get_output_descriptors(self) -> List[FileOutput]: return [ @@ -52,10 +77,51 @@ def _load_from_cache( input_data: ProcessedEpisode, context: ExecutionContext, ) -> ArchiveArtifact: - return self.__construct_archive_artifact(input_data, cache_path) + return self.__build_artifact(input_data, cache_path) + + @staticmethod + def __collect_episode_files( + context: ExecutionContext, episode_info: EpisodeInfo, + ) -> Dict[str, Path]: + elastic_dir = context.base_output_dir / "elastic_documents" + season = episode_info.season_code() + episode = episode_info.episode_code() + + collected: Dict[str, Path] = {} + for folder, suffix in ELASTIC_DOC_TYPES: + file_path = elastic_dir / folder / season / f"{episode}_{suffix}.jsonl" + if file_path.exists(): + collected[folder] = file_path + return collected + + @staticmethod + def __create_archive( + archive_path: Path, + files: Dict[str, Path], + context: ExecutionContext, + ) -> None: + archive_path.parent.mkdir(parents=True, exist_ok=True) + temp_path = archive_path.with_suffix(archive_path.suffix + ".tmp") + + try: + with zipfile.ZipFile(temp_path, "w", zipfile.ZIP_DEFLATED) as zipf: + for file_path in files.values(): + zipf.write(file_path, arcname=file_path.name) + + temp_path.replace(archive_path) + + size_mb = archive_path.stat().st_size / (1024 * 1024) + context.logger.info( + f"Created archive: {archive_path.name} ({len(files)} files, {size_mb:.2f} MB)", + ) + + except Exception as e: + if temp_path.exists(): + temp_path.unlink() + raise RuntimeError(f"Failed to create archive {archive_path}: {e}") from e @staticmethod - def __construct_archive_artifact( + def __build_artifact( input_data: ProcessedEpisode, output_path: Path, ) -> ArchiveArtifact: return ArchiveArtifact( diff --git a/preprocessor/steps/search/document_generation_step.py b/preprocessor/steps/search/document_generation_step.py index f5adf6395..88de919c4 100644 --- a/preprocessor/steps/search/document_generation_step.py +++ b/preprocessor/steps/search/document_generation_step.py @@ -4,218 +4,502 @@ Any, Dict, List, - Tuple, + Optional, ) +from preprocessor.config.constants import ELASTIC_DOC_TYPES from preprocessor.config.step_configs import DocumentGenerationConfig from preprocessor.core.artifacts import ( - Artifact, ElasticDocuments, + EmbeddingCollection, ) from preprocessor.core.base_step import PipelineStep from preprocessor.core.context import ExecutionContext from preprocessor.core.output_descriptors import FileOutput from preprocessor.core.temp_files import StepTempFile +from preprocessor.services.episodes.types import EpisodeInfo from preprocessor.services.io.files import FileOperations -class DocumentGeneratorStep(PipelineStep[Artifact, ElasticDocuments, DocumentGenerationConfig]): +class DocumentGeneratorStep( + PipelineStep[EmbeddingCollection, ElasticDocuments, DocumentGenerationConfig], +): @property def supports_batch_processing(self) -> bool: return True def execute_batch( - self, input_data: List[Artifact], context: ExecutionContext, + self, input_data: List[EmbeddingCollection], context: ExecutionContext, ) -> List[ElasticDocuments]: return self._execute_with_threadpool( input_data, context, self.config.max_parallel_episodes, self.execute, ) def _process( - self, input_data: Artifact, context: ExecutionContext, + self, input_data: EmbeddingCollection, context: ExecutionContext, ) -> ElasticDocuments: - episode_info, episode_id = self.__extract_episode_info(input_data) - output_path = self._get_cache_path(input_data, context) + episode_info = input_data.episode_info + episode_id = input_data.episode_id + episode_metadata = self.__build_episode_metadata(episode_info, context) + video_path = self.__build_video_path(episode_info, context) - data = self.__gather_input_data(episode_info, context) - total_docs = self.__generate_documents( - data, output_path, episode_info, context, - ) + scene_data = self.__load_optional(context, "scene_detections", episode_info) + char_data = self.__load_optional(context, "detections/characters", episode_info) + emotion_data = self.__load_optional(context, "detections/emotions", episode_info) + object_data = self.__load_optional(context, "detections/objects", episode_info) + + char_by_frame = self.__index_characters_by_frame(char_data, emotion_data) + objects_by_frame = self.__index_objects_by_frame(object_data) + + total_docs = sum([ + self.__write_text_segments(context, episode_info, episode_id, episode_metadata, video_path, scene_data), + self.__write_sound_events(context, episode_info, episode_id, episode_metadata, video_path, scene_data), + self.__write_text_embeddings(context, episode_info, episode_id, episode_metadata, video_path), + self.__write_video_frames(context, episode_info, episode_id, episode_metadata, video_path, scene_data, char_by_frame, objects_by_frame), + self.__write_episode_name(context, episode_info, episode_id, episode_metadata, video_path), + self.__write_text_statistics(context, episode_info, episode_id, episode_metadata, video_path), + self.__write_full_episode_embedding(context, episode_info, episode_id, episode_metadata, video_path), + self.__write_sound_event_embeddings(context, episode_info, episode_id, episode_metadata, video_path), + ]) - return self.__construct_elastic_documents( - episode_id, episode_info, output_path, total_docs, + context.logger.info(f"Generated {total_docs} documents for {episode_id}") + + return ElasticDocuments( + episode_id=episode_id, + episode_info=episode_info, + path=self._get_cache_path(input_data, context), + document_count=total_docs, ) def get_output_descriptors(self) -> List[FileOutput]: return [ FileOutput( - pattern="{season}/{episode}.ndjson", - subdir="elastic_documents", - min_size_bytes=100, - ), + pattern=f"{{season}}/{{episode}}_{suffix}.jsonl", + subdir=f"elastic_documents/{folder}", + min_size_bytes=10, + ) + for folder, suffix in ELASTIC_DOC_TYPES ] def _get_cache_path( - self, input_data: Artifact, context: ExecutionContext, + self, input_data: EmbeddingCollection, context: ExecutionContext, ) -> Path: - episode_info, _ = self.__extract_episode_info(input_data) return self._resolve_output_path( - 0, - context, - { - 'season': episode_info.season_code(), - 'episode': episode_info.episode_code(), - }, + 0, context, self.__path_vars(input_data.episode_info), ) def _load_from_cache( - self, cache_path: Path, input_data: Artifact, context: ExecutionContext, + self, cache_path: Path, input_data: EmbeddingCollection, context: ExecutionContext, ) -> ElasticDocuments: - episode_info, episode_id = self.__extract_episode_info(input_data) - return self.__construct_elastic_documents( - episode_id, episode_info, cache_path, 0, + return ElasticDocuments( + episode_id=input_data.episode_id, + episode_info=input_data.episode_info, + path=cache_path, + document_count=0, + ) + + @staticmethod + def __path_vars(episode_info: EpisodeInfo) -> Dict[str, str]: + return { + "season": episode_info.season_code(), + "episode": episode_info.episode_code(), + } + + @staticmethod + def __input_path( + context: ExecutionContext, subdir: str, episode_info: EpisodeInfo, + ) -> Path: + return ( + context.base_output_dir + / subdir + / episode_info.season_code() + / f"{episode_info.episode_code()}.json" + ) + + def __output_path( + self, context: ExecutionContext, episode_info: EpisodeInfo, descriptor_index: int, + ) -> Path: + return self._resolve_output_path( + descriptor_index, context, self.__path_vars(episode_info), ) - def __generate_documents( + def __load_optional( + self, context: ExecutionContext, subdir: str, episode_info: EpisodeInfo, + ) -> Optional[Dict[str, Any]]: + path = self.__input_path(context, subdir, episode_info) + return FileOperations.load_json(path) if path.exists() else None + + @staticmethod + def __build_episode_metadata( + episode_info: EpisodeInfo, context: ExecutionContext, + ) -> Dict[str, Any]: + return { + "season": episode_info.season, + "episode_number": episode_info.relative_episode, + "title": episode_info.title, + "premiere_date": episode_info.premiere_date, + "series_name": context.series_name, + "viewership": episode_info.viewership, + } + + @staticmethod + def __build_video_path(episode_info: EpisodeInfo, context: ExecutionContext) -> str: + filename = f"{context.series_name}_{episode_info.episode_code()}.mp4" + return f"bot/{context.series_name.upper()}-WIDEO/{episode_info.season_code()}/{filename}" + + @staticmethod + def __find_scene( + timestamp: float, scene_data: Optional[Dict[str, Any]], + ) -> Optional[Dict[str, Any]]: + if not scene_data: + return None + for scene in scene_data.get("scenes", []): + start = scene["start"]["seconds"] + end = scene["end"]["seconds"] + if start is None or end is None: + continue + if start <= timestamp < end: + return { + "scene_number": scene["scene_number"], + "scene_start_time": start, + "scene_end_time": end, + "scene_start_frame": scene["start"]["frame"], + "scene_end_frame": scene["end"]["frame"], + } + return None + + @staticmethod + def __index_characters_by_frame( + char_data: Optional[Dict[str, Any]], + emotion_data: Optional[Dict[str, Any]], + ) -> Dict[str, List[Dict[str, Any]]]: + if not char_data: + return {} + + emotion_by_frame: Dict[str, Dict[str, Dict[str, Any]]] = {} + if emotion_data: + for det in emotion_data.get("detections", []): + frame = det["frame"] + emotion_by_frame[frame] = { + face["name"]: face.get("emotion") + for face in det.get("faces", []) + if face.get("emotion") + } + + result: Dict[str, List[Dict[str, Any]]] = {} + for det in char_data.get("detections", []): + frame = det["frame"] + faces = [] + for face in det.get("faces", []): + name = face["name"] + entry: Dict[str, Any] = {"name": name, "confidence": face.get("confidence")} + emotion = emotion_by_frame.get(frame, {}).get(name) + if emotion: + entry["emotion"] = { + "label": emotion["label"], + "confidence": emotion["confidence"], + } + faces.append(entry) + if faces: + result[frame] = faces + return result + + @staticmethod + def __index_objects_by_frame( + object_data: Optional[Dict[str, Any]], + ) -> Dict[str, List[Dict[str, Any]]]: + if not object_data: + return {} + result: Dict[str, List[Dict[str, Any]]] = {} + for det in object_data.get("detections", []): + frame = det["frame"] + counts: Dict[str, int] = {} + for obj in det.get("objects", []): + cls = obj["class_name"] + counts[cls] = counts.get(cls, 0) + 1 + if counts: + result[frame] = [{"class": k, "count": v} for k, v in counts.items()] + return result + + @staticmethod + def __write_ndjson(output_path: Path, docs: List[Dict[str, Any]]) -> int: + if not docs: + return 0 + with StepTempFile(output_path) as tmp: + with open(tmp, "w", encoding="utf-8") as f: + for doc in docs: + f.write(json.dumps(doc, ensure_ascii=False) + "\n") + return len(docs) + + def __write_text_segments( self, - data: Dict[str, Any], - output_path: Path, - episode_info: Any, context: ExecutionContext, + episode_info: EpisodeInfo, + episode_id: str, + episode_metadata: Dict[str, Any], + video_path: str, + scene_data: Optional[Dict[str, Any]], ) -> int: - total_docs = 0 - if self.config.generate_segments and 'transcription' in data: - total_docs += self.__generate_segments_jsonl( - data, output_path, episode_info, context, - ) - return total_docs + clean_data = self.__load_optional(context, "transcriptions/clean", episode_info) + if not clean_data: + return 0 + + docs = [] + for i, seg in enumerate(clean_data.get("segments", [])): + text = seg.get("text", "").strip() + if not text: + continue + words = seg.get("words", []) + start = (words[0].get("start") or seg.get("start", 0.0)) if words else seg.get("start", 0.0) + end = (words[-1].get("end") or seg.get("end", 0.0)) if words else seg.get("end", 0.0) + speaker = (words[0].get("speaker_id") or seg.get("speaker", "unknown")) if words else seg.get("speaker", "unknown") + doc: Dict[str, Any] = { + "episode_id": episode_id, + "episode_metadata": episode_metadata, + "segment_id": i, + "text": text, + "start_time": start, + "end_time": end, + "speaker": speaker, + "video_path": video_path, + } + scene_info = self.__find_scene(start, scene_data) + if scene_info: + doc["scene_info"] = scene_info + docs.append(doc) - def __generate_segments_jsonl( + return self.__write_ndjson(self.__output_path(context, episode_info, 0), docs) + + def __write_sound_events( self, - data: Dict[str, Any], - output_path: Path, - episode_info: Any, context: ExecutionContext, + episode_info: EpisodeInfo, + episode_id: str, + episode_metadata: Dict[str, Any], + video_path: str, + scene_data: Optional[Dict[str, Any]], ) -> int: - segments = data['transcription'].get('segments', []) - episode_metadata = self.__build_episode_metadata(episode_info, context) - video_bot_path = self.__build_video_bot_path(episode_info, context) - - return self.__write_segments_to_jsonl( - segments, - output_path, - episode_info, - episode_metadata, - video_bot_path, - ) + sound_data = self.__load_optional(context, "transcriptions/sound_events", episode_info) + if not sound_data: + return 0 - def __gather_input_data( - self, episode_info: Any, context: ExecutionContext, - ) -> Dict[str, Any]: - data: Dict[str, Any] = {} + docs = [] + for i, seg in enumerate(sound_data.get("segments", [])): + if "text" not in seg: + continue + words = seg.get("words", []) + start = (words[0].get("start") or seg.get("start", 0.0)) if words else seg.get("start", 0.0) + end = (words[-1].get("end") or seg.get("end", 0.0)) if words else seg.get("end", 0.0) + doc: Dict[str, Any] = { + "episode_id": episode_id, + "episode_metadata": episode_metadata, + "segment_id": i, + "text": seg.get("text", ""), + "sound_type": seg.get("sound_type", "sound"), + "start_time": start, + "end_time": end, + "video_path": video_path, + } + scene_info = self.__find_scene(start, scene_data) + if scene_info: + doc["scene_info"] = scene_info + docs.append(doc) - clean_path = self.__resolve_input_path( - episode_info, - context, - 'transcriptions/clean', - '_clean_transcription.json', - ) - if clean_path.exists(): - data['transcription'] = FileOperations.load_json(clean_path) - - text_emb_path = self.__resolve_input_path( - episode_info, - context, - 'embeddings', - '_embeddings_text.json', - ) - if text_emb_path.exists(): - data['text_embeddings'] = FileOperations.load_json(text_emb_path) - - scene_path = self.__resolve_input_path( - episode_info, - context, - 'scene_timestamps', - '_scenes.json', - ) - if scene_path.exists(): - data['scenes'] = FileOperations.load_json(scene_path) + return self.__write_ndjson(self.__output_path(context, episode_info, 1), docs) - return data + def __write_text_embeddings( + self, + context: ExecutionContext, + episode_info: EpisodeInfo, + episode_id: str, + episode_metadata: Dict[str, Any], + video_path: str, + ) -> int: + emb_data = self.__load_optional(context, "embeddings/text", episode_info) + if not emb_data: + return 0 - @staticmethod - def __write_segments_to_jsonl( - segments: List[Dict[str, Any]], - output_path: Path, - episode_info: Any, + docs = [] + for i, emb in enumerate(emb_data.get("text_embeddings", [])): + embedding = emb.get("embedding", []) + if not embedding: + continue + segment_range = emb.get("segment_range", []) + docs.append({ + "episode_id": episode_id, + "episode_metadata": episode_metadata, + "embedding_id": i, + "segment_range": segment_range[0] if segment_range else 0, + "text": emb.get("text", ""), + "text_embedding": embedding, + "video_path": video_path, + }) + + return self.__write_ndjson(self.__output_path(context, episode_info, 2), docs) + + def __write_video_frames( + self, + context: ExecutionContext, + episode_info: EpisodeInfo, + episode_id: str, episode_metadata: Dict[str, Any], - video_bot_path: str, + video_path: str, + scene_data: Optional[Dict[str, Any]], + char_by_frame: Dict[str, List[Dict[str, Any]]], + objects_by_frame: Dict[str, List[Dict[str, Any]]], ) -> int: - count = 0 - with StepTempFile(output_path) as temp_path: - with open(temp_path, 'w', encoding='utf-8') as f: - for i, segment in enumerate(segments): - doc = { - 'episode_id': episode_info.episode_code(), - 'episode_metadata': episode_metadata, - 'segment_id': i, - 'text': segment.get('text', '').strip(), - 'start_time': segment.get('start', 0.0), - 'end_time': segment.get('end', 0.0), - 'speaker': segment.get('speaker', 'unknown'), - 'video_path': video_bot_path, - } - f.write(json.dumps(doc, ensure_ascii=False) + '\n') - count += 1 - return count + emb_data = self.__load_optional(context, "embeddings/vision", episode_info) + if not emb_data: + return 0 - @staticmethod - def __extract_episode_info(input_data: Artifact) -> Tuple[Any, str]: - if not hasattr(input_data, 'episode_info'): - raise ValueError('Input artifact must have episode_info') + docs = [] + for emb in emb_data.get("video_embeddings", []): + embedding = emb.get("embedding") + timestamp = emb.get("timestamp") + if embedding is None or timestamp is None: + continue - episode_info = getattr(input_data, 'episode_info') - episode_id = getattr(input_data, 'episode_id') - return episode_info, episode_id + frame_path = emb.get("frame_path", "") + frame_name = Path(frame_path).name if frame_path else "" - @staticmethod - def __build_video_bot_path(episode_info: Any, context: ExecutionContext) -> str: - filename = f'{context.series_name}_{episode_info.episode_code()}.mp4' - return ( - f'bot/{context.series_name.upper()}-WIDEO/' - f'{episode_info.season_code()}/{filename}' - ) + doc: Dict[str, Any] = { + "episode_id": episode_id, + "episode_metadata": episode_metadata, + "frame_number": emb.get("frame_number"), + "timestamp": timestamp, + "frame_type": emb.get("type", "unknown"), + "video_path": video_path, + "video_embedding": embedding, + } - @staticmethod - def __construct_elastic_documents( + if frame_name and frame_name in char_by_frame: + doc["character_appearances"] = char_by_frame[frame_name] + if frame_name and frame_name in objects_by_frame: + doc["detected_objects"] = objects_by_frame[frame_name] + + perceptual_hash = emb.get("perceptual_hash") + if perceptual_hash: + doc["perceptual_hash"] = perceptual_hash + try: + doc["perceptual_hash_int"] = int(perceptual_hash, 16) + except (ValueError, TypeError): + pass + + if "scene_number" in emb: + doc["scene_number"] = emb["scene_number"] + + scene_info = self.__find_scene(timestamp, scene_data) + if scene_info: + doc["scene_info"] = scene_info + + docs.append(doc) + + return self.__write_ndjson(self.__output_path(context, episode_info, 3), docs) + + def __write_episode_name( + self, + context: ExecutionContext, + episode_info: EpisodeInfo, episode_id: str, - episode_info: Any, - output_path: Path, - document_count: int, - ) -> ElasticDocuments: - return ElasticDocuments( - episode_id=episode_id, - episode_info=episode_info, - path=output_path, - document_count=document_count, - ) + episode_metadata: Dict[str, Any], + video_path: str, + ) -> int: + emb_data = self.__load_optional(context, "embeddings/episode_names", episode_info) + if not emb_data or not emb_data.get("title_embedding"): + return 0 - @staticmethod - def __build_episode_metadata( - episode_info: Any, context: ExecutionContext, - ) -> Dict[str, Any]: - return { - 'season': episode_info.season, - 'episode_number': episode_info.relative_episode, - 'series_name': context.series_name, + doc: Dict[str, Any] = { + "episode_id": episode_id, + "episode_metadata": episode_metadata, + "title": emb_data.get("title", ""), + "title_embedding": emb_data.get("title_embedding", []), + "video_path": video_path, } + return self.__write_ndjson(self.__output_path(context, episode_info, 4), [doc]) - @staticmethod - def __resolve_input_path( - episode_info: Any, + def __write_text_statistics( + self, context: ExecutionContext, - folder: str, - suffix: str, - ) -> Path: - filename = f'{context.series_name}_{episode_info.episode_code()}{suffix}' - return context.get_output_path(episode_info, folder, filename) + episode_info: EpisodeInfo, + episode_id: str, + episode_metadata: Dict[str, Any], + video_path: str, + ) -> int: + stats_data = self.__load_optional(context, "text_analysis", episode_info) + if not stats_data or not stats_data.get("basic_statistics"): + return 0 + + doc: Dict[str, Any] = { + "episode_id": episode_id, + "episode_metadata": episode_metadata, + "video_path": video_path, + "language": stats_data.get("metadata", {}).get("language", "pl"), + "analyzed_at": stats_data.get("metadata", {}).get("analyzed_at"), + "basic_statistics": stats_data.get("basic_statistics", {}), + "advanced_statistics": stats_data.get("advanced_statistics", {}), + "word_frequency": stats_data.get("word_frequency", [])[:20], + "bigrams": stats_data.get("bigrams", [])[:10], + "trigrams": stats_data.get("trigrams", [])[:10], + } + return self.__write_ndjson(self.__output_path(context, episode_info, 5), [doc]) + + def __write_full_episode_embedding( + self, + context: ExecutionContext, + episode_info: EpisodeInfo, + episode_id: str, + episode_metadata: Dict[str, Any], + video_path: str, + ) -> int: + emb_data = self.__load_optional(context, "embeddings/full_episode", episode_info) + if not emb_data: + return 0 + + full_emb = emb_data.get("full_episode_embedding", {}) + if not full_emb or "embedding" not in full_emb: + return 0 + + doc: Dict[str, Any] = { + "episode_id": episode_id, + "episode_metadata": episode_metadata, + "full_transcript": full_emb.get("text", ""), + "transcript_length": full_emb.get("transcript_length", 0), + "full_episode_embedding": full_emb.get("embedding", []), + "video_path": video_path, + } + return self.__write_ndjson(self.__output_path(context, episode_info, 6), [doc]) + + def __write_sound_event_embeddings( + self, + context: ExecutionContext, + episode_info: EpisodeInfo, + episode_id: str, + episode_metadata: Dict[str, Any], + video_path: str, + ) -> int: + emb_data = self.__load_optional(context, "embeddings/sound_events", episode_info) + if not emb_data: + return 0 + + docs = [] + for i, emb in enumerate(emb_data.get("sound_event_embeddings", [])): + embedding = emb.get("embedding", []) + if not embedding: + continue + segment_range = emb.get("segment_range", []) + if isinstance(segment_range, list) and len(segment_range) == 2: + segment_range = {"gte": segment_range[0], "lte": segment_range[1]} + docs.append({ + "episode_id": episode_id, + "episode_metadata": episode_metadata, + "embedding_id": i, + "segment_range": segment_range, + "text": emb.get("text", ""), + "sound_types": emb.get("sound_types", []), + "start_time": emb.get("start_time", 0.0), + "end_time": emb.get("end_time", 0.0), + "sound_event_embedding": embedding, + "video_path": video_path, + }) + + return self.__write_ndjson(self.__output_path(context, episode_info, 7), docs) diff --git a/preprocessor/steps/text/segment_filter_step.py b/preprocessor/steps/text/segment_filter_step.py index fe3a11cf6..1f3c5c669 100644 --- a/preprocessor/steps/text/segment_filter_step.py +++ b/preprocessor/steps/text/segment_filter_step.py @@ -40,6 +40,11 @@ class SegmentFilterStep( def _output_format(self) -> str: pass + @property + @abstractmethod + def _output_subdir(self) -> str: + pass + @abstractmethod def _process_segment(self, segment: Dict[str, Any]) -> List[Dict[str, Any]]: pass @@ -68,7 +73,7 @@ def get_output_descriptors(self) -> List[JsonFileOutput]: return [ JsonFileOutput( pattern="{season}/{episode}.json", - subdir="", + subdir=self._output_subdir, min_size_bytes=10, ), ] diff --git a/preprocessor/steps/text/sound_events_step.py b/preprocessor/steps/text/sound_events_step.py index c47d80fa0..712894554 100644 --- a/preprocessor/steps/text/sound_events_step.py +++ b/preprocessor/steps/text/sound_events_step.py @@ -13,6 +13,10 @@ class SoundEventsStep(SegmentFilterStep[SoundEventsConfig]): def _output_format(self) -> str: return 'sound_events' + @property + def _output_subdir(self) -> str: + return 'transcriptions/sound_events' + def _process_segment(self, segment: Dict[str, Any]) -> List[Dict[str, Any]]: kind = self._classify(segment) if kind == 'sound_event': diff --git a/preprocessor/steps/text/text_cleaning_step.py b/preprocessor/steps/text/text_cleaning_step.py index 50c6f950c..fc14f5a08 100644 --- a/preprocessor/steps/text/text_cleaning_step.py +++ b/preprocessor/steps/text/text_cleaning_step.py @@ -13,6 +13,10 @@ class TextCleaningStep(SegmentFilterStep[TextCleaningConfig]): def _output_format(self) -> str: return 'clean' + @property + def _output_subdir(self) -> str: + return 'transcriptions/clean' + def _process_segment(self, segment: Dict[str, Any]) -> List[Dict[str, Any]]: kind = self._classify(segment) if kind == 'dialogue': diff --git a/preprocessor/steps/text/transcription_step.py b/preprocessor/steps/text/transcription_step.py index e3f7bde43..6340005e9 100644 --- a/preprocessor/steps/text/transcription_step.py +++ b/preprocessor/steps/text/transcription_step.py @@ -69,8 +69,8 @@ def _process( def get_output_descriptors(self) -> List[JsonFileOutput]: return [ JsonFileOutput( - pattern="{season}/{episode}/{episode}.json", - subdir="", + pattern="{season}/{episode_num}/{episode}.json", + subdir="transcriptions/raw", min_size_bytes=50, ), ] @@ -78,7 +78,15 @@ def get_output_descriptors(self) -> List[JsonFileOutput]: def _get_cache_path( self, input_data: TranscodedVideo, context: ExecutionContext, ) -> Path: - return self._get_standard_cache_path(input_data, context) + return self._resolve_output_path( + 0, + context, + { + 'season': input_data.episode_info.season_code(), + 'episode_num': input_data.episode_info.episode_num(), + 'episode': input_data.episode_info.episode_code(), + }, + ) def _load_from_cache( self, diff --git a/preprocessor/steps/validation/validator_step.py b/preprocessor/steps/validation/validator_step.py index 2163407cc..aecd7355d 100644 --- a/preprocessor/steps/validation/validator_step.py +++ b/preprocessor/steps/validation/validator_step.py @@ -31,18 +31,19 @@ def _process( input_data: ElasticDocuments, context: ExecutionContext, ) -> ValidationResult: - context.logger.info(f"Starting validation for season {context.season}") + season = input_data.episode_info.season_code() + context.logger.info(f"Starting validation for season {season}") - validator = self.__create_validator(context) + validator = self.__create_validator(season, context) self.__run_validation(validator) context.logger.info("Validation completed successfully") - return self.__construct_validation_result(context, validator) + return self.__construct_validation_result(season, validator) - def __create_validator(self, context: ExecutionContext) -> Validator: + def __create_validator(self, season: str, context: ExecutionContext) -> Validator: return Validator( - season=context.season, + season=season, series_name=context.series_name, anomaly_threshold=self.config.anomaly_threshold, base_output_dir=context.base_output_dir, @@ -57,10 +58,10 @@ def __run_validation(validator: Validator) -> None: @staticmethod def __construct_validation_result( - context: ExecutionContext, + season: str, validator: Validator, ) -> ValidationResult: return ValidationResult( - season=context.season, + season=season, validation_report_dir=validator.validation_reports_dir, ) From 9877625fd3518c689636784d32ea1de36392ecf4 Mon Sep 17 00:00:00 2001 From: dam2452 <81230036+dam2452@users.noreply.github.com> Date: Fri, 27 Feb 2026 15:02:09 +0100 Subject: [PATCH 62/89] Fix cv2 lint, hashing path, and segment_range Small refactors across preprocessing steps: - face_clusterer: Silence pylint false-positive on cv2.imread by adding a disable comment. - validator: Remove use of rich.progress.track and iterate directly over episode dirs. - document_generation_step: Stop normalizing segment_range to a single value/dict; keep the original segment_range in documents. - embeddings_step: Change image-hash lookup to a predictable path under base_output_dir (hashes/S{season}/.json) instead of building the path via get_output_path. These changes simplify linting, avoid the progress dependency in iteration, preserve segment_range structure, and standardize hash file locations. --- preprocessor/services/characters/face_clusterer.py | 2 +- preprocessor/services/validation/validator.py | 3 +-- preprocessor/steps/search/document_generation_step.py | 4 +--- preprocessor/steps/vision/embeddings_step.py | 10 +++------- 4 files changed, 6 insertions(+), 13 deletions(-) diff --git a/preprocessor/services/characters/face_clusterer.py b/preprocessor/services/characters/face_clusterer.py index 5bcc217e9..610bd564d 100644 --- a/preprocessor/services/characters/face_clusterer.py +++ b/preprocessor/services/characters/face_clusterer.py @@ -25,7 +25,7 @@ def extract_face_embeddings( face_data: List[Dict[str, Any]] = [] for frame_path in frame_files: - img = cv2.imread(str(frame_path)) + img = cv2.imread(str(frame_path)) # pylint: disable=no-member if img is None: continue diff --git a/preprocessor/services/validation/validator.py b/preprocessor/services/validation/validator.py index 2abd6e003..ac82f1165 100644 --- a/preprocessor/services/validation/validator.py +++ b/preprocessor/services/validation/validator.py @@ -8,7 +8,6 @@ ) from rich.console import Console -from rich.progress import track from preprocessor.config.settings_instance import settings from preprocessor.services.episodes import EpisodeManager @@ -70,7 +69,7 @@ def __collect_all_episodes_stats(self, season_path: Path) -> Dict[str, EpisodeSt episode_dirs = sorted([d for d in season_path.iterdir() if d.is_dir() and d.name.startswith('E')]) results: Dict[str, EpisodeStats] = {} - for ep_dir in track(episode_dirs, description='Collecting episode stats'): + for ep_dir in episode_dirs: stats = self.__process_single_episode_dir(ep_dir) if stats: results[stats.episode_info.episode_code()] = stats diff --git a/preprocessor/steps/search/document_generation_step.py b/preprocessor/steps/search/document_generation_step.py index 88de919c4..94afe79f0 100644 --- a/preprocessor/steps/search/document_generation_step.py +++ b/preprocessor/steps/search/document_generation_step.py @@ -329,7 +329,7 @@ def __write_text_embeddings( "episode_id": episode_id, "episode_metadata": episode_metadata, "embedding_id": i, - "segment_range": segment_range[0] if segment_range else 0, + "segment_range": segment_range, "text": emb.get("text", ""), "text_embedding": embedding, "video_path": video_path, @@ -487,8 +487,6 @@ def __write_sound_event_embeddings( if not embedding: continue segment_range = emb.get("segment_range", []) - if isinstance(segment_range, list) and len(segment_range) == 2: - segment_range = {"gte": segment_range[0], "lte": segment_range[1]} docs.append({ "episode_id": episode_id, "episode_metadata": episode_metadata, diff --git a/preprocessor/steps/vision/embeddings_step.py b/preprocessor/steps/vision/embeddings_step.py index 932c046dc..9c06aa120 100644 --- a/preprocessor/steps/vision/embeddings_step.py +++ b/preprocessor/steps/vision/embeddings_step.py @@ -164,13 +164,9 @@ def __extract_frame_requests( def __fetch_image_hashes( input_data: FrameCollection, context: ExecutionContext, ) -> Dict[int, str]: - filename_base = ( - f'{context.series_name}_{input_data.episode_info.episode_code()}' - ) - hash_filename: str = f'{filename_base}_image_hashes.json' - hash_path: Path = context.get_output_path( - input_data.episode_info, 'image_hashes', hash_filename, - ) + season = f'S{input_data.episode_info.season:02d}' + episode = input_data.episode_info.episode_code() + hash_path: Path = context.base_output_dir / 'hashes' / season / f'{episode}.json' if not hash_path.exists(): return {} From 5172ac0775a67459b300a46663c6556160f080b5 Mon Sep 17 00:00:00 2001 From: dam2452 <81230036+dam2452@users.noreply.github.com> Date: Fri, 27 Feb 2026 21:21:18 +0100 Subject: [PATCH 63/89] Add deploy_to_nas script and package init Introduce a new CLI utility to copy processed series output to NAS storage. The deploy_to_nas script collects files from `archives` and `transcoded_videos` subdirs, supports dry-run, overwrite, and concurrent copying via ThreadPoolExecutor, and auto-resolves the local output_data base path unless overridden. Also add an empty package __init__.py to expose preprocessor.scripts as a module. --- .pre-commit-config.yaml | 2 +- preprocessor/scripts/__init__.py | 0 preprocessor/scripts/deploy_to_nas.py | 183 ++++++++++++++++++++++++++ 3 files changed, 184 insertions(+), 1 deletion(-) create mode 100644 preprocessor/scripts/__init__.py create mode 100644 preprocessor/scripts/deploy_to_nas.py diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 5fd7cc6cf..8947f5279 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -37,7 +37,7 @@ repos: - id: chmod args: ["755"] files: (.*scripts\/.*.py$|\.sh$) - exclude: ^preprocessor/entrypoint\.sh$ + exclude: (^preprocessor/entrypoint\.sh$|^preprocessor/scripts/) - id: remove-tabs args: [--whitespaces-count, '4'] - repo: https://github.com/PyCQA/isort diff --git a/preprocessor/scripts/__init__.py b/preprocessor/scripts/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/preprocessor/scripts/deploy_to_nas.py b/preprocessor/scripts/deploy_to_nas.py new file mode 100644 index 000000000..97991d125 --- /dev/null +++ b/preprocessor/scripts/deploy_to_nas.py @@ -0,0 +1,183 @@ +import argparse +from concurrent.futures import ( + ThreadPoolExecutor, + as_completed, +) +from pathlib import Path +import shutil +import sys +from typing import ( + List, + Tuple, +) + +_DEPLOY_SUBDIRS = ("archives", "transcoded_videos") +_DEFAULT_WORKERS = 8 + + +def _resolve_source_base(source_path: str) -> Path: + if source_path: + return Path(source_path) + script_dir = Path(__file__).resolve().parent + return script_dir.parent / "output_data" + + +def _collect_files(source_series_dir: Path, target_series_dir: Path) -> List[Tuple[Path, Path]]: + pairs = [] + for subdir in _DEPLOY_SUBDIRS: + source_subdir = source_series_dir / subdir + if not source_subdir.exists(): + print(f" [SKIP] Source not found: {source_subdir}") + continue + for source_file in source_subdir.rglob("*"): + if source_file.is_file(): + relative = source_file.relative_to(source_subdir) + target_file = target_series_dir / relative + pairs.append((source_file, target_file)) + return pairs + + +def _copy_file(src: Path, dst: Path, dry_run: bool) -> Tuple[Path, Path, bool, str]: + try: + if not dry_run: + dst.parent.mkdir(parents=True, exist_ok=True) + shutil.copy2(src, dst) + return src, dst, True, "" + except Exception as e: + return src, dst, False, str(e) + + +def _print_summary(total: int, copied: int, skipped: int, failed: int, dry_run: bool) -> None: + prefix = "[DRY RUN] " if dry_run else "" + print(f"\n{prefix}Summary:") + print(f" Total files : {total}") + print(f" Copied : {copied}") + print(f" Skipped : {skipped}") + print(f" Failed : {failed}") + + +def _filter_files_to_copy( + pairs: List[Tuple[Path, Path]], overwrite: bool, +) -> Tuple[List[Tuple[Path, Path]], int]: + to_copy = [] + skipped = 0 + for src, dst in pairs: + if not overwrite and dst.exists(): + skipped += 1 + else: + to_copy.append((src, dst)) + return to_copy, skipped + + +def _execute_copy_batch( + to_copy: List[Tuple[Path, Path]], + target_series_dir: Path, + dry_run: bool, + workers: int, +) -> Tuple[int, int]: + copied = 0 + failed = 0 + done = 0 + with ThreadPoolExecutor(max_workers=workers) as executor: + futures = {executor.submit(_copy_file, src, dst, dry_run): src for src, dst in to_copy} + for future in as_completed(futures): + src, result_dst, success, error = future.result() + done += 1 + if success: + copied += 1 + rel = result_dst.relative_to(target_series_dir) + print(f" [{'DRY' if dry_run else 'OK'}] {rel} ({done}/{len(to_copy)})") + else: + failed += 1 + print(f" [FAIL] {src.name} — {error}") + return copied, failed + + +def deploy( + source_base: Path, + target_base: Path, + series: str, + dry_run: bool, + workers: int, + overwrite: bool, +) -> int: + source_series_dir = source_base / series + target_series_dir = target_base / series + + if not source_series_dir.exists(): + print(f"ERROR: Source directory not found: {source_series_dir}") + return 1 + + print(f"Source : {source_series_dir}") + print(f"Target : {target_series_dir}") + print(f"Mode : {'DRY RUN' if dry_run else 'COPY'} | workers={workers} | overwrite={overwrite}") + print() + + pairs = _collect_files(source_series_dir, target_series_dir) + if not pairs: + print("No files found to copy.") + return 0 + + to_copy, skipped = _filter_files_to_copy(pairs, overwrite) + print(f"Files to copy : {len(to_copy)}") + print(f"Files skipped : {skipped} (already exist, use --overwrite to replace)") + print() + + if not to_copy: + _print_summary(len(pairs), 0, skipped, 0, dry_run) + return 0 + + copied, failed = _execute_copy_batch(to_copy, target_series_dir, dry_run, workers) + _print_summary(len(pairs), copied, skipped, failed, dry_run) + return 1 if failed else 0 + + +def main() -> None: + parser = argparse.ArgumentParser( + description="Copy processed series archives and videos to NAS storage.", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog="Example:\n python -m preprocessor.scripts.deploy_to_nas" + " --target-path //TRUENAS/RanchBot --series kiepscy", + ) + parser.add_argument( + "--target-path", + required=True, + help="Base NAS path (e.g. //TRUENAS/RanchBot or /mnt/truenas/RanchBot)", + ) + parser.add_argument( + "--series", + required=True, + help="Series name (e.g. kiepscy, ranczo)", + ) + parser.add_argument( + "--source-path", + default="", + help="Override local output_data base path (default: auto-detected relative to this script)", + ) + parser.add_argument( + "--overwrite", + action="store_true", + help="Overwrite files that already exist on target", + ) + parser.add_argument( + "--dry-run", + action="store_true", + help="Show what would be copied without actually copying", + ) + parser.add_argument( + "--workers", + type=int, + default=_DEFAULT_WORKERS, + help=f"Number of parallel copy workers (default: {_DEFAULT_WORKERS})", + ) + + args = parser.parse_args() + + source_base = _resolve_source_base(args.source_path) + target_base = Path(args.target_path) + + sys.exit(deploy(source_base, target_base, args.series, args.dry_run, args.workers, args.overwrite)) + + +if __name__ == "__main__": + main() From fb9752dfe3ace5e177313c24539581883caccbc8 Mon Sep 17 00:00:00 2001 From: dam2452 <81230036+dam2452@users.noreply.github.com> Date: Sat, 28 Feb 2026 08:33:30 +0100 Subject: [PATCH 64/89] Update deploy_to_nas.py --- preprocessor/scripts/deploy_to_nas.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/preprocessor/scripts/deploy_to_nas.py b/preprocessor/scripts/deploy_to_nas.py index 97991d125..df06747a4 100644 --- a/preprocessor/scripts/deploy_to_nas.py +++ b/preprocessor/scripts/deploy_to_nas.py @@ -12,7 +12,7 @@ ) _DEPLOY_SUBDIRS = ("archives", "transcoded_videos") -_DEFAULT_WORKERS = 8 +_DEFAULT_WORKERS = 1 def _resolve_source_base(source_path: str) -> Path: From e06b9186766f991a29e6d8adca6f2ea0ac2bd4ec Mon Sep 17 00:00:00 2001 From: dam2452 <81230036+dam2452@users.noreply.github.com> Date: Mon, 9 Mar 2026 12:11:10 +0100 Subject: [PATCH 65/89] vLLM: switch model, adjust sampling, 256K context Update VLLM client defaults and runtime settings: change default model from Qwen/Qwen2.5-Coder-7B-Instruct to Qwen/Qwen3.5-9B; tune sampling parameters (add min_p, presence_penalty, set repetition_penalty to 1.0); pass chat_template_kwargs to disable "thinking" in outputs; and increase max_model_len from 131072 to 262144 (update console message to 256K context). These changes enable longer-context runs and adjusted generation behavior. --- preprocessor/services/ai/clients.py | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/preprocessor/services/ai/clients.py b/preprocessor/services/ai/clients.py index ba5d172d1..bfa8e314e 100644 --- a/preprocessor/services/ai/clients.py +++ b/preprocessor/services/ai/clients.py @@ -25,7 +25,7 @@ def generate(self, messages: List[Dict[str, str]], max_tokens: int = 32768) -> s class VLLMClient(BaseLLMClient): - __DEFAULT_MODEL_NAME = 'Qwen/Qwen2.5-Coder-7B-Instruct' + __DEFAULT_MODEL_NAME = 'Qwen/Qwen3.5-9B' def __init__(self, model_name: Optional[str] = None) -> None: self.__model_name = model_name or self.__DEFAULT_MODEL_NAME @@ -40,19 +40,25 @@ def generate(self, messages: List[Dict[str, str]], max_tokens: int = 32768) -> s temperature=0.7, top_p=0.8, top_k=20, + min_p=0.0, + presence_penalty=1.5, + repetition_penalty=1.0, max_tokens=max_tokens, - repetition_penalty=1.05, ) - outputs = self.__model.chat(messages=[messages], sampling_params=sampling_params) + outputs = self.__model.chat( + messages=[messages], + sampling_params=sampling_params, + chat_template_kwargs={'enable_thinking': False}, + ) return outputs[0].outputs[0].text.strip() def __load_model(self) -> None: - console.print(f'[cyan]Loading LLM: {self.__model_name} (vLLM, 128K context)[/cyan]') + console.print(f'[cyan]Loading LLM: {self.__model_name} (vLLM, 256K context)[/cyan]') try: self.__model = LLM( model=self.__model_name, trust_remote_code=True, - max_model_len=131072, + max_model_len=262144, gpu_memory_utilization=0.95, tensor_parallel_size=1, dtype='bfloat16', From 759c5e3e22448795eba36e56209c552651eddfbd Mon Sep 17 00:00:00 2001 From: dam2452 <81230036+dam2452@users.noreply.github.com> Date: Mon, 9 Mar 2026 13:31:49 +0100 Subject: [PATCH 66/89] Add transcription import step and config Introduce a TranscriptionImportStep and wiring to allow importing pre-existing transcriptions instead of running transcription. - Add TranscriptionImportStep implementation that supports batch processing, finds and converts 11labs_seg mented JSON (and generic JSON), writes normalized transcription artifacts and provides output descriptor. Supports season_remap and resolves per-episode files. - Make pipeline_factory choose between TranscriptionImportStep and runtime TranscriptionStep based on series_config.processing.transcription_import. - Extend SeriesConfig to parse an optional transcription_import section (TranscriptionImportProcessingConfig) and refactor processing config construction. - Add new example series config kapitan_bomba.json with transcription_import settings. - Update step_configs: TranscriptionImportConfig now uses Path and includes season_remap; add apply_boost_on_resize_only flag to TranscodeConfig. - Video transcoder: add resolution equality check and honor apply_boost_on_resize_only to avoid bitrate boosting when output resolution equals source. These changes enable importing externally-produced transcriptions and prevent unnecessary bitrate boosts when no resize occurs. --- preprocessor/app/pipeline_factory.py | 55 +++-- preprocessor/config/series_config.py | 75 ++++--- preprocessor/config/step_configs.py | 5 +- .../series_configs/kapitan_bomba.json | 35 +++ preprocessor/services/text/import_step.py | 207 ++++++++---------- preprocessor/steps/video/transcoding_step.py | 9 + 6 files changed, 227 insertions(+), 159 deletions(-) create mode 100644 preprocessor/series_configs/kapitan_bomba.json diff --git a/preprocessor/app/pipeline_factory.py b/preprocessor/app/pipeline_factory.py index f083e0121..331ff376a 100644 --- a/preprocessor/app/pipeline_factory.py +++ b/preprocessor/app/pipeline_factory.py @@ -1,3 +1,4 @@ +from pathlib import Path from typing import Dict from preprocessor.app.pipeline import PipelineDefinition @@ -32,6 +33,7 @@ TextEmbeddingConfig, TranscodeConfig, TranscriptionConfig, + TranscriptionImportConfig, ValidationConfig, VideoEmbeddingConfig, ) @@ -42,6 +44,7 @@ create_frames_output, ) from preprocessor.services.media.resolution import Resolution +from preprocessor.services.text.import_step import TranscriptionImportStep from preprocessor.steps.analysis.resolution_analysis_step import ResolutionAnalysisStep from preprocessor.steps.audio.separation_step import SoundSeparationStep from preprocessor.steps.packaging.archives_step import ArchiveGenerationStep @@ -229,25 +232,41 @@ def build_pipeline(series_name: str) -> PipelineDefinition: # pylint: disable=t # ========================================================= # PROCESSING PHASE: TEXT & AUDIO # ========================================================= - transcription_data = StepBuilder( - phase=PROCESSING, - step_class=TranscriptionStep, - description=f"Audio transcription using {series_config.processing.transcription.mode}", - produces=[ - JsonFileOutput( - pattern="{season}/{episode_num}/{episode}.json", - subdir="transcriptions/raw", - min_size_bytes=50, - ), - ], - needs=[transcoded_videos], - config=TranscriptionConfig( - mode=series_config.processing.transcription.mode, - model=series_config.processing.transcription.model, - language=series_config.processing.transcription.language, - device=series_config.processing.transcription.device, + _transcription_output = [ + JsonFileOutput( + pattern="{season}/{episode_num}/{episode}.json", + subdir="transcriptions/raw", + min_size_bytes=50, ), - ) + ] + _import_cfg = series_config.processing.transcription_import + if _import_cfg: + transcription_data = StepBuilder( + phase=PROCESSING, + step_class=TranscriptionImportStep, + description=f"Import pre-existing {_import_cfg.format_type} transcriptions", + produces=_transcription_output, + needs=[], + config=TranscriptionImportConfig( + source_dir=Path(_import_cfg.source_dir), + format_type=_import_cfg.format_type, + season_remap=_import_cfg.season_remap, + ), + ) + else: + transcription_data = StepBuilder( + phase=PROCESSING, + step_class=TranscriptionStep, + description=f"Audio transcription using {series_config.processing.transcription.mode}", + produces=_transcription_output, + needs=[transcoded_videos], + config=TranscriptionConfig( + mode=series_config.processing.transcription.mode, + model=series_config.processing.transcription.model, + language=series_config.processing.transcription.language, + device=series_config.processing.transcription.device, + ), + ) text_cleaning = StepBuilder( phase=PROCESSING, diff --git a/preprocessor/config/series_config.py b/preprocessor/config/series_config.py index 6969f826d..3b3eb8fc7 100644 --- a/preprocessor/config/series_config.py +++ b/preprocessor/config/series_config.py @@ -1,10 +1,14 @@ -from dataclasses import dataclass +from dataclasses import ( + dataclass, + field, +) import json from pathlib import Path from typing import ( Any, Dict, List, + Optional, ) @@ -53,6 +57,13 @@ class TranscriptionProcessingConfig: model: str +@dataclass +class TranscriptionImportProcessingConfig: + format_type: str + season_remap: Dict[str, int] + source_dir: str + + @dataclass class TranscodeProcessingConfig: bitrate_boost_ratio: float @@ -81,6 +92,7 @@ class ProcessingConfig: scene_detection: SceneDetectionProcessingConfig transcode: TranscodeProcessingConfig transcription: TranscriptionProcessingConfig + transcription_import: Optional[TranscriptionImportProcessingConfig] = field(default=None) @dataclass @@ -143,30 +155,7 @@ def __load_from_dict(data: Dict[str, Any]) -> 'SeriesConfig': images_per_character=data['scraping']['character_references']['images_per_character'], ), ), - processing=ProcessingConfig( - transcription=TranscriptionProcessingConfig( - mode=data['processing']['transcription']['mode'], - model=data['processing']['transcription']['model'], - language=data['processing']['transcription']['language'], - device=data['processing']['transcription']['device'], - ), - transcode=TranscodeProcessingConfig( - max_bitrate_file_size_mb=data['processing']['transcode']['max_bitrate_file_size_mb'], - max_bitrate_duration_seconds=data['processing']['transcode']['max_bitrate_duration_seconds'], - min_bitrate_mbps=data['processing']['transcode']['min_bitrate_mbps'], - bitrate_boost_ratio=data['processing']['transcode']['bitrate_boost_ratio'], - force_deinterlace=data['processing']['transcode']['force_deinterlace'], - keyframe_interval_seconds=data['processing']['transcode']['keyframe_interval_seconds'], - resolution=data['processing']['transcode']['resolution'], - ), - scene_detection=SceneDetectionProcessingConfig( - threshold=data['processing']['scene_detection']['threshold'], - min_scene_len=data['processing']['scene_detection']['min_scene_len'], - ), - frame_export=FrameExportProcessingConfig( - frames_per_scene=data['processing']['frame_export']['frames_per_scene'], - ), - ), + processing=SeriesConfig.__build_processing_config(data), indexing=IndexingConfig( elasticsearch=ElasticsearchIndexingConfig( index_name=data['indexing']['elasticsearch']['index_name'], @@ -177,6 +166,42 @@ def __load_from_dict(data: Dict[str, Any]) -> 'SeriesConfig': ), ) + @staticmethod + def __build_processing_config(data: Dict[str, Any]) -> 'ProcessingConfig': + import_cfg = data.get('processing', {}).get('transcription_import') + transcription_import = None + if import_cfg and import_cfg.get('source_dir'): + transcription_import = TranscriptionImportProcessingConfig( + source_dir=import_cfg['source_dir'], + format_type=import_cfg.get('format_type', '11labs_segmented'), + season_remap=import_cfg.get('season_remap', {}), + ) + return ProcessingConfig( + transcription=TranscriptionProcessingConfig( + mode=data['processing']['transcription']['mode'], + model=data['processing']['transcription']['model'], + language=data['processing']['transcription']['language'], + device=data['processing']['transcription']['device'], + ), + transcode=TranscodeProcessingConfig( + max_bitrate_file_size_mb=data['processing']['transcode']['max_bitrate_file_size_mb'], + max_bitrate_duration_seconds=data['processing']['transcode']['max_bitrate_duration_seconds'], + min_bitrate_mbps=data['processing']['transcode']['min_bitrate_mbps'], + bitrate_boost_ratio=data['processing']['transcode']['bitrate_boost_ratio'], + force_deinterlace=data['processing']['transcode']['force_deinterlace'], + keyframe_interval_seconds=data['processing']['transcode']['keyframe_interval_seconds'], + resolution=data['processing']['transcode']['resolution'], + ), + scene_detection=SceneDetectionProcessingConfig( + threshold=data['processing']['scene_detection']['threshold'], + min_scene_len=data['processing']['scene_detection']['min_scene_len'], + ), + frame_export=FrameExportProcessingConfig( + frames_per_scene=data['processing']['frame_export']['frames_per_scene'], + ), + transcription_import=transcription_import, + ) + @staticmethod def __load_from_file(config_path: Path) -> 'SeriesConfig': if not config_path.exists(): diff --git a/preprocessor/config/step_configs.py b/preprocessor/config/step_configs.py index 2d1b0df0f..3b08ce28c 100644 --- a/preprocessor/config/step_configs.py +++ b/preprocessor/config/step_configs.py @@ -1,5 +1,6 @@ from pathlib import Path from typing import ( + Dict, List, Optional, ) @@ -17,6 +18,7 @@ class TranscodeConfig(BaseModel): model_config = ConfigDict(arbitrary_types_allowed=True) + apply_boost_on_resize_only: bool = True bitrate_boost_ratio: float = Field(default=1.1, ge=1.0, le=2.0) force_deinterlace: bool = False keyframe_interval_seconds: float = Field(gt=0) @@ -160,7 +162,8 @@ class ImageHashConfig(BaseModel): class TranscriptionImportConfig(BaseModel): format_type: str = '11labs_segmented' - source_dir: str + season_remap: Dict[str, int] = {} + source_dir: Path class ElasticsearchConfig(BaseModel): diff --git a/preprocessor/series_configs/kapitan_bomba.json b/preprocessor/series_configs/kapitan_bomba.json new file mode 100644 index 000000000..dc9ca3abe --- /dev/null +++ b/preprocessor/series_configs/kapitan_bomba.json @@ -0,0 +1,35 @@ +{ + "display_name": "Kapitan Bomba", + "indexing": { + "elasticsearch": { + "host": "http://192.168.1.210:19200", + "index_name": "kapitan_bomba" + } + }, + "processing": { + "transcription_import": { + "format_type": "11labs_segmented", + "season_remap": { + "10": 0 + }, + "source_dir": "/mnt/c/Users/dam2452/Downloads/bomba trans/kapitan_bomba-wideo/segmented_json" + } + }, + "scraping": { + "characters": { + "urls": [ + "https://dubbingpedia.pl/wiki/Kapitan_Bomba" + ] + }, + "episodes": { + "urls": [ + "https://pl.wikipedia.org/wiki/Kapitan_Bomba_(serial_animowany)" + ] + } + }, + "series_name": "kapitan_bomba", + "skip_steps": [ + "episode_scraper", + "character_scraper" + ] +} diff --git a/preprocessor/services/text/import_step.py b/preprocessor/services/text/import_step.py index c9b36687d..2d894aa0e 100644 --- a/preprocessor/services/text/import_step.py +++ b/preprocessor/services/text/import_step.py @@ -2,93 +2,112 @@ import json from pathlib import Path -import re from typing import ( Any, Dict, List, Optional, - Tuple, ) from preprocessor.config.step_configs import TranscriptionImportConfig -from preprocessor.core.artifacts import TranscriptionData +from preprocessor.core.artifacts import ( + SourceVideo, + TranscriptionData, +) from preprocessor.core.base_step import PipelineStep from preprocessor.core.context import ExecutionContext -from preprocessor.services.episodes.episode_manager import ( - EpisodeInfo, - EpisodeManager, -) - - -class TranscriptionImportStep(PipelineStep[None, List[TranscriptionData], TranscriptionImportConfig]): - def __init__(self, config: TranscriptionImportConfig) -> None: - super().__init__(config) - self.__episode_manager: Optional[EpisodeManager] = None - - def _process(self, input_data: None, context: ExecutionContext) -> List[TranscriptionData]: - raise NotImplementedError("TranscriptionImportStep uses execute() instead of _process()") - - def execute(self, input_data: None, context: ExecutionContext) -> List[TranscriptionData]: - self.__ensure_episode_manager(context) - - json_files = self.__find_transcription_files() - if not json_files: - context.logger.warning(f'No transcription files found in {self.config.source_dir}') - return [] +from preprocessor.core.output_descriptors import JsonFileOutput +from preprocessor.services.episodes.episode_manager import EpisodeManager +from preprocessor.services.episodes.types import EpisodeInfo - context.logger.info(f'Found {len(json_files)} transcription files to import') - results: List[TranscriptionData] = [] - for json_file in json_files: - try: - artifact = self.__import_single_file(json_file, context) - if artifact: - results.append(artifact) - except Exception as e: - context.logger.error(f'Failed to import {json_file.name}: {e}') +class TranscriptionImportStep(PipelineStep[SourceVideo, TranscriptionData, TranscriptionImportConfig]): + @property + def supports_batch_processing(self) -> bool: + return True - return results - - def __ensure_episode_manager(self, context: ExecutionContext) -> None: - if self.__episode_manager is None: - self.__episode_manager = EpisodeManager(None, context.series_name, context.logger) - - def __find_transcription_files(self) -> List[Path]: - pattern = '*_segmented.json' if self.config.format_type == '11labs_segmented' else '*.json' - files = sorted(self.config.source_dir.rglob(pattern)) - return [f for f in files if not f.name.startswith('.')] - - def __import_single_file(self, json_file: Path, context: ExecutionContext) -> Optional[TranscriptionData]: - episode_info = self.__resolve_episode_info(json_file) - if not episode_info: - context.logger.warning(f'Could not determine episode for {json_file}') - return None - - episode_id = self.__episode_manager.get_episode_id_for_state(episode_info) - output_path = self.__get_output_path(episode_info, context) + def execute_batch( + self, input_data: List[SourceVideo], context: ExecutionContext, + ) -> List[TranscriptionData]: + return self._execute_with_threadpool( + input_data, context, 4, self.execute, + ) - if self.__should_skip_import(output_path, episode_id, context): - return self.__construct_cached_artifact(episode_id, episode_info, output_path) + def _process(self, input_data: SourceVideo, context: ExecutionContext) -> TranscriptionData: + episode_info = input_data.episode_info - context.logger.info(f'Importing {episode_id} from {json_file.name}') - context.mark_step_started(self.name, episode_id) + json_file = self.__find_transcription_file(episode_info) + if not json_file: + raise FileNotFoundError( + f'No transcription file found for {input_data.episode_id} in {self.config.source_dir}', + ) + output_path = self._get_cache_path(input_data, context) source_data = self.__load_json(json_file) converted_data = self.__convert_data(source_data, json_file) converted_data['episode_info'] = EpisodeManager.get_metadata(episode_info) - self.__save_converted_data(output_path, converted_data) - context.mark_step_completed(self.name, episode_id) - return self.__construct_new_artifact(episode_id, episode_info, output_path, converted_data) + context.logger.info(f'Imported {input_data.episode_id} from {json_file.name}') - def __resolve_episode_info(self, json_file: Path) -> Optional[EpisodeInfo]: - info = self.__episode_manager.parse_filename(json_file) - if not info: - season, episode = self.__extract_season_episode_fallback(json_file) - info = self.__episode_manager.get_episode_by_season_and_relative(season, episode) - return info + trans_meta = converted_data.get('transcription', {}) + return TranscriptionData( + episode_id=input_data.episode_id, + episode_info=episode_info, + path=output_path, + language=trans_meta.get('language_code', 'pl'), + model=trans_meta.get('format', '11labs'), + format='json', + ) + + def get_output_descriptors(self) -> List[JsonFileOutput]: + return [ + JsonFileOutput( + pattern='{season}/{episode_num}/{episode}.json', + subdir='transcriptions/raw', + min_size_bytes=50, + ), + ] + + def _get_cache_path(self, input_data: SourceVideo, context: ExecutionContext) -> Path: + return self._resolve_output_path( + 0, + context, + { + 'season': input_data.episode_info.season_code(), + 'episode_num': input_data.episode_info.episode_num(), + 'episode': input_data.episode_info.episode_code(), + }, + ) + + def _load_from_cache( + self, cache_path: Path, input_data: SourceVideo, context: ExecutionContext, + ) -> TranscriptionData: + return TranscriptionData( + episode_id=input_data.episode_id, + episode_info=input_data.episode_info, + path=cache_path, + language='pl', + model='11labs', + format='json', + ) + + def __find_transcription_file(self, episode_info: EpisodeInfo) -> Optional[Path]: + file_season = self.__resolve_file_season(episode_info.season) + ep = episode_info.relative_episode + pattern = ( + f'*S{file_season:02d}E{ep:02d}*_segmented.json' + if self.config.format_type == '11labs_segmented' + else f'*S{file_season:02d}E{ep:02d}*.json' + ) + files = sorted(self.config.source_dir.rglob(pattern)) + return files[0] if files else None + + def __resolve_file_season(self, target_season: int) -> int: + for file_season_str, mapped_season in self.config.season_remap.items(): + if mapped_season == target_season: + return int(file_season_str) + return target_season def __convert_data(self, data: Dict[str, Any], source_file: Path) -> Dict[str, Any]: if self.config.format_type == '11labs_segmented': @@ -97,23 +116,13 @@ def __convert_data(self, data: Dict[str, Any], source_file: Path) -> Dict[str, A return self.__convert_11labs_full(data, source_file) raise ValueError(f'Unknown format type: {self.config.format_type}') - def __should_skip_import(self, output_path: Path, episode_id: str, context: ExecutionContext) -> bool: - if output_path.exists() and not context.force_rerun: - context.logger.info(f'Skipping {episode_id} (output exists)') - if not context.is_step_completed(self.name, episode_id): - context.mark_step_completed(self.name, episode_id) - return True - return False - - def __get_output_path(self, episode_info: EpisodeInfo, context: ExecutionContext) -> Path: - filename = self.__episode_manager.path_manager.build_filename(episode_info, extension='json') - return context.get_output_path(episode_info, 'transcriptions/raw', filename) - @staticmethod def __convert_11labs_full(data: Dict[str, Any], source_file: Path) -> Dict[str, Any]: segments: List[Dict[str, Any]] = [] words = data.get('words', []) - current_seg: Dict[str, Any] = {'words': [], 'start': None, 'end': None, 'text': '', 'speaker': 'unknown'} + current_seg: Dict[str, Any] = { + 'words': [], 'start': None, 'end': None, 'text': '', 'speaker': 'unknown', + } for word in words: if current_seg['start'] is None: @@ -160,24 +169,14 @@ def __convert_11labs_segmented(data: Dict[str, Any], source_file: Path) -> Dict[ 'words': segment.get('words', []), }) return { - 'transcription': {'format': '11labs_segmented', 'source_file': source_file.name, 'segments': segments}, + 'transcription': { + 'format': '11labs_segmented', + 'source_file': source_file.name, + 'language_code': 'pol', + }, 'segments': segments, } - @staticmethod - def __extract_season_episode_fallback(file_path: Path) -> Tuple[int, int]: - match = re.search('S(\\d+)E(\\d+)', file_path.name, re.IGNORECASE) - if match: - return int(match.group(1)), int(match.group(2)) - - parent_match = re.search('S(\\d+)', file_path.parent.name, re.IGNORECASE) - if parent_match: - season = int(parent_match.group(1)) - episode_match = re.search('E(\\d+)', file_path.name, re.IGNORECASE) - if episode_match: - return season, int(episode_match.group(1)) - return 1, 1 - @staticmethod def __load_json(file_path: Path) -> Dict[str, Any]: with open(file_path, 'r', encoding='utf-8') as f: @@ -188,25 +187,3 @@ def __save_converted_data(output_path: Path, data: Dict[str, Any]) -> None: output_path.parent.mkdir(parents=True, exist_ok=True) with open(output_path, 'w', encoding='utf-8') as f: json.dump(data, f, indent=2, ensure_ascii=False) - - @staticmethod - def __construct_cached_artifact(episode_id: str, info: EpisodeInfo, path: Path) -> TranscriptionData: - return TranscriptionData( - episode_id=episode_id, episode_info=info, path=path, - language='pl', model='11labs', format='json', - ) - - @staticmethod - def __construct_new_artifact( - episode_id: str, info: EpisodeInfo, path: Path, - data: Dict[str, Any], - ) -> TranscriptionData: - trans_meta = data.get('transcription', {}) - return TranscriptionData( - episode_id=episode_id, - episode_info=info, - path=path, - language=trans_meta.get('language_code', 'pl'), - model=trans_meta.get('format', '11labs'), - format='json', - ) diff --git a/preprocessor/steps/video/transcoding_step.py b/preprocessor/steps/video/transcoding_step.py index 259f1ff7a..a10ee72cb 100644 --- a/preprocessor/steps/video/transcoding_step.py +++ b/preprocessor/steps/video/transcoding_step.py @@ -117,6 +117,12 @@ def __is_upscaling(self, probe_data: Dict[str, Any]) -> bool: target_px = self.config.resolution.width * self.config.resolution.height return src_px < target_px + def __is_same_resolution(self, probe_data: Dict[str, Any]) -> bool: + w, h = FFmpegWrapper.get_resolution(probe_data) + sar_num, sar_denom = FFmpegWrapper.get_sample_aspect_ratio(probe_data) + eff_w = int(w * sar_num / sar_denom) + return eff_w == self.config.resolution.width and h == self.config.resolution.height + def __compute_all_bitrate_settings( self, probe_data: Dict[str, Any], context: ExecutionContext, ) -> Dict[str, float]: @@ -135,6 +141,9 @@ def __compute_all_bitrate_settings( elif normalized_bitrate > max_bitrate: final_bitrate = max_bitrate adjustment = f"capped to maximum ({max_bitrate} Mbps)" + elif self.config.apply_boost_on_resize_only and self.__is_same_resolution(probe_data): + final_bitrate = normalized_bitrate + adjustment = "preserved (same resolution, no resize boost)" else: final_bitrate = normalized_bitrate * self.config.bitrate_boost_ratio boost_percent = (self.config.bitrate_boost_ratio - 1.0) * 100 From 1ac92d45bfa7963dc05bbb400069ec151b5083d9 Mon Sep 17 00:00:00 2001 From: dam2452 <81230036+dam2452@users.noreply.github.com> Date: Mon, 9 Mar 2026 14:01:33 +0100 Subject: [PATCH 67/89] Preserve bitrate for same-res, improve logs Update series config path and adjust transcoding bitrate logic and logging. - preprocessor/series_configs/kapitan_bomba.json: switch source_dir from a local Windows Downloads path to /transcriptions/kapitan_bomba. - preprocessor/steps/video/transcoding_step.py: when config.apply_boost_on_resize_only is set and the video resolution is unchanged, preserve the normalized bitrate (no boost), even if it falls below min_bitrate. Reordered branch logic to ensure same-resolution preservation takes precedence. Adjusted the preserved message text. - Made __log_transcode_details an instance method so it can call the resolution check, and changed the scale label to emit SAME/UP/DOWN in logs for clearer reporting. These changes ensure bitrate boosts are applied only when resizing (if configured) and improve logging clarity. --- .../series_configs/kapitan_bomba.json | 2 +- preprocessor/steps/video/transcoding_step.py | 19 ++++++++++++------- 2 files changed, 13 insertions(+), 8 deletions(-) diff --git a/preprocessor/series_configs/kapitan_bomba.json b/preprocessor/series_configs/kapitan_bomba.json index dc9ca3abe..512136c15 100644 --- a/preprocessor/series_configs/kapitan_bomba.json +++ b/preprocessor/series_configs/kapitan_bomba.json @@ -12,7 +12,7 @@ "season_remap": { "10": 0 }, - "source_dir": "/mnt/c/Users/dam2452/Downloads/bomba trans/kapitan_bomba-wideo/segmented_json" + "source_dir": "/transcriptions/kapitan_bomba" } }, "scraping": { diff --git a/preprocessor/steps/video/transcoding_step.py b/preprocessor/steps/video/transcoding_step.py index a10ee72cb..798b80ce1 100644 --- a/preprocessor/steps/video/transcoding_step.py +++ b/preprocessor/steps/video/transcoding_step.py @@ -135,15 +135,15 @@ def __compute_all_bitrate_settings( normalized_bitrate = self.__get_normalized_bitrate(src_bitrate, probe_data, context) - if normalized_bitrate < min_bitrate: + if self.config.apply_boost_on_resize_only and self.__is_same_resolution(probe_data): + final_bitrate = normalized_bitrate + adjustment = "preserved (same resolution, no boost)" + elif normalized_bitrate < min_bitrate: final_bitrate = min_bitrate adjustment = f"boosted to minimum ({min_bitrate} Mbps)" elif normalized_bitrate > max_bitrate: final_bitrate = max_bitrate adjustment = f"capped to maximum ({max_bitrate} Mbps)" - elif self.config.apply_boost_on_resize_only and self.__is_same_resolution(probe_data): - final_bitrate = normalized_bitrate - adjustment = "preserved (same resolution, no resize boost)" else: final_bitrate = normalized_bitrate * self.config.bitrate_boost_ratio boost_percent = (self.config.bitrate_boost_ratio - 1.0) * 100 @@ -267,15 +267,20 @@ def __get_codec_efficiency_multiplier(src: str, tgt: str) -> float: eff = VideoTranscoderStep.__CODEC_EFFICIENCY return eff.get(src, 1.0) / eff.get(tgt, 1.0) - @staticmethod def __log_transcode_details( + self, ctx: ExecutionContext, input_data: SourceVideo, params: TranscodeParams, probe: Dict[str, Any], ) -> None: w, h = FFmpegWrapper.get_resolution(probe) - up_label = "UP" if params.is_upscaling else "DOWN" + if self.__is_same_resolution(probe): + scale_label = "SAME" + elif params.is_upscaling: + scale_label = "UP" + else: + scale_label = "DOWN" ctx.logger.info( - f'{input_data.episode_id}: {w}x{h} -> {params.resolution} [{up_label}]', + f'{input_data.episode_id}: {w}x{h} -> {params.resolution} [{scale_label}]', ) From 92f86464e251110c16e49dde5e53244243686d3b Mon Sep 17 00:00:00 2001 From: dam2452 <81230036+dam2452@users.noreply.github.com> Date: Tue, 10 Mar 2026 15:18:33 +0100 Subject: [PATCH 68/89] Update vLLM install, client and configs Dockerfile: switch to pre-release/nightly vllm wheels, install transformers from main, remove flashinfer and onnxruntime, pin onnxruntime-gpu==1.21.0, and replace CUDA/CuDNN/NCCL packages with cu12 variants (add tolerant uninstall for cu11). core/state_reconstruction.py: rename step_instance parameters to step_def for output checks and use step_def.get_output_descriptors(). series_configs: reduce images_per_character from 5 to 3 in defaults and stop skipping character_scraper for kapitan_bomba (enable character scraping). services/ai/clients.py: adjust vLLM sampling and runtime defaults (temperature 0.7->1.0, top_p 0.8->0.95, gpu_memory_utilization 0.95->0.90) and add language_model_only=True. These changes align dependencies with nightly vLLM, update runtime tuning, and tweak preprocessing defaults. --- preprocessor/Dockerfile | 8 +++++++- preprocessor/core/state_reconstruction.py | 12 ++++++------ preprocessor/series_configs/defaults.json | 2 +- preprocessor/series_configs/kapitan_bomba.json | 3 +-- preprocessor/services/ai/clients.py | 7 ++++--- 5 files changed, 19 insertions(+), 13 deletions(-) diff --git a/preprocessor/Dockerfile b/preprocessor/Dockerfile index 07731cad3..358109848 100644 --- a/preprocessor/Dockerfile +++ b/preprocessor/Dockerfile @@ -38,12 +38,18 @@ RUN --mount=type=cache,target=/root/.cache/pip \ pip install --no-cache-dir --upgrade pip setuptools wheel \ && pip install --no-cache-dir \ -r /app/requirements.txt \ - vllm==0.13.0 \ --extra-index-url https://pypi.nvidia.com \ + && pip install --no-cache-dir --pre vllm \ + --extra-index-url https://wheels.vllm.ai/nightly \ + && pip install --no-cache-dir \ + git+https://github.com/huggingface/transformers.git@main \ + && pip uninstall -y flashinfer \ && pip uninstall -y onnxruntime \ && pip install --no-cache-dir \ onnxruntime-gpu==1.21.0 \ --extra-index-url https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/onnxruntime-cuda-12/pypi/simple/ \ + && pip uninstall -y nvidia-cudnn-cu11 || true \ + && pip install --no-cache-dir --force-reinstall --no-deps nvidia-cudnn-cu12 \ && pip uninstall -y nvidia-nccl-cu11 || true \ && pip install --no-cache-dir --force-reinstall --no-deps nvidia-nccl-cu12 diff --git a/preprocessor/core/state_reconstruction.py b/preprocessor/core/state_reconstruction.py index f87b9569d..0d2e0cb47 100644 --- a/preprocessor/core/state_reconstruction.py +++ b/preprocessor/core/state_reconstruction.py @@ -32,7 +32,7 @@ def scan_filesystem( step_name = step_instance.name if step_instance.is_global: - if StateReconstructor.__check_global_step_outputs(step_instance, base_output_dir): + if StateReconstructor.__check_global_step_outputs(step_def, base_output_dir): checkpoint = StepCheckpoint( step=step_name, episode='all', @@ -55,7 +55,7 @@ def scan_filesystem( } if StateReconstructor.__check_episode_step_outputs( - step_instance, base_output_dir, context_vars, + step_def, base_output_dir, context_vars, ): checkpoint = StepCheckpoint( step=step_name, @@ -74,8 +74,8 @@ def scan_filesystem( return completed_steps @staticmethod - def __check_global_step_outputs(step_instance, base_output_dir: Path) -> bool: - descriptors = step_instance.get_output_descriptors() + def __check_global_step_outputs(step_def, base_output_dir: Path) -> bool: + descriptors = step_def.get_output_descriptors() if not descriptors: return True @@ -86,11 +86,11 @@ def __check_global_step_outputs(step_instance, base_output_dir: Path) -> bool: @staticmethod def __check_episode_step_outputs( - step_instance, + step_def, base_output_dir: Path, context_vars: Dict[str, str], ) -> bool: - descriptors = step_instance.get_output_descriptors() + descriptors = step_def.get_output_descriptors() if not descriptors: return True diff --git a/preprocessor/series_configs/defaults.json b/preprocessor/series_configs/defaults.json index af45df889..805c052b9 100644 --- a/preprocessor/series_configs/defaults.json +++ b/preprocessor/series_configs/defaults.json @@ -35,7 +35,7 @@ }, "scraping": { "character_references": { - "images_per_character": 5, + "images_per_character": 3, "search_engine": "duckduckgo" }, "characters": { diff --git a/preprocessor/series_configs/kapitan_bomba.json b/preprocessor/series_configs/kapitan_bomba.json index 512136c15..f7113594f 100644 --- a/preprocessor/series_configs/kapitan_bomba.json +++ b/preprocessor/series_configs/kapitan_bomba.json @@ -29,7 +29,6 @@ }, "series_name": "kapitan_bomba", "skip_steps": [ - "episode_scraper", - "character_scraper" + "episode_scraper" ] } diff --git a/preprocessor/services/ai/clients.py b/preprocessor/services/ai/clients.py index bfa8e314e..042a23ffe 100644 --- a/preprocessor/services/ai/clients.py +++ b/preprocessor/services/ai/clients.py @@ -37,8 +37,8 @@ def generate(self, messages: List[Dict[str, str]], max_tokens: int = 32768) -> s raise RuntimeError('Model not initialized') sampling_params = SamplingParams( - temperature=0.7, - top_p=0.8, + temperature=1.0, + top_p=0.95, top_k=20, min_p=0.0, presence_penalty=1.5, @@ -59,13 +59,14 @@ def __load_model(self) -> None: model=self.__model_name, trust_remote_code=True, max_model_len=262144, - gpu_memory_utilization=0.95, + gpu_memory_utilization=0.90, tensor_parallel_size=1, dtype='bfloat16', enable_chunked_prefill=True, max_num_batched_tokens=16384, enforce_eager=True, disable_log_stats=True, + language_model_only=True, ) console.print('[green]LLM loaded successfully (vLLM)[/green]') except Exception as e: From 98ac7da804630b0da1a37ee8748909d42515c666 Mon Sep 17 00:00:00 2001 From: dam2452 <81230036+dam2452@users.noreply.github.com> Date: Wed, 11 Mar 2026 22:06:11 +0100 Subject: [PATCH 69/89] Skip character images; improve models & clustering Allow disabling character reference downloads and harden model loading and face clustering. - Config: set face clustering and validation default parallel episodes to 1; allow images_per_character >= 0. - Series config: set kapitan_bomba scraping.character_references.images_per_character to 0 to skip downloads. - Scraping step: short-circuit reference download when images_per_character == 0 and log the skip. - Face clustering: handle tiny sample counts (return zeros for <2 samples) and cap min_samples/min_cluster_size to number of samples to avoid errors. - Emotion utils: add support for EMOTION_MODEL_HOME to load/persist ONNX models from a mounted volume, add retry logic for HTTP 429 rate limits when downloading models, persist packaged model into volume if present, and patch model path resolution accordingly. These changes make scraping optional for character images, improve robustness in low-sample clustering scenarios, and make emotion model loading resilient and friendly to containerized environments with mounted model volumes. --- preprocessor/config/step_configs.py | 6 +- .../series_configs/kapitan_bomba.json | 3 + .../services/characters/face_clusterer.py | 11 ++- preprocessor/services/video/emotion_utils.py | 74 +++++++++++++++++-- .../scraping/reference_processor_step.py | 3 + 5 files changed, 86 insertions(+), 11 deletions(-) diff --git a/preprocessor/config/step_configs.py b/preprocessor/config/step_configs.py index 3b08ce28c..ed9a161cc 100644 --- a/preprocessor/config/step_configs.py +++ b/preprocessor/config/step_configs.py @@ -191,7 +191,7 @@ class EmotionDetectionConfig(BaseModel): class FaceClusteringConfig(BaseModel): - max_parallel_episodes: int = Field(default=4, ge=1, le=8) + max_parallel_episodes: int = Field(default=1, ge=1, le=8) class ObjectDetectionConfig(BaseModel): @@ -209,7 +209,7 @@ class ArchiveConfig(BaseModel): class ValidationConfig(BaseModel): anomaly_threshold: float = 20.0 episodes_info_json: Optional[Path] = None - max_parallel_episodes: int = Field(default=8, ge=1, le=16) + max_parallel_episodes: int = Field(default=1, ge=1, le=16) class EpisodeScraperConfig(BaseModel): @@ -228,7 +228,7 @@ class CharacterScraperConfig(BaseModel): class CharacterReferenceConfig(BaseModel): - images_per_character: int = Field(default=5, ge=1, le=20) + images_per_character: int = Field(default=5, ge=0, le=20) max_parallel_episodes: int = Field(default=4, ge=1, le=8) search_engine: str = "duckduckgo" diff --git a/preprocessor/series_configs/kapitan_bomba.json b/preprocessor/series_configs/kapitan_bomba.json index f7113594f..5b81694cb 100644 --- a/preprocessor/series_configs/kapitan_bomba.json +++ b/preprocessor/series_configs/kapitan_bomba.json @@ -16,6 +16,9 @@ } }, "scraping": { + "character_references": { + "images_per_character": 0 + }, "characters": { "urls": [ "https://dubbingpedia.pl/wiki/Kapitan_Bomba" diff --git a/preprocessor/services/characters/face_clusterer.py b/preprocessor/services/characters/face_clusterer.py index 610bd564d..566f1a030 100644 --- a/preprocessor/services/characters/face_clusterer.py +++ b/preprocessor/services/characters/face_clusterer.py @@ -53,12 +53,19 @@ def cluster_embeddings( min_cluster_size: int, min_samples: int, ) -> np.ndarray: + n_samples = len(face_data) + if n_samples < 2: + return np.zeros(n_samples, dtype=np.intp) + vectors = np.array([fd['vector'] for fd in face_data]) vectors_gpu = cp.asarray(vectors) + effective_min_samples = min(min_samples, n_samples) + effective_min_cluster_size = min(min_cluster_size, n_samples) + clusterer = cuHDBSCAN( - min_cluster_size=min_cluster_size, - min_samples=min_samples, + min_cluster_size=effective_min_cluster_size, + min_samples=effective_min_samples, metric='euclidean', cluster_selection_method='eom', ) diff --git a/preprocessor/services/video/emotion_utils.py b/preprocessor/services/video/emotion_utils.py index df7c79167..03a970e0e 100644 --- a/preprocessor/services/video/emotion_utils.py +++ b/preprocessor/services/video/emotion_utils.py @@ -1,14 +1,17 @@ +import os +from pathlib import Path +import shutil +import time from typing import ( Dict, List, Optional, Tuple, ) +import urllib.error -from hsemotion_onnx.facial_emotions import ( - HSEmotionRecognizer, - get_model_path, -) +import hsemotion_onnx.facial_emotions as _hsemotion_facial_emotions +from hsemotion_onnx.facial_emotions import HSEmotionRecognizer import numpy as np import onnxruntime as ort @@ -17,6 +20,20 @@ EMOTION_LABELS: List[str] = ['anger', 'contempt', 'disgust', 'fear', 'happiness', 'neutral', 'sadness', 'surprise'] +_ORIGINAL_GET_MODEL_PATH = _hsemotion_facial_emotions.get_model_path + + +def _volume_aware_get_model_path(model_name: str) -> str: + model_home = os.environ.get('EMOTION_MODEL_HOME', '') + if model_home: + volume_path = Path(model_home) / f'{model_name}.onnx' + if volume_path.exists(): + return str(volume_path) + return _ORIGINAL_GET_MODEL_PATH(model_name) + + +_hsemotion_facial_emotions.get_model_path = _volume_aware_get_model_path + class EmotionDetector: @staticmethod @@ -37,7 +54,8 @@ def init_model(logger: Optional[ErrorHandlingLogger] = None) -> HSEmotionRecogni logger.info(f'Loading HSEmotion model: {model_name}...') try: - fer = HSEmotionRecognizer(model_name=model_name) + fer = EmotionDetector.__load_with_retry(model_name, logger) + EmotionDetector.__persist_model_to_volume(model_name, logger) EmotionDetector.__patch_gpu_session(fer, model_name, logger) if logger: logger.info(f'HSEmotion model loaded: {model_name}') @@ -45,6 +63,50 @@ def init_model(logger: Optional[ErrorHandlingLogger] = None) -> HSEmotionRecogni except Exception as e: raise RuntimeError(f'Failed to load HSEmotion model {model_name}: {e}') from e + @staticmethod + def __load_with_retry( + model_name: str, + logger: Optional[ErrorHandlingLogger], + max_retries: int = 5, + initial_delay: float = 15.0, + ) -> HSEmotionRecognizer: + delay = initial_delay + for attempt in range(max_retries): + try: + return HSEmotionRecognizer(model_name=model_name) + except urllib.error.HTTPError as e: + if e.code != 429 or attempt >= max_retries - 1: + raise + if logger: + logger.warning( + f'Rate limited downloading HSEmotion model ' + f'(attempt {attempt + 1}/{max_retries}), retrying in {delay:.0f}s...', + ) + time.sleep(delay) + delay *= 2 + raise RuntimeError(f'Failed to download HSEmotion model after {max_retries} attempts') + + @staticmethod + def __get_volume_model_path(model_name: str) -> Optional[Path]: + model_home = os.environ.get('EMOTION_MODEL_HOME', '') + if not model_home: + return None + return Path(model_home) / f'{model_name}.onnx' + + @staticmethod + def __persist_model_to_volume( + model_name: str, logger: Optional[ErrorHandlingLogger], + ) -> None: + volume_path = EmotionDetector.__get_volume_model_path(model_name) + if not volume_path or volume_path.exists(): + return + package_path = Path(_hsemotion_facial_emotions.__file__).parent / 'models' / f'{model_name}.onnx' + if package_path.exists(): + volume_path.parent.mkdir(parents=True, exist_ok=True) + shutil.copy2(package_path, volume_path) + if logger: + logger.info(f'Persisted HSEmotion model to volume: {volume_path}') + @staticmethod def __patch_gpu_session( fer: HSEmotionRecognizer, @@ -60,7 +122,7 @@ def __patch_gpu_session( ) return - model_path = get_model_path(model_name) + model_path = _hsemotion_facial_emotions.get_model_path(model_name) fer.ort_session = ort.InferenceSession( model_path, providers=['CUDAExecutionProvider', 'CPUExecutionProvider'], diff --git a/preprocessor/steps/scraping/reference_processor_step.py b/preprocessor/steps/scraping/reference_processor_step.py index 018ed62c2..a29546e13 100644 --- a/preprocessor/steps/scraping/reference_processor_step.py +++ b/preprocessor/steps/scraping/reference_processor_step.py @@ -31,6 +31,9 @@ def _load_from_cache( def _process( self, input_data: SourceVideo, context: ExecutionContext, ) -> SourceVideo: + if self.config.images_per_character == 0: + context.logger.info("images_per_character=0, skipping character reference download") + return input_data characters_path, output_dir = self.__resolve_paths(context) self.__validate_characters_file(characters_path) self.__download_character_references(characters_path, output_dir, context) From 7f320173b8ee8099e9d77c32a6b91f2b5b15ff4b Mon Sep 17 00:00:00 2001 From: dam2452 <81230036+dam2452@users.noreply.github.com> Date: Thu, 12 Mar 2026 15:58:23 +0100 Subject: [PATCH 70/89] Use per-episode file layout & refactor validators Introduce per-episode file paths and path helper methods, update validators to validate single episode JSON/JSONL files and use FileValidator via PathService. Key changes: - Added PathService.get_episode_dir_by_code and get_episode_file_path; expose Validator.validation_reports_dir. - Switched several output subdirectory names in OutputSubdirs (characters, clusters, frames, hashes, object detection, scene paths, etc.). - ElasticValidator: search season folder for {ep_code}_*.jsonl, validate individual files and text statistics via PathService/get_base_output_dir; improved error/warning messages. - FaceCluster, ImageHash, Object, Scene, Transcription and Frame validators: now operate on per-episode files (using PathService.get_episode_file_path or get_episode_dir_by_code) and use FileValidator for JSON integrity; removed old directory-based helpers. - Document generation and archiving: set min_size_bytes to 0 for many outputs; document generator now handles missing/empty source data by writing empty NDJSON safely instead of early-returning; archives reporting now lists missing document types when skipping. - SoundEventEmbeddingStep: save empty results when no segments. - TranscodeConfig: removed apply_boost_on_resize_only and updated VideoTranscoderStep logic to always apply bitrate rules (simplified boost behavior). These changes migrate validation and IO to a simpler per-episode file layout, improve validation messages, and make document/archiving behavior more explicit for missing or empty inputs. --- preprocessor/app/pipeline_factory.py | 3 +- preprocessor/config/config.py | 16 +-- preprocessor/config/step_configs.py | 1 - preprocessor/services/io/path_service.py | 8 ++ preprocessor/services/validation/validator.py | 4 + .../validators/elastic_validator.py | 93 +++++++-------- .../validators/face_cluster_validator.py | 20 +--- .../validation/validators/frame_validator.py | 2 +- .../validators/image_hash_validator.py | 18 ++- .../validation/validators/object_validator.py | 23 ++-- .../validation/validators/scene_validator.py | 5 +- .../validators/transcription_validator.py | 24 ++-- preprocessor/steps/packaging/archives_step.py | 3 +- .../steps/search/document_generation_step.py | 107 ++++++++---------- .../steps/text/sound_event_embedding_step.py | 1 + preprocessor/steps/video/transcoding_step.py | 5 +- 16 files changed, 163 insertions(+), 170 deletions(-) diff --git a/preprocessor/app/pipeline_factory.py b/preprocessor/app/pipeline_factory.py index 331ff376a..7be781437 100644 --- a/preprocessor/app/pipeline_factory.py +++ b/preprocessor/app/pipeline_factory.py @@ -503,7 +503,7 @@ def build_pipeline(series_name: str) -> PipelineDefinition: # pylint: disable=t FileOutput( pattern="{season}/{episode}_text_segments.jsonl", subdir="elastic_documents/text_segments", - min_size_bytes=10, + min_size_bytes=0, ), ], needs=[ @@ -524,6 +524,7 @@ def build_pipeline(series_name: str) -> PipelineDefinition: # pylint: disable=t produces=[ FileOutput( pattern="{season}/{episode}.zip", + subdir="archives", min_size_bytes=1024 * 100, ), ], diff --git a/preprocessor/config/config.py b/preprocessor/config/config.py index ecaad51c7..5ea79e638 100644 --- a/preprocessor/config/config.py +++ b/preprocessor/config/config.py @@ -37,17 +37,17 @@ class TranscriptionSubdirs: @dataclass(frozen=True) class OutputSubdirs: # pylint: disable=too-many-instance-attributes # Configuration dataclass - all subdirs needed archives: str = 'archives' - character_detections: str = 'character_detections' - character_visualizations: str = 'character_detections/visualizations' + character_detections: str = 'detections/characters' + character_visualizations: str = 'detections/characters/visualizations' elastic_document_subdirs: ElasticDocumentSubdirs = field(default_factory=ElasticDocumentSubdirs) elastic_documents: str = 'elastic_documents' embeddings: str = 'embeddings' - face_clusters: str = 'face_clusters' - frames: str = 'exported_frames' - image_hashes: str = 'image_hashes' - object_detections: str = 'object_detections' - object_visualizations: str = 'object_detections/visualizations' - scenes: str = 'scene_timestamps' + face_clusters: str = 'clusters/faces' + frames: str = 'frames' + image_hashes: str = 'hashes' + object_detections: str = 'detections/objects' + object_visualizations: str = 'detections/objects/visualizations' + scenes: str = 'scene_detections' transcription_subdirs: TranscriptionSubdirs = field(default_factory=TranscriptionSubdirs) transcriptions: str = 'transcriptions' validation_reports: str = 'validation_reports' diff --git a/preprocessor/config/step_configs.py b/preprocessor/config/step_configs.py index ed9a161cc..320a7de8a 100644 --- a/preprocessor/config/step_configs.py +++ b/preprocessor/config/step_configs.py @@ -18,7 +18,6 @@ class TranscodeConfig(BaseModel): model_config = ConfigDict(arbitrary_types_allowed=True) - apply_boost_on_resize_only: bool = True bitrate_boost_ratio: float = Field(default=1.1, ge=1.0, le=2.0) force_deinterlace: bool = False keyframe_interval_seconds: float = Field(gt=0) diff --git a/preprocessor/services/io/path_service.py b/preprocessor/services/io/path_service.py index 51edf5b72..bbb93847a 100644 --- a/preprocessor/services/io/path_service.py +++ b/preprocessor/services/io/path_service.py @@ -26,6 +26,14 @@ def get_episode_dir(self, episode_info: EpisodeInfo, subdir: str) -> Path: base_output_dir = get_base_output_dir(self.__series_name) return base_output_dir / subdir / episode_info.season_code() / episode_info.episode_num() + def get_episode_dir_by_code(self, episode_info: EpisodeInfo, subdir: str) -> Path: + base_output_dir = get_base_output_dir(self.__series_name) + return base_output_dir / subdir / episode_info.season_code() / episode_info.episode_code() + + def get_episode_file_path(self, episode_info: EpisodeInfo, subdir: str, extension: str = 'json') -> Path: + base_output_dir = get_base_output_dir(self.__series_name) + return base_output_dir / subdir / episode_info.season_code() / f'{episode_info.episode_code()}.{extension}' + @staticmethod def get_input_base() -> Path: if Environment.is_docker(): diff --git a/preprocessor/services/validation/validator.py b/preprocessor/services/validation/validator.py index ac82f1165..307bb6ff8 100644 --- a/preprocessor/services/validation/validator.py +++ b/preprocessor/services/validation/validator.py @@ -36,6 +36,10 @@ def __init__( self.__episode_manager = EpisodeManager(episodes_info_json, series_name) self.__validation_reports_dir = base_output_dir / settings.output_subdirs.validation_reports + @property + def validation_reports_dir(self) -> Path: + return self.__validation_reports_dir + def validate(self) -> int: transcriptions_path = self.__base_output_dir / 'transcriptions' / 'raw' / self.__season if not transcriptions_path.exists(): diff --git a/preprocessor/services/validation/validators/elastic_validator.py b/preprocessor/services/validation/validators/elastic_validator.py index de2196e4c..8a2c8ecdd 100644 --- a/preprocessor/services/validation/validators/elastic_validator.py +++ b/preprocessor/services/validation/validators/elastic_validator.py @@ -7,7 +7,7 @@ Dict, ) -from preprocessor.config.constants import OUTPUT_FILE_NAMES +from preprocessor.config.output_paths import get_base_output_dir from preprocessor.config.settings_instance import settings from preprocessor.services.io.path_service import PathService from preprocessor.services.validation.episode_stats import EpisodeStats @@ -24,25 +24,25 @@ def validate(self, stats: EpisodeStats) -> None: self.__validate_elastic_documents(stats) self.__validate_text_statistics(stats) - def __validate_character_detections(self, stats: EpisodeStats) -> None: - char_detections_dir = self.__get_dir(stats, settings.output_subdirs.character_detections) - detections_file = char_detections_dir / OUTPUT_FILE_NAMES['detections'] - - self._validate_json_if_exists( - stats, - detections_file, - error_msg_prefix=f"Invalid {OUTPUT_FILE_NAMES['detections']}", + @staticmethod + def __validate_character_detections(stats: EpisodeStats) -> None: + detections_file = PathService(stats.series_name).get_episode_file_path( + stats.episode_info, settings.output_subdirs.character_detections, ) + if detections_file.exists(): + result = FileValidator.validate_json_file(detections_file) + if not result.is_valid: + stats.errors.append(f'Invalid character detections JSON: {result.error_message}') - def __validate_embeddings(self, stats: EpisodeStats) -> None: - embeddings_dir = self.__get_dir(stats, settings.output_subdirs.embeddings) - if embeddings_dir.exists(): - embeddings_file = embeddings_dir / OUTPUT_FILE_NAMES['embeddings_text'] - self._validate_json_if_exists( - stats, - embeddings_file, - error_msg_prefix=f"Invalid {OUTPUT_FILE_NAMES['embeddings_text']}", - ) + @staticmethod + def __validate_embeddings(stats: EpisodeStats) -> None: + embeddings_file = PathService(stats.series_name).get_episode_file_path( + stats.episode_info, f'{settings.output_subdirs.embeddings}/episode_names', + ) + if embeddings_file.exists(): + result = FileValidator.validate_json_file(embeddings_file) + if not result.is_valid: + stats.errors.append(f'Invalid episode embeddings JSON: {result.error_message}') def __validate_elastic_documents(self, stats: EpisodeStats) -> None: subdirs_to_check = [ @@ -54,36 +54,43 @@ def __validate_elastic_documents(self, stats: EpisodeStats) -> None: found_any = False elastic_base = settings.output_subdirs.elastic_documents + ep_code = stats.episode_info.episode_code() + season_code = stats.episode_info.season_code() for subdir in subdirs_to_check: - docs_dir = self.__get_dir(stats, f'{elastic_base}/{subdir}') - if docs_dir.exists(): - found_any = True - self.__process_jsonl_files(stats, docs_dir, subdir) + season_dir = ( + get_base_output_dir(stats.series_name) / elastic_base / subdir / season_code + ) + if not season_dir.exists(): + continue + ep_files = list(season_dir.glob(f'{ep_code}_*.jsonl')) + if not ep_files: + continue + found_any = True + for jsonl_file in ep_files: + self.__validate_jsonl_file(stats, jsonl_file, subdir) if not found_any: self._add_warning(stats, f'Missing {settings.output_subdirs.elastic_documents} directory') - def __process_jsonl_files(self, stats: EpisodeStats, docs_dir: Path, subdir: str) -> None: - for jsonl_file in docs_dir.glob('*.jsonl'): - result = FileValidator.validate_jsonl_file(jsonl_file) + def __validate_jsonl_file(self, stats: EpisodeStats, jsonl_file: Path, subdir: str) -> None: + result = FileValidator.validate_jsonl_file(jsonl_file) + if not result.is_valid: + self._add_error(stats, f'Invalid JSONL {jsonl_file.name}: {result.error_message}') + else: + self.__validate_embedding_dimensions(stats, jsonl_file, subdir) + + @staticmethod + def __validate_text_statistics(stats: EpisodeStats) -> None: + text_stats_file = PathService(stats.series_name).get_episode_file_path( + stats.episode_info, 'text_analysis', + ) + if text_stats_file.exists(): + result = FileValidator.validate_json_file(text_stats_file) if not result.is_valid: - self._add_error(stats, f'Invalid JSONL {jsonl_file.name}: {result.error_message}') - else: - self.__validate_embedding_dimensions(stats, jsonl_file, subdir) - - def __validate_text_statistics(self, stats: EpisodeStats) -> None: - trans_dir = self.__get_dir(stats, settings.output_subdirs.transcriptions) - if trans_dir.exists(): - clean_subdir = settings.output_subdirs.transcription_subdirs.clean - text_stats_file = trans_dir / clean_subdir / f'{stats.series_name}_{stats.episode_info.episode_code()}_text_stats.json' - - if text_stats_file.exists(): - result = FileValidator.validate_json_file(text_stats_file) - if not result.is_valid: - self._add_error(stats, f'Invalid text_stats JSON: {result.error_message}') - else: - self._add_warning(stats, f'Missing text statistics file: {text_stats_file.name}') + stats.errors.append(f'Invalid text_stats JSON: {result.error_message}') + else: + stats.warnings.append(f'Missing text statistics file: {text_stats_file.name}') def __validate_embedding_dimensions(self, stats: EpisodeStats, jsonl_file: Path, subdir: str) -> None: embedding_fields = { @@ -118,7 +125,3 @@ def __check_doc_dimension( actual = len(doc[field]) if actual != expected: self._add_error(stats, f'{fname} line {lnum}: {field} has {actual} dim, expected {expected}') - - @staticmethod - def __get_dir(stats: EpisodeStats, subdir: str) -> Path: - return PathService(stats.series_name).get_episode_dir(stats.episode_info, subdir) diff --git a/preprocessor/services/validation/validators/face_cluster_validator.py b/preprocessor/services/validation/validators/face_cluster_validator.py index ceaa5b949..bbed519fa 100644 --- a/preprocessor/services/validation/validators/face_cluster_validator.py +++ b/preprocessor/services/validation/validators/face_cluster_validator.py @@ -1,10 +1,8 @@ from __future__ import annotations -from pathlib import Path from typing import ( Any, Dict, - Optional, ) from preprocessor.config.settings_instance import settings @@ -15,30 +13,20 @@ class FaceClusterValidator(BaseValidator): def validate(self, stats: EpisodeStats) -> None: - clusters_dir = PathService(stats.series_name).get_episode_dir( + clusters_file = PathService(stats.series_name).get_episode_file_path( stats.episode_info, settings.output_subdirs.face_clusters, ) - if not clusters_dir.exists(): + if not clusters_file.exists(): return - metadata_file = self.__get_metadata_file(clusters_dir) - if not metadata_file: - self._add_warning(stats, 'Missing face clustering metadata file') + if not self._validate_json_with_error(stats, clusters_file, 'Missing face clusters file', 'Invalid face clusters JSON'): return - if not self._validate_json_with_error(stats, metadata_file, 'Missing metadata', 'Invalid face metadata'): - return - - data = self._load_json_safely(metadata_file) + data = self._load_json_safely(clusters_file) if data: self.__parse_cluster_stats(stats, data) - @staticmethod - def __get_metadata_file(clusters_dir: Path) -> Optional[Path]: - files = list(clusters_dir.glob('*_face_clusters.json')) - return files[0] if files else None - def __parse_cluster_stats(self, stats: EpisodeStats, data: Dict[str, Any]) -> None: clusters = data.get('clusters', {}) diff --git a/preprocessor/services/validation/validators/frame_validator.py b/preprocessor/services/validation/validators/frame_validator.py index 14ed09791..eced19853 100644 --- a/preprocessor/services/validation/validators/frame_validator.py +++ b/preprocessor/services/validation/validators/frame_validator.py @@ -16,7 +16,7 @@ class FrameValidator(BaseValidator): def validate(self, stats: EpisodeStats) -> None: - frames_dir = PathService(stats.series_name).get_episode_dir( + frames_dir = PathService(stats.series_name).get_episode_dir_by_code( stats.episode_info, settings.output_subdirs.frames, ) diff --git a/preprocessor/services/validation/validators/image_hash_validator.py b/preprocessor/services/validation/validators/image_hash_validator.py index 1d8534ad8..f93c661ab 100644 --- a/preprocessor/services/validation/validators/image_hash_validator.py +++ b/preprocessor/services/validation/validators/image_hash_validator.py @@ -1,16 +1,22 @@ from __future__ import annotations from preprocessor.config.settings_instance import settings +from preprocessor.services.io.path_service import PathService from preprocessor.services.validation.episode_stats import EpisodeStats +from preprocessor.services.validation.file_validators import FileValidator from preprocessor.services.validation.validators.base_validator import BaseValidator -from preprocessor.services.validation.validators.validation_helpers import JsonDirectoryValidationHelper class ImageHashValidator(BaseValidator): def validate(self, stats: EpisodeStats) -> None: - JsonDirectoryValidationHelper.validate_json_directory( - stats, - settings.output_subdirs.image_hashes, - 'image_hashes_count', - 'image_hashes', + hash_file = PathService(stats.series_name).get_episode_file_path( + stats.episode_info, settings.output_subdirs.image_hashes, ) + + if not hash_file.exists(): + self._add_warning(stats, f'Missing image hashes file: {hash_file.name}') + return + + result = FileValidator.validate_json_file(hash_file) + if not result.is_valid: + self._add_error(stats, f'Invalid image hashes JSON: {result.error_message}') diff --git a/preprocessor/services/validation/validators/object_validator.py b/preprocessor/services/validation/validators/object_validator.py index 7e7c88e3b..73fb285b8 100644 --- a/preprocessor/services/validation/validators/object_validator.py +++ b/preprocessor/services/validation/validators/object_validator.py @@ -1,12 +1,11 @@ from __future__ import annotations from preprocessor.config.settings_instance import settings +from preprocessor.services.io.path_service import PathService from preprocessor.services.validation.episode_stats import EpisodeStats +from preprocessor.services.validation.file_validators import FileValidator from preprocessor.services.validation.validators.base_validator import BaseValidator -from preprocessor.services.validation.validators.validation_helpers import ( - JsonDirectoryValidationHelper, - VisualizationValidationHelper, -) +from preprocessor.services.validation.validators.validation_helpers import VisualizationValidationHelper class ObjectValidator(BaseValidator): @@ -16,14 +15,18 @@ def validate(self, stats: EpisodeStats) -> None: @staticmethod def __validate_object_detections(stats: EpisodeStats) -> None: - JsonDirectoryValidationHelper.validate_json_directory( - stats, - settings.output_subdirs.object_detections, - 'object_detections_count', - 'object_detections', - exclude_pattern='visualizations', + detections_file = PathService(stats.series_name).get_episode_file_path( + stats.episode_info, settings.output_subdirs.object_detections, ) + if not detections_file.exists(): + stats.warnings.append(f'Missing object detections file: {detections_file.name}') + return + + result = FileValidator.validate_json_file(detections_file) + if not result.is_valid: + stats.errors.append(f'Invalid object detections JSON: {result.error_message}') + @staticmethod def __validate_object_visualizations(stats: EpisodeStats) -> None: VisualizationValidationHelper.validate_visualizations( diff --git a/preprocessor/services/validation/validators/scene_validator.py b/preprocessor/services/validation/validators/scene_validator.py index f8abe13c9..71a8ebd18 100644 --- a/preprocessor/services/validation/validators/scene_validator.py +++ b/preprocessor/services/validation/validators/scene_validator.py @@ -7,7 +7,6 @@ List, ) -from preprocessor.config.constants import OUTPUT_FILE_PATTERNS from preprocessor.config.settings_instance import settings from preprocessor.services.io.path_service import PathService from preprocessor.services.validation.episode_stats import EpisodeStats @@ -31,11 +30,9 @@ def validate(self, stats: EpisodeStats) -> None: @staticmethod def __resolve_scenes_file(stats: EpisodeStats) -> Path: - scenes_dir = PathService(stats.series_name).get_episode_dir( + return PathService(stats.series_name).get_episode_file_path( stats.episode_info, settings.output_subdirs.scenes, ) - suffix = OUTPUT_FILE_PATTERNS['scenes_suffix'] - return scenes_dir / f"{stats.series_name}_{stats.episode_info.episode_code()}{suffix}" def __validate_json_integrity(self, stats: EpisodeStats, file_path: Path) -> bool: result = FileValidator.validate_json_file(file_path) diff --git a/preprocessor/services/validation/validators/transcription_validator.py b/preprocessor/services/validation/validators/transcription_validator.py index 8c2825e34..b734a5539 100644 --- a/preprocessor/services/validation/validators/transcription_validator.py +++ b/preprocessor/services/validation/validators/transcription_validator.py @@ -90,18 +90,18 @@ def __validate_sound_events(self, stats: EpisodeStats, file_path: Path) -> None: @staticmethod def __resolve_file_map(stats: EpisodeStats) -> Dict[str, Path]: path_svc = PathService(stats.series_name) - trans_dir = path_svc.get_episode_dir(stats.episode_info, settings.output_subdirs.transcriptions) - base = f'{stats.series_name}_{stats.episode_info.episode_code()}' - - raw_base = trans_dir / settings.output_subdirs.transcription_subdirs.raw - clean_base = trans_dir / settings.output_subdirs.transcription_subdirs.clean - sound_base = trans_dir / settings.output_subdirs.transcription_subdirs.sound_events + raw_ep_dir = path_svc.get_episode_dir( + stats.episode_info, + f'{settings.output_subdirs.transcriptions}/{settings.output_subdirs.transcription_subdirs.raw}', + ) + season_raw_dir = raw_ep_dir.parent + ep_code = stats.episode_info.episode_code() return { - 'main': raw_base / f'{base}.json', - 'segmented': raw_base / f'{base}_segmented.json', - 'simple': raw_base / f'{base}_simple.json', - 'clean': clean_base / f'{base}_clean_transcription.json', - 'clean_txt': clean_base / f'{base}_clean_transcription.txt', - 'sound_events': sound_base / f'{base}_sound_events.json', + 'main': raw_ep_dir / f'{ep_code}.json', + 'segmented': raw_ep_dir / f'{ep_code}_segmented.json', + 'simple': raw_ep_dir / f'{ep_code}_simple.json', + 'clean': season_raw_dir / settings.output_subdirs.transcription_subdirs.clean / f'{ep_code}_clean_transcription.json', + 'clean_txt': season_raw_dir / settings.output_subdirs.transcription_subdirs.clean / f'{ep_code}_clean_transcription.txt', + 'sound_events': season_raw_dir / settings.output_subdirs.transcription_subdirs.sound_events / f'{ep_code}_sound_events.json', } diff --git a/preprocessor/steps/packaging/archives_step.py b/preprocessor/steps/packaging/archives_step.py index be66018df..45c0cddaf 100644 --- a/preprocessor/steps/packaging/archives_step.py +++ b/preprocessor/steps/packaging/archives_step.py @@ -47,9 +47,10 @@ def _process( return self.__build_artifact(input_data, output_path) if found < expected and not self.config.allow_partial: + missing = [folder for folder, _ in ELASTIC_DOC_TYPES if folder not in episode_files] context.logger.warning( f"Skipping {input_data.episode_id}: incomplete documents " - f"({found}/{expected}). Set allow_partial=True to archive anyway.", + f"({found}/{expected}), missing: {missing}. Set allow_partial=True to archive anyway.", ) return self.__build_artifact(input_data, output_path) diff --git a/preprocessor/steps/search/document_generation_step.py b/preprocessor/steps/search/document_generation_step.py index 94afe79f0..489b7e5c0 100644 --- a/preprocessor/steps/search/document_generation_step.py +++ b/preprocessor/steps/search/document_generation_step.py @@ -76,7 +76,7 @@ def get_output_descriptors(self) -> List[FileOutput]: FileOutput( pattern=f"{{season}}/{{episode}}_{suffix}.jsonl", subdir=f"elastic_documents/{folder}", - min_size_bytes=10, + min_size_bytes=0, ) for folder, suffix in ELASTIC_DOC_TYPES ] @@ -223,8 +223,6 @@ def __index_objects_by_frame( @staticmethod def __write_ndjson(output_path: Path, docs: List[Dict[str, Any]]) -> int: - if not docs: - return 0 with StepTempFile(output_path) as tmp: with open(tmp, "w", encoding="utf-8") as f: for doc in docs: @@ -241,11 +239,9 @@ def __write_text_segments( scene_data: Optional[Dict[str, Any]], ) -> int: clean_data = self.__load_optional(context, "transcriptions/clean", episode_info) - if not clean_data: - return 0 docs = [] - for i, seg in enumerate(clean_data.get("segments", [])): + for i, seg in enumerate((clean_data or {}).get("segments", [])): text = seg.get("text", "").strip() if not text: continue @@ -280,11 +276,9 @@ def __write_sound_events( scene_data: Optional[Dict[str, Any]], ) -> int: sound_data = self.__load_optional(context, "transcriptions/sound_events", episode_info) - if not sound_data: - return 0 docs = [] - for i, seg in enumerate(sound_data.get("segments", [])): + for i, seg in enumerate((sound_data or {}).get("segments", [])): if "text" not in seg: continue words = seg.get("words", []) @@ -316,11 +310,9 @@ def __write_text_embeddings( video_path: str, ) -> int: emb_data = self.__load_optional(context, "embeddings/text", episode_info) - if not emb_data: - return 0 docs = [] - for i, emb in enumerate(emb_data.get("text_embeddings", [])): + for i, emb in enumerate((emb_data or {}).get("text_embeddings", [])): embedding = emb.get("embedding", []) if not embedding: continue @@ -349,11 +341,9 @@ def __write_video_frames( objects_by_frame: Dict[str, List[Dict[str, Any]]], ) -> int: emb_data = self.__load_optional(context, "embeddings/vision", episode_info) - if not emb_data: - return 0 docs = [] - for emb in emb_data.get("video_embeddings", []): + for emb in (emb_data or {}).get("video_embeddings", []): embedding = emb.get("embedding") timestamp = emb.get("timestamp") if embedding is None or timestamp is None: @@ -405,17 +395,17 @@ def __write_episode_name( video_path: str, ) -> int: emb_data = self.__load_optional(context, "embeddings/episode_names", episode_info) - if not emb_data or not emb_data.get("title_embedding"): - return 0 - - doc: Dict[str, Any] = { - "episode_id": episode_id, - "episode_metadata": episode_metadata, - "title": emb_data.get("title", ""), - "title_embedding": emb_data.get("title_embedding", []), - "video_path": video_path, - } - return self.__write_ndjson(self.__output_path(context, episode_info, 4), [doc]) + + docs = [] + if emb_data and emb_data.get("title_embedding"): + docs = [{ + "episode_id": episode_id, + "episode_metadata": episode_metadata, + "title": emb_data.get("title", ""), + "title_embedding": emb_data.get("title_embedding", []), + "video_path": video_path, + }] + return self.__write_ndjson(self.__output_path(context, episode_info, 4), docs) def __write_text_statistics( self, @@ -426,22 +416,22 @@ def __write_text_statistics( video_path: str, ) -> int: stats_data = self.__load_optional(context, "text_analysis", episode_info) - if not stats_data or not stats_data.get("basic_statistics"): - return 0 - - doc: Dict[str, Any] = { - "episode_id": episode_id, - "episode_metadata": episode_metadata, - "video_path": video_path, - "language": stats_data.get("metadata", {}).get("language", "pl"), - "analyzed_at": stats_data.get("metadata", {}).get("analyzed_at"), - "basic_statistics": stats_data.get("basic_statistics", {}), - "advanced_statistics": stats_data.get("advanced_statistics", {}), - "word_frequency": stats_data.get("word_frequency", [])[:20], - "bigrams": stats_data.get("bigrams", [])[:10], - "trigrams": stats_data.get("trigrams", [])[:10], - } - return self.__write_ndjson(self.__output_path(context, episode_info, 5), [doc]) + + docs = [] + if stats_data and stats_data.get("basic_statistics"): + docs = [{ + "episode_id": episode_id, + "episode_metadata": episode_metadata, + "video_path": video_path, + "language": stats_data.get("metadata", {}).get("language", "pl"), + "analyzed_at": stats_data.get("metadata", {}).get("analyzed_at"), + "basic_statistics": stats_data.get("basic_statistics", {}), + "advanced_statistics": stats_data.get("advanced_statistics", {}), + "word_frequency": stats_data.get("word_frequency", [])[:20], + "bigrams": stats_data.get("bigrams", [])[:10], + "trigrams": stats_data.get("trigrams", [])[:10], + }] + return self.__write_ndjson(self.__output_path(context, episode_info, 5), docs) def __write_full_episode_embedding( self, @@ -452,22 +442,19 @@ def __write_full_episode_embedding( video_path: str, ) -> int: emb_data = self.__load_optional(context, "embeddings/full_episode", episode_info) - if not emb_data: - return 0 - - full_emb = emb_data.get("full_episode_embedding", {}) - if not full_emb or "embedding" not in full_emb: - return 0 - - doc: Dict[str, Any] = { - "episode_id": episode_id, - "episode_metadata": episode_metadata, - "full_transcript": full_emb.get("text", ""), - "transcript_length": full_emb.get("transcript_length", 0), - "full_episode_embedding": full_emb.get("embedding", []), - "video_path": video_path, - } - return self.__write_ndjson(self.__output_path(context, episode_info, 6), [doc]) + + docs = [] + full_emb = (emb_data or {}).get("full_episode_embedding", {}) + if full_emb and "embedding" in full_emb: + docs = [{ + "episode_id": episode_id, + "episode_metadata": episode_metadata, + "full_transcript": full_emb.get("text", ""), + "transcript_length": full_emb.get("transcript_length", 0), + "full_episode_embedding": full_emb.get("embedding", []), + "video_path": video_path, + }] + return self.__write_ndjson(self.__output_path(context, episode_info, 6), docs) def __write_sound_event_embeddings( self, @@ -478,11 +465,9 @@ def __write_sound_event_embeddings( video_path: str, ) -> int: emb_data = self.__load_optional(context, "embeddings/sound_events", episode_info) - if not emb_data: - return 0 docs = [] - for i, emb in enumerate(emb_data.get("sound_event_embeddings", [])): + for i, emb in enumerate((emb_data or {}).get("sound_event_embeddings", [])): embedding = emb.get("embedding", []) if not embedding: continue diff --git a/preprocessor/steps/text/sound_event_embedding_step.py b/preprocessor/steps/text/sound_event_embedding_step.py index a785c7414..72db7746d 100644 --- a/preprocessor/steps/text/sound_event_embedding_step.py +++ b/preprocessor/steps/text/sound_event_embedding_step.py @@ -69,6 +69,7 @@ def _process( segments = self.__load_segments(input_data, context) if not segments: + self.__save_results([], output_path, input_data) return self.__build_collection(input_data, output_path, 0) self.__ensure_model() diff --git a/preprocessor/steps/video/transcoding_step.py b/preprocessor/steps/video/transcoding_step.py index 798b80ce1..f1b70a3a0 100644 --- a/preprocessor/steps/video/transcoding_step.py +++ b/preprocessor/steps/video/transcoding_step.py @@ -135,10 +135,7 @@ def __compute_all_bitrate_settings( normalized_bitrate = self.__get_normalized_bitrate(src_bitrate, probe_data, context) - if self.config.apply_boost_on_resize_only and self.__is_same_resolution(probe_data): - final_bitrate = normalized_bitrate - adjustment = "preserved (same resolution, no boost)" - elif normalized_bitrate < min_bitrate: + if normalized_bitrate < min_bitrate: final_bitrate = min_bitrate adjustment = f"boosted to minimum ({min_bitrate} Mbps)" elif normalized_bitrate > max_bitrate: From b1ddf23c7ae93a39fe672307ceae74ef4a3ecc13 Mon Sep 17 00:00:00 2001 From: dam2452 <81230036+dam2452@users.noreply.github.com> Date: Fri, 13 Mar 2026 09:39:41 +0100 Subject: [PATCH 71/89] Add BaseTranscriptionStep and refactor steps Introduce BaseTranscriptionStep to centralize transcription output descriptors and cache path resolution. Update TranscriptionImportStep and TranscriptionStep to inherit from the new base class and remove duplicated get_output_descriptors/_get_cache_path implementations. Bump VERSION to 4.0.1 and remove an unnecessary pylint disable comment in face_clusterer.py. --- VERSION | 1 + preprocessor/core/base_transcription_step.py | 44 +++++++++++++++++++ .../services/characters/face_clusterer.py | 2 +- preprocessor/services/text/import_step.py | 25 +---------- preprocessor/steps/text/transcription_step.py | 27 +----------- 5 files changed, 50 insertions(+), 49 deletions(-) create mode 100644 VERSION create mode 100644 preprocessor/core/base_transcription_step.py diff --git a/VERSION b/VERSION new file mode 100644 index 000000000..1454f6ed4 --- /dev/null +++ b/VERSION @@ -0,0 +1 @@ +4.0.1 diff --git a/preprocessor/core/base_transcription_step.py b/preprocessor/core/base_transcription_step.py new file mode 100644 index 000000000..6f3efdb15 --- /dev/null +++ b/preprocessor/core/base_transcription_step.py @@ -0,0 +1,44 @@ +from __future__ import annotations + +from pathlib import Path +from typing import ( + List, + TypeVar, +) + +from pydantic import BaseModel + +from preprocessor.core.artifacts import ( + EpisodeArtifact, + TranscriptionData, +) +from preprocessor.core.base_step import PipelineStep +from preprocessor.core.context import ExecutionContext +from preprocessor.core.output_descriptors import JsonFileOutput + +EpisodeInputT = TypeVar('EpisodeInputT', bound=EpisodeArtifact) +ConfigT = TypeVar('ConfigT', bound=BaseModel) + + +class BaseTranscriptionStep(PipelineStep[EpisodeInputT, TranscriptionData, ConfigT]): + def get_output_descriptors(self) -> List[JsonFileOutput]: + return [ + JsonFileOutput( + pattern='{season}/{episode_num}/{episode}.json', + subdir='transcriptions/raw', + min_size_bytes=50, + ), + ] + + def _get_cache_path( + self, input_data: EpisodeInputT, context: ExecutionContext, + ) -> Path: + return self._resolve_output_path( + 0, + context, + { + 'season': input_data.episode_info.season_code(), + 'episode_num': input_data.episode_info.episode_num(), + 'episode': input_data.episode_info.episode_code(), + }, + ) diff --git a/preprocessor/services/characters/face_clusterer.py b/preprocessor/services/characters/face_clusterer.py index 566f1a030..f859fc059 100644 --- a/preprocessor/services/characters/face_clusterer.py +++ b/preprocessor/services/characters/face_clusterer.py @@ -25,7 +25,7 @@ def extract_face_embeddings( face_data: List[Dict[str, Any]] = [] for frame_path in frame_files: - img = cv2.imread(str(frame_path)) # pylint: disable=no-member + img = cv2.imread(str(frame_path)) if img is None: continue diff --git a/preprocessor/services/text/import_step.py b/preprocessor/services/text/import_step.py index 2d894aa0e..a886f062e 100644 --- a/preprocessor/services/text/import_step.py +++ b/preprocessor/services/text/import_step.py @@ -14,14 +14,13 @@ SourceVideo, TranscriptionData, ) -from preprocessor.core.base_step import PipelineStep +from preprocessor.core.base_transcription_step import BaseTranscriptionStep from preprocessor.core.context import ExecutionContext -from preprocessor.core.output_descriptors import JsonFileOutput from preprocessor.services.episodes.episode_manager import EpisodeManager from preprocessor.services.episodes.types import EpisodeInfo -class TranscriptionImportStep(PipelineStep[SourceVideo, TranscriptionData, TranscriptionImportConfig]): +class TranscriptionImportStep(BaseTranscriptionStep[SourceVideo, TranscriptionImportConfig]): @property def supports_batch_processing(self) -> bool: return True @@ -60,26 +59,6 @@ def _process(self, input_data: SourceVideo, context: ExecutionContext) -> Transc format='json', ) - def get_output_descriptors(self) -> List[JsonFileOutput]: - return [ - JsonFileOutput( - pattern='{season}/{episode_num}/{episode}.json', - subdir='transcriptions/raw', - min_size_bytes=50, - ), - ] - - def _get_cache_path(self, input_data: SourceVideo, context: ExecutionContext) -> Path: - return self._resolve_output_path( - 0, - context, - { - 'season': input_data.episode_info.season_code(), - 'episode_num': input_data.episode_info.episode_num(), - 'episode': input_data.episode_info.episode_code(), - }, - ) - def _load_from_cache( self, cache_path: Path, input_data: SourceVideo, context: ExecutionContext, ) -> TranscriptionData: diff --git a/preprocessor/steps/text/transcription_step.py b/preprocessor/steps/text/transcription_step.py index 6340005e9..2daa06ecf 100644 --- a/preprocessor/steps/text/transcription_step.py +++ b/preprocessor/steps/text/transcription_step.py @@ -12,9 +12,8 @@ TranscodedVideo, TranscriptionData, ) -from preprocessor.core.base_step import PipelineStep +from preprocessor.core.base_transcription_step import BaseTranscriptionStep from preprocessor.core.context import ExecutionContext -from preprocessor.core.output_descriptors import JsonFileOutput from preprocessor.services.episodes.episode_manager import EpisodeManager from preprocessor.services.io.files import FileOperations from preprocessor.services.transcription.engines.base_engine import TranscriptionEngine @@ -26,7 +25,7 @@ class TranscriptionStep( - PipelineStep[TranscodedVideo, TranscriptionData, TranscriptionConfig], + BaseTranscriptionStep[TranscodedVideo, TranscriptionConfig], ): def __init__(self, config: TranscriptionConfig) -> None: super().__init__(config) @@ -66,28 +65,6 @@ def _process( return self.__construct_result_artifact(output_path, input_data, result) - def get_output_descriptors(self) -> List[JsonFileOutput]: - return [ - JsonFileOutput( - pattern="{season}/{episode_num}/{episode}.json", - subdir="transcriptions/raw", - min_size_bytes=50, - ), - ] - - def _get_cache_path( - self, input_data: TranscodedVideo, context: ExecutionContext, - ) -> Path: - return self._resolve_output_path( - 0, - context, - { - 'season': input_data.episode_info.season_code(), - 'episode_num': input_data.episode_info.episode_num(), - 'episode': input_data.episode_info.episode_code(), - }, - ) - def _load_from_cache( self, cache_path: Path, From 3bf23be9ffcd7c0077929470892d55e8e22e50c9 Mon Sep 17 00:00:00 2001 From: dam2452 <81230036+dam2452@users.noreply.github.com> Date: Fri, 13 Mar 2026 11:30:26 +0100 Subject: [PATCH 72/89] Add sejm_demo series config Add a new series configuration for a Sejm RP demo. Defines display_name and series_name 'sejm_demo', Elasticsearch index_name 'sejm_demo', scraping settings (character image refs, character and episode URLs), and skips the episode_scraper and character_scraper steps for demo runs. --- preprocessor/series_configs/sejm_demo.json | 28 ++++++++++++++++++++++ 1 file changed, 28 insertions(+) create mode 100644 preprocessor/series_configs/sejm_demo.json diff --git a/preprocessor/series_configs/sejm_demo.json b/preprocessor/series_configs/sejm_demo.json new file mode 100644 index 000000000..2d1cf789d --- /dev/null +++ b/preprocessor/series_configs/sejm_demo.json @@ -0,0 +1,28 @@ +{ + "display_name": "Sejm RP - Demo", + "indexing": { + "elasticsearch": { + "index_name": "sejm_demo" + } + }, + "scraping": { + "character_references": { + "images_per_character": 2 + }, + "characters": { + "urls": [ + "https://www.sejm.gov.pl/Sejm10.nsf/poslowie.xsp" + ] + }, + "episodes": { + "urls": [ + "https://www.sejm.gov.pl/Sejm10.nsf/transmisje.xsp" + ] + } + }, + "series_name": "sejm_demo", + "skip_steps": [ + "episode_scraper", + "character_scraper" + ] +} From b9f0ed9610c11861a61c9550935e55ff55431eaf Mon Sep 17 00:00:00 2001 From: dam2452 <81230036+dam2452@users.noreply.github.com> Date: Sat, 14 Mar 2026 21:59:48 +0100 Subject: [PATCH 73/89] Add pipeline_mode and lower missing-image error Set pipeline_mode to "selective" in sejm_demo.json. Change CharacterReferenceDownloader to log "No suitable images found" as a warning instead of an error to reduce noisy error reports when characters have no available reference images. --- preprocessor/series_configs/sejm_demo.json | 1 + preprocessor/services/characters/reference_downloader.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/preprocessor/series_configs/sejm_demo.json b/preprocessor/series_configs/sejm_demo.json index 2d1cf789d..fad8932b7 100644 --- a/preprocessor/series_configs/sejm_demo.json +++ b/preprocessor/series_configs/sejm_demo.json @@ -5,6 +5,7 @@ "index_name": "sejm_demo" } }, + "pipeline_mode": "selective", "scraping": { "character_references": { "images_per_character": 2 diff --git a/preprocessor/services/characters/reference_downloader.py b/preprocessor/services/characters/reference_downloader.py index e2a23126d..43d5270a9 100644 --- a/preprocessor/services/characters/reference_downloader.py +++ b/preprocessor/services/characters/reference_downloader.py @@ -266,7 +266,7 @@ def __log_final_results(self, char_name: str, saved_count: int) -> None: elif saved_count > 0: self.logger.warning(f'{char_name}: {saved_count}/{self.__images_per_character} images (incomplete)') else: - self.logger.error(f'{char_name}: No suitable images found') + self.logger.warning(f'{char_name}: No suitable images found') @staticmethod def __apply_random_delay() -> None: From f8fdd0cebbfb93229090add5025a0ade86878222 Mon Sep 17 00:00:00 2001 From: dam2452 <81230036+dam2452@users.noreply.github.com> Date: Sun, 15 Mar 2026 10:50:55 +0100 Subject: [PATCH 74/89] Add global-completion flag and exhausted marker handling Introduce a uses_global_completion property (default True) on PipelineStep and update PipelineExecutor to only mark/skip global steps when the flag is set. Disable caching and global completion for the character reference step and forward force_rerun to the downloader. In CharacterReferenceDownloader add a force_rerun option, short-circuit runs when a .exhausted marker exists, and create that marker when no images are saved to avoid repeated futile scrapes. These changes prevent unnecessary global completion bookkeeping for this step and skip re-running searches for characters previously found to be exhausted unless explicitly forced. --- preprocessor/app/pipeline_builder.py | 8 +++--- preprocessor/core/base_step.py | 4 +++ .../characters/reference_downloader.py | 12 +++++++++ .../scraping/reference_processor_step.py | 25 ++++++------------- 4 files changed, 28 insertions(+), 21 deletions(-) diff --git a/preprocessor/app/pipeline_builder.py b/preprocessor/app/pipeline_builder.py index 8c95088f5..862b8f709 100644 --- a/preprocessor/app/pipeline_builder.py +++ b/preprocessor/app/pipeline_builder.py @@ -174,14 +174,16 @@ def run(self, source_path: Path, episode_manager: EpisodeManager) -> None: def __run_global_step(self, step: PipelineStep) -> None: self.__context.logger.info(f"=== Running Global Step: {step.name} ===") - if self.__should_skip_global_step(step.name): + if step.uses_global_completion and self.__should_skip_global_step(step.name): self.__context.logger.info(f"Skipping {step.name} (already completed)") return try: - self.__mark_step_in_progress(step.name, 'all') + if step.uses_global_completion: + self.__mark_step_in_progress(step.name, 'all') step.execute(None, self.__context) - self.__mark_step_completed(step.name, 'all') + if step.uses_global_completion: + self.__mark_step_completed(step.name, 'all') except Exception as e: self.__context.logger.error(f"Global step {step.name} failed: {e}") raise diff --git a/preprocessor/core/base_step.py b/preprocessor/core/base_step.py index 6f27f2c86..3d5235431 100644 --- a/preprocessor/core/base_step.py +++ b/preprocessor/core/base_step.py @@ -55,6 +55,10 @@ def is_global(self) -> bool: def uses_caching(self) -> bool: return True + @property + def uses_global_completion(self) -> bool: + return True + @property def supports_batch_processing(self) -> bool: return False diff --git a/preprocessor/services/characters/reference_downloader.py b/preprocessor/services/characters/reference_downloader.py index 43d5270a9..b2a468cb1 100644 --- a/preprocessor/services/characters/reference_downloader.py +++ b/preprocessor/services/characters/reference_downloader.py @@ -58,6 +58,7 @@ def __init__(self, args: Dict[str, Any]) -> None: self.__min_width: int = settings.image_scraper.min_image_width self.__min_height: int = settings.image_scraper.min_image_height self.__search_mode: str = self._args.get('search_mode', 'normal') + self.__force_rerun: bool = self._args.get('force_rerun', False) self.__search_engine: BaseImageSearch = self.__create_search_engine() self.__face_app: Optional[FaceAnalysis] = None @@ -81,6 +82,10 @@ def _get_expected_outputs(self, item: ProcessingItem) -> List[OutputSpec]: char_name = item.metadata['char_name'] output_folder = self.__output_dir / char_name.replace(' ', '_').lower() + exhausted_marker = output_folder / '.exhausted' + if not self.__force_rerun and exhausted_marker.exists(): + return [OutputSpec(path=exhausted_marker, required=True)] + return [ OutputSpec(path=output_folder / f'{i:02d}.jpg', required=True) for i in range(self.__images_per_character) @@ -129,6 +134,8 @@ def _process_item(self, item: ProcessingItem, missing_outputs: List[OutputSpec]) self.__log_final_results(char_name, saved_count) self.__apply_random_delay() + if saved_count == 0: + self.__mark_exhausted(output_folder, char_name) def __create_search_engine(self) -> BaseImageSearch: if self.__search_mode == 'premium': @@ -260,6 +267,11 @@ def __sort_results_by_extension(results: List[Dict[str, Any]]) -> List[Dict[str, ), ) + def __mark_exhausted(self, output_folder: Path, char_name: str) -> None: + exhausted_marker = output_folder / '.exhausted' + exhausted_marker.touch() + self.logger.info(f'{char_name}: marked as exhausted (no images found after search)') + def __log_final_results(self, char_name: str, saved_count: int) -> None: if saved_count >= self.__images_per_character: self.logger.info(f'{char_name}: {saved_count}/{self.__images_per_character} images') diff --git a/preprocessor/steps/scraping/reference_processor_step.py b/preprocessor/steps/scraping/reference_processor_step.py index a29546e13..b3ebff9af 100644 --- a/preprocessor/steps/scraping/reference_processor_step.py +++ b/preprocessor/steps/scraping/reference_processor_step.py @@ -16,17 +16,13 @@ class CharacterReferenceStep( def is_global(self) -> bool: return True - def _get_cache_path( - self, input_data: SourceVideo, context: ExecutionContext, - ) -> Path: - _, output_dir = self.__resolve_paths(context) - return output_dir + @property + def uses_caching(self) -> bool: + return False - def _load_from_cache( - self, cache_path: Path, input_data: SourceVideo, context: ExecutionContext, - ) -> SourceVideo: - context.logger.info(f"Character references already exist in: {cache_path}") - return input_data + @property + def uses_global_completion(self) -> bool: + return False def _process( self, input_data: SourceVideo, context: ExecutionContext, @@ -39,14 +35,6 @@ def _process( self.__download_character_references(characters_path, output_dir, context) return input_data - @staticmethod - def _should_use_cache( - cache_path: Path, _input_data: SourceVideo, context: ExecutionContext, - ) -> bool: - if context.force_rerun: - return False - return cache_path.exists() and any(cache_path.iterdir()) - @staticmethod def __resolve_paths(context: ExecutionContext) -> Tuple[Path, Path]: base_dir = get_base_output_dir(context.series_name) @@ -69,6 +57,7 @@ def __download_character_references( "search_engine": self.config.search_engine, "images_per_character": self.config.images_per_character, "series_name": context.series_name, + "force_rerun": context.force_rerun, }, ) From 03fa082263bcad19face1f95e904446f8f40f9bc Mon Sep 17 00:00:00 2001 From: dam2452 <81230036+dam2452@users.noreply.github.com> Date: Sun, 15 Mar 2026 17:26:22 +0100 Subject: [PATCH 75/89] Update reference_downloader.py --- preprocessor/services/characters/reference_downloader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/preprocessor/services/characters/reference_downloader.py b/preprocessor/services/characters/reference_downloader.py index b2a468cb1..32ca4bfc3 100644 --- a/preprocessor/services/characters/reference_downloader.py +++ b/preprocessor/services/characters/reference_downloader.py @@ -169,7 +169,7 @@ def __handle_retry_logic(self, error: Exception, attempt: int, char_name: str) - self.logger.warning(f'Attempt {attempt + 1} failed for {char_name}, retrying in {delay}s: {error}') time.sleep(delay) else: - self.logger.error(f'All retry attempts failed for {char_name}: {error}') + self.logger.warning(f'All retry attempts failed for {char_name}: {error}') def __download_and_process_images( self, results: List[Dict[str, Any]], output_folder: Path, saved_count: int, From fd1611b16cda350f2a93593ba2febfce9700751f Mon Sep 17 00:00:00 2001 From: dam2452 <81230036+dam2452@users.noreply.github.com> Date: Sun, 15 Mar 2026 17:36:29 +0100 Subject: [PATCH 76/89] Add search_query_template to scraping config MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Introduce a configurable search_query_template for character reference scraping. Added the field to SeriesConfig and CharacterReferencesConfig (dataclass) and to the Pydantic CharacterReferenceConfig with a default of "Serial {series_name} {char_name} postać". Updated series defaults and sejm_demo config to include templates (sejm_demo uses "{char_name} poseł"). PipelineFactory and the reference scraping step now pass the template through to the downloader. CharacterReferenceDownloader reads the template from args (with a fallback default) and builds search queries via .format(series_name=..., char_name=...), enabling per-series/custom search query formats. --- preprocessor/app/pipeline_factory.py | 1 + preprocessor/config/series_config.py | 2 ++ preprocessor/config/step_configs.py | 1 + preprocessor/series_configs/defaults.json | 3 ++- preprocessor/series_configs/sejm_demo.json | 3 ++- preprocessor/services/characters/reference_downloader.py | 7 ++++++- preprocessor/steps/scraping/reference_processor_step.py | 1 + 7 files changed, 15 insertions(+), 3 deletions(-) diff --git a/preprocessor/app/pipeline_factory.py b/preprocessor/app/pipeline_factory.py index 7be781437..cc277eeb4 100644 --- a/preprocessor/app/pipeline_factory.py +++ b/preprocessor/app/pipeline_factory.py @@ -144,6 +144,7 @@ def build_pipeline(series_name: str) -> PipelineDefinition: # pylint: disable=t config=CharacterReferenceConfig( search_engine=series_config.scraping.character_references.search_engine, images_per_character=series_config.scraping.character_references.images_per_character, + search_query_template=series_config.scraping.character_references.search_query_template, ), ) diff --git a/preprocessor/config/series_config.py b/preprocessor/config/series_config.py index 3b3eb8fc7..31024b9d5 100644 --- a/preprocessor/config/series_config.py +++ b/preprocessor/config/series_config.py @@ -40,6 +40,7 @@ class CharacterScrapingConfig: class CharacterReferencesConfig: images_per_character: int search_engine: str + search_query_template: str @dataclass @@ -153,6 +154,7 @@ def __load_from_dict(data: Dict[str, Any]) -> 'SeriesConfig': character_references=CharacterReferencesConfig( search_engine=data['scraping']['character_references']['search_engine'], images_per_character=data['scraping']['character_references']['images_per_character'], + search_query_template=data['scraping']['character_references']['search_query_template'], ), ), processing=SeriesConfig.__build_processing_config(data), diff --git a/preprocessor/config/step_configs.py b/preprocessor/config/step_configs.py index 320a7de8a..8e766330e 100644 --- a/preprocessor/config/step_configs.py +++ b/preprocessor/config/step_configs.py @@ -230,6 +230,7 @@ class CharacterReferenceConfig(BaseModel): images_per_character: int = Field(default=5, ge=0, le=20) max_parallel_episodes: int = Field(default=4, ge=1, le=8) search_engine: str = "duckduckgo" + search_query_template: str = "Serial {series_name} {char_name} postać" class CharacterReferenceProcessorConfig(BaseModel): diff --git a/preprocessor/series_configs/defaults.json b/preprocessor/series_configs/defaults.json index 805c052b9..8f694f511 100644 --- a/preprocessor/series_configs/defaults.json +++ b/preprocessor/series_configs/defaults.json @@ -36,7 +36,8 @@ "scraping": { "character_references": { "images_per_character": 3, - "search_engine": "duckduckgo" + "search_engine": "duckduckgo", + "search_query_template": "Serial {series_name} {char_name} posta\u0107" }, "characters": { "parser_mode": "normal" diff --git a/preprocessor/series_configs/sejm_demo.json b/preprocessor/series_configs/sejm_demo.json index fad8932b7..670d5091b 100644 --- a/preprocessor/series_configs/sejm_demo.json +++ b/preprocessor/series_configs/sejm_demo.json @@ -8,7 +8,8 @@ "pipeline_mode": "selective", "scraping": { "character_references": { - "images_per_character": 2 + "images_per_character": 2, + "search_query_template": "{char_name} pose\u0142" }, "characters": { "urls": [ diff --git a/preprocessor/services/characters/reference_downloader.py b/preprocessor/services/characters/reference_downloader.py index 32ca4bfc3..fe011d666 100644 --- a/preprocessor/services/characters/reference_downloader.py +++ b/preprocessor/services/characters/reference_downloader.py @@ -59,6 +59,9 @@ def __init__(self, args: Dict[str, Any]) -> None: self.__min_height: int = settings.image_scraper.min_image_height self.__search_mode: str = self._args.get('search_mode', 'normal') self.__force_rerun: bool = self._args.get('force_rerun', False) + self.__search_query_template: str = self._args.get( + 'search_query_template', 'Serial {series_name} {char_name} postać', + ) self.__search_engine: BaseImageSearch = self.__create_search_engine() self.__face_app: Optional[FaceAnalysis] = None @@ -127,7 +130,9 @@ def _process_item(self, item: ProcessingItem, missing_outputs: List[OutputSpec]) if saved_count >= self.__images_per_character: return - search_query = f'Serial {self.__series_name} {char_name} postać' + search_query = self.__search_query_template.format( + series_name=self.__series_name, char_name=char_name, + ) self.logger.info(f'Searching [{self.__search_engine.name}]: {search_query}') saved_count = self.__execute_search_with_retries(search_query, char_name, output_folder, saved_count) diff --git a/preprocessor/steps/scraping/reference_processor_step.py b/preprocessor/steps/scraping/reference_processor_step.py index b3ebff9af..16a783b01 100644 --- a/preprocessor/steps/scraping/reference_processor_step.py +++ b/preprocessor/steps/scraping/reference_processor_step.py @@ -57,6 +57,7 @@ def __download_character_references( "search_engine": self.config.search_engine, "images_per_character": self.config.images_per_character, "series_name": context.series_name, + "search_query_template": self.config.search_query_template, "force_rerun": context.force_rerun, }, ) From eee5f1113f0db49e677f979ece5a0c07e06919c8 Mon Sep 17 00:00:00 2001 From: dam2452 <81230036+dam2452@users.noreply.github.com> Date: Sun, 15 Mar 2026 18:01:35 +0100 Subject: [PATCH 77/89] Use RapidAPI for Google Search; add SerpAPI Switch Google image search to RapidAPI: rename config key to google_search_key and read RAPIDAPI_GOOGLE_SEARCH_KEY from env. Replace serpapi client usage in GoogleImageSearch with requests to the RapidAPI Google Search endpoint (update names/error text and result parsing). Add a new SerpApiImageSearch class that retains the original serpapi-based implementation and export it from the package. Update reference_downloader to accept no direct DuckDuckGo fallback (search engine is optional), use settings.image_scraper.google_search_key for premium mode, and implement a browser-based DuckDuckGo i.js response fallback to gather image URLs. Improve download handling by checking content-type, add extraction of og:image when HTML is returned, and move/surface result-sorting utility; also adjust imports and logging accordingly. --- preprocessor/config/config.py | 6 +- .../characters/image_search/__init__.py | 3 +- .../image_search/google_image_search.py | 46 ++++---- .../image_search/serpapi_image_search.py | 51 +++++++++ .../characters/reference_downloader.py | 104 ++++++++++++++---- 5 files changed, 164 insertions(+), 46 deletions(-) create mode 100644 preprocessor/services/characters/image_search/serpapi_image_search.py diff --git a/preprocessor/config/config.py b/preprocessor/config/config.py index 5ea79e638..3c161656d 100644 --- a/preprocessor/config/config.py +++ b/preprocessor/config/config.py @@ -227,14 +227,14 @@ class ImageScraperSettings(BaseAPISettings): retry_delay: float = 3.0 @property - def serpapi_key(self) -> Optional[str]: + def google_search_key(self) -> Optional[str]: return self.api_key @classmethod def from_env(cls) -> 'ImageScraperSettings': api_key = None - if os.getenv('SERPAPI_API_KEY'): - api_key = SecretStr(os.getenv('SERPAPI_API_KEY', '')) + if os.getenv('RAPIDAPI_GOOGLE_SEARCH_KEY'): + api_key = SecretStr(os.getenv('RAPIDAPI_GOOGLE_SEARCH_KEY', '')) return cls(_api_key=api_key) diff --git a/preprocessor/services/characters/image_search/__init__.py b/preprocessor/services/characters/image_search/__init__.py index 9ca06eb9f..af3687fe7 100644 --- a/preprocessor/services/characters/image_search/__init__.py +++ b/preprocessor/services/characters/image_search/__init__.py @@ -1,5 +1,6 @@ from preprocessor.services.characters.image_search.duckduckgo_image_search import DuckDuckGoImageSearch from preprocessor.services.characters.image_search.google_image_search import GoogleImageSearch from preprocessor.services.characters.image_search.image_search import BaseImageSearch +from preprocessor.services.characters.image_search.serpapi_image_search import SerpApiImageSearch -__all__ = ['BaseImageSearch', 'DuckDuckGoImageSearch', 'GoogleImageSearch'] +__all__ = ['BaseImageSearch', 'DuckDuckGoImageSearch', 'GoogleImageSearch', 'SerpApiImageSearch'] diff --git a/preprocessor/services/characters/image_search/google_image_search.py b/preprocessor/services/characters/image_search/google_image_search.py index 5a9cfa013..02396edb3 100644 --- a/preprocessor/services/characters/image_search/google_image_search.py +++ b/preprocessor/services/characters/image_search/google_image_search.py @@ -4,48 +4,50 @@ List, ) -from serpapi import GoogleSearch +import requests from preprocessor.services.characters.image_search.image_search import BaseImageSearch +_RAPIDAPI_HOST = 'google-search116.p.rapidapi.com' +_RAPIDAPI_URL = f'https://{_RAPIDAPI_HOST}/' + class GoogleImageSearch(BaseImageSearch): def __init__(self, api_key: str, max_results: int = 50) -> None: super().__init__(max_results) if not api_key: - raise ValueError('SerpAPI key is required for Google Image Search') + raise ValueError('RapidAPI key is required for Google Image Search') self.__api_key = api_key @property def name(self) -> str: - return 'Google Images API' + return 'Google Search API (RapidAPI)' def search(self, query: str) -> List[Dict[str, str]]: - params = self.__build_search_params(query) - search_client = GoogleSearch(params) - raw_results = search_client.get_dict() - + raw_results = self.__call_api(query) return self.__extract_image_data(raw_results) - def __build_search_params(self, query: str) -> Dict[str, str]: - return { - 'engine': 'google_images', - 'q': query, + def __call_api(self, query: str) -> Dict[str, Any]: + headers = { + 'x-rapidapi-key': self.__api_key, + 'x-rapidapi-host': _RAPIDAPI_HOST, + } + params = { + 'query': query, + 'limit': str(self._max_results), 'hl': 'pl', 'gl': 'pl', - 'api_key': self.__api_key, } + response = requests.get(_RAPIDAPI_URL, headers=headers, params=params, timeout=15) + response.raise_for_status() + return response.json() def __extract_image_data(self, raw_results: Dict[str, Any]) -> List[Dict[str, str]]: - images: List[Dict[str, str]] = [] - image_results = raw_results.get('images_results', [])[:self._max_results] - - for img_result in image_results: - images.append({ - 'image': img_result.get('original', ''), - 'thumbnail': img_result.get('thumbnail', ''), - }) - - return images + results = raw_results.get('results', [])[:self._max_results] + return [ + {'image': r['url'], 'thumbnail': ''} + for r in results + if r.get('url') + ] diff --git a/preprocessor/services/characters/image_search/serpapi_image_search.py b/preprocessor/services/characters/image_search/serpapi_image_search.py new file mode 100644 index 000000000..244569e41 --- /dev/null +++ b/preprocessor/services/characters/image_search/serpapi_image_search.py @@ -0,0 +1,51 @@ +from typing import ( + Any, + Dict, + List, +) + +from serpapi import GoogleSearch + +from preprocessor.services.characters.image_search.image_search import BaseImageSearch + + +class SerpApiImageSearch(BaseImageSearch): + def __init__(self, api_key: str, max_results: int = 50) -> None: + super().__init__(max_results) + + if not api_key: + raise ValueError('SerpAPI key is required for Google Image Search') + + self.__api_key = api_key + + @property + def name(self) -> str: + return 'Google Images API (SerpAPI)' + + def search(self, query: str) -> List[Dict[str, str]]: + params = self.__build_search_params(query) + search_client = GoogleSearch(params) + raw_results = search_client.get_dict() + + return self.__extract_image_data(raw_results) + + def __build_search_params(self, query: str) -> Dict[str, str]: + return { + 'engine': 'google_images', + 'q': query, + 'hl': 'pl', + 'gl': 'pl', + 'api_key': self.__api_key, + } + + def __extract_image_data(self, raw_results: Dict[str, Any]) -> List[Dict[str, str]]: + images: List[Dict[str, str]] = [] + image_results = raw_results.get('images_results', [])[:self._max_results] + + for img_result in image_results: + images.append({ + 'image': img_result.get('original', ''), + 'thumbnail': img_result.get('thumbnail', ''), + }) + + return images diff --git a/preprocessor/services/characters/reference_downloader.py b/preprocessor/services/characters/reference_downloader.py index fe011d666..9d1f61b2a 100644 --- a/preprocessor/services/characters/reference_downloader.py +++ b/preprocessor/services/characters/reference_downloader.py @@ -11,6 +11,7 @@ List, Optional, ) +from urllib.parse import quote_plus import cv2 from insightface.app import FaceAnalysis @@ -19,6 +20,7 @@ BrowserContext, Page, Playwright, + Response, sync_playwright, ) @@ -26,7 +28,6 @@ from preprocessor.services.characters.face_detection import FaceDetector from preprocessor.services.characters.image_search import ( BaseImageSearch, - DuckDuckGoImageSearch, GoogleImageSearch, ) from preprocessor.services.core.base_processor import ( @@ -63,7 +64,7 @@ def __init__(self, args: Dict[str, Any]) -> None: 'search_query_template', 'Serial {series_name} {char_name} postać', ) - self.__search_engine: BaseImageSearch = self.__create_search_engine() + self.__search_engine: Optional[BaseImageSearch] = self.__create_search_engine() self.__face_app: Optional[FaceAnalysis] = None self.__playwright: Optional[Playwright] = None self.__browser_context: Optional[BrowserContext] = None @@ -133,7 +134,7 @@ def _process_item(self, item: ProcessingItem, missing_outputs: List[OutputSpec]) search_query = self.__search_query_template.format( series_name=self.__series_name, char_name=char_name, ) - self.logger.info(f'Searching [{self.__search_engine.name}]: {search_query}') + self.logger.info(f'Searching: {search_query}') saved_count = self.__execute_search_with_retries(search_query, char_name, output_folder, saved_count) self.__log_final_results(char_name, saved_count) @@ -142,13 +143,13 @@ def _process_item(self, item: ProcessingItem, missing_outputs: List[OutputSpec]) if saved_count == 0: self.__mark_exhausted(output_folder, char_name) - def __create_search_engine(self) -> BaseImageSearch: + def __create_search_engine(self) -> Optional[BaseImageSearch]: if self.__search_mode == 'premium': return GoogleImageSearch( - api_key=settings.image_scraper.serpapi_key, + api_key=settings.image_scraper.google_search_key, max_results=self.__max_results, ) - return DuckDuckGoImageSearch(max_results=self.__max_results) + return None def __prepare_output_folder(self, char_name: str) -> Path: output_folder = self.__output_dir / char_name.replace(' ', '_').lower() @@ -160,7 +161,10 @@ def __execute_search_with_retries( ) -> int: for attempt in range(settings.image_scraper.retry_attempts): try: - results = self.__search_engine.search(query) + if self.__search_engine is not None: + results = self.__search_engine.search(query) + else: + results = self.__search_via_browser(query) return self.__download_and_process_images(results, output_folder, saved_count) except Exception as e: if isinstance(e, KeyboardInterrupt): @@ -168,6 +172,40 @@ def __execute_search_with_retries( self.__handle_retry_logic(e, attempt, char_name) return saved_count + def __search_via_browser(self, query: str) -> List[Dict[str, Any]]: + i_js_responses: List[Response] = [] + page = self.__browser_context.new_page() + + def _on_response(response: Response) -> None: + if '/i.js' in response.url and response.status == 200: + i_js_responses.append(response) + + page.on('response', _on_response) + + try: + url = f'https://duckduckgo.com/?q={quote_plus(query)}&iax=images&ia=images' + page.goto(url, wait_until='networkidle', timeout=20000) + + results: List[Dict[str, Any]] = [] + for response in i_js_responses: + try: + data = response.json() + for item in data.get('results', []): + img_url = item.get('image', '') + if img_url: + results.append({'image': img_url}) + if len(results) >= self.__max_results: + break + except Exception as e: + self.logger.debug(f'Failed to parse DDG i.js response: {e}') + + if not results: + raise ValueError('No results found') + + return results + finally: + page.close() + def __handle_retry_logic(self, error: Exception, attempt: int, char_name: str) -> None: if attempt < settings.image_scraper.retry_attempts - 1: delay = settings.image_scraper.retry_delay * (2 ** attempt) @@ -210,10 +248,14 @@ def __download_image_via_browser(self, img_url: str, page: Page) -> Optional[np. if not response or response.status != 200: return None - if 'image' not in response.headers.get('content-type', ''): - return None + content_type = response.headers.get('content-type', '') + if 'image' in content_type: + return self.__decode_image_bytes(response.body(), img_url) - return self.__decode_image_bytes(response.body(), img_url) + if 'text/html' in content_type: + return self.__extract_og_image(page, img_url) + + return None except TimeoutError: self.logger.debug(f'Timeout downloading image {img_url}') @@ -225,6 +267,28 @@ def __download_image_via_browser(self, img_url: str, page: Page) -> Optional[np. self.logger.debug(f'Failed to download image {img_url}: {msg}') return None + def __extract_og_image(self, page: Page, source_url: str) -> Optional[np.ndarray]: + try: + og_image_url = page.evaluate( + '() => document.querySelector("meta[property=\'og:image\']")?.content ?? ""', + ) + if not og_image_url: + return None + + response = page.goto( + og_image_url, + timeout=settings.image_scraper.page_navigation_timeout, + wait_until='domcontentloaded', + ) + if not response or response.status != 200: + return None + if 'image' not in response.headers.get('content-type', ''): + return None + return self.__decode_image_bytes(response.body(), og_image_url) + except Exception as e: + self.logger.debug(f'Failed to extract og:image from {source_url}: {e}') + return None + def __decode_image_bytes(self, img_bytes: bytes, img_url: str) -> Optional[np.ndarray]: if not img_bytes: return None @@ -262,16 +326,6 @@ def __validate_and_save_image( cv2.imwrite(str(output_folder / filename), img) return True - @staticmethod - def __sort_results_by_extension(results: List[Dict[str, Any]]) -> List[Dict[str, Any]]: - return sorted( - results, - key=lambda x: ( - 0 if x.get('image', '').lower().endswith(('.jpg', '.jpeg')) else 1, - 1 if x.get('image', '').lower().endswith('.png') else 2, - ), - ) - def __mark_exhausted(self, output_folder: Path, char_name: str) -> None: exhausted_marker = output_folder / '.exhausted' exhausted_marker.touch() @@ -285,6 +339,16 @@ def __log_final_results(self, char_name: str, saved_count: int) -> None: else: self.logger.warning(f'{char_name}: No suitable images found') + @staticmethod + def __sort_results_by_extension(results: List[Dict[str, Any]]) -> List[Dict[str, Any]]: + return sorted( + results, + key=lambda x: ( + 0 if x.get('image', '').lower().endswith(('.jpg', '.jpeg')) else 1, + 1 if x.get('image', '').lower().endswith('.png') else 2, + ), + ) + @staticmethod def __apply_random_delay() -> None: delay = random.uniform( From 69e21abdec1eead72e2926d12c6b852fdeeab422 Mon Sep 17 00:00:00 2001 From: dam2452 <81230036+dam2452@users.noreply.github.com> Date: Sun, 15 Mar 2026 18:13:15 +0100 Subject: [PATCH 78/89] Add search engines and lower min image size Reduce min image dimensions from 600x800 to 60x60 and introduce explicit search engine handling. Default search_engine in CharacterReferenceConfig changed to "normal"; sejm_demo.json now requests "premium" engine and fixes the search query string. DuckDuckGoImageSearch now normalizes results to dicts with 'image' and 'thumbnail'; GoogleImageSearch extracts knowledge panel image first and returns up to max_results. Character reference downloader was simplified to always use a search engine instance (Google when 'premium', otherwise DuckDuckGo), removed the browser-based DuckDuckGo fallback and unused imports, and adjusted arg/variable names for clarity. --- preprocessor/config/config.py | 4 +- preprocessor/config/step_configs.py | 2 +- preprocessor/series_configs/sejm_demo.json | 1 + .../image_search/duckduckgo_image_search.py | 6 ++- .../image_search/google_image_search.py | 17 +++--- .../characters/reference_downloader.py | 52 +++---------------- 6 files changed, 27 insertions(+), 55 deletions(-) diff --git a/preprocessor/config/config.py b/preprocessor/config/config.py index 3c161656d..63121939a 100644 --- a/preprocessor/config/config.py +++ b/preprocessor/config/config.py @@ -218,8 +218,8 @@ class ImageHashSettings(OutputDirMixin): @dataclass(frozen=True) class ImageScraperSettings(BaseAPISettings): max_results_to_scrape: int = 50 - min_image_height: int = 600 - min_image_width: int = 800 + min_image_height: int = 60 + min_image_width: int = 60 page_navigation_timeout: int = 30000 request_delay_max: float = 6.0 request_delay_min: float = 3.0 diff --git a/preprocessor/config/step_configs.py b/preprocessor/config/step_configs.py index 8e766330e..b0ec137ae 100644 --- a/preprocessor/config/step_configs.py +++ b/preprocessor/config/step_configs.py @@ -229,7 +229,7 @@ class CharacterScraperConfig(BaseModel): class CharacterReferenceConfig(BaseModel): images_per_character: int = Field(default=5, ge=0, le=20) max_parallel_episodes: int = Field(default=4, ge=1, le=8) - search_engine: str = "duckduckgo" + search_engine: str = "normal" search_query_template: str = "Serial {series_name} {char_name} postać" diff --git a/preprocessor/series_configs/sejm_demo.json b/preprocessor/series_configs/sejm_demo.json index 670d5091b..5fe624a80 100644 --- a/preprocessor/series_configs/sejm_demo.json +++ b/preprocessor/series_configs/sejm_demo.json @@ -9,6 +9,7 @@ "scraping": { "character_references": { "images_per_character": 2, + "search_engine": "premium", "search_query_template": "{char_name} pose\u0142" }, "characters": { diff --git a/preprocessor/services/characters/image_search/duckduckgo_image_search.py b/preprocessor/services/characters/image_search/duckduckgo_image_search.py index e9363c915..a9fb18a83 100644 --- a/preprocessor/services/characters/image_search/duckduckgo_image_search.py +++ b/preprocessor/services/characters/image_search/duckduckgo_image_search.py @@ -16,4 +16,8 @@ def name(self) -> str: def search(self, query: str) -> List[Dict[str, str]]: with DDGS() as ddgs: results = ddgs.images(query, max_results=self._max_results) - return list(results) + return [ + {'image': r.get('thumbnail') or r.get('image', ''), 'thumbnail': r.get('thumbnail', '')} + for r in results + if r.get('thumbnail') or r.get('image') + ] diff --git a/preprocessor/services/characters/image_search/google_image_search.py b/preprocessor/services/characters/image_search/google_image_search.py index 02396edb3..e8f63caa0 100644 --- a/preprocessor/services/characters/image_search/google_image_search.py +++ b/preprocessor/services/characters/image_search/google_image_search.py @@ -45,9 +45,14 @@ def __call_api(self, query: str) -> Dict[str, Any]: return response.json() def __extract_image_data(self, raw_results: Dict[str, Any]) -> List[Dict[str, str]]: - results = raw_results.get('results', [])[:self._max_results] - return [ - {'image': r['url'], 'thumbnail': ''} - for r in results - if r.get('url') - ] + images: List[Dict[str, str]] = [] + + kp_url = raw_results.get('knowledge_panel', {}).get('image', {}).get('url', '') + if kp_url: + images.append({'image': kp_url, 'thumbnail': kp_url}) + + for r in raw_results.get('results', []): + if r.get('url'): + images.append({'image': r['url'], 'thumbnail': ''}) + + return images[:self._max_results] diff --git a/preprocessor/services/characters/reference_downloader.py b/preprocessor/services/characters/reference_downloader.py index 9d1f61b2a..4e001a84f 100644 --- a/preprocessor/services/characters/reference_downloader.py +++ b/preprocessor/services/characters/reference_downloader.py @@ -11,7 +11,6 @@ List, Optional, ) -from urllib.parse import quote_plus import cv2 from insightface.app import FaceAnalysis @@ -20,7 +19,6 @@ BrowserContext, Page, Playwright, - Response, sync_playwright, ) @@ -28,6 +26,7 @@ from preprocessor.services.characters.face_detection import FaceDetector from preprocessor.services.characters.image_search import ( BaseImageSearch, + DuckDuckGoImageSearch, GoogleImageSearch, ) from preprocessor.services.core.base_processor import ( @@ -58,13 +57,13 @@ def __init__(self, args: Dict[str, Any]) -> None: self.__max_results: int = settings.image_scraper.max_results_to_scrape self.__min_width: int = settings.image_scraper.min_image_width self.__min_height: int = settings.image_scraper.min_image_height - self.__search_mode: str = self._args.get('search_mode', 'normal') + self.__search_engine_name: str = self._args.get('search_engine', 'normal') self.__force_rerun: bool = self._args.get('force_rerun', False) self.__search_query_template: str = self._args.get( 'search_query_template', 'Serial {series_name} {char_name} postać', ) - self.__search_engine: Optional[BaseImageSearch] = self.__create_search_engine() + self.__search_engine: BaseImageSearch = self.__create_search_engine() self.__face_app: Optional[FaceAnalysis] = None self.__playwright: Optional[Playwright] = None self.__browser_context: Optional[BrowserContext] = None @@ -143,13 +142,13 @@ def _process_item(self, item: ProcessingItem, missing_outputs: List[OutputSpec]) if saved_count == 0: self.__mark_exhausted(output_folder, char_name) - def __create_search_engine(self) -> Optional[BaseImageSearch]: - if self.__search_mode == 'premium': + def __create_search_engine(self) -> BaseImageSearch: + if self.__search_engine_name == 'premium': return GoogleImageSearch( api_key=settings.image_scraper.google_search_key, max_results=self.__max_results, ) - return None + return DuckDuckGoImageSearch(max_results=self.__max_results) def __prepare_output_folder(self, char_name: str) -> Path: output_folder = self.__output_dir / char_name.replace(' ', '_').lower() @@ -161,10 +160,7 @@ def __execute_search_with_retries( ) -> int: for attempt in range(settings.image_scraper.retry_attempts): try: - if self.__search_engine is not None: - results = self.__search_engine.search(query) - else: - results = self.__search_via_browser(query) + results = self.__search_engine.search(query) return self.__download_and_process_images(results, output_folder, saved_count) except Exception as e: if isinstance(e, KeyboardInterrupt): @@ -172,40 +168,6 @@ def __execute_search_with_retries( self.__handle_retry_logic(e, attempt, char_name) return saved_count - def __search_via_browser(self, query: str) -> List[Dict[str, Any]]: - i_js_responses: List[Response] = [] - page = self.__browser_context.new_page() - - def _on_response(response: Response) -> None: - if '/i.js' in response.url and response.status == 200: - i_js_responses.append(response) - - page.on('response', _on_response) - - try: - url = f'https://duckduckgo.com/?q={quote_plus(query)}&iax=images&ia=images' - page.goto(url, wait_until='networkidle', timeout=20000) - - results: List[Dict[str, Any]] = [] - for response in i_js_responses: - try: - data = response.json() - for item in data.get('results', []): - img_url = item.get('image', '') - if img_url: - results.append({'image': img_url}) - if len(results) >= self.__max_results: - break - except Exception as e: - self.logger.debug(f'Failed to parse DDG i.js response: {e}') - - if not results: - raise ValueError('No results found') - - return results - finally: - page.close() - def __handle_retry_logic(self, error: Exception, attempt: int, char_name: str) -> None: if attempt < settings.image_scraper.retry_attempts - 1: delay = settings.image_scraper.retry_delay * (2 ** attempt) From f26e38fbfdfbbbf1ae50c606d4ef3ed6733218dc Mon Sep 17 00:00:00 2001 From: dam2452 <81230036+dam2452@users.noreply.github.com> Date: Sun, 15 Mar 2026 20:26:28 +0100 Subject: [PATCH 79/89] Refactor image search and reference downloader Switch image scraping to SerpAPI and browser-based Bing, unify search interface to iterators, and overhaul reference downloader pipeline. Updates include: increase max results (50->100), adjust request/retry delays, rename env key to SERPAPI_API_KEY (settings property serpapi_key), and change sejm_demo search_engine from "premium" to "normal". Added BrowserBingImageSearch (browser scraping), updated DuckDuckGo to yield results with pre-search delay, replaced RapidAPI Google search with serpapi.GoogleSearch, and removed the old serpapi_image_search module. reference_downloader was rewritten to initialize the chosen search engine after launching the browser, stream results, collect candidate images, perform consensus-based face embedding filtering/scoring, decode images via PIL, and save top candidates (removed OG extraction and previous simple validation/sorting logic). --- preprocessor/config/config.py | 14 +- preprocessor/series_configs/sejm_demo.json | 2 +- .../characters/image_search/__init__.py | 4 +- .../image_search/bing_image_search.py | 62 ++++++ .../image_search/duckduckgo_image_search.py | 27 ++- .../image_search/google_image_search.py | 51 ++--- .../characters/image_search/image_search.py | 4 +- ...arch.py => serpapi_image_search.py.delete} | 0 .../characters/reference_downloader.py | 189 +++++++++++------- 9 files changed, 226 insertions(+), 127 deletions(-) create mode 100644 preprocessor/services/characters/image_search/bing_image_search.py rename preprocessor/services/characters/image_search/{serpapi_image_search.py => serpapi_image_search.py.delete} (100%) diff --git a/preprocessor/config/config.py b/preprocessor/config/config.py index 63121939a..66e75b3df 100644 --- a/preprocessor/config/config.py +++ b/preprocessor/config/config.py @@ -217,24 +217,24 @@ class ImageHashSettings(OutputDirMixin): @dataclass(frozen=True) class ImageScraperSettings(BaseAPISettings): - max_results_to_scrape: int = 50 + max_results_to_scrape: int = 100 min_image_height: int = 60 min_image_width: int = 60 page_navigation_timeout: int = 30000 - request_delay_max: float = 6.0 - request_delay_min: float = 3.0 + request_delay_max: float = 5.0 + request_delay_min: float = 2.0 retry_attempts: int = 3 - retry_delay: float = 3.0 + retry_delay: float = 5.0 @property - def google_search_key(self) -> Optional[str]: + def serpapi_key(self) -> Optional[str]: return self.api_key @classmethod def from_env(cls) -> 'ImageScraperSettings': api_key = None - if os.getenv('RAPIDAPI_GOOGLE_SEARCH_KEY'): - api_key = SecretStr(os.getenv('RAPIDAPI_GOOGLE_SEARCH_KEY', '')) + if os.getenv('SERPAPI_API_KEY'): + api_key = SecretStr(os.getenv('SERPAPI_API_KEY', '')) return cls(_api_key=api_key) diff --git a/preprocessor/series_configs/sejm_demo.json b/preprocessor/series_configs/sejm_demo.json index 5fe624a80..2164929bb 100644 --- a/preprocessor/series_configs/sejm_demo.json +++ b/preprocessor/series_configs/sejm_demo.json @@ -9,7 +9,7 @@ "scraping": { "character_references": { "images_per_character": 2, - "search_engine": "premium", + "search_engine": "normal", "search_query_template": "{char_name} pose\u0142" }, "characters": { diff --git a/preprocessor/services/characters/image_search/__init__.py b/preprocessor/services/characters/image_search/__init__.py index af3687fe7..bcfc66e90 100644 --- a/preprocessor/services/characters/image_search/__init__.py +++ b/preprocessor/services/characters/image_search/__init__.py @@ -1,6 +1,6 @@ +from preprocessor.services.characters.image_search.bing_image_search import BrowserBingImageSearch from preprocessor.services.characters.image_search.duckduckgo_image_search import DuckDuckGoImageSearch from preprocessor.services.characters.image_search.google_image_search import GoogleImageSearch from preprocessor.services.characters.image_search.image_search import BaseImageSearch -from preprocessor.services.characters.image_search.serpapi_image_search import SerpApiImageSearch -__all__ = ['BaseImageSearch', 'DuckDuckGoImageSearch', 'GoogleImageSearch', 'SerpApiImageSearch'] +__all__ = ['BaseImageSearch', 'BrowserBingImageSearch', 'DuckDuckGoImageSearch', 'GoogleImageSearch'] diff --git a/preprocessor/services/characters/image_search/bing_image_search.py b/preprocessor/services/characters/image_search/bing_image_search.py new file mode 100644 index 000000000..54d922386 --- /dev/null +++ b/preprocessor/services/characters/image_search/bing_image_search.py @@ -0,0 +1,62 @@ +from typing import ( + Any, + Dict, + Iterator, + List, +) +from urllib.parse import quote + +from patchright.sync_api import BrowserContext + +from preprocessor.services.characters.image_search.image_search import BaseImageSearch + +_SEARCH_URL = 'https://www.bing.com/images/search' +_RESULT_WAIT_MS = 3000 +_SCROLL_STEPS = 3 +_SCROLL_PAUSE_MS = 1500 + + +class BrowserBingImageSearch(BaseImageSearch): + def __init__(self, browser_context: BrowserContext, max_results: int = 100) -> None: + super().__init__(max_results) + self.__browser_context = browser_context + + @property + def name(self) -> str: + return 'Bing Images (Browser)' + + def search(self, query: str) -> Iterator[Dict[str, str]]: + page = self.__browser_context.new_page() + try: + url = f'{_SEARCH_URL}?q={quote(query)}&count={self._max_results}&form=HDRSC2' + page.goto(url, wait_until='domcontentloaded', timeout=30000) + page.wait_for_timeout(_RESULT_WAIT_MS) + self.__scroll_to_load_more(page) + yield from self.__extract_results(page) + finally: + page.close() + + @staticmethod + def __scroll_to_load_more(page: Any) -> None: + for _ in range(_SCROLL_STEPS): + page.evaluate('window.scrollBy(0, window.innerHeight)') + page.wait_for_timeout(_SCROLL_PAUSE_MS) + + def __extract_results(self, page: Any) -> Iterator[Dict[str, str]]: + raw: List[Dict[str, str]] = page.evaluate("""() => { + const out = []; + for (const el of document.querySelectorAll('.iusc')) { + try { + const m = JSON.parse(el.getAttribute('m') || '{}'); + if (m.murl) out.push({image: m.murl, thumbnail: m.turl || ''}); + } catch(e) {} + } + if (out.length === 0) { + for (const img of document.querySelectorAll('img.mimg, img[data-src]')) { + const src = img.getAttribute('data-src') || img.src || ''; + if (src && src.startsWith('http')) out.push({image: src, thumbnail: src}); + } + } + return out; + }""") + yield from raw[:self._max_results] diff --git a/preprocessor/services/characters/image_search/duckduckgo_image_search.py b/preprocessor/services/characters/image_search/duckduckgo_image_search.py index a9fb18a83..49eae02bb 100644 --- a/preprocessor/services/characters/image_search/duckduckgo_image_search.py +++ b/preprocessor/services/characters/image_search/duckduckgo_image_search.py @@ -1,6 +1,8 @@ +import random +import time from typing import ( Dict, - List, + Iterator, ) from ddgs import DDGS @@ -9,15 +11,24 @@ class DuckDuckGoImageSearch(BaseImageSearch): + def __init__( + self, + max_results: int = 50, + pre_search_delay_min: float = 8.0, + pre_search_delay_max: float = 15.0, + ) -> None: + super().__init__(max_results) + self.__pre_search_delay_min = pre_search_delay_min + self.__pre_search_delay_max = pre_search_delay_max + @property def name(self) -> str: return 'DuckDuckGo' - def search(self, query: str) -> List[Dict[str, str]]: + def search(self, query: str) -> Iterator[Dict[str, str]]: + time.sleep(random.uniform(self.__pre_search_delay_min, self.__pre_search_delay_max)) with DDGS() as ddgs: - results = ddgs.images(query, max_results=self._max_results) - return [ - {'image': r.get('thumbnail') or r.get('image', ''), 'thumbnail': r.get('thumbnail', '')} - for r in results - if r.get('thumbnail') or r.get('image') - ] + for r in ddgs.images(query, region='pl-pl', max_results=self._max_results): + url = r.get('image') or r.get('thumbnail', '') + if url: + yield {'image': url, 'thumbnail': r.get('thumbnail', '')} diff --git a/preprocessor/services/characters/image_search/google_image_search.py b/preprocessor/services/characters/image_search/google_image_search.py index e8f63caa0..59fb15726 100644 --- a/preprocessor/services/characters/image_search/google_image_search.py +++ b/preprocessor/services/characters/image_search/google_image_search.py @@ -1,58 +1,43 @@ from typing import ( Any, Dict, - List, + Iterator, ) -import requests +from serpapi import GoogleSearch from preprocessor.services.characters.image_search.image_search import BaseImageSearch -_RAPIDAPI_HOST = 'google-search116.p.rapidapi.com' -_RAPIDAPI_URL = f'https://{_RAPIDAPI_HOST}/' - class GoogleImageSearch(BaseImageSearch): def __init__(self, api_key: str, max_results: int = 50) -> None: super().__init__(max_results) if not api_key: - raise ValueError('RapidAPI key is required for Google Image Search') + raise ValueError('SerpAPI key is required for Google Image Search') self.__api_key = api_key @property def name(self) -> str: - return 'Google Search API (RapidAPI)' + return 'Google Images (SerpAPI)' - def search(self, query: str) -> List[Dict[str, str]]: - raw_results = self.__call_api(query) - return self.__extract_image_data(raw_results) + def search(self, query: str) -> Iterator[Dict[str, str]]: + params = self.__build_search_params(query) + raw_results = GoogleSearch(params).get_dict() + yield from self.__iter_image_data(raw_results) - def __call_api(self, query: str) -> Dict[str, Any]: - headers = { - 'x-rapidapi-key': self.__api_key, - 'x-rapidapi-host': _RAPIDAPI_HOST, - } - params = { - 'query': query, - 'limit': str(self._max_results), + def __build_search_params(self, query: str) -> Dict[str, str]: + return { + 'engine': 'google_images', + 'q': query, 'hl': 'pl', 'gl': 'pl', + 'api_key': self.__api_key, } - response = requests.get(_RAPIDAPI_URL, headers=headers, params=params, timeout=15) - response.raise_for_status() - return response.json() - - def __extract_image_data(self, raw_results: Dict[str, Any]) -> List[Dict[str, str]]: - images: List[Dict[str, str]] = [] - - kp_url = raw_results.get('knowledge_panel', {}).get('image', {}).get('url', '') - if kp_url: - images.append({'image': kp_url, 'thumbnail': kp_url}) - - for r in raw_results.get('results', []): - if r.get('url'): - images.append({'image': r['url'], 'thumbnail': ''}) - return images[:self._max_results] + def __iter_image_data(self, raw_results: Dict[str, Any]) -> Iterator[Dict[str, str]]: + for img in raw_results.get('images_results', [])[:self._max_results]: + url = img.get('original') or img.get('thumbnail', '') + if url: + yield {'image': url, 'thumbnail': img.get('thumbnail', '')} diff --git a/preprocessor/services/characters/image_search/image_search.py b/preprocessor/services/characters/image_search/image_search.py index 00662c92e..3107cdde3 100644 --- a/preprocessor/services/characters/image_search/image_search.py +++ b/preprocessor/services/characters/image_search/image_search.py @@ -4,7 +4,7 @@ ) from typing import ( Dict, - List, + Iterator, ) @@ -18,5 +18,5 @@ def name(self) -> str: pass @abstractmethod - def search(self, query: str) -> List[Dict[str, str]]: + def search(self, query: str) -> Iterator[Dict[str, str]]: pass diff --git a/preprocessor/services/characters/image_search/serpapi_image_search.py b/preprocessor/services/characters/image_search/serpapi_image_search.py.delete similarity index 100% rename from preprocessor/services/characters/image_search/serpapi_image_search.py rename to preprocessor/services/characters/image_search/serpapi_image_search.py.delete diff --git a/preprocessor/services/characters/reference_downloader.py b/preprocessor/services/characters/reference_downloader.py index 4e001a84f..f1f04a2e2 100644 --- a/preprocessor/services/characters/reference_downloader.py +++ b/preprocessor/services/characters/reference_downloader.py @@ -1,5 +1,6 @@ from __future__ import annotations +import io import json import logging from pathlib import Path @@ -8,10 +9,14 @@ from typing import ( Any, Dict, + Iterator, List, Optional, + Tuple, ) +import warnings +from PIL import Image import cv2 from insightface.app import FaceAnalysis import numpy as np @@ -26,7 +31,7 @@ from preprocessor.services.characters.face_detection import FaceDetector from preprocessor.services.characters.image_search import ( BaseImageSearch, - DuckDuckGoImageSearch, + BrowserBingImageSearch, GoogleImageSearch, ) from preprocessor.services.core.base_processor import ( @@ -63,7 +68,7 @@ def __init__(self, args: Dict[str, Any]) -> None: 'search_query_template', 'Serial {series_name} {char_name} postać', ) - self.__search_engine: BaseImageSearch = self.__create_search_engine() + self.__search_engine: Optional[BaseImageSearch] = None self.__face_app: Optional[FaceAnalysis] = None self.__playwright: Optional[Playwright] = None self.__browser_context: Optional[BrowserContext] = None @@ -120,6 +125,7 @@ def _load_resources(self) -> bool: args=['--no-sandbox', '--disable-dev-shm-usage', '--disable-gpu'], ignore_default_args=['--enable-automation'], ) + self.__search_engine = self.__create_search_engine() return True def _process_item(self, item: ProcessingItem, missing_outputs: List[OutputSpec]) -> None: @@ -130,6 +136,8 @@ def _process_item(self, item: ProcessingItem, missing_outputs: List[OutputSpec]) if saved_count >= self.__images_per_character: return + assert self.__search_engine is not None + search_query = self.__search_query_template.format( series_name=self.__series_name, char_name=char_name, ) @@ -145,10 +153,13 @@ def _process_item(self, item: ProcessingItem, missing_outputs: List[OutputSpec]) def __create_search_engine(self) -> BaseImageSearch: if self.__search_engine_name == 'premium': return GoogleImageSearch( - api_key=settings.image_scraper.google_search_key, + api_key=settings.image_scraper.serpapi_key, max_results=self.__max_results, ) - return DuckDuckGoImageSearch(max_results=self.__max_results) + return BrowserBingImageSearch( + browser_context=self.__browser_context, + max_results=self.__max_results, + ) def __prepare_output_folder(self, char_name: str) -> Path: output_folder = self.__output_dir / char_name.replace(' ', '_').lower() @@ -177,26 +188,110 @@ def __handle_retry_logic(self, error: Exception, attempt: int, char_name: str) - self.logger.warning(f'All retry attempts failed for {char_name}: {error}') def __download_and_process_images( - self, results: List[Dict[str, Any]], output_folder: Path, saved_count: int, + self, results: Iterator[Dict[str, Any]], output_folder: Path, saved_count: int, ) -> int: - sorted_results = self.__sort_results_by_extension(results) + needed = self.__images_per_character - saved_count + raw = self.__collect_raw_candidates(results, needed) + scored = self.__filter_by_consensus(raw) + if len(scored) < needed: + scored = self.__score_all(raw) + return self.__save_best_candidates(scored, output_folder, saved_count) + + def __collect_raw_candidates( + self, results: Iterator[Dict[str, Any]], needed: int, + ) -> List[Tuple[np.ndarray, List[Any]]]: + raw: List[Tuple[np.ndarray, List[Any]]] = [] + processed = 0 page = self.__browser_context.new_page() try: - for res in sorted_results: - if saved_count >= self.__images_per_character: + for res in results: + if processed >= self.__max_results: break - + processed += 1 img_url = res.get('image', '') try: img = self.__download_image_via_browser(img_url, page) - if img is not None and self.__validate_and_save_image(img, img_url, output_folder, saved_count): - saved_count += 1 + if img is None: + continue + h, w = img.shape[:2] + if w < self.__min_width or h < self.__min_height: + continue + faces = self.__face_app.get(img) + if faces: + raw.append((img, list(faces))) except Exception as e: self.logger.debug(f'Error processing image {img_url}: {e}') + + if len(raw) >= needed + 1 and len(self.__filter_by_consensus(raw)) >= needed: + break finally: page.close() + return raw + + def __filter_by_consensus( + self, candidates: List[Tuple[np.ndarray, List[Any]]], + ) -> List[Tuple[np.ndarray, float]]: + if not candidates: + return [] + + consensus = self.__find_consensus_embedding(candidates) + if consensus is None: + return [] + + threshold = settings.character.reference_matching_threshold + scored: List[Tuple[np.ndarray, float]] = [] + for img, faces in candidates: + best_det = max( + ( + f.det_score for f in faces + if float(np.dot(consensus, f.normed_embedding)) >= threshold + ), + default=None, + ) + if best_det is not None: + scored.append((img, float(best_det))) + return scored + + def __find_consensus_embedding( + self, candidates: List[Tuple[np.ndarray, List[Any]]], + ) -> Optional[np.ndarray]: + threshold = settings.character.reference_matching_threshold + _, first_faces = candidates[0] + others = [faces for _, faces in candidates[1:]] + + best_embedding: Optional[np.ndarray] = None + best_count = 0 + + for anchor in first_faces: + count = 1 + for other_faces in others: + sims = [float(np.dot(anchor.normed_embedding, f.normed_embedding)) for f in other_faces] + if sims and max(sims) >= threshold: + count += 1 + if count > best_count: + best_count = count + best_embedding = anchor.normed_embedding + + return best_embedding + + def __score_all( + self, candidates: List[Tuple[np.ndarray, List[Any]]], + ) -> List[Tuple[np.ndarray, float]]: + return [ + (img, float(max(f.det_score for f in faces))) + for img, faces in candidates + ] + + def __save_best_candidates( + self, candidates: List[Tuple[np.ndarray, float]], output_folder: Path, saved_count: int, + ) -> int: + needed = self.__images_per_character - saved_count + best = sorted(candidates, key=lambda x: x[1], reverse=True)[:needed] + for img, _ in best: + cv2.imwrite(str(output_folder / f'{saved_count:02d}.jpg'), img) + saved_count += 1 return saved_count def __download_image_via_browser(self, img_url: str, page: Page) -> Optional[np.ndarray]: @@ -210,14 +305,10 @@ def __download_image_via_browser(self, img_url: str, page: Page) -> Optional[np. if not response or response.status != 200: return None - content_type = response.headers.get('content-type', '') - if 'image' in content_type: - return self.__decode_image_bytes(response.body(), img_url) + if 'image' not in response.headers.get('content-type', ''): + return None - if 'text/html' in content_type: - return self.__extract_og_image(page, img_url) - - return None + return self.__decode_image_bytes(response.body(), img_url) except TimeoutError: self.logger.debug(f'Timeout downloading image {img_url}') @@ -229,36 +320,16 @@ def __download_image_via_browser(self, img_url: str, page: Page) -> Optional[np. self.logger.debug(f'Failed to download image {img_url}: {msg}') return None - def __extract_og_image(self, page: Page, source_url: str) -> Optional[np.ndarray]: - try: - og_image_url = page.evaluate( - '() => document.querySelector("meta[property=\'og:image\']")?.content ?? ""', - ) - if not og_image_url: - return None - - response = page.goto( - og_image_url, - timeout=settings.image_scraper.page_navigation_timeout, - wait_until='domcontentloaded', - ) - if not response or response.status != 200: - return None - if 'image' not in response.headers.get('content-type', ''): - return None - return self.__decode_image_bytes(response.body(), og_image_url) - except Exception as e: - self.logger.debug(f'Failed to extract og:image from {source_url}: {e}') - return None - def __decode_image_bytes(self, img_bytes: bytes, img_url: str) -> Optional[np.ndarray]: if not img_bytes: return None - img_array = np.asarray(bytearray(img_bytes), dtype=np.uint8) - img = cv2.imdecode(img_array, cv2.IMREAD_COLOR) - - if img is None or img.size == 0: + try: + with warnings.catch_warnings(): + warnings.simplefilter('ignore') + pil_img = Image.open(io.BytesIO(img_bytes)).convert('RGB') + img = cv2.cvtColor(np.array(pil_img), cv2.COLOR_RGB2BGR) + except Exception: self.logger.debug(f'Failed to decode image from {img_url}') return None @@ -268,26 +339,6 @@ def __decode_image_bytes(self, img_bytes: bytes, img_url: str) -> Optional[np.nd return img - def __validate_and_save_image( - self, img: np.ndarray, img_url: str, output_folder: Path, saved_count: int, - ) -> bool: - h, w = img.shape[:2] - if w < self.__min_width or h < self.__min_height: - return False - - try: - face_count = len(self.__face_app.get(img)) - except Exception as face_err: - self.logger.debug(f'Face detection failed for {img_url}: {face_err}') - return False - - if face_count != 1: - return False - - filename = f'{saved_count:02d}.jpg' - cv2.imwrite(str(output_folder / filename), img) - return True - def __mark_exhausted(self, output_folder: Path, char_name: str) -> None: exhausted_marker = output_folder / '.exhausted' exhausted_marker.touch() @@ -301,16 +352,6 @@ def __log_final_results(self, char_name: str, saved_count: int) -> None: else: self.logger.warning(f'{char_name}: No suitable images found') - @staticmethod - def __sort_results_by_extension(results: List[Dict[str, Any]]) -> List[Dict[str, Any]]: - return sorted( - results, - key=lambda x: ( - 0 if x.get('image', '').lower().endswith(('.jpg', '.jpeg')) else 1, - 1 if x.get('image', '').lower().endswith('.png') else 2, - ), - ) - @staticmethod def __apply_random_delay() -> None: delay = random.uniform( From c4a8a46d76a9d64dcc0189e83b1baba034e94d36 Mon Sep 17 00:00:00 2001 From: dam2452 <81230036+dam2452@users.noreply.github.com> Date: Mon, 16 Mar 2026 18:51:02 +0100 Subject: [PATCH 80/89] Use browser-based DuckDuckGo, refactor image scraping Introduce a browser-driven DuckDuckGo image search (BrowserDuckDuckGoImageSearch) and remove the old DDGS-based DuckDuckGo implementation. Improve Bing browser scraper: add page timeouts, hard alarm timeout, safer scrolling, and return lists instead of yielding generators. Add image_download_timeout setting to config and use it for page navigation when downloading images. Refactor CharacterReferenceDownloader: change candidate collection to return a consensus embedding early when confident, implement consensus clustering/selection, replace previous consensus & scoring helpers with consensus-aware scoring, and guard against insufficient consensus. Update exports to expose the new browser-based search classes and remove the deprecated implementation. --- preprocessor/config/config.py | 1 + preprocessor/services/characters/__init__.py | 6 +- .../characters/image_search/__init__.py | 4 +- .../image_search/bing_image_search.py | 53 ++++++++--- .../duckduckgo_browser_image_search.py | 58 ++++++++++++ .../image_search/duckduckgo_image_search.py | 34 ------- .../characters/reference_downloader.py | 92 +++++++++---------- 7 files changed, 150 insertions(+), 98 deletions(-) create mode 100644 preprocessor/services/characters/image_search/duckduckgo_browser_image_search.py delete mode 100644 preprocessor/services/characters/image_search/duckduckgo_image_search.py diff --git a/preprocessor/config/config.py b/preprocessor/config/config.py index 66e75b3df..cc023f4ee 100644 --- a/preprocessor/config/config.py +++ b/preprocessor/config/config.py @@ -217,6 +217,7 @@ class ImageHashSettings(OutputDirMixin): @dataclass(frozen=True) class ImageScraperSettings(BaseAPISettings): + image_download_timeout: int = 8000 max_results_to_scrape: int = 100 min_image_height: int = 60 min_image_width: int = 60 diff --git a/preprocessor/services/characters/__init__.py b/preprocessor/services/characters/__init__.py index 0aa7baa42..17fa35797 100644 --- a/preprocessor/services/characters/__init__.py +++ b/preprocessor/services/characters/__init__.py @@ -2,13 +2,15 @@ from preprocessor.services.characters.face_detection import FaceDetector from preprocessor.services.characters.image_search import ( BaseImageSearch, - DuckDuckGoImageSearch, + BrowserBingImageSearch, + BrowserDuckDuckGoImageSearch, GoogleImageSearch, ) __all__ = [ 'BaseImageSearch', - 'DuckDuckGoImageSearch', + 'BrowserBingImageSearch', + 'BrowserDuckDuckGoImageSearch', 'FaceClusterer', 'FaceDetector', 'GoogleImageSearch', diff --git a/preprocessor/services/characters/image_search/__init__.py b/preprocessor/services/characters/image_search/__init__.py index bcfc66e90..241b34030 100644 --- a/preprocessor/services/characters/image_search/__init__.py +++ b/preprocessor/services/characters/image_search/__init__.py @@ -1,6 +1,6 @@ from preprocessor.services.characters.image_search.bing_image_search import BrowserBingImageSearch -from preprocessor.services.characters.image_search.duckduckgo_image_search import DuckDuckGoImageSearch +from preprocessor.services.characters.image_search.duckduckgo_browser_image_search import BrowserDuckDuckGoImageSearch from preprocessor.services.characters.image_search.google_image_search import GoogleImageSearch from preprocessor.services.characters.image_search.image_search import BaseImageSearch -__all__ = ['BaseImageSearch', 'BrowserBingImageSearch', 'DuckDuckGoImageSearch', 'GoogleImageSearch'] +__all__ = ['BaseImageSearch', 'BrowserBingImageSearch', 'BrowserDuckDuckGoImageSearch', 'GoogleImageSearch'] diff --git a/preprocessor/services/characters/image_search/bing_image_search.py b/preprocessor/services/characters/image_search/bing_image_search.py index 54d922386..fdfd5b1d9 100644 --- a/preprocessor/services/characters/image_search/bing_image_search.py +++ b/preprocessor/services/characters/image_search/bing_image_search.py @@ -1,3 +1,5 @@ +import signal +import time from typing import ( Any, Dict, @@ -11,9 +13,15 @@ from preprocessor.services.characters.image_search.image_search import BaseImageSearch _SEARCH_URL = 'https://www.bing.com/images/search' -_RESULT_WAIT_MS = 3000 +_PAGE_TIMEOUT_MS = 12000 +_LOAD_WAIT_S = 2.0 _SCROLL_STEPS = 3 -_SCROLL_PAUSE_MS = 1500 +_SCROLL_PAUSE_S = 1.0 +_HARD_TIMEOUT_S = 25 + + +class _SearchTimeout(Exception): + pass class BrowserBingImageSearch(BaseImageSearch): @@ -26,23 +34,44 @@ def name(self) -> str: return 'Bing Images (Browser)' def search(self, query: str) -> Iterator[Dict[str, str]]: + yield from self.__fetch_with_timeout(query) + + def __fetch_with_timeout(self, query: str) -> List[Dict[str, str]]: page = self.__browser_context.new_page() + page.set_default_timeout(_PAGE_TIMEOUT_MS) + + old_handler = signal.signal(signal.SIGALRM, self.__raise_timeout) + signal.alarm(_HARD_TIMEOUT_S) try: url = f'{_SEARCH_URL}?q={quote(query)}&count={self._max_results}&form=HDRSC2' - page.goto(url, wait_until='domcontentloaded', timeout=30000) - page.wait_for_timeout(_RESULT_WAIT_MS) - self.__scroll_to_load_more(page) - yield from self.__extract_results(page) + page.goto(url, wait_until='commit', timeout=_PAGE_TIMEOUT_MS) + time.sleep(_LOAD_WAIT_S) + self.__scroll_for_more(page) + return self.__extract_results(page) + except Exception: + return [] finally: - page.close() + signal.alarm(0) + signal.signal(signal.SIGALRM, old_handler) + try: + page.close() + except Exception: + pass + + @staticmethod + def __raise_timeout(signum: int, frame: Any) -> None: + raise _SearchTimeout() @staticmethod - def __scroll_to_load_more(page: Any) -> None: + def __scroll_for_more(page: Any) -> None: for _ in range(_SCROLL_STEPS): - page.evaluate('window.scrollBy(0, window.innerHeight)') - page.wait_for_timeout(_SCROLL_PAUSE_MS) + try: + page.evaluate('window.scrollBy(0, window.innerHeight * 3)') + time.sleep(_SCROLL_PAUSE_S) + except Exception: + break - def __extract_results(self, page: Any) -> Iterator[Dict[str, str]]: + def __extract_results(self, page: Any) -> List[Dict[str, str]]: raw: List[Dict[str, str]] = page.evaluate("""() => { const out = []; for (const el of document.querySelectorAll('.iusc')) { @@ -59,4 +88,4 @@ def __extract_results(self, page: Any) -> Iterator[Dict[str, str]]: } return out; }""") - yield from raw[:self._max_results] + return raw[:self._max_results] diff --git a/preprocessor/services/characters/image_search/duckduckgo_browser_image_search.py b/preprocessor/services/characters/image_search/duckduckgo_browser_image_search.py new file mode 100644 index 000000000..21f10ded7 --- /dev/null +++ b/preprocessor/services/characters/image_search/duckduckgo_browser_image_search.py @@ -0,0 +1,58 @@ +from typing import ( + Any, + Dict, + Iterator, + List, +) +from urllib.parse import quote + +from patchright.sync_api import BrowserContext + +from preprocessor.services.characters.image_search.image_search import BaseImageSearch + +_SEARCH_URL = 'https://duckduckgo.com/' +_NETWORK_IDLE_TIMEOUT = 15000 +_SCROLL_STEPS = 3 +_SCROLL_PAUSE_MS = 1500 + + +class BrowserDuckDuckGoImageSearch(BaseImageSearch): + def __init__(self, browser_context: BrowserContext, max_results: int = 100) -> None: + super().__init__(max_results) + self.__browser_context = browser_context + + @property + def name(self) -> str: + return 'DuckDuckGo Images (Browser)' + + def search(self, query: str) -> Iterator[Dict[str, str]]: + page = self.__browser_context.new_page() + collected: List[Dict[str, str]] = [] + + def _on_response(response: Any) -> None: + if 'duckduckgo.com/i.js' not in response.url: + return + try: + body = response.json() + for item in body.get('results', []): + url = item.get('image') or item.get('thumbnail', '') + if url: + collected.append({'image': url, 'thumbnail': item.get('thumbnail', '')}) + except Exception: + pass + + page.on('response', _on_response) + try: + url = f'{_SEARCH_URL}?q={quote(query)}&iax=images&ia=images' + page.goto(url, wait_until='networkidle', timeout=_NETWORK_IDLE_TIMEOUT) + self.__scroll_for_more(page, collected) + yield from collected[:self._max_results] + finally: + page.close() + + def __scroll_for_more(self, page: Any, collected: List[Dict[str, str]]) -> None: + for _ in range(_SCROLL_STEPS): + if len(collected) >= self._max_results: + break + page.evaluate('window.scrollBy(0, window.innerHeight * 3)') + page.wait_for_timeout(_SCROLL_PAUSE_MS) diff --git a/preprocessor/services/characters/image_search/duckduckgo_image_search.py b/preprocessor/services/characters/image_search/duckduckgo_image_search.py deleted file mode 100644 index 49eae02bb..000000000 --- a/preprocessor/services/characters/image_search/duckduckgo_image_search.py +++ /dev/null @@ -1,34 +0,0 @@ -import random -import time -from typing import ( - Dict, - Iterator, -) - -from ddgs import DDGS - -from preprocessor.services.characters.image_search.image_search import BaseImageSearch - - -class DuckDuckGoImageSearch(BaseImageSearch): - def __init__( - self, - max_results: int = 50, - pre_search_delay_min: float = 8.0, - pre_search_delay_max: float = 15.0, - ) -> None: - super().__init__(max_results) - self.__pre_search_delay_min = pre_search_delay_min - self.__pre_search_delay_max = pre_search_delay_max - - @property - def name(self) -> str: - return 'DuckDuckGo' - - def search(self, query: str) -> Iterator[Dict[str, str]]: - time.sleep(random.uniform(self.__pre_search_delay_min, self.__pre_search_delay_max)) - with DDGS() as ddgs: - for r in ddgs.images(query, region='pl-pl', max_results=self._max_results): - url = r.get('image') or r.get('thumbnail', '') - if url: - yield {'image': url, 'thumbnail': r.get('thumbnail', '')} diff --git a/preprocessor/services/characters/reference_downloader.py b/preprocessor/services/characters/reference_downloader.py index f1f04a2e2..a5d42e4eb 100644 --- a/preprocessor/services/characters/reference_downloader.py +++ b/preprocessor/services/characters/reference_downloader.py @@ -191,15 +191,15 @@ def __download_and_process_images( self, results: Iterator[Dict[str, Any]], output_folder: Path, saved_count: int, ) -> int: needed = self.__images_per_character - saved_count - raw = self.__collect_raw_candidates(results, needed) - scored = self.__filter_by_consensus(raw) - if len(scored) < needed: - scored = self.__score_all(raw) + raw, consensus = self.__collect_raw_candidates(results, needed) + if consensus is None: + return saved_count + scored = self.__score_by_consensus(raw, consensus) return self.__save_best_candidates(scored, output_folder, saved_count) def __collect_raw_candidates( self, results: Iterator[Dict[str, Any]], needed: int, - ) -> List[Tuple[np.ndarray, List[Any]]]: + ) -> Tuple[List[Tuple[np.ndarray, List[Any]]], Optional[np.ndarray]]: raw: List[Tuple[np.ndarray, List[Any]]] = [] processed = 0 @@ -223,23 +223,49 @@ def __collect_raw_candidates( except Exception as e: self.logger.debug(f'Error processing image {img_url}: {e}') - if len(raw) >= needed + 1 and len(self.__filter_by_consensus(raw)) >= needed: - break + consensus = self.__find_confident_consensus(raw, needed) + if consensus is not None and len(self.__score_by_consensus(raw, consensus)) >= needed: + return raw, consensus finally: page.close() - return raw + return raw, self.__find_confident_consensus(raw, needed) - def __filter_by_consensus( - self, candidates: List[Tuple[np.ndarray, List[Any]]], - ) -> List[Tuple[np.ndarray, float]]: - if not candidates: - return [] + def __find_confident_consensus( + self, candidates: List[Tuple[np.ndarray, List[Any]]], needed: int, + ) -> Optional[np.ndarray]: + if len(candidates) < needed: + return None - consensus = self.__find_consensus_embedding(candidates) - if consensus is None: - return [] + threshold = settings.character.reference_matching_threshold + clusters: List[Tuple[np.ndarray, List[int]]] = [] + + for img_idx, (_, faces) in enumerate(candidates): + for face in faces: + matched = False + for cluster_emb, img_indices in clusters: + if float(np.dot(cluster_emb, face.normed_embedding)) >= threshold: + if img_idx not in img_indices: + img_indices.append(img_idx) + matched = True + break + if not matched: + clusters.append((face.normed_embedding, [img_idx])) + + if not clusters: + return None + clusters.sort(key=lambda x: len(x[1]), reverse=True) + best_count = len(clusters[0][1]) + second_count = len(clusters[1][1]) if len(clusters) > 1 else 0 + + if best_count > second_count and best_count >= needed: + return clusters[0][0] + return None + + def __score_by_consensus( + self, candidates: List[Tuple[np.ndarray, List[Any]]], consensus: np.ndarray, + ) -> List[Tuple[np.ndarray, float]]: threshold = settings.character.reference_matching_threshold scored: List[Tuple[np.ndarray, float]] = [] for img, faces in candidates: @@ -254,36 +280,6 @@ def __filter_by_consensus( scored.append((img, float(best_det))) return scored - def __find_consensus_embedding( - self, candidates: List[Tuple[np.ndarray, List[Any]]], - ) -> Optional[np.ndarray]: - threshold = settings.character.reference_matching_threshold - _, first_faces = candidates[0] - others = [faces for _, faces in candidates[1:]] - - best_embedding: Optional[np.ndarray] = None - best_count = 0 - - for anchor in first_faces: - count = 1 - for other_faces in others: - sims = [float(np.dot(anchor.normed_embedding, f.normed_embedding)) for f in other_faces] - if sims and max(sims) >= threshold: - count += 1 - if count > best_count: - best_count = count - best_embedding = anchor.normed_embedding - - return best_embedding - - def __score_all( - self, candidates: List[Tuple[np.ndarray, List[Any]]], - ) -> List[Tuple[np.ndarray, float]]: - return [ - (img, float(max(f.det_score for f in faces))) - for img, faces in candidates - ] - def __save_best_candidates( self, candidates: List[Tuple[np.ndarray, float]], output_folder: Path, saved_count: int, ) -> int: @@ -298,8 +294,8 @@ def __download_image_via_browser(self, img_url: str, page: Page) -> Optional[np. try: response = page.goto( img_url, - timeout=settings.image_scraper.page_navigation_timeout, - wait_until='domcontentloaded', + timeout=settings.image_scraper.image_download_timeout, + wait_until='commit', ) if not response or response.status != 200: From 2dc3ffa76e0083034a36728fa6cafd33f2c481ab Mon Sep 17 00:00:00 2001 From: dam2452 <81230036+dam2452@users.noreply.github.com> Date: Tue, 17 Mar 2026 10:53:53 +0100 Subject: [PATCH 81/89] Add chunked Whisper transcription for long audio Enable chunked transcription for long audio files by adding a max_chunk_duration_seconds config (default 1800s) and wiring it through the step config and pipeline. WhisperEngine now checks audio duration via ffprobe and, if needed, splits input with ffmpeg into chunks, transcribes each chunk, adjusts segment ids/timestamps (and word timestamps), and concatenates text and segments. Also minor defaults: set max_parallel_episodes=1 in defaults and pipeline. Note: this adds ffmpeg/ffprobe usage and temporary file handling. --- preprocessor/app/pipeline_factory.py | 1 + preprocessor/config/step_configs.py | 1 + preprocessor/config/step_defaults.py | 1 + .../transcription/engines/whisper_engine.py | 99 ++++++++++++++++++- preprocessor/steps/text/transcription_step.py | 1 + 5 files changed, 98 insertions(+), 5 deletions(-) diff --git a/preprocessor/app/pipeline_factory.py b/preprocessor/app/pipeline_factory.py index cc277eeb4..c5f366cfb 100644 --- a/preprocessor/app/pipeline_factory.py +++ b/preprocessor/app/pipeline_factory.py @@ -266,6 +266,7 @@ def build_pipeline(series_name: str) -> PipelineDefinition: # pylint: disable=t model=series_config.processing.transcription.model, language=series_config.processing.transcription.language, device=series_config.processing.transcription.device, + max_parallel_episodes=1, ), ) diff --git a/preprocessor/config/step_configs.py b/preprocessor/config/step_configs.py index b0ec137ae..68f030b72 100644 --- a/preprocessor/config/step_configs.py +++ b/preprocessor/config/step_configs.py @@ -83,6 +83,7 @@ class TranscriptionConfig(BaseModel): beam_size: int = Field(default=10, ge=1) device: str = 'cuda' language: str = 'pl' + max_chunk_duration_seconds: int = Field(default=1800, ge=60) max_parallel_episodes: int = Field(default=2, ge=1, le=4) mode: str = 'whisper' model: str = 'large-v3-turbo' diff --git a/preprocessor/config/step_defaults.py b/preprocessor/config/step_defaults.py index 3cb41b2cf..32b537965 100644 --- a/preprocessor/config/step_defaults.py +++ b/preprocessor/config/step_defaults.py @@ -39,6 +39,7 @@ def get_configs(series_name: str) -> Dict[str, object]: device='cuda', beam_size=5, temperature=0.0, + max_parallel_episodes=1, ), 'separate_sounds': SoundSeparationConfig(), 'analyze_text': TextAnalysisConfig(language='pl'), diff --git a/preprocessor/services/transcription/engines/whisper_engine.py b/preprocessor/services/transcription/engines/whisper_engine.py index 385c813c9..876e727cd 100644 --- a/preprocessor/services/transcription/engines/whisper_engine.py +++ b/preprocessor/services/transcription/engines/whisper_engine.py @@ -1,8 +1,12 @@ import gc +import json from pathlib import Path +import subprocess +import tempfile from typing import ( Any, Dict, + List, Optional, ) @@ -22,12 +26,14 @@ def __init__( device: str = 'cuda', beam_size: int = 10, temperature: float = 0.0, + max_chunk_duration_seconds: int = 1800, ) -> None: self.__model_name = model_name self.__language = language self.__device = device self.__beam_size = beam_size self.__temperature = temperature + self.__max_chunk_duration_seconds = max_chunk_duration_seconds if device != 'cuda': raise ValueError(f'Whisper acceleration requires CUDA, got: {device}') @@ -56,8 +62,72 @@ def transcribe(self, audio_path: Path) -> Dict[str, Any]: if not self.__model: raise RuntimeError('Whisper model not loaded.') - language_code = WhisperUtils.get_language_code(self.__language) + duration = self.__get_duration(audio_path) + if duration > self.__max_chunk_duration_seconds: + n_chunks = int(duration // self.__max_chunk_duration_seconds) + 1 + console.print( + f'[yellow]Long audio ({duration/3600:.1f}h), splitting into {n_chunks} chunks ' + f'of {self.__max_chunk_duration_seconds//60}min each[/yellow]', + ) + result = self.__transcribe_chunked(audio_path, duration) + else: + result = self.__transcribe_single(audio_path) + + console.print(f'[green]Transcription completed: {audio_path.name}[/green]') + return result + + def __transcribe_chunked(self, audio_path: Path, total_duration: float) -> Dict[str, Any]: + chunk_starts = list(range(0, int(total_duration), self.__max_chunk_duration_seconds)) + all_segments: List[Dict[str, Any]] = [] + text_parts: List[str] = [] + language: Optional[str] = None + + id_offset = 0 + with tempfile.TemporaryDirectory() as tmpdir: + for i, start in enumerate(chunk_starts): + end = min(start + self.__max_chunk_duration_seconds, total_duration) + chunk_path = Path(tmpdir) / f'chunk_{i:04d}.wav' + + console.print( + f'[cyan]Chunk {i+1}/{len(chunk_starts)}: ' + f'{start/3600:.2f}h - {end/3600:.2f}h[/cyan]', + ) + self.__extract_audio_chunk(audio_path, chunk_path, start, end) + + chunk_result = self.__transcribe_single(chunk_path) + + if language is None: + language = chunk_result.get('language') + + offset = float(start) + chunk_segments = chunk_result.get('segments', []) + for seg in chunk_segments: + adjusted_seg = { + **seg, + 'id': seg['id'] + id_offset, + 'start': seg['start'] + offset, + 'end': seg['end'] + offset, + } + if adjusted_seg.get('words'): + adjusted_seg['words'] = [ + {**w, 'start': w['start'] + offset, 'end': w['end'] + offset} + for w in adjusted_seg['words'] + ] + all_segments.append(adjusted_seg) + + id_offset += len(chunk_segments) + text_parts.append(chunk_result.get('text', '')) + + result: Dict[str, Any] = {'text': ''.join(text_parts), 'segments': all_segments} + if language: + result['language'] = language + return result + + def __transcribe_single(self, audio_path: Path) -> Dict[str, Any]: + if not self.__model: + raise RuntimeError('Whisper model not loaded.') + language_code = WhisperUtils.get_language_code(self.__language) segments, info = self.__model.transcribe( str(audio_path), language=language_code, @@ -65,11 +135,9 @@ def transcribe(self, audio_path: Path) -> Dict[str, Any]: word_timestamps=True, condition_on_previous_text=False, temperature=self.__temperature, + vad_filter=True, ) - - result = WhisperUtils.build_transcription_result(segments, language=info.language) - console.print(f'[green]Transcription completed: {audio_path.name}[/green]') - return result + return WhisperUtils.build_transcription_result(segments, language=info.language) def __load_model(self) -> WhisperModel: compute_type = 'float16' @@ -78,3 +146,24 @@ def __load_model(self) -> WhisperModel: model = WhisperModel(self.__model_name, device=self.__device, compute_type=compute_type) console.print('[green]Whisper model loaded[/green]') return model + + @staticmethod + def __get_duration(path: Path) -> float: + result = subprocess.run( + ['ffprobe', '-v', 'quiet', '-print_format', 'json', '-show_format', str(path)], + capture_output=True, text=True, check=True, + ) + return float(json.loads(result.stdout)['format']['duration']) + + @staticmethod + def __extract_audio_chunk(video_path: Path, output_path: Path, start: float, end: float) -> None: + subprocess.run( + [ + 'ffmpeg', '-y', + '-ss', str(start), '-to', str(end), + '-i', str(video_path), + '-vn', '-acodec', 'pcm_f32le', '-ar', '16000', '-ac', '1', + str(output_path), + ], + capture_output=True, check=True, + ) diff --git a/preprocessor/steps/text/transcription_step.py b/preprocessor/steps/text/transcription_step.py index 2daa06ecf..29dbfe3af 100644 --- a/preprocessor/steps/text/transcription_step.py +++ b/preprocessor/steps/text/transcription_step.py @@ -93,6 +93,7 @@ def __create_engine(self, context: ExecutionContext) -> TranscriptionEngine: device=self.config.device, beam_size=self.config.beam_size, temperature=self.config.temperature, + max_chunk_duration_seconds=self.config.max_chunk_duration_seconds, ) def __transcribe_and_save( From 07617a63c4a9436b7b0ca2220d6622c9b9daf0ff Mon Sep 17 00:00:00 2001 From: dam2452 <81230036+dam2452@users.noreply.github.com> Date: Wed, 18 Mar 2026 11:32:17 +0100 Subject: [PATCH 82/89] Remove output_data dirs from preprocessor Dockerfile Stop creating /app/output_data/characters, /app/output_data/scraped_pages, and /app/output_data/processing_metadata in the Dockerfile mkdir step. These runtime/output directories are likely managed or mounted elsewhere, so only model directories (including /models/emotion_model) are created during image build. --- preprocessor/Dockerfile | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/preprocessor/Dockerfile b/preprocessor/Dockerfile index 358109848..fcc5e1271 100644 --- a/preprocessor/Dockerfile +++ b/preprocessor/Dockerfile @@ -64,10 +64,7 @@ RUN mkdir -p \ /models/whisper \ /models/insightface \ /models/ultralytics \ - /models/emotion_model \ - /app/output_data/characters \ - /app/output_data/scraped_pages \ - /app/output_data/processing_metadata + /models/emotion_model COPY bot /app/bot COPY preprocessor /app/preprocessor From 9a519ee02ff5583125eb896d829d71c8561cc110 Mon Sep 17 00:00:00 2001 From: dam2452 <81230036+dam2452@users.noreply.github.com> Date: Fri, 27 Mar 2026 12:03:33 +0100 Subject: [PATCH 83/89] Add script to split double-episode videos Introduce preprocessor/scripts/split_double_episodes.py: a CLI tool to detect and split double-episode video files and renumber episodes per season. It probes video duration with ffprobe, uses a TransNetV2 wrapper for scene detection, and refines split points by scanning for black frames with ffmpeg's blackdetect. Files are renamed using an SxxExx pattern (appends _SPECIAL for single-episode specials) and output to season subdirectories; splitting is performed with ffmpeg (hevc_nvenc settings) while specials are copied. The script supports dry-run mode and configurable options for scene threshold, minimum scene length, and black-frame scan window. --- preprocessor/scripts/split_double_episodes.py | 260 ++++++++++++++++++ preprocessor/series_configs/README.md | 74 +++++ .../pingwiny_z_madagaskaru.json | 37 +++ 3 files changed, 371 insertions(+) create mode 100644 preprocessor/scripts/split_double_episodes.py create mode 100644 preprocessor/series_configs/README.md create mode 100644 preprocessor/series_configs/pingwiny_z_madagaskaru.json diff --git a/preprocessor/scripts/split_double_episodes.py b/preprocessor/scripts/split_double_episodes.py new file mode 100644 index 000000000..967e1ef83 --- /dev/null +++ b/preprocessor/scripts/split_double_episodes.py @@ -0,0 +1,260 @@ +import argparse +import json +import math +from pathlib import Path +import re +import shutil +import subprocess +import sys +from typing import ( + Any, + Dict, + List, + Optional, + Tuple, +) + +sys.path.insert(0, str(Path(__file__).resolve().parent.parent.parent)) + +from preprocessor.services.media.scene_detection import TransNetWrapper # noqa: E402 # pylint: disable=wrong-import-position + +_VIDEO_EXTENSIONS: Tuple[str, ...] = ('.mkv', '.mp4', '.avi') +_EP_PATTERN = re.compile(r'(S\d{2})E(\d{2})') +_BLACK_PATTERN = re.compile(r'black_start:([\d.]+)\s+black_end:([\d.]+)') + + +def _probe_duration(video_path: Path) -> float: + result = subprocess.run( + ['ffprobe', '-v', 'quiet', '-print_format', 'json', '-show_format', str(video_path)], + stdout=subprocess.PIPE, + stderr=subprocess.DEVNULL, + check=True, + text=True, + ) + return float(json.loads(result.stdout)['format']['duration']) + + +def _detect_scenes(video_path: Path, threshold: float, min_scene_len: int) -> List[Dict[str, Any]]: + wrapper = TransNetWrapper() + wrapper.load_model() + try: + return wrapper.detect_scenes(video_path, threshold=threshold, min_scene_len=min_scene_len) + finally: + wrapper.cleanup() + + +def _scene_cut_timestamps(scenes: List[Dict[str, Any]]) -> List[float]: + return [ + float(s['start']['seconds']) if isinstance(s.get('start'), dict) else float(s.get('start', 0)) + for s in scenes[1:] + ] + + +def _nearest_cut(cuts: List[float], target: float) -> float: + return min(cuts, key=lambda t: abs(t - target)) + + +def _detect_black_frames( + video_path: Path, + cut: float, + half_window: float, + black_duration: float = 0.02, + pix_threshold: float = 0.10, +) -> List[Tuple[float, float]]: + scan_start = max(0.0, cut - half_window) + result = subprocess.run( + [ + 'ffmpeg', + '-ss', str(scan_start), + '-t', str(half_window * 2), + '-i', str(video_path), + '-vf', f'blackdetect=d={black_duration}:pix_th={pix_threshold}', + '-an', '-f', 'null', '-', + ], + stdout=subprocess.DEVNULL, + stderr=subprocess.PIPE, + text=True, + check=False, + ) + return [ + (float(m.group(1)) + scan_start, float(m.group(2)) + scan_start) + for m in _BLACK_PATTERN.finditer(result.stderr) + ] + + +def _adjust_for_black_frames( + cut: float, + black_intervals: List[Tuple[float, float]], + max_distance: float = 5.0, +) -> float: + best_interval: Optional[Tuple[float, float]] = None + best_dist = math.inf + + for black_start, black_end in black_intervals: + if black_start <= cut <= black_end: + dist = 0.0 + elif black_end < cut: + dist = cut - black_end + else: + dist = black_start - cut + + if dist <= max_distance and dist < best_dist: + best_dist = dist + best_interval = (black_start, black_end) + + return best_interval[1] if best_interval is not None else cut + + +def _classify_file(video_path: Path, half_window: float) -> Tuple[bool, float]: + midpoint = _probe_duration(video_path) / 2.0 + black_intervals = _detect_black_frames(video_path, midpoint, half_window) + adjusted = _adjust_for_black_frames(midpoint, black_intervals) + return adjusted != midpoint or bool(black_intervals), adjusted + + +def _rename_episode(filename: str, new_ep: int, special: bool = False) -> str: + match = _EP_PATTERN.search(filename) + if not match: + raise ValueError(f'No SxxExx pattern in filename: {filename}') + season = match.group(1) + suffix = '_SPECIAL' if special else '' + replacement = f'{season}E{new_ep:02d}{suffix}' + return filename[:match.start()] + replacement + filename[match.end():] + + +def _ffmpeg_split(video_path: Path, cut_time: float, ep1_path: Path, ep2_path: Path) -> None: + codec = ['-c:v', 'hevc_nvenc', '-preset', 'p4', '-cq', '18', '-c:a', 'copy'] + subprocess.run( + ['ffmpeg', '-y', '-i', str(video_path), '-t', str(cut_time)] + codec + [str(ep1_path)], + stdout=subprocess.DEVNULL, + stderr=subprocess.DEVNULL, + check=True, + ) + subprocess.run( + ['ffmpeg', '-y', '-ss', str(cut_time), '-i', str(video_path)] + codec + [str(ep2_path)], + stdout=subprocess.DEVNULL, + stderr=subprocess.DEVNULL, + check=True, + ) + + +def _split_double( + video: Path, + approx_cut: float, + half_window: float, + threshold: float, + min_scene_len: int, + output_dir: Path, + ep_counter: int, +) -> int: + scenes = _detect_scenes(video, threshold, min_scene_len) + cuts = _scene_cut_timestamps(scenes) + raw_cut = _nearest_cut(cuts, approx_cut) if cuts else approx_cut + black_intervals = _detect_black_frames(video, raw_cut, half_window) + final_cut = _adjust_for_black_frames(raw_cut, black_intervals) + + ep1_name = _rename_episode(video.name, ep_counter) + ep2_name = _rename_episode(video.name, ep_counter + 1) + + direction = '' + if final_cut != raw_cut: + arrow = 'forward' if final_cut > raw_cut else 'backward' + direction = f' ({arrow} {raw_cut:.3f}s -> {final_cut:.3f}s)' + + print(f' [SPLIT] {video.name} cut={final_cut:.3f}s{direction}') + print(f' E{ep_counter:02d} -> {ep1_name}') + print(f' E{ep_counter + 1:02d} -> {ep2_name}') + + _ffmpeg_split(video, final_cut, output_dir / ep1_name, output_dir / ep2_name) + return ep_counter + 2 + + +def _process_season( + season_dir: Path, + output_dir: Path, + half_window: float, + threshold: float, + min_scene_len: int, + dry_run: bool, +) -> None: + videos = sorted(p for p in season_dir.iterdir() if p.suffix.lower() in _VIDEO_EXTENSIONS) + if not videos: + print(f'[{season_dir.name}] no videos found') + return + + print(f'\n[{season_dir.name}] classifying {len(videos)} file(s)...') + classifications: List[Tuple[Path, bool, float]] = [] + for video in videos: + is_double, cut = _classify_file(video, half_window) + label = 'DOUBLE' if is_double else 'SPECIAL' + cut_info = f' cut={cut:.3f}s' if is_double else '' + print(f' [{label}] {video.name}{cut_info}') + classifications.append((video, is_double, cut)) + + if dry_run: + specials = [v.name for v, is_double, _ in classifications if not is_double] + if specials: + print(f' --- SPECIALS: {specials}') + return + + output_dir.mkdir(parents=True, exist_ok=True) + ep_counter = 1 + + for video, is_double, approx_cut in classifications: + if is_double: + ep_counter = _split_double(video, approx_cut, half_window, threshold, min_scene_len, output_dir, ep_counter) + else: + special_name = _rename_episode(video.name, ep_counter, special=True) + print(f' [COPY ] {video.name}') + print(f' E{ep_counter:02d} -> {special_name}') + shutil.copy2(str(video), str(output_dir / special_name)) + ep_counter += 1 + + +def main() -> None: + parser = argparse.ArgumentParser( + description='Split double-episode files and renumber sequentially per season.', + ) + parser.add_argument( + 'season_dirs', nargs='+', type=Path, + help='Season directory/directories to process', + ) + parser.add_argument( + '--output-dir', '-o', type=Path, required=True, + help='Root output directory (S01/S02/... subdirs created automatically)', + ) + parser.add_argument( + '--threshold', type=float, default=0.5, + help='TransNetV2 scene detection threshold (default: 0.5)', + ) + parser.add_argument( + '--min-scene-len', type=int, default=10, + help='Minimum scene length in frames (default: 10)', + ) + parser.add_argument( + '--black-window', type=float, default=15.0, + help='Half-window in seconds for symmetric black frame scan (default: 15)', + ) + parser.add_argument( + '--dry-run', action='store_true', + help='Classify only — no TransNetV2, no splitting, no copying', + ) + + args = parser.parse_args() + + for season_dir in args.season_dirs: + if not season_dir.is_dir(): + print(f'Not a directory, skipping: {season_dir}', file=sys.stderr) + continue + _process_season( + season_dir, + args.output_dir / season_dir.name, + args.black_window, + args.threshold, + args.min_scene_len, + args.dry_run, + ) + + +if __name__ == '__main__': + main() diff --git a/preprocessor/series_configs/README.md b/preprocessor/series_configs/README.md new file mode 100644 index 000000000..ce277fb8c --- /dev/null +++ b/preprocessor/series_configs/README.md @@ -0,0 +1,74 @@ +# Series Configs + +Każda seria to `{series_name}.json` zawierający **tylko różnice** względem `defaults.json`. + +## Wymagane pola + +```json +{ + "series_name": "nazwa_serii", + "display_name": "Nazwa Wyświetlana", + "indexing": { "elasticsearch": { "index_name": "nazwa_serii_clips" } }, + "scraping": { + "episodes": { "urls": ["https://..."] }, + "characters": { "urls": ["https://..."] } + } +} +``` + +> `series_name` musi zgadzać się z nazwą pliku i katalogu `input_data/`. +> `urls` są **zawsze wymagane** przez parser — nawet jeśli scraper jest w `skip_steps`. Jeśli dane masz ręcznie, wpisz URL źródłowy skąd pochodzą (dla dokumentacji). + +## Pipeline mode + +| Wartość | Opis | +|---|---| +| `"full"` (domyślny) | Uruchamia wszystkie kroki | +| `"selective"` | Pomija kroki z listy `skip_steps` | + +## skip_steps + +| ID | Co pomija | +|---|---| +| `episode_scraper` | Scrapowanie listy odcinków | +| `character_scraper` | Scrapowanie listy postaci | +| `character_reference` | Pobieranie zdjęć referencyjnych postaci | +| `transcription` | Transkrypcja audio | +| `index_to_elasticsearch` | Wysyłanie do Elasticsearch | +| `generate_archives` | Generowanie archiwów ZIP | + +Jednorazowe pominięcie bez zmiany configa: `run-all --series X --skip index_to_elasticsearch`. + +## Transkrypcja + +```json +"processing": { "transcription": { "mode": "elevenlabs" } } +``` + +| `mode` | Opis | +|---|---| +| `"whisper"` (domyślny) | Lokalny model Whisper (CUDA) | +| `"elevenlabs"` / `"11labs"` | API ElevenLabs (`ELEVENLABS_API_KEY`) | + +Import gotowych transkrypcji (format 11labs): +```json +"processing": { + "transcription_import": { "format_type": "11labs_segmented", "source_dir": "/transcriptions/nazwa_serii" } +} +``` + +## Zdjęcia referencyjne postaci + +```json +"scraping": { "character_references": { "images_per_character": 2, "search_engine": "google" } } +``` + +`images_per_character: 0` pomija pobieranie. Domyślna wyszukiwarka: `"duckduckgo"` (bez API). `"google"` wymaga SerpAPI. + +## Elasticsearch + +```json +"indexing": { "elasticsearch": { "index_name": "nazwa_serii_clips", "host": "localhost:9200", "append": false, "dry_run": false } } +``` + +`dry_run: true` — generuje dokumenty ale nie wysyła. `append: true` — dopisuje do istniejącego indeksu. diff --git a/preprocessor/series_configs/pingwiny_z_madagaskaru.json b/preprocessor/series_configs/pingwiny_z_madagaskaru.json new file mode 100644 index 000000000..b8fc8a534 --- /dev/null +++ b/preprocessor/series_configs/pingwiny_z_madagaskaru.json @@ -0,0 +1,37 @@ +{ + "_comment": "Konfiguracja dla Pingwin\u00f3w z Madagaskaru - tylko zmiany wzgl\u0119dem defaults.json", + "_note": "Metadane odcink\u00f3w i bohater\u00f3w dostarczone r\u0119cznie. Transkrypcje przez ElevenLabs. Indeksowanie do ES r\u0119czne.", + "display_name": "Pingwiny z Madagaskaru", + "indexing": { + "elasticsearch": { + "index_name": "pingwiny_z_madagaskaru_clips" + } + }, + "pipeline_mode": "selective", + "processing": { + "transcription": { + "mode": "elevenlabs" + } + }, + "scraping": { + "character_references": { + "images_per_character": 2 + }, + "characters": { + "urls": [ + "https://pl.wikipedia.org/wiki/Lista_postaci_serialu_Pingwiny_z_Madagaskaru" + ] + }, + "episodes": { + "urls": [ + "https://pl.wikipedia.org/wiki/Lista_odcink%C3%B3w_serialu_Pingwiny_z_Madagaskaru" + ] + } + }, + "series_name": "pingwiny_z_madagaskaru", + "skip_steps": [ + "episode_scraper", + "character_scraper", + "index_to_elasticsearch" + ] +} From d770797da5987c2c1a53a0f5fe1942c59258801b Mon Sep 17 00:00:00 2001 From: dam2452 <81230036+dam2452@users.noreply.github.com> Date: Fri, 27 Mar 2026 15:47:34 +0100 Subject: [PATCH 84/89] Add scribe comparison script and ElevenLabs tweaks Add a new compare_scribe_models script to submit and compare ElevenLabs scribe_v1/scribe_v2 transcriptions (with optional Whisper via Docker), poll for results, and save JSON/SRT outputs. Change default ElevenLabs model in config to scribe_v2. Increase diarization speaker cap (num_speakers=32) in ElevenLabs engine requests. Broaden sound-event detection regex to match parentheses or square brackets. Accept both '11labs' and 'elevenlabs' modes when creating the ElevenLabs engine. Add .gitignore entry for preprocessor scripts output. --- .gitignore | 1 + preprocessor/config/config.py | 2 +- preprocessor/scripts/compare_scribe_models.py | 278 ++++++++++++++++++ preprocessor/scripts/deploy_to_nas.py | 1 + preprocessor/scripts/split_double_episodes.py | 1 + .../engines/elevenlabs_engine.py | 1 + .../transcription/sound_classification.py | 2 +- preprocessor/steps/text/transcription_step.py | 2 +- 8 files changed, 285 insertions(+), 3 deletions(-) create mode 100644 preprocessor/scripts/compare_scribe_models.py diff --git a/.gitignore b/.gitignore index 6123a9ff2..8984478cd 100644 --- a/.gitignore +++ b/.gitignore @@ -42,3 +42,4 @@ cookies.txt test_episodes.json /models /tmp +/preprocessor/scripts/scribe_compare diff --git a/preprocessor/config/config.py b/preprocessor/config/config.py index cc023f4ee..96e48ae56 100644 --- a/preprocessor/config/config.py +++ b/preprocessor/config/config.py @@ -130,7 +130,7 @@ class ElevenLabsSettings(BaseAPISettings): diarize: bool = True language_code: str = 'pol' max_attempts: int = 60 - model_id: str = 'scribe_v1' + model_id: str = 'scribe_v2' polling_interval: int = 20 @classmethod diff --git a/preprocessor/scripts/compare_scribe_models.py b/preprocessor/scripts/compare_scribe_models.py new file mode 100644 index 000000000..95fe43745 --- /dev/null +++ b/preprocessor/scripts/compare_scribe_models.py @@ -0,0 +1,278 @@ +# pylint: skip-file +import argparse +import json +import os +from pathlib import Path +import subprocess +import tempfile +import time +from typing import ( + Any, + Dict, + List, + Optional, + Tuple, +) + +from elevenlabs.client import ElevenLabs +from elevenlabs.core import ApiError + +_ELEVENLABS_MODELS = ('scribe_v1', 'scribe_v2') +_WHISPER_MODEL = 'large-v3-turbo' +_POLLING_INTERVAL = 20 +_MAX_ATTEMPTS = 120 +_ADDITIONAL_FORMATS: List[Dict[str, Any]] = [ + {'format': 'srt'}, + { + 'format': 'segmented_json', + 'include_speakers': True, + 'include_timestamps': True, + 'segment_on_silence_longer_than_s': 0.5, + 'max_segment_duration_s': 10.0, + 'max_segment_chars': 200, + }, +] + +_WHISPER_DOCKER_SCRIPT = """ +import json, sys +from pathlib import Path +from faster_whisper import WhisperModel + +audio_path = sys.argv[1] +out_path = sys.argv[2] +model_name = sys.argv[3] + +print(f'[whisper] Loading {model_name} on cuda...') +model = WhisperModel( + model_name, + device='cuda', + compute_type='float16', + download_root='/models/huggingface', +) + +print(f'[whisper] Transcribing {Path(audio_path).name}...') +segments_iter, info = model.transcribe( + audio_path, + language='pl', + beam_size=10, + temperature=0.0, + vad_filter=True, +) + +segments = [] +text_parts = [] +for seg in segments_iter: + text = seg.text.strip() + text_parts.append(text) + segments.append({ + 'text': text, + 'start': round(seg.start, 3), + 'end': round(seg.end, 3), + 'words': [ + {'text': w.word, 'start': round(w.start, 3), 'end': round(w.end, 3)} + for w in (seg.words or []) + ], + }) + +result = {'text': ' '.join(text_parts), 'language_code': info.language, 'segments': segments} +Path(out_path).write_text(json.dumps(result, ensure_ascii=False, indent=2), encoding='utf-8') +print(f'[whisper] Done. {len(segments)} segments saved to {out_path}') +""" + + +def _extract_audio(video_path: Path, audio_path: Path) -> None: + print(f'Extracting audio from {video_path.name}...') + subprocess.run( + ['ffmpeg', '-y', '-i', str(video_path), '-vn', '-acodec', 'aac', '-b:a', '192k', str(audio_path)], + stdout=subprocess.DEVNULL, + stderr=subprocess.DEVNULL, + check=True, + ) + print(f'Audio extracted: {audio_path.name} ({audio_path.stat().st_size / 1024 / 1024:.1f} MB)') + + +def _submit_job(client: ElevenLabs, audio_path: Path, model_id: str, language_code: str, diarize: bool) -> str: + print(f'[{model_id}] Submitting transcription job...') + with open(audio_path, 'rb') as f: + audio_data = f.read() + + response = client.speech_to_text.convert( + file=audio_data, + model_id=model_id, + language_code=language_code, + tag_audio_events=True, + timestamps_granularity='character', + diarize=diarize, + use_multi_channel=False, + additional_formats=_ADDITIONAL_FORMATS, + webhook=True, + ) + job_id = response.transcription_id + print(f'[{model_id}] Job submitted. ID: {job_id}') + return job_id + + +def _poll_job(client: ElevenLabs, model_id: str, job_id: str) -> Optional[Any]: + print(f'[{model_id}] Polling for results (ID: {job_id})...') + for attempt in range(1, _MAX_ATTEMPTS + 1): + try: + result = client.speech_to_text.transcripts.get(transcription_id=job_id) + print(f'[{model_id}] Done after {attempt} poll(s).') + return result + except ApiError as e: + if e.status_code == 404: + print(f'[{model_id}] Not ready yet (attempt {attempt}/{_MAX_ATTEMPTS}), waiting {_POLLING_INTERVAL}s...') + time.sleep(_POLLING_INTERVAL) + else: + raise + raise TimeoutError(f'[{model_id}] Timeout after {_MAX_ATTEMPTS} attempts') + + +def _elevenlabs_result_to_dict(result: Any) -> Dict[str, Any]: + data: Dict[str, Any] = { + 'text': result.text, + 'language_code': result.language_code, + 'segments': [], + 'srt': None, + } + + if not result.additional_formats: + return data + + for fmt in result.additional_formats: + if fmt.requested_format == 'srt': + data['srt'] = fmt.content + elif fmt.requested_format == 'segmented_json': + segmented = json.loads(fmt.content) + for seg in segmented.get('segments', []): + words = seg.get('words', []) + if not words: + continue + non_spacing = [w for w in words if w.get('type') != 'spacing'] + segment: Dict[str, Any] = {'text': seg.get('text', '').strip(), 'words': words} + if non_spacing: + segment['start'] = non_spacing[0].get('start') + segment['end'] = non_spacing[-1].get('end') + segment['speaker'] = non_spacing[0].get('speaker_id') + data['segments'].append(segment) + + return data + + +def _transcribe_whisper_docker(audio_path: Path, json_out: Path) -> None: + print(f'[whisper_{_WHISPER_MODEL}] Running via Docker (model download may take a moment on first run)...') + output_dir = audio_path.parent.resolve() + + audio_in_container = f'/compare_output/{audio_path.name}' + json_in_container = f'/compare_output/{json_out.name}' + + cmd = [ + 'docker', 'run', '--rm', '--gpus', 'all', + '--entrypoint', 'python', + '-v', f'ranchbot-ai-models:/models', + '-v', f'{output_dir}:/compare_output', + 'ranczo-preprocessor:latest', + '-c', _WHISPER_DOCKER_SCRIPT, + audio_in_container, json_in_container, _WHISPER_MODEL, + ] + + subprocess.run(cmd, check=True) + + +def _save_elevenlabs_result(data: Dict[str, Any], output_dir: Path, model_label: str, stem: str) -> Tuple[Path, Optional[Path]]: + json_path = output_dir / f'{stem}_{model_label}.json' + srt_content = data.pop('srt', None) + json_path.write_text(json.dumps(data, ensure_ascii=False, indent=2), encoding='utf-8') + + srt_path = None + if srt_content: + srt_path = output_dir / f'{stem}_{model_label}.srt' + srt_path.write_text(srt_content, encoding='utf-8') + + return json_path, srt_path + + +def _json_exists(output_dir: Path, model_label: str, stem: str) -> bool: + return (output_dir / f'{stem}_{model_label}.json').exists() + + +def main() -> None: + parser = argparse.ArgumentParser(description='Compare scribe_v1, scribe_v2 and Whisper transcription quality.') + parser.add_argument('video', type=Path, help='Path to the video file') + parser.add_argument('--output-dir', '-o', type=Path, default=None, help='Output directory (default: same as video)') + parser.add_argument('--language', default='pol', help='ElevenLabs language code (default: pol)') + parser.add_argument('--no-diarize', action='store_true', help='Disable speaker diarization') + parser.add_argument('--no-whisper', action='store_true', help='Skip Whisper transcription') + args = parser.parse_args() + + video_path: Path = args.video.resolve() + if not video_path.exists(): + raise FileNotFoundError(f'Video file not found: {video_path}') + + output_dir: Path = (args.output_dir or video_path.parent).resolve() + output_dir.mkdir(parents=True, exist_ok=True) + + stem = video_path.stem + diarize = not args.no_diarize + whisper_label = f'whisper_{_WHISPER_MODEL}' + + elevenlabs_to_run = [m for m in _ELEVENLABS_MODELS if not _json_exists(output_dir, m, stem)] + whisper_needed = not args.no_whisper and not _json_exists(output_dir, whisper_label, stem) + need_audio = bool(elevenlabs_to_run) or whisper_needed + + for m in _ELEVENLABS_MODELS: + if _json_exists(output_dir, m, stem): + print(f'[{m}] Already exists, skipping API call.') + if not whisper_needed and not args.no_whisper: + print(f'[{whisper_label}] Already exists, skipping.') + + job_ids: Dict[str, str] = {} + audio_temp_dir: Optional[tempfile.TemporaryDirectory] = None # type: ignore[type-arg] + audio_path: Optional[Path] = None + + if need_audio: + audio_temp_dir = tempfile.TemporaryDirectory() + audio_path = Path(audio_temp_dir.name) / f'{stem}.aac' + _extract_audio(video_path, audio_path) + + if elevenlabs_to_run: + api_key = os.getenv('ELEVEN_API_KEY', '') + if not api_key: + raise ValueError('ELEVEN_API_KEY environment variable is not set.') + client = ElevenLabs(api_key=api_key) + + assert audio_path is not None + for model in elevenlabs_to_run: + job_ids[model] = _submit_job(client, audio_path, model, args.language, diarize) + + print(f'\n{len(job_ids)} job(s) submitted. Polling for results...\n') + for model in elevenlabs_to_run: + result = _poll_job(client, model, job_ids[model]) + data = _elevenlabs_result_to_dict(result) + json_path, srt_path = _save_elevenlabs_result(data, output_dir, model, stem) + print(f'[{model}] Saved: {json_path.name} ({len(data["segments"])} segments, {len(data["text"])} chars)') + if srt_path: + print(f'[{model}] Saved: {srt_path.name}') + + if whisper_needed: + assert audio_path is not None + whisper_audio = output_dir / f'_whisper_tmp_{stem}.aac' + import shutil + shutil.copy2(audio_path, whisper_audio) + try: + json_out = output_dir / f'{stem}_{whisper_label}.json' + _transcribe_whisper_docker(whisper_audio, json_out) + if json_out.exists(): + data = json.loads(json_out.read_text(encoding='utf-8')) + print(f'[{whisper_label}] Saved: {json_out.name} ({len(data["segments"])} segments, {len(data["text"])} chars)') + finally: + whisper_audio.unlink(missing_ok=True) + + if audio_temp_dir: + audio_temp_dir.cleanup() + + print(f'\nDone. Compare files in: {output_dir}') + + +if __name__ == '__main__': + main() diff --git a/preprocessor/scripts/deploy_to_nas.py b/preprocessor/scripts/deploy_to_nas.py index df06747a4..171dc5146 100644 --- a/preprocessor/scripts/deploy_to_nas.py +++ b/preprocessor/scripts/deploy_to_nas.py @@ -1,3 +1,4 @@ +# pylint: skip-file import argparse from concurrent.futures import ( ThreadPoolExecutor, diff --git a/preprocessor/scripts/split_double_episodes.py b/preprocessor/scripts/split_double_episodes.py index 967e1ef83..5b856a34d 100644 --- a/preprocessor/scripts/split_double_episodes.py +++ b/preprocessor/scripts/split_double_episodes.py @@ -1,3 +1,4 @@ +# pylint: skip-file import argparse import json import math diff --git a/preprocessor/services/transcription/engines/elevenlabs_engine.py b/preprocessor/services/transcription/engines/elevenlabs_engine.py index e585961b7..15d18f368 100644 --- a/preprocessor/services/transcription/engines/elevenlabs_engine.py +++ b/preprocessor/services/transcription/engines/elevenlabs_engine.py @@ -74,6 +74,7 @@ def __submit_job(self, audio_path: Path) -> str: tag_audio_events=True, timestamps_granularity='character', diarize=self.__diarize, + num_speakers=32, use_multi_channel=False, additional_formats=self.__additional_formats, webhook=True, diff --git a/preprocessor/services/transcription/sound_classification.py b/preprocessor/services/transcription/sound_classification.py index 9a85e8cc1..be0ab12bf 100644 --- a/preprocessor/services/transcription/sound_classification.py +++ b/preprocessor/services/transcription/sound_classification.py @@ -15,7 +15,7 @@ def is_sound_event(word: Dict[str, Any]) -> bool: return True text = word.get(WordKeys.TEXT, word.get(WordKeys.WORD, '')).strip() - return bool(re.match(r'^\(.*\)$', text)) + return bool(re.match(r'^[\(\[].*[\)\]]$', text)) def classify_segment(segment: Dict[str, Any]) -> str: diff --git a/preprocessor/steps/text/transcription_step.py b/preprocessor/steps/text/transcription_step.py index 29dbfe3af..95519d7c6 100644 --- a/preprocessor/steps/text/transcription_step.py +++ b/preprocessor/steps/text/transcription_step.py @@ -82,7 +82,7 @@ def _load_from_cache( ) def __create_engine(self, context: ExecutionContext) -> TranscriptionEngine: - if self.config.mode == '11labs': + if self.config.mode in {'11labs', 'elevenlabs'}: context.logger.info('Creating ElevenLabs transcription engine') return ElevenLabsEngine(logger=context.logger) From 04c8a56a72ea2fd450bfe0638a68fbc96430990f Mon Sep 17 00:00:00 2001 From: dam2452 <81230036+dam2452@users.noreply.github.com> Date: Sun, 29 Mar 2026 13:51:28 +0200 Subject: [PATCH 85/89] Add series face clustering and cluster-based refs Introduce series-wide face clustering and support using labeled cluster folders as character reference sources. Key changes: - Add SeriesFaceClusteringStep to extract embeddings from all frames, cluster them, create numbered cluster folders and write a _cluster_index.json. - Add ClusterFolderManager service to create cluster folders, extract dominant face vectors from cluster folders, and manage labeled folder checks. - Extend CharacterReferenceProcessorStep to support a "clusters" reference_source: validate labeled cluster folders, extract per-character vectors from labeled folders, and emit metadata; preserve existing web-based processing. - Add configuration fields: CharacterReferencesConfig.source (default 'web') and CharacterReferenceProcessorConfig.reference_source (Literal["web","clusters"]). SeriesConfig parsing uses the new source field. - Wire pipeline_factory to conditionally run series clustering and adjust step phases/dependencies when reference_source == "clusters"; register steps in correct order. - Export ClusterFolderManager from services.characters.__init__ and add SeriesFaceClusteringConfig placeholder. Defaults retain existing web scraping behavior; the new cluster flow enables manual labeling of clusters for improved character reference vectors. --- preprocessor/app/pipeline_factory.py | 109 +++++++----- preprocessor/config/series_config.py | 2 + preprocessor/config/step_configs.py | 6 + preprocessor/scripts/deploy_to_nas.py | 41 ++++- preprocessor/series_configs/defaults.json | 3 +- preprocessor/services/characters/__init__.py | 2 + .../characters/cluster_folder_manager.py | 158 ++++++++++++++++++ .../services/characters/face_clusterer.py | 35 +++- .../character_reference_processor_step.py | 140 +++++++++++++++- .../vision/series_face_clustering_step.py | 120 +++++++++++++ 10 files changed, 561 insertions(+), 55 deletions(-) create mode 100644 preprocessor/services/characters/cluster_folder_manager.py create mode 100644 preprocessor/steps/vision/series_face_clustering_step.py diff --git a/preprocessor/app/pipeline_factory.py b/preprocessor/app/pipeline_factory.py index c5f366cfb..8c387a193 100644 --- a/preprocessor/app/pipeline_factory.py +++ b/preprocessor/app/pipeline_factory.py @@ -25,6 +25,7 @@ ObjectDetectionConfig, ResolutionAnalysisConfig, SceneDetectionConfig, + SeriesFaceClusteringConfig, SoundEventEmbeddingConfig, SoundEventsConfig, SoundSeparationConfig, @@ -72,6 +73,7 @@ from preprocessor.steps.vision.face_clustering_step import FaceClusteringStep from preprocessor.steps.vision.image_hashing_step import ImageHashStep from preprocessor.steps.vision.object_detection_step import ObjectDetectionStep +from preprocessor.steps.vision.series_face_clustering_step import SeriesFaceClusteringStep # Phase Definitions SCRAPING = Phase("SCRAPING", color="blue") @@ -82,6 +84,7 @@ def build_pipeline(series_name: str) -> PipelineDefinition: # pylint: disable=too-many-locals,too-many-statements series_config = SeriesConfig.load(series_name) + _reference_source = series_config.scraping.character_references.source # ========================================================= # SCRAPING PHASE @@ -127,43 +130,6 @@ def build_pipeline(series_name: str) -> PipelineDefinition: # pylint: disable=t ), ) - character_references = StepBuilder( - phase=SCRAPING, - step_class=CharacterReferenceStep, - description="Downloads character reference images from the web", - produces=[ - DirectoryOutput( - pattern="character_faces", - subdir="", - expected_file_pattern="**/*.jpg", - min_files=1, - min_size_per_file_bytes=1024, - ), - ], - needs=[characters_metadata], - config=CharacterReferenceConfig( - search_engine=series_config.scraping.character_references.search_engine, - images_per_character=series_config.scraping.character_references.images_per_character, - search_query_template=series_config.scraping.character_references.search_query_template, - ), - ) - - character_reference_vectors = StepBuilder( - phase=SCRAPING, - step_class=CharacterReferenceProcessorStep, - description="Processes character reference images into face embedding vectors", - produces=[ - DirectoryOutput( - pattern="character_references_processed", - subdir="", - expected_file_pattern="**/face_vector.npy", - min_files=1, - min_size_per_file_bytes=100, - ), - ], - needs=[character_references], - config=CharacterReferenceProcessorConfig(), - ) # ========================================================= # PROCESSING PHASE: VIDEO @@ -406,6 +372,70 @@ def build_pipeline(series_name: str) -> PipelineDefinition: # pylint: disable=t # ========================================================= # PROCESSING PHASE: VISION # ========================================================= + series_face_clusters = StepBuilder( + phase=PROCESSING, + step_class=SeriesFaceClusteringStep, + description="Clusters all faces across the series into numbered folders for manual labeling", + produces=[ + JsonFileOutput( + pattern="_cluster_index.json", + subdir="character_clusters", + min_size_bytes=10, + ), + ], + needs=[exported_frames], + config=SeriesFaceClusteringConfig(), + ) + + _character_ref_vectors_output = DirectoryOutput( + pattern="character_references_processed", + subdir="", + expected_file_pattern="**/face_vector.npy", + min_files=1, + min_size_per_file_bytes=100, + ) + + if _reference_source == "clusters": + character_reference_vectors = StepBuilder( + phase=PROCESSING, + step_class=CharacterReferenceProcessorStep, + description="Builds character face vectors from manually labeled cluster frames", + produces=[_character_ref_vectors_output], + needs=[series_face_clusters], + config=CharacterReferenceProcessorConfig(reference_source="clusters"), + ) + _character_ref_steps = [character_reference_vectors] + else: + character_references = StepBuilder( + phase=SCRAPING, + step_class=CharacterReferenceStep, + description="Downloads character reference images from the web", + produces=[ + DirectoryOutput( + pattern="character_faces", + subdir="", + expected_file_pattern="**/*.jpg", + min_files=1, + min_size_per_file_bytes=1024, + ), + ], + needs=[characters_metadata], + config=CharacterReferenceConfig( + search_engine=series_config.scraping.character_references.search_engine, + images_per_character=series_config.scraping.character_references.images_per_character, + search_query_template=series_config.scraping.character_references.search_query_template, + ), + ) + character_reference_vectors = StepBuilder( + phase=SCRAPING, + step_class=CharacterReferenceProcessorStep, + description="Processes character reference images into face embedding vectors", + produces=[_character_ref_vectors_output], + needs=[character_references], + config=CharacterReferenceProcessorConfig(reference_source="web"), + ) + _character_ref_steps = [character_references, character_reference_vectors] + image_hashes = StepBuilder( phase=PROCESSING, step_class=ImageHashStep, @@ -574,8 +604,6 @@ def build_pipeline(series_name: str) -> PipelineDefinition: # pylint: disable=t pipeline.register(episodes_metadata) pipeline.register(characters_metadata) - pipeline.register(character_references) - pipeline.register(character_reference_vectors) pipeline.register(resolution_analysis) pipeline.register(transcoded_videos) @@ -595,6 +623,9 @@ def build_pipeline(series_name: str) -> PipelineDefinition: # pylint: disable=t pipeline.register(image_hashes) pipeline.register(video_embeddings) + pipeline.register(series_face_clusters) + for _step in _character_ref_steps: + pipeline.register(_step) pipeline.register(object_detections) pipeline.register(character_detections) pipeline.register(emotion_data) diff --git a/preprocessor/config/series_config.py b/preprocessor/config/series_config.py index 31024b9d5..dfa224881 100644 --- a/preprocessor/config/series_config.py +++ b/preprocessor/config/series_config.py @@ -41,6 +41,7 @@ class CharacterReferencesConfig: images_per_character: int search_engine: str search_query_template: str + source: str = 'clusters' @dataclass @@ -155,6 +156,7 @@ def __load_from_dict(data: Dict[str, Any]) -> 'SeriesConfig': search_engine=data['scraping']['character_references']['search_engine'], images_per_character=data['scraping']['character_references']['images_per_character'], search_query_template=data['scraping']['character_references']['search_query_template'], + source=data['scraping']['character_references'].get('source', 'web'), ), ), processing=SeriesConfig.__build_processing_config(data), diff --git a/preprocessor/config/step_configs.py b/preprocessor/config/step_configs.py index 68f030b72..a6b51cbf9 100644 --- a/preprocessor/config/step_configs.py +++ b/preprocessor/config/step_configs.py @@ -2,6 +2,7 @@ from typing import ( Dict, List, + Literal, Optional, ) @@ -194,6 +195,10 @@ class FaceClusteringConfig(BaseModel): max_parallel_episodes: int = Field(default=1, ge=1, le=8) +class SeriesFaceClusteringConfig(BaseModel): + prefetch_workers: int = Field(default=8, ge=1, le=32) + + class ObjectDetectionConfig(BaseModel): batch_size: int = Field(default=8, ge=1) conf_threshold: float = Field(default=0.3, ge=0.0, le=1.0) @@ -235,4 +240,5 @@ class CharacterReferenceConfig(BaseModel): class CharacterReferenceProcessorConfig(BaseModel): + reference_source: Literal["web", "clusters"] = "clusters" similarity_threshold: float = Field(default=0.45, ge=0.0, le=1.0) diff --git a/preprocessor/scripts/deploy_to_nas.py b/preprocessor/scripts/deploy_to_nas.py index 171dc5146..c86baab87 100644 --- a/preprocessor/scripts/deploy_to_nas.py +++ b/preprocessor/scripts/deploy_to_nas.py @@ -4,6 +4,7 @@ ThreadPoolExecutor, as_completed, ) +import os from pathlib import Path import shutil import sys @@ -57,13 +58,28 @@ def _print_summary(total: int, copied: int, skipped: int, failed: int, dry_run: print(f" Failed : {failed}") +def _is_changed(src: Path, dst: Path) -> bool: + if not dst.exists(): + return True + src_stat = os.stat(src) + dst_stat = os.stat(dst) + if src_stat.st_size != dst_stat.st_size: + return True + return src_stat.st_mtime > dst_stat.st_mtime + 1 + + def _filter_files_to_copy( - pairs: List[Tuple[Path, Path]], overwrite: bool, + pairs: List[Tuple[Path, Path]], overwrite: bool, diff_only: bool, ) -> Tuple[List[Tuple[Path, Path]], int]: to_copy = [] skipped = 0 for src, dst in pairs: - if not overwrite and dst.exists(): + if diff_only: + if _is_changed(src, dst): + to_copy.append((src, dst)) + else: + skipped += 1 + elif not overwrite and dst.exists(): skipped += 1 else: to_copy.append((src, dst)) @@ -101,6 +117,7 @@ def deploy( dry_run: bool, workers: int, overwrite: bool, + diff_only: bool, ) -> int: source_series_dir = source_base / series target_series_dir = target_base / series @@ -109,9 +126,15 @@ def deploy( print(f"ERROR: Source directory not found: {source_series_dir}") return 1 + mode_flags = f"{'DRY RUN' if dry_run else 'COPY'} | workers={workers}" + if diff_only: + mode_flags += " | diff-only" + elif overwrite: + mode_flags += " | overwrite" + print(f"Source : {source_series_dir}") print(f"Target : {target_series_dir}") - print(f"Mode : {'DRY RUN' if dry_run else 'COPY'} | workers={workers} | overwrite={overwrite}") + print(f"Mode : {mode_flags}") print() pairs = _collect_files(source_series_dir, target_series_dir) @@ -119,9 +142,10 @@ def deploy( print("No files found to copy.") return 0 - to_copy, skipped = _filter_files_to_copy(pairs, overwrite) + to_copy, skipped = _filter_files_to_copy(pairs, overwrite, diff_only) + skip_reason = "unchanged (size+mtime)" if diff_only else "already exist, use --overwrite to replace" print(f"Files to copy : {len(to_copy)}") - print(f"Files skipped : {skipped} (already exist, use --overwrite to replace)") + print(f"Files skipped : {skipped} ({skip_reason})") print() if not to_copy: @@ -165,6 +189,11 @@ def main() -> None: action="store_true", help="Show what would be copied without actually copying", ) + parser.add_argument( + "--diff-only", + action="store_true", + help="Only copy files that are missing or differ from target (by size or modification time)", + ) parser.add_argument( "--workers", type=int, @@ -177,7 +206,7 @@ def main() -> None: source_base = _resolve_source_base(args.source_path) target_base = Path(args.target_path) - sys.exit(deploy(source_base, target_base, args.series, args.dry_run, args.workers, args.overwrite)) + sys.exit(deploy(source_base, target_base, args.series, args.dry_run, args.workers, args.overwrite, args.diff_only)) if __name__ == "__main__": diff --git a/preprocessor/series_configs/defaults.json b/preprocessor/series_configs/defaults.json index 8f694f511..126d6aadf 100644 --- a/preprocessor/series_configs/defaults.json +++ b/preprocessor/series_configs/defaults.json @@ -37,7 +37,8 @@ "character_references": { "images_per_character": 3, "search_engine": "duckduckgo", - "search_query_template": "Serial {series_name} {char_name} posta\u0107" + "search_query_template": "Serial {series_name} {char_name} posta\u0107", + "source": "clusters" }, "characters": { "parser_mode": "normal" diff --git a/preprocessor/services/characters/__init__.py b/preprocessor/services/characters/__init__.py index 17fa35797..0140041a3 100644 --- a/preprocessor/services/characters/__init__.py +++ b/preprocessor/services/characters/__init__.py @@ -1,3 +1,4 @@ +from preprocessor.services.characters.cluster_folder_manager import ClusterFolderManager from preprocessor.services.characters.face_clusterer import FaceClusterer from preprocessor.services.characters.face_detection import FaceDetector from preprocessor.services.characters.image_search import ( @@ -11,6 +12,7 @@ 'BaseImageSearch', 'BrowserBingImageSearch', 'BrowserDuckDuckGoImageSearch', + 'ClusterFolderManager', 'FaceClusterer', 'FaceDetector', 'GoogleImageSearch', diff --git a/preprocessor/services/characters/cluster_folder_manager.py b/preprocessor/services/characters/cluster_folder_manager.py new file mode 100644 index 000000000..e82a9ae7b --- /dev/null +++ b/preprocessor/services/characters/cluster_folder_manager.py @@ -0,0 +1,158 @@ +from collections import defaultdict +import hashlib +from pathlib import Path +import shutil +from typing import ( + Any, + Dict, + List, + Optional, + Tuple, +) + +import cv2 +from insightface.app import FaceAnalysis +import numpy as np + +from preprocessor.services.core.logging import ErrorHandlingLogger + + +class ClusterFolderManager: + @staticmethod + def create_cluster_folders( + face_data: List[Dict[str, Any]], + labels: np.ndarray, + output_dir: Path, + logger: Optional[ErrorHandlingLogger] = None, + ) -> int: + groups: Dict[int, List[Dict[str, Any]]] = defaultdict(list) + for face_info, label in zip(face_data, labels): + if int(label) == -1: + continue + groups[int(label)].append(face_info) + + sorted_clusters = sorted(groups.items(), key=lambda x: len(x[1]), reverse=True) + output_dir.mkdir(parents=True, exist_ok=True) + + for rank, (_, faces) in enumerate(sorted_clusters): + cluster_dir = output_dir / str(rank) + cluster_dir.mkdir(exist_ok=True) + + ranked_frames = ClusterFolderManager._rank_frames_by_centrality(faces) + for frame_rank, (frame_path, _) in enumerate(ranked_frames): + hash8 = hashlib.sha256(str(frame_path).encode()).hexdigest()[:8] + dest_name = f"{frame_rank:04d}_{frame_path.stem}_{hash8}{frame_path.suffix}" + dest_path = cluster_dir / dest_name + if not dest_path.exists(): + shutil.copy2(frame_path, dest_path) + + cluster_count = len(sorted_clusters) + if logger: + logger.info(f"Created {cluster_count} cluster folders in {output_dir}") + return cluster_count + + @staticmethod + def _rank_frames_by_centrality( + faces: List[Dict[str, Any]], + ) -> List[Tuple[Path, float]]: + vectors = np.array([f['vector'] for f in faces]) + centroid = np.mean(vectors, axis=0) + norm = np.linalg.norm(centroid) + if norm > 1e-6: + centroid /= norm + + frame_max_sim: Dict[Path, float] = {} + for face_info in faces: + frame_path: Path = face_info['frame_path'] + sim = float(np.dot(face_info['vector'], centroid)) + if frame_path not in frame_max_sim or sim > frame_max_sim[frame_path]: + frame_max_sim[frame_path] = sim + + return sorted(frame_max_sim.items(), key=lambda x: x[1], reverse=True) + + @staticmethod + def get_labeled_folders(cluster_dir: Path) -> Dict[str, Path]: + if not cluster_dir.exists(): + return {} + return { + d.name: d + for d in sorted(cluster_dir.iterdir()) + if d.is_dir() and not d.name.isdigit() + } + + @staticmethod + def is_complete( + cluster_dir: Path, + character_names: List[str], + ) -> Tuple[bool, List[str]]: + labeled = ClusterFolderManager.get_labeled_folders(cluster_dir) + normalized_labels = {ClusterFolderManager._normalize_name(n) for n in labeled} + missing = [ + name for name in character_names + if ClusterFolderManager._normalize_name(name) not in normalized_labels + ] + return len(missing) == 0, missing + + @staticmethod + def extract_face_vector( + cluster_folder: Path, + face_app: FaceAnalysis, + logger: Optional[ErrorHandlingLogger] = None, + ) -> Optional[np.ndarray]: + frame_files = sorted(cluster_folder.glob('*.jpg')) + if not frame_files: + if logger: + logger.warning(f"No frames in {cluster_folder}") + return None + + all_embeddings: List[np.ndarray] = [] + for frame_path in frame_files: + img = cv2.imread(str(frame_path)) + if img is None: + continue + for face in face_app.get(img): + all_embeddings.append(face.normed_embedding) + + if not all_embeddings: + if logger: + logger.warning(f"No faces detected in {cluster_folder}") + return None + + vectors = np.array(all_embeddings) + dominant = ClusterFolderManager._find_dominant_embedding(vectors) + if dominant is None: + return None + + norm = np.linalg.norm(dominant) + if norm < 1e-6: + return None + return dominant / norm + + @staticmethod + def _find_dominant_embedding(vectors: np.ndarray) -> Optional[np.ndarray]: + if len(vectors) == 1: + return vectors[0].copy() + + centroid = np.mean(vectors, axis=0) + norm = np.linalg.norm(centroid) + if norm < 1e-6: + return None + centroid = centroid / norm + + for _ in range(3): + sims = vectors @ centroid + threshold = float(np.percentile(sims, 30)) + mask = sims >= threshold + if mask.sum() < 1: + break + centroid = np.mean(vectors[mask], axis=0) + norm = np.linalg.norm(centroid) + if norm < 1e-6: + break + centroid = centroid / norm + + return centroid + + @staticmethod + def _normalize_name(name: str) -> str: + return name.lower().replace(' ', '_').replace('-', '_') diff --git a/preprocessor/services/characters/face_clusterer.py b/preprocessor/services/characters/face_clusterer.py index f859fc059..4cd7cfbbc 100644 --- a/preprocessor/services/characters/face_clusterer.py +++ b/preprocessor/services/characters/face_clusterer.py @@ -1,10 +1,17 @@ -from collections import defaultdict +from collections import ( + defaultdict, + deque, +) +from concurrent.futures import ThreadPoolExecutor import gc +from itertools import islice from pathlib import Path from typing import ( Any, Dict, + Generator, List, + Optional, Tuple, ) @@ -21,11 +28,12 @@ class FaceClusterer: def extract_face_embeddings( frame_files: List[Path], face_app: FaceAnalysis, + prefetch_workers: int = 4, ) -> List[Dict[str, Any]]: face_data: List[Dict[str, Any]] = [] - for frame_path in frame_files: - img = cv2.imread(str(frame_path)) + prefetch_size = prefetch_workers * 4 + for frame_path, img in FaceClusterer._prefetch_images(frame_files, prefetch_workers, prefetch_size): if img is None: continue @@ -47,6 +55,27 @@ def extract_face_embeddings( return face_data + @staticmethod + def _prefetch_images( + frame_files: List[Path], + workers: int, + prefetch_size: int, + ) -> Generator[Tuple[Path, Optional[np.ndarray]], None, None]: + with ThreadPoolExecutor(max_workers=workers) as pool: + it = iter(frame_files) + pending: deque = deque( + (path, pool.submit(cv2.imread, str(path))) + for path in islice(it, prefetch_size) + ) + while pending: + path, future = pending.popleft() + try: + next_path = next(it) + pending.append((next_path, pool.submit(cv2.imread, str(next_path)))) + except StopIteration: + pass + yield path, future.result() + @staticmethod def cluster_embeddings( face_data: List[Dict[str, Any]], diff --git a/preprocessor/steps/vision/character_reference_processor_step.py b/preprocessor/steps/vision/character_reference_processor_step.py index bf39298ca..6de6a65f8 100644 --- a/preprocessor/steps/vision/character_reference_processor_step.py +++ b/preprocessor/steps/vision/character_reference_processor_step.py @@ -1,11 +1,16 @@ -# pylint: disable=duplicate-code +from datetime import datetime +import json from pathlib import Path from typing import ( List, Tuple, ) +from insightface.app import FaceAnalysis +import numpy as np + from preprocessor.config.output_paths import get_base_output_dir +from preprocessor.config.settings_instance import settings from preprocessor.config.step_configs import CharacterReferenceProcessorConfig from preprocessor.core.artifacts import SourceVideo from preprocessor.core.base_step import PipelineStep @@ -14,6 +19,11 @@ DirectoryOutput, OutputDescriptor, ) +from preprocessor.services.characters import ( + FaceClusterer, + FaceDetector, +) +from preprocessor.services.characters.cluster_folder_manager import ClusterFolderManager from preprocessor.services.scraping.reference_processor import CharacterReferenceProcessor @@ -50,16 +60,84 @@ def _load_from_cache( def _process( self, input_data: SourceVideo, context: ExecutionContext, ) -> SourceVideo: - characters_dir, output_dir = self.__resolve_paths(context) - self.__validate_input_directory(characters_dir) - self.__run_reference_processor(characters_dir, output_dir, context) - return input_data + if self.config.reference_source == "clusters": + return self.__process_from_clusters(input_data, context) + return self.__process_from_web(input_data, context) @staticmethod def __resolve_paths(context: ExecutionContext) -> Tuple[Path, Path]: base_dir = get_base_output_dir(context.series_name) return base_dir / 'character_faces', base_dir / 'character_references_processed' + def __process_from_web( + self, + input_data: SourceVideo, + context: ExecutionContext, + ) -> SourceVideo: + characters_dir, output_dir = self.__resolve_paths(context) + self.__validate_web_input_directory(characters_dir) + self.__run_reference_processor(characters_dir, output_dir, context) + return input_data + + def __process_from_clusters( + self, + input_data: SourceVideo, + context: ExecutionContext, + ) -> SourceVideo: + cluster_dir = context.base_output_dir / 'character_clusters' + _, output_dir = self.__resolve_paths(context) + + character_names = self.__load_character_names(context) + is_complete, missing = ClusterFolderManager.is_complete(cluster_dir, character_names) + + if not is_complete: + context.logger.warning( + f"Cluster labeling incomplete. Missing characters: {missing}", + ) + raise RuntimeError( + f"Not all characters have labeled cluster folders. Missing: {missing}", + ) + + labeled_folders = ClusterFolderManager.get_labeled_folders(cluster_dir) + context.logger.info( + f"Processing {len(labeled_folders)} labeled cluster folders into face vectors...", + ) + + face_app = None + try: + face_app = FaceDetector.init() + for char_name, folder in labeled_folders.items(): + self.__process_cluster_character( + char_name, folder, output_dir, face_app, context, + ) + finally: + if face_app is not None: + FaceClusterer.cleanup_gpu_memory() + + context.logger.info(f"Cluster-based face vectors saved to: {output_dir}") + return input_data + + def __process_cluster_character( + self, + char_name: str, + cluster_folder: Path, + output_dir: Path, + face_app: FaceAnalysis, + context: ExecutionContext, + ) -> None: + vector = ClusterFolderManager.extract_face_vector( + cluster_folder, face_app, context.logger, + ) + if vector is None: + context.logger.warning(f"Could not extract face vector for '{char_name}', skipping") + return + + char_out = output_dir / char_name + char_out.mkdir(parents=True, exist_ok=True) + np.save(char_out / 'face_vector.npy', vector) + self.__save_cluster_metadata(char_out, char_name, cluster_folder, vector) + context.logger.info(f"Saved face vector for '{char_name}'") + def __run_reference_processor( self, characters_dir: Path, @@ -84,9 +162,59 @@ def __run_reference_processor( context.logger.info(f"Character reference vectors saved to: {output_dir}") @staticmethod - def __validate_input_directory(characters_dir: Path) -> None: + def __load_character_names(context: ExecutionContext) -> List[str]: + characters_json = context.base_output_dir / f'{context.series_name}_characters.json' + if not characters_json.exists(): + raise FileNotFoundError( + f"Characters JSON not found: {characters_json}. " + f"Run characters_metadata step first.", + ) + with open(characters_json, 'r', encoding='utf-8') as f: + data = json.load(f) + return [c['name'] for c in data.get('characters', []) if c.get('name')] + + @staticmethod + def __save_cluster_metadata( + char_out: Path, + char_name: str, + cluster_folder: Path, + vector: np.ndarray, + ) -> None: + metadata = { + 'character_name': char_name, + 'source': 'clusters', + 'cluster_folder': str(cluster_folder), + 'processed_at': datetime.now().isoformat(), + 'face_vector_dim': int(vector.shape[0]), + 'processing_params': { + 'face_model': settings.face_recognition.model_name, + }, + } + with open(char_out / 'metadata.json', 'w', encoding='utf-8') as f: + json.dump(metadata, f, ensure_ascii=False, indent=2) + + @staticmethod + def __validate_web_input_directory(characters_dir: Path) -> None: if not characters_dir.exists(): raise FileNotFoundError( f"Character faces directory not found: {characters_dir}. " f"Run character_reference step first.", ) + + def _check_cache_validity( + self, + output_path: Path, + context: ExecutionContext, + episode_id: str, + cache_description: str, + ) -> bool: + if output_path.exists() and not context.force_rerun: + vectors = list(output_path.rglob('face_vector.npy')) + if vectors: + if not context.is_step_completed(self.name, episode_id): + context.mark_step_completed(self.name, episode_id) + context.logger.info( + f'Skipping {episode_id} ({cache_description}, {len(vectors)} vectors found)', + ) + return True + return False diff --git a/preprocessor/steps/vision/series_face_clustering_step.py b/preprocessor/steps/vision/series_face_clustering_step.py new file mode 100644 index 000000000..8eaeef56c --- /dev/null +++ b/preprocessor/steps/vision/series_face_clustering_step.py @@ -0,0 +1,120 @@ +from pathlib import Path +from typing import ( + Any, + Dict, + List, +) + +from preprocessor.config.settings_instance import settings +from preprocessor.config.step_configs import SeriesFaceClusteringConfig +from preprocessor.core.artifacts import SourceVideo +from preprocessor.core.base_step import PipelineStep +from preprocessor.core.context import ExecutionContext +from preprocessor.core.output_descriptors import ( + JsonFileOutput, + OutputDescriptor, +) +from preprocessor.services.characters import ( + FaceClusterer, + FaceDetector, +) +from preprocessor.services.characters.cluster_folder_manager import ClusterFolderManager +from preprocessor.services.io.files import FileOperations + + +class SeriesFaceClusteringStep(PipelineStep[SourceVideo, SourceVideo, SeriesFaceClusteringConfig]): + @property + def is_global(self) -> bool: + return True + + def get_output_descriptors(self) -> List[OutputDescriptor]: + return [ + JsonFileOutput( + pattern='_cluster_index.json', + subdir='character_clusters', + min_size_bytes=10, + ), + ] + + def _get_cache_path(self, input_data: SourceVideo, context: ExecutionContext) -> Path: + return context.base_output_dir / 'character_clusters' / '_cluster_index.json' + + def _load_from_cache( + self, cache_path: Path, input_data: SourceVideo, context: ExecutionContext, + ) -> SourceVideo: + context.logger.info(f"Series character clusters already exist: {cache_path.parent}") + return input_data + + def _process(self, input_data: SourceVideo, context: ExecutionContext) -> SourceVideo: + frames_root = context.base_output_dir / 'frames' + output_dir = context.base_output_dir / 'character_clusters' + + frame_files = self.__collect_frame_files(frames_root) + if not frame_files: + context.logger.warning(f"No frames found in {frames_root}") + return input_data + + context.logger.info( + f"Extracting face embeddings from {len(frame_files)} frames across the series...", + ) + + face_app = None + try: + face_app = FaceDetector.init() + face_data = FaceClusterer.extract_face_embeddings( + frame_files, face_app, self.config.prefetch_workers, + ) + + if not face_data: + context.logger.warning("No faces detected across the series") + return input_data + + context.logger.info(f"Clustering {len(face_data)} face embeddings series-wide...") + + clustering = settings.face_clustering + labels = FaceClusterer.cluster_embeddings( + face_data, clustering.min_cluster_size, clustering.min_samples, + ) + + cluster_count = ClusterFolderManager.create_cluster_folders( + face_data=face_data, + labels=labels, + output_dir=output_dir, + logger=context.logger, + ) + + self.__write_cluster_index(output_dir, context.series_name, cluster_count, face_data, frame_files) + + context.logger.info( + f"Series clustering complete: {cluster_count} clusters → {output_dir}", + ) + finally: + if face_app is not None: + FaceClusterer.cleanup_gpu_memory() + + return input_data + + @staticmethod + def __write_cluster_index( + output_dir: Path, + series_name: str, + cluster_count: int, + face_data: List[Dict[str, Any]], + frame_files: List[Path], + ) -> None: + index_data = { + 'series_name': series_name, + 'cluster_count': cluster_count, + 'total_faces': len(face_data), + 'total_frames': len(frame_files), + } + FileOperations.atomic_write_json(output_dir / '_cluster_index.json', index_data) + + @staticmethod + def __collect_frame_files(frames_root: Path) -> List[Path]: + if not frames_root.exists(): + return [] + return sorted([ + f for f in frames_root.rglob('*.jpg') + if f.is_file() and 'frame_' in f.name + ]) From ef0afb955c151475eb8638d4d18024a99c900522 Mon Sep 17 00:00:00 2001 From: dam2452 <81230036+dam2452@users.noreply.github.com> Date: Sun, 29 Mar 2026 14:44:47 +0200 Subject: [PATCH 86/89] Remove legacy face clustering step and validator Delete the per-episode FaceClusteringStep and its FaceClusterValidator and remove the FaceClusteringConfig and ClusterData artifact. Update pipeline_factory and vision step exports to no longer import or register the removed step/config. Adjust validation wiring and EpisodeStats to stop referencing the removed validator. Refactor cluster folder handling and face data: ClusterFolderManager now creates separate 'frames' and 'faces' subfolders, ranks frames with bbox info, saves cropped face images via a new _save_face_crop helper, and falls back to legacy folder layout when reading frames. FaceClusterer now includes bbox tuples in extracted face entries to support cropping. These changes prepare for series-level clustering and consolidate face-clustering responsibilities. --- preprocessor/app/pipeline_factory.py | 18 -- preprocessor/config/config.py | 7 +- preprocessor/config/step_configs.py | 4 - preprocessor/config/step_defaults.py | 2 - preprocessor/core/artifacts.py | 5 - .../characters/cluster_folder_manager.py | 68 ++++++-- .../services/characters/face_clusterer.py | 8 + .../services/characters/face_detection.py | 14 +- .../services/validation/episode_stats.py | 3 +- .../validation/validators/__init__.py | 2 - .../validators/face_cluster_validator.py | 44 ----- preprocessor/steps/vision/__init__.py | 3 +- .../steps/vision/face_clustering_step.py | 159 ------------------ .../vision/series_face_clustering_step.py | 6 +- 14 files changed, 79 insertions(+), 264 deletions(-) delete mode 100644 preprocessor/services/validation/validators/face_cluster_validator.py delete mode 100644 preprocessor/steps/vision/face_clustering_step.py diff --git a/preprocessor/app/pipeline_factory.py b/preprocessor/app/pipeline_factory.py index 8c387a193..7504f08a2 100644 --- a/preprocessor/app/pipeline_factory.py +++ b/preprocessor/app/pipeline_factory.py @@ -18,7 +18,6 @@ EmotionDetectionConfig, EpisodeNameEmbeddingConfig, EpisodeScraperConfig, - FaceClusteringConfig, FrameExportConfig, FullEpisodeEmbeddingConfig, ImageHashConfig, @@ -70,7 +69,6 @@ from preprocessor.steps.vision.character_reference_processor_step import CharacterReferenceProcessorStep from preprocessor.steps.vision.embeddings_step import VideoEmbeddingStep from preprocessor.steps.vision.emotion_detection_step import EmotionDetectionStep -from preprocessor.steps.vision.face_clustering_step import FaceClusteringStep from preprocessor.steps.vision.image_hashing_step import ImageHashStep from preprocessor.steps.vision.object_detection_step import ObjectDetectionStep from preprocessor.steps.vision.series_face_clustering_step import SeriesFaceClusteringStep @@ -496,20 +494,6 @@ def build_pipeline(series_name: str) -> PipelineDefinition: # pylint: disable=t config=EmotionDetectionConfig(), ) - face_clusters = StepBuilder( - phase=PROCESSING, - step_class=FaceClusteringStep, - description="Face clustering using HDBSCAN", - produces=[ - JsonFileOutput( - pattern="{season}/{episode}.json", - min_size_bytes=10, - ), - ], - needs=[exported_frames], - config=FaceClusteringConfig(), - ) - object_detections = StepBuilder( phase=PROCESSING, step_class=ObjectDetectionStep, @@ -543,7 +527,6 @@ def build_pipeline(series_name: str) -> PipelineDefinition: # pylint: disable=t video_embeddings, character_detections, emotion_data, - face_clusters, object_detections, ], config=DocumentGenerationConfig(), @@ -629,7 +612,6 @@ def build_pipeline(series_name: str) -> PipelineDefinition: # pylint: disable=t pipeline.register(object_detections) pipeline.register(character_detections) pipeline.register(emotion_data) - pipeline.register(face_clusters) pipeline.register(elastic_documents) pipeline.register(episode_archives) diff --git a/preprocessor/config/config.py b/preprocessor/config/config.py index 96e48ae56..fce3f9729 100644 --- a/preprocessor/config/config.py +++ b/preprocessor/config/config.py @@ -168,6 +168,7 @@ class EmbeddingSettings(OutputDirMixin): @dataclass(frozen=True) class FaceRecognitionSettings: + det_thresh: float = 0.55 detection_size: Tuple[int, int] = (1280, 1280) model_name: str = 'buffalo_l' @@ -176,8 +177,10 @@ class FaceRecognitionSettings: class FaceClusteringSettings(OutputDirMixin): OUTPUT_SUBDIR: ClassVar[str] = 'face_clusters' - min_cluster_size: int = 5 - min_samples: int = 3 + min_cluster_size: int = 80 + min_face_px: int = 60 + min_samples: int = 15 + min_det_score: float = 0.55 save_noise: bool = True diff --git a/preprocessor/config/step_configs.py b/preprocessor/config/step_configs.py index a6b51cbf9..a4fa7eca6 100644 --- a/preprocessor/config/step_configs.py +++ b/preprocessor/config/step_configs.py @@ -191,10 +191,6 @@ class EmotionDetectionConfig(BaseModel): max_parallel_episodes: int = Field(default=2, ge=1, le=4) -class FaceClusteringConfig(BaseModel): - max_parallel_episodes: int = Field(default=1, ge=1, le=8) - - class SeriesFaceClusteringConfig(BaseModel): prefetch_workers: int = Field(default=8, ge=1, le=32) diff --git a/preprocessor/config/step_defaults.py b/preprocessor/config/step_defaults.py index 32b537965..56a65f20c 100644 --- a/preprocessor/config/step_defaults.py +++ b/preprocessor/config/step_defaults.py @@ -6,7 +6,6 @@ DocumentGenerationConfig, ElasticsearchConfig, EmotionDetectionConfig, - FaceClusteringConfig, FrameExportConfig, ImageHashConfig, ObjectDetectionConfig, @@ -60,7 +59,6 @@ def get_configs(series_name: str) -> Dict[str, object]: ), 'character_detection': CharacterDetectionConfig(threshold=0.7), 'emotion_detection': EmotionDetectionConfig(), - 'face_clustering': FaceClusteringConfig(), 'object_detection': ObjectDetectionConfig(), 'generate_elastic_documents': DocumentGenerationConfig(), 'generate_archives': ArchiveConfig(), diff --git a/preprocessor/core/artifacts.py b/preprocessor/core/artifacts.py index 242f14379..e269ecd7c 100644 --- a/preprocessor/core/artifacts.py +++ b/preprocessor/core/artifacts.py @@ -116,11 +116,6 @@ class EmotionData(EpisodeArtifact): path: Path -@dataclass(frozen=True) -class ClusterData(EpisodeArtifact): - path: Path - - @dataclass(frozen=True) class ObjectDetectionData(EpisodeArtifact): path: Path diff --git a/preprocessor/services/characters/cluster_folder_manager.py b/preprocessor/services/characters/cluster_folder_manager.py index e82a9ae7b..cd4ebed12 100644 --- a/preprocessor/services/characters/cluster_folder_manager.py +++ b/preprocessor/services/characters/cluster_folder_manager.py @@ -35,40 +35,74 @@ def create_cluster_folders( output_dir.mkdir(parents=True, exist_ok=True) for rank, (_, faces) in enumerate(sorted_clusters): - cluster_dir = output_dir / str(rank) - cluster_dir.mkdir(exist_ok=True) - - ranked_frames = ClusterFolderManager._rank_frames_by_centrality(faces) - for frame_rank, (frame_path, _) in enumerate(ranked_frames): - hash8 = hashlib.sha256(str(frame_path).encode()).hexdigest()[:8] - dest_name = f"{frame_rank:04d}_{frame_path.stem}_{hash8}{frame_path.suffix}" - dest_path = cluster_dir / dest_name - if not dest_path.exists(): - shutil.copy2(frame_path, dest_path) + ClusterFolderManager.__populate_cluster_dir(output_dir / str(rank), faces) cluster_count = len(sorted_clusters) if logger: logger.info(f"Created {cluster_count} cluster folders in {output_dir}") return cluster_count + @staticmethod + def __populate_cluster_dir( + cluster_dir: Path, + faces: List[Dict[str, Any]], + ) -> None: + frames_dir = cluster_dir / 'frames' + faces_dir = cluster_dir / 'faces' + frames_dir.mkdir(parents=True, exist_ok=True) + faces_dir.mkdir(parents=True, exist_ok=True) + + for frame_rank, (frame_path, _, bbox) in enumerate( + ClusterFolderManager._rank_frames_by_centrality(faces), + ): + hash8 = hashlib.sha256(str(frame_path).encode()).hexdigest()[:8] + dest_name = f"{frame_rank:04d}_{frame_path.stem}_{hash8}{frame_path.suffix}" + + frame_dest = frames_dir / dest_name + if not frame_dest.exists(): + shutil.copy2(frame_path, frame_dest) + + face_dest = faces_dir / dest_name + if not face_dest.exists(): + ClusterFolderManager._save_face_crop(frame_path, bbox, face_dest) + + @staticmethod + def _save_face_crop( + frame_path: Path, + bbox: Tuple[int, int, int, int], + dest_path: Path, + ) -> None: + img = cv2.imread(str(frame_path)) + if img is None: + return + x1, y1, x2, y2 = bbox + crop = img[y1:y2, x1:x2] + if crop.size > 0: + cv2.imwrite(str(dest_path), crop) + @staticmethod def _rank_frames_by_centrality( faces: List[Dict[str, Any]], - ) -> List[Tuple[Path, float]]: + ) -> List[Tuple[Path, float, Tuple[int, int, int, int]]]: vectors = np.array([f['vector'] for f in faces]) centroid = np.mean(vectors, axis=0) norm = np.linalg.norm(centroid) if norm > 1e-6: centroid /= norm - frame_max_sim: Dict[Path, float] = {} + frame_best: Dict[Path, Tuple[float, Tuple[int, int, int, int]]] = {} for face_info in faces: frame_path: Path = face_info['frame_path'] sim = float(np.dot(face_info['vector'], centroid)) - if frame_path not in frame_max_sim or sim > frame_max_sim[frame_path]: - frame_max_sim[frame_path] = sim + bbox: Tuple[int, int, int, int] = face_info['bbox'] + if frame_path not in frame_best or sim > frame_best[frame_path][0]: + frame_best[frame_path] = (sim, bbox) - return sorted(frame_max_sim.items(), key=lambda x: x[1], reverse=True) + return sorted( + [(path, sim, bbox) for path, (sim, bbox) in frame_best.items()], + key=lambda x: x[1], + reverse=True, + ) @staticmethod def get_labeled_folders(cluster_dir: Path) -> Dict[str, Path]: @@ -99,7 +133,9 @@ def extract_face_vector( face_app: FaceAnalysis, logger: Optional[ErrorHandlingLogger] = None, ) -> Optional[np.ndarray]: - frame_files = sorted(cluster_folder.glob('*.jpg')) + frames_dir = cluster_folder / 'frames' + search_dir = frames_dir if frames_dir.exists() else cluster_folder + frame_files = sorted(search_dir.glob('*.jpg')) if not frame_files: if logger: logger.warning(f"No frames in {cluster_folder}") diff --git a/preprocessor/services/characters/face_clusterer.py b/preprocessor/services/characters/face_clusterer.py index 4cd7cfbbc..c92c8834f 100644 --- a/preprocessor/services/characters/face_clusterer.py +++ b/preprocessor/services/characters/face_clusterer.py @@ -29,6 +29,8 @@ def extract_face_embeddings( frame_files: List[Path], face_app: FaceAnalysis, prefetch_workers: int = 4, + min_det_score: float = 0.0, + min_face_px: int = 0, ) -> List[Dict[str, Any]]: face_data: List[Dict[str, Any]] = [] @@ -38,6 +40,9 @@ def extract_face_embeddings( continue for face_idx, face in enumerate(face_app.get(img)): + if face.det_score < min_det_score: + continue + bbox = face.bbox.astype(int) x1 = max(0, bbox[0]) y1 = max(0, bbox[1]) @@ -46,11 +51,14 @@ def extract_face_embeddings( if x2 <= x1 or y2 <= y1: continue + if (x2 - x1) < min_face_px or (y2 - y1) < min_face_px: + continue face_data.append({ 'vector': face.normed_embedding, 'frame_path': frame_path, 'face_idx': face_idx, + 'bbox': (x1, y1, x2, y2), }) return face_data diff --git a/preprocessor/services/characters/face_detection.py b/preprocessor/services/characters/face_detection.py index ee1b59dcf..639866106 100644 --- a/preprocessor/services/characters/face_detection.py +++ b/preprocessor/services/characters/face_detection.py @@ -57,7 +57,7 @@ def detect_characters_in_frame( return detected @staticmethod - def init() -> FaceAnalysis: + def init(det_thresh: Optional[float] = None) -> FaceAnalysis: model_root = os.getenv('INSIGHTFACE_HOME', os.path.expanduser('~/.insightface')) FaceDetector.__check_cuda_availability() @@ -67,10 +67,10 @@ def init() -> FaceAnalysis: warnings.filterwarnings('ignore', category=UserWarning, module='onnxruntime') warnings.filterwarnings('ignore', category=FutureWarning, module='insightface') - face_app = FaceDetector.__init_face_app(model_root, providers) + face_app = FaceDetector.__init_face_app(model_root, providers, det_thresh) FaceDetector.__verify_active_providers(face_app) - FaceDetector.__print_init_success(model_root) + FaceDetector.__print_init_success(model_root, det_thresh) return face_app @staticmethod @@ -156,8 +156,10 @@ def __build_providers_config() -> List[Tuple[str, Dict[str, Any]]]: def __init_face_app( model_root: str, providers: List[Tuple[str, Dict[str, Any]]], + det_thresh_override: Optional[float], ) -> FaceAnalysis: model_name = settings.face_recognition.model_name + det_thresh = det_thresh_override if det_thresh_override is not None else settings.character.face_detection_threshold console.print(f'[cyan]Loading {model_name} face detection model (GPU-only)...[/cyan]') try: @@ -165,7 +167,7 @@ def __init_face_app( face_app.prepare( ctx_id=0, det_size=settings.face_recognition.detection_size, - det_thresh=settings.character.face_detection_threshold, + det_thresh=det_thresh, ) return face_app except Exception as e: @@ -183,10 +185,10 @@ def __verify_active_providers(face_app: FaceAnalysis) -> None: raise RuntimeError('CUDA required but not available for face detection') @staticmethod - def __print_init_success(model_root: str) -> None: + def __print_init_success(model_root: str, det_thresh_override: Optional[float]) -> None: model_name = settings.face_recognition.model_name det_size = settings.face_recognition.detection_size - det_thresh = settings.character.face_detection_threshold + det_thresh = det_thresh_override if det_thresh_override is not None else settings.character.face_detection_threshold console.print(f'[green]Face detection initialized ({model_name})[/green]') console.print('[dim] Device: GPU (CUDA)[/dim]') diff --git a/preprocessor/services/validation/episode_stats.py b/preprocessor/services/validation/episode_stats.py index ba7fb3cc0..cd7a04b57 100644 --- a/preprocessor/services/validation/episode_stats.py +++ b/preprocessor/services/validation/episode_stats.py @@ -47,7 +47,6 @@ def collect_stats(self) -> None: from preprocessor.services.validation.validators import ( CharacterValidator, ElasticValidator, - FaceClusterValidator, FrameValidator, ImageHashValidator, ObjectValidator, @@ -59,7 +58,7 @@ def collect_stats(self) -> None: validators = [ TranscriptionValidator(), FrameValidator(), VideoValidator(), SceneValidator(), ImageHashValidator(), CharacterValidator(), - FaceClusterValidator(), ObjectValidator(), ElasticValidator(), + ObjectValidator(), ElasticValidator(), ] for validator in validators: diff --git a/preprocessor/services/validation/validators/__init__.py b/preprocessor/services/validation/validators/__init__.py index 98c636063..1b986667c 100644 --- a/preprocessor/services/validation/validators/__init__.py +++ b/preprocessor/services/validation/validators/__init__.py @@ -1,7 +1,6 @@ from preprocessor.services.validation.validators.base_validator import BaseValidator from preprocessor.services.validation.validators.character_validator import CharacterValidator from preprocessor.services.validation.validators.elastic_validator import ElasticValidator -from preprocessor.services.validation.validators.face_cluster_validator import FaceClusterValidator from preprocessor.services.validation.validators.frame_validator import FrameValidator from preprocessor.services.validation.validators.image_hash_validator import ImageHashValidator from preprocessor.services.validation.validators.object_validator import ObjectValidator @@ -13,7 +12,6 @@ 'BaseValidator', 'CharacterValidator', 'ElasticValidator', - 'FaceClusterValidator', 'FrameValidator', 'ImageHashValidator', 'ObjectValidator', diff --git a/preprocessor/services/validation/validators/face_cluster_validator.py b/preprocessor/services/validation/validators/face_cluster_validator.py deleted file mode 100644 index bbed519fa..000000000 --- a/preprocessor/services/validation/validators/face_cluster_validator.py +++ /dev/null @@ -1,44 +0,0 @@ -from __future__ import annotations - -from typing import ( - Any, - Dict, -) - -from preprocessor.config.settings_instance import settings -from preprocessor.services.io.path_service import PathService -from preprocessor.services.validation.episode_stats import EpisodeStats -from preprocessor.services.validation.validators.base_validator import BaseValidator - - -class FaceClusterValidator(BaseValidator): - def validate(self, stats: EpisodeStats) -> None: - clusters_file = PathService(stats.series_name).get_episode_file_path( - stats.episode_info, settings.output_subdirs.face_clusters, - ) - - if not clusters_file.exists(): - return - - if not self._validate_json_with_error(stats, clusters_file, 'Missing face clusters file', 'Invalid face clusters JSON'): - return - - data = self._load_json_safely(clusters_file) - if data: - self.__parse_cluster_stats(stats, data) - - def __parse_cluster_stats(self, stats: EpisodeStats, data: Dict[str, Any]) -> None: - clusters = data.get('clusters', {}) - - if isinstance(clusters, (dict, list)): - stats.face_clusters_count = len(clusters) - items = clusters.values() if isinstance(clusters, dict) else clusters - total_faces = sum(item.get('face_count', 0) for item in items) - else: - self._add_warning(stats, 'Unexpected clusters format in face clustering metadata') - return - - noise_info = data.get('noise', {}) - total_faces += noise_info.get('face_count', 0) - - stats.face_clusters_total_faces = total_faces diff --git a/preprocessor/steps/vision/__init__.py b/preprocessor/steps/vision/__init__.py index f18724d88..5cdd3a929 100644 --- a/preprocessor/steps/vision/__init__.py +++ b/preprocessor/steps/vision/__init__.py @@ -1,8 +1,7 @@ from preprocessor.steps.vision.character_detection_step import CharacterDetectorStep from preprocessor.steps.vision.embeddings_step import VideoEmbeddingStep from preprocessor.steps.vision.emotion_detection_step import EmotionDetectionStep -from preprocessor.steps.vision.face_clustering_step import FaceClusteringStep from preprocessor.steps.vision.image_hashing_step import ImageHashStep from preprocessor.steps.vision.object_detection_step import ObjectDetectionStep -__all__ = ['CharacterDetectorStep', 'EmotionDetectionStep', 'FaceClusteringStep', 'ImageHashStep', 'ObjectDetectionStep', 'VideoEmbeddingStep'] +__all__ = ['CharacterDetectorStep', 'EmotionDetectionStep', 'ImageHashStep', 'ObjectDetectionStep', 'VideoEmbeddingStep'] diff --git a/preprocessor/steps/vision/face_clustering_step.py b/preprocessor/steps/vision/face_clustering_step.py deleted file mode 100644 index 5b2712bcb..000000000 --- a/preprocessor/steps/vision/face_clustering_step.py +++ /dev/null @@ -1,159 +0,0 @@ -# pylint: disable=duplicate-code -from pathlib import Path -from typing import ( - Any, - Dict, - List, - Optional, -) - -from insightface.app import FaceAnalysis - -from preprocessor.config.settings_instance import settings -from preprocessor.config.step_configs import FaceClusteringConfig -from preprocessor.core.artifacts import ( - ClusterData, - FrameCollection, -) -from preprocessor.core.base_step import PipelineStep -from preprocessor.core.context import ExecutionContext -from preprocessor.core.output_descriptors import ( - JsonFileOutput, - OutputDescriptor, -) -from preprocessor.services.characters import FaceDetector -from preprocessor.services.characters.face_clusterer import FaceClusterer -from preprocessor.services.io.files import FileOperations - - -class FaceClusteringStep(PipelineStep[FrameCollection, ClusterData, FaceClusteringConfig]): - def __init__(self, config: FaceClusteringConfig) -> None: - super().__init__(config) - self.__face_app: Optional[FaceAnalysis] = None - - @property - def supports_batch_processing(self) -> bool: - return True - - def setup_resources(self, context: ExecutionContext) -> None: - if self.__face_app is None: - context.logger.info('Loading Face Clustering model...') - self.__face_app = FaceDetector.init() - - def teardown_resources(self, context: ExecutionContext) -> None: - if self.__face_app: - context.logger.info('Face Clustering model unloaded') - self.__face_app = None - FaceClusterer.cleanup_gpu_memory() - - def cleanup(self) -> None: - self.__face_app = None - - def execute_batch( - self, input_data: List[FrameCollection], context: ExecutionContext, - ) -> List[ClusterData]: - return self._execute_with_threadpool( - input_data, context, self.config.max_parallel_episodes, self.execute, - ) - - def _process( - self, input_data: FrameCollection, context: ExecutionContext, - ) -> ClusterData: - output_path = self._get_cache_path(input_data, context) - face_app = self.__face_app - - frame_files = self.__extract_frame_files(input_data) - if not frame_files: - context.logger.warning(f'No frame files found in {input_data.directory}') - self.__write_empty_output(output_path, input_data, context) - return self.__build_result(input_data, output_path) - - face_data = FaceClusterer.extract_face_embeddings(frame_files, face_app) - if not face_data: - context.logger.warning(f'No faces detected in episode {input_data.episode_id}') - self.__write_empty_output(output_path, input_data, context) - return self.__build_result(input_data, output_path) - - clustering = settings.face_clustering - labels = FaceClusterer.cluster_embeddings( - face_data, clustering.min_cluster_size, clustering.min_samples, - ) - - output_data = FaceClusterer.build_cluster_output( - face_data=face_data, - labels=labels, - save_noise=clustering.save_noise, - episode_id=input_data.episode_id, - series_name=context.series_name, - min_cluster_size=clustering.min_cluster_size, - min_samples=clustering.min_samples, - model_name=settings.face_recognition.model_name, - total_frames=len(frame_files), - ) - FileOperations.atomic_write_json(output_path, output_data) - - return self.__build_result(input_data, output_path) - - def get_output_descriptors(self) -> List[OutputDescriptor]: - return [ - JsonFileOutput( - subdir='clusters/faces', - pattern='{season}/{episode}.json', - min_size_bytes=10, - ), - ] - - def _get_cache_path( - self, input_data: FrameCollection, context: ExecutionContext, - ) -> Path: - return self._resolve_output_path( - 0, context, self.__create_path_variables(input_data), - ) - - def _load_from_cache( - self, cache_path: Path, input_data: FrameCollection, context: ExecutionContext, - ) -> ClusterData: - return self.__build_result(input_data, cache_path) - - def __write_empty_output( - self, - output_path: Path, - input_data: FrameCollection, - context: ExecutionContext, - ) -> None: - empty_data: Dict[str, Any] = { - 'episode_id': input_data.episode_id, - 'series_name': context.series_name, - 'statistics': { - 'total_faces_detected': 0, - 'total_clusters': 0, - 'noise_faces': 0, - 'frames_processed': 0, - 'frames_with_faces': 0, - }, - 'clusters': {}, - 'noise': {}, - } - FileOperations.atomic_write_json(output_path, empty_data) - - @staticmethod - def __build_result(input_data: FrameCollection, output_path: Path) -> ClusterData: - return ClusterData( - episode_id=input_data.episode_id, - episode_info=input_data.episode_info, - path=output_path, - ) - - @staticmethod - def __create_path_variables(input_data: FrameCollection) -> Dict[str, str]: - return { - 'season': f'S{input_data.episode_info.season:02d}', - 'episode': input_data.episode_info.episode_code(), - } - - @staticmethod - def __extract_frame_files(input_data: FrameCollection) -> List[Path]: - return sorted([ - f for f in input_data.directory.glob('*.jpg') - if f.is_file() and 'frame_' in f.name - ]) diff --git a/preprocessor/steps/vision/series_face_clustering_step.py b/preprocessor/steps/vision/series_face_clustering_step.py index 8eaeef56c..e0d5e9d79 100644 --- a/preprocessor/steps/vision/series_face_clustering_step.py +++ b/preprocessor/steps/vision/series_face_clustering_step.py @@ -58,11 +58,14 @@ def _process(self, input_data: SourceVideo, context: ExecutionContext) -> Source f"Extracting face embeddings from {len(frame_files)} frames across the series...", ) + clustering = settings.face_clustering face_app = None try: - face_app = FaceDetector.init() + face_app = FaceDetector.init(det_thresh=clustering.min_det_score) face_data = FaceClusterer.extract_face_embeddings( frame_files, face_app, self.config.prefetch_workers, + min_det_score=clustering.min_det_score, + min_face_px=clustering.min_face_px, ) if not face_data: @@ -71,7 +74,6 @@ def _process(self, input_data: SourceVideo, context: ExecutionContext) -> Source context.logger.info(f"Clustering {len(face_data)} face embeddings series-wide...") - clustering = settings.face_clustering labels = FaceClusterer.cluster_embeddings( face_data, clustering.min_cluster_size, clustering.min_samples, ) From 439156f6834b8638c291a9626a3db2b497dee0b4 Mon Sep 17 00:00:00 2001 From: dam2452 <81230036+dam2452@users.noreply.github.com> Date: Sun, 29 Mar 2026 15:40:01 +0200 Subject: [PATCH 87/89] Update config.py --- preprocessor/config/config.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/preprocessor/config/config.py b/preprocessor/config/config.py index fce3f9729..cb84f8af9 100644 --- a/preprocessor/config/config.py +++ b/preprocessor/config/config.py @@ -168,7 +168,6 @@ class EmbeddingSettings(OutputDirMixin): @dataclass(frozen=True) class FaceRecognitionSettings: - det_thresh: float = 0.55 detection_size: Tuple[int, int] = (1280, 1280) model_name: str = 'buffalo_l' @@ -177,10 +176,10 @@ class FaceRecognitionSettings: class FaceClusteringSettings(OutputDirMixin): OUTPUT_SUBDIR: ClassVar[str] = 'face_clusters' - min_cluster_size: int = 80 - min_face_px: int = 60 - min_samples: int = 15 - min_det_score: float = 0.55 + min_cluster_size: int = 5 + min_face_px: int = 0 + min_samples: int = 3 + min_det_score: float = 0.0 save_noise: bool = True From 95126623c7939086f57c48282a5cf6d090455ebd Mon Sep 17 00:00:00 2001 From: dam2452 <81230036+dam2452@users.noreply.github.com> Date: Mon, 30 Mar 2026 15:54:47 +0200 Subject: [PATCH 88/89] Face clustering: stricter filters, noise & labels Tighten clustering thresholds and improve cluster output handling. Increased min_face_px to 40 and min_det_score to 0.4 to reduce false positives. ClusterFolderManager now collects faces labeled as noise and writes them to an `_noise` folder instead of skipping them. SeriesFaceClusteringStep now imports json and creates empty character label folders from the series' _characters.json (logging the count) to aid downstream workflows; it still writes the cluster index as before. --- preprocessor/config/config.py | 4 ++-- .../characters/cluster_folder_manager.py | 9 +++++++-- .../steps/vision/series_face_clustering_step.py | 17 +++++++++++++++++ 3 files changed, 26 insertions(+), 4 deletions(-) diff --git a/preprocessor/config/config.py b/preprocessor/config/config.py index cb84f8af9..15cb84c2c 100644 --- a/preprocessor/config/config.py +++ b/preprocessor/config/config.py @@ -177,9 +177,9 @@ class FaceClusteringSettings(OutputDirMixin): OUTPUT_SUBDIR: ClassVar[str] = 'face_clusters' min_cluster_size: int = 5 - min_face_px: int = 0 + min_face_px: int = 40 min_samples: int = 3 - min_det_score: float = 0.0 + min_det_score: float = 0.4 save_noise: bool = True diff --git a/preprocessor/services/characters/cluster_folder_manager.py b/preprocessor/services/characters/cluster_folder_manager.py index cd4ebed12..fe1d7eb92 100644 --- a/preprocessor/services/characters/cluster_folder_manager.py +++ b/preprocessor/services/characters/cluster_folder_manager.py @@ -26,10 +26,12 @@ def create_cluster_folders( logger: Optional[ErrorHandlingLogger] = None, ) -> int: groups: Dict[int, List[Dict[str, Any]]] = defaultdict(list) + noise: List[Dict[str, Any]] = [] for face_info, label in zip(face_data, labels): if int(label) == -1: - continue - groups[int(label)].append(face_info) + noise.append(face_info) + else: + groups[int(label)].append(face_info) sorted_clusters = sorted(groups.items(), key=lambda x: len(x[1]), reverse=True) output_dir.mkdir(parents=True, exist_ok=True) @@ -37,6 +39,9 @@ def create_cluster_folders( for rank, (_, faces) in enumerate(sorted_clusters): ClusterFolderManager.__populate_cluster_dir(output_dir / str(rank), faces) + if noise: + ClusterFolderManager.__populate_cluster_dir(output_dir / '_noise', noise) + cluster_count = len(sorted_clusters) if logger: logger.info(f"Created {cluster_count} cluster folders in {output_dir}") diff --git a/preprocessor/steps/vision/series_face_clustering_step.py b/preprocessor/steps/vision/series_face_clustering_step.py index e0d5e9d79..974521f82 100644 --- a/preprocessor/steps/vision/series_face_clustering_step.py +++ b/preprocessor/steps/vision/series_face_clustering_step.py @@ -1,3 +1,4 @@ +import json from pathlib import Path from typing import ( Any, @@ -86,6 +87,7 @@ def _process(self, input_data: SourceVideo, context: ExecutionContext) -> Source ) self.__write_cluster_index(output_dir, context.series_name, cluster_count, face_data, frame_files) + self.__create_character_label_folders(output_dir, context) context.logger.info( f"Series clustering complete: {cluster_count} clusters → {output_dir}", @@ -112,6 +114,21 @@ def __write_cluster_index( } FileOperations.atomic_write_json(output_dir / '_cluster_index.json', index_data) + @staticmethod + def __create_character_label_folders(output_dir: Path, context: ExecutionContext) -> None: + characters_json = context.base_output_dir / f'{context.series_name}_characters.json' + if not characters_json.exists(): + return + with open(characters_json, 'r', encoding='utf-8') as f: + data = json.load(f) + names = [c['name'] for c in data.get('characters', []) if c.get('name')] + for name in names: + folder = output_dir / name + if not folder.exists(): + folder.mkdir(parents=True) + if names: + context.logger.info(f"Created {len(names)} empty character label folders") + @staticmethod def __collect_frame_files(frames_root: Path) -> List[Path]: if not frames_root.exists(): From e3132efd4e1e6d8e5fd751699d5c8f0d2b4e9c98 Mon Sep 17 00:00:00 2001 From: dam2452 <81230036+dam2452@users.noreply.github.com> Date: Sat, 4 Apr 2026 14:00:09 +0200 Subject: [PATCH 89/89] Update cluster_folder_manager.py --- preprocessor/services/characters/cluster_folder_manager.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/preprocessor/services/characters/cluster_folder_manager.py b/preprocessor/services/characters/cluster_folder_manager.py index fe1d7eb92..61aad8e4a 100644 --- a/preprocessor/services/characters/cluster_folder_manager.py +++ b/preprocessor/services/characters/cluster_folder_manager.py @@ -116,7 +116,7 @@ def get_labeled_folders(cluster_dir: Path) -> Dict[str, Path]: return { d.name: d for d in sorted(cluster_dir.iterdir()) - if d.is_dir() and not d.name.isdigit() + if d.is_dir() and not d.name.isdigit() and not d.name.startswith('_') } @staticmethod