diff --git a/src-tauri/Cargo.lock b/src-tauri/Cargo.lock index 200ebf1c..12659644 100644 --- a/src-tauri/Cargo.lock +++ b/src-tauri/Cargo.lock @@ -17,6 +17,10 @@ dependencies = [ "objc2-core-foundation", "objc2-quartz-core", "ocr-rs", + "oxideav-codec", + "oxideav-container", + "oxideav-core", + "oxideav-sub-image", "rayon", "reqwest 0.12.28", "serde", @@ -3714,6 +3718,44 @@ dependencies = [ "ttf-parser", ] +[[package]] +name = "oxideav-codec" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d7d85f9686d29b25a216aae44c9394d8ea232be2f7b00628f69899e0cdbe643c" +dependencies = [ + "oxideav-core", +] + +[[package]] +name = "oxideav-container" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8536c8d8b3a0426e0250a7f4098ad01efe64b771aa375e0b49989cd5d4a0b273" +dependencies = [ + "oxideav-core", +] + +[[package]] +name = "oxideav-core" +version = "0.1.26" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "068fc43174e7aa261fab5a399a6bf206ec3daf324e8df7fb17ecbcba2146a663" +dependencies = [ + "bytemuck", + "serde_json", + "thiserror 2.0.18", +] + +[[package]] +name = "oxideav-sub-image" +version = "0.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "670066cd6f07675f4e7cf01008f73cf5634539039e66c2805af3e6180a0e0b81" +dependencies = [ + "oxideav-core", +] + [[package]] name = "pango" version = "0.18.3" diff --git a/src-tauri/Cargo.toml b/src-tauri/Cargo.toml index ec3997bd..b58f1d2a 100644 --- a/src-tauri/Cargo.toml +++ b/src-tauri/Cargo.toml @@ -43,6 +43,10 @@ tiktoken-rs = "0.9" image = "0.25" rayon = "1.10" num_cpus = "1.16" +oxideav-core = "0.1" +oxideav-codec = "0.1" +oxideav-container = "0.1" +oxideav-sub-image = "0.0.6" reqwest = { version = "0.12", default-features = false, features = ["rustls-tls", "stream", "multipart"] } futures-util = "0.3" zip = "8.2" diff --git a/src-tauri/ocr-models/en_PP-OCRv5_mobile_rec_infer.mnn b/src-tauri/ocr-models/en_PP-OCRv5_mobile_rec_infer.mnn new file mode 100644 index 00000000..5e7939b5 Binary files /dev/null and b/src-tauri/ocr-models/en_PP-OCRv5_mobile_rec_infer.mnn differ diff --git a/src-tauri/ocr-models/ppocr_keys_en.txt b/src-tauri/ocr-models/ppocr_keys_en.txt new file mode 100644 index 00000000..7bf3f5fa --- /dev/null +++ b/src-tauri/ocr-models/ppocr_keys_en.txt @@ -0,0 +1,436 @@ +0 +1 +2 +3 +4 +5 +6 +7 +8 +9 +A +B +C +D +E +F +G +H +I +J +K +L +M +N +O +P +Q +R +S +T +U +V +W +X +Y +Z +a +b +c +d +e +f +g +h +i +j +k +l +m +n +o +p +q +r +s +t +u +v +w +x +y +z +! +" +# +$ +% +& +' +( +) +* ++ +, +- +. +/ +: +; +< += +> +? +@ +[ +\ +] +_ +` +{ +| +} +^ +~ +© +® +℉ +№ +Ω +℮ +™ +∆ +✓ +✔ +✗ +✘ +✕ +☑ +☒ +● +▪ +▫ +◼ +▶ +◀ +⬆ +¤ +¦ +§ +¨ +ª +« +¬ +¯ +° +² +³ +´ +µ +¶ +¸ +¹ +º +» +¼ +½ +¾ +¿ +× +‐ +‑ +‒ +— +― +‖ +‗ +‘ +’ +‚ +‛ +“ +” +„ +‟ +† +‡ +‣ +․ +… +‧ +‰ +‴ +‵ +‶ +‷ +‸ +‹ +› +※ +‼ +‽ +‾ +− +₤ +₡ +₹ +₽ +₴ +₿ +¢ +€ +£ +¥ +Ⅰ +Ⅱ +Ⅲ +Ⅳ +Ⅴ +Ⅵ +Ⅶ +Ⅷ +Ⅸ +Ⅹ +Ⅺ +Ⅻ +ⅰ +ⅱ +ⅲ +ⅳ +ⅴ +ⅵ +ⅶ +ⅷ +ⅸ +ⅹ +ⅺ +ⅻ +➀ +➁ +➂ +➃ +➄ +➅ +➆ +➇ +➈ +➉ +➊ +➋ +➌ +➍ +➎ +➏ +➐ +➑ +➒ +➓ +❶ +❷ +❸ +❹ +❺ +❻ +❼ +❽ +❾ +❿ +① +② +③ +④ +⑤ +⑥ +⑦ +⑧ +⑨ +⑩ +↑ +→ +↓ +↕ +← +↔ +⇒ +⇐ +⇔ +∀ +∃ +∄ +∴ +∵ +∝ +∞ +∩ +∪ +∂ +∫ +∬ +∭ +∮ +∯ +∰ +∑ +∏ +√ +∛ +∜ +∱ +∲ +∳ +∶ +∷ +∼ +∖ +∗ +≈ +≠ +≡ +≤ +≥ +⊂ +⊃ +⊥ +⊾ +⊿ +□ +∥ +∋ +ƒ +′ +″ +À +Á +Â +Ã +Ä +Å +Æ +Ç +È +É +Ê +Ë +Ì +Í +Î +Ï +Ð +Ñ +Ò +Ó +Ô +Õ +Ö +Ø +Ù +Ú +Û +Ü +Ý +Þ +à +á +â +ã +ä +å +æ +ç +è +é +ê +ë +ì +í +î +ï +ð +ñ +ò +ó +ô +õ +ö +ø +ù +ú +û +ü +ý +þ +ÿ +Α +Β +Γ +Δ +Ε +Ζ +Η +Θ +Ι +Κ +Λ +Μ +Ν +Ξ +Ο +Π +Ρ +Σ +Τ +Υ +Φ +Χ +Ψ +Ω +α +β +γ +δ +ε +ζ +η +θ +ι +κ +λ +μ +ν +ξ +ο +π +ρ +σ +ς +τ +υ +φ +χ +ψ +ω +Å +ℏ +⌀ +⍺ +⍵ +𝑢 +𝜓 +० +‥ +︽ +﹥ +• +÷ +∕ +∙ +⋅ +· +± +∓ +∟ +∠ +∡ +∢ +℧ +☺ diff --git a/src-tauri/src/commands/mod.rs b/src-tauri/src/commands/mod.rs index 6316bfe4..829c5566 100644 --- a/src-tauri/src/commands/mod.rs +++ b/src-tauri/src/commands/mod.rs @@ -20,6 +20,13 @@ pub(crate) use crate::tools::ocr::pipeline as ocr_pipeline; pub(crate) use crate::tools::ocr::preview as ocr_preview; pub(crate) use crate::tools::ocr::subtitles as ocr_subtitles; pub(crate) use crate::tools::power::sleep_inhibit; +pub(crate) use crate::tools::subtitle_ocr::cancel as subtitle_ocr_cancel; +pub(crate) use crate::tools::subtitle_ocr::decode as subtitle_ocr_decode; +pub(crate) use crate::tools::subtitle_ocr::export as subtitle_ocr_export; +pub(crate) use crate::tools::subtitle_ocr::extract as subtitle_ocr_extract; +pub(crate) use crate::tools::subtitle_ocr::import as subtitle_ocr_import; +pub(crate) use crate::tools::subtitle_ocr::ocr as subtitle_ocr_ocr; +pub(crate) use crate::tools::subtitle_ocr::restore as subtitle_ocr_restore; pub(crate) use crate::tools::tokens::count as tokens; pub(crate) use crate::tools::transcode::analysis as transcode_analysis; pub(crate) use crate::tools::transcode::cancel as transcode_cancel; diff --git a/src-tauri/src/lib.rs b/src-tauri/src/lib.rs index dbadfce2..b18f4231 100644 --- a/src-tauri/src/lib.rs +++ b/src-tauri/src/lib.rs @@ -77,6 +77,15 @@ pub fn run() { commands::ocr_pipeline::run_ocr_pipeline, commands::ocr_subtitles::generate_subtitles_from_ocr, commands::ocr_export::export_ocr_subtitles, + commands::subtitle_ocr_import::probe_subtitle_ocr_tracks, + commands::subtitle_ocr_import::resolve_subtitle_ocr_vobsub_pair, + commands::subtitle_ocr_extract::prepare_subtitle_ocr_track, + commands::subtitle_ocr_decode::decode_subtitle_ocr_bitmaps, + commands::subtitle_ocr_ocr::run_subtitle_ocr_pipeline, + commands::subtitle_ocr_restore::collect_missing_subtitle_ocr_bitmap_assets, + commands::subtitle_ocr_restore::restore_subtitle_ocr_bitmap_assets, + commands::subtitle_ocr_export::export_subtitle_ocr_version, + commands::subtitle_ocr_cancel::cancel_subtitle_ocr_operation, commands::ocr_cancel::cancel_ocr_operation, commands::ocr_models::check_ocr_models, // General transcode commands diff --git a/src-tauri/src/shared/validation.rs b/src-tauri/src/shared/validation.rs index 8b98d078..99c255f4 100644 --- a/src-tauri/src/shared/validation.rs +++ b/src-tauri/src/shared/validation.rs @@ -2,8 +2,9 @@ use std::path::Path; /// Allowed media file extensions pub(crate) const ALLOWED_MEDIA_EXTENSIONS: &[&str] = &[ - "mkv", "mp4", "avi", "mov", "webm", "m4v", "mks", "mka", "m4a", "mp3", "flac", "wav", "ogg", - "aac", "ac3", "dts", "srt", "ass", "ssa", "vtt", "sub", "sup", "opus", "wma", "eac3", "mxf", + "mkv", "m2ts", "mp4", "avi", "mov", "webm", "m4v", "mks", "mka", "m4a", "mp3", "flac", "wav", + "ogg", "aac", "ac3", "dts", "srt", "ass", "ssa", "vtt", "sub", "sup", "opus", "wma", "eac3", + "mxf", ]; /// Validate that a path exists and is a file with an allowed extension @@ -110,6 +111,27 @@ mod tests { .expect("webm media path should be valid"); } + #[test] + fn validate_media_path_accepts_m2ts_extension() { + let dir = tempfile::tempdir().expect("failed to create tempdir"); + let file = dir.path().join("clip.M2TS"); + std::fs::write(&file, b"data").expect("failed to create media file"); + + validate_media_path(file.to_string_lossy().as_ref()) + .expect("m2ts media path should be valid"); + } + + #[test] + fn validate_media_path_rejects_vob_extension() { + let dir = tempfile::tempdir().expect("failed to create tempdir"); + let file = dir.path().join("clip.VOB"); + std::fs::write(&file, b"data").expect("failed to create media file"); + + let error = validate_media_path(file.to_string_lossy().as_ref()) + .expect_err("vob media path should remain unsupported"); + assert!(error.contains("Unsupported file type")); + } + #[test] fn validate_media_path_rejects_unsupported_extension() { let dir = tempfile::tempdir().expect("failed to create tempdir"); diff --git a/src-tauri/src/tools/mod.rs b/src-tauri/src/tools/mod.rs index 2a00cfca..03cbb026 100644 --- a/src-tauri/src/tools/mod.rs +++ b/src-tauri/src/tools/mod.rs @@ -9,6 +9,7 @@ pub(crate) mod mediaflow_api; pub(crate) mod merge; pub(crate) mod ocr; pub(crate) mod power; +pub(crate) mod subtitle_ocr; pub(crate) mod tokens; pub(crate) mod transcode; pub(crate) mod transcription; diff --git a/src-tauri/src/tools/ocr/engine.rs b/src-tauri/src/tools/ocr/engine.rs index 613f44aa..fb923769 100644 --- a/src-tauri/src/tools/ocr/engine.rs +++ b/src-tauri/src/tools/ocr/engine.rs @@ -1,6 +1,6 @@ use std::path::{Path, PathBuf}; -use ocr_rs::{Backend, OcrEngine, OcrEngineConfig}; +use ocr_rs::{Backend, DetOptions, OcrEngine, OcrEngineConfig, PrecisionMode}; use tauri::Manager; /// Default OCR models directory (relative to app resources) @@ -10,10 +10,15 @@ pub(super) const DEFAULT_OCR_MODELS_DIR: &str = "ocr-models"; pub(super) const OCR_DET_MODEL: &str = "PP-OCRv5_mobile_det.mnn"; pub(super) const OCR_CHARSET: &str = "ppocr_keys_v5.txt"; +fn get_det_model_for_language(_language: &str) -> &'static str { + OCR_DET_MODEL +} + /// Language to recognition model mapping fn get_rec_model_for_language(language: &str) -> &'static str { match language { - "multi" | "chinese" | "japanese" | "en" => "PP-OCRv5_mobile_rec.mnn", + "en" | "english" => "en_PP-OCRv5_mobile_rec_infer.mnn", + "multi" | "chinese" | "japanese" => "PP-OCRv5_mobile_rec.mnn", "korean" => "korean_PP-OCRv5_mobile_rec_infer.mnn", "latin" => "latin_PP-OCRv5_mobile_rec_infer.mnn", "cyrillic" => "cyrillic_PP-OCRv5_mobile_rec_infer.mnn", @@ -30,6 +35,7 @@ fn get_rec_model_for_language(language: &str) -> &'static str { /// Get charset file for language fn get_charset_for_language(language: &str) -> &'static str { match language { + "en" | "english" => "ppocr_keys_en.txt", "korean" => "ppocr_keys_korean.txt", "latin" => "ppocr_keys_latin.txt", "cyrillic" => "ppocr_keys_cyrillic.txt", @@ -66,7 +72,7 @@ pub(super) fn resolve_ocr_worker_count_for_backend(requested_workers: u32, use_g } } -pub(super) fn resolve_ocr_engine_threads(worker_count: usize) -> i32 { +pub(crate) fn resolve_ocr_engine_threads(worker_count: usize) -> i32 { let physical_cores = num_cpus::get_physical(); let fallback_cores = num_cpus::get(); let available_cores = physical_cores.max(fallback_cores).max(1); @@ -76,8 +82,31 @@ pub(super) fn resolve_ocr_engine_threads(worker_count: usize) -> i32 { derived_threads.clamp(minimum_threads, 4) as i32 } +fn ocr_engine_config(use_gpu: bool, engine_threads: i32, enable_parallel: bool) -> OcrEngineConfig { + let backend = if use_gpu { + #[cfg(target_os = "macos")] + { + Backend::Metal + } + #[cfg(not(target_os = "macos"))] + { + Backend::Vulkan + } + } else { + Backend::CPU + }; + + // Keep the mobile detector for throughput, but avoid low-precision inference by default. + OcrEngineConfig::new() + .with_backend(backend) + .with_threads(engine_threads) + .with_precision(PrecisionMode::Normal) + .with_det_options(DetOptions::fast()) + .with_parallel(enable_parallel) +} + /// Create an OCR engine for the given language with specified options. -pub(super) fn create_ocr_engine( +pub(crate) fn create_ocr_engine( models_dir: &Path, language: &str, use_gpu: bool, @@ -85,7 +114,7 @@ pub(super) fn create_ocr_engine( enable_parallel: bool, ) -> Result { // Build model paths - let det_path = models_dir.join(OCR_DET_MODEL); + let det_path = models_dir.join(get_det_model_for_language(language)); let rec_model = get_rec_model_for_language(language); let rec_path = models_dir.join(rec_model); let charset_file = get_charset_for_language(language); @@ -112,29 +141,7 @@ pub(super) fn create_ocr_engine( )); } - // Create OCR engine config based on GPU option - let config = if use_gpu { - #[cfg(target_os = "macos")] - { - OcrEngineConfig::fast() - .with_backend(Backend::Metal) - .with_threads(engine_threads) - .with_parallel(enable_parallel) - } - #[cfg(not(target_os = "macos"))] - { - OcrEngineConfig::fast() - .with_backend(Backend::Vulkan) - .with_threads(engine_threads) - .with_parallel(enable_parallel) - } - } else { - // CPU-only mode: force CPU backend to avoid platform auto-selection issues. - OcrEngineConfig::fast() - .with_backend(Backend::CPU) - .with_threads(engine_threads) - .with_parallel(enable_parallel) - }; + let config = ocr_engine_config(use_gpu, engine_threads, enable_parallel); // Create the engine let engine = OcrEngine::new( @@ -149,7 +156,7 @@ pub(super) fn create_ocr_engine( } /// Get the OCR models directory, checking app resources first, then user config -pub(super) fn get_ocr_models_dir(app: &tauri::AppHandle) -> Result { +pub(crate) fn get_ocr_models_dir(app: &tauri::AppHandle) -> Result { // First, check if models are in app resources if let Ok(resource_dir) = app.path().resource_dir() { let models_dir = resource_dir.join(DEFAULT_OCR_MODELS_DIR); @@ -172,16 +179,26 @@ pub(super) fn get_ocr_models_dir(app: &tauri::AppHandle) -> Result Result { + let image = + RgbaImage::from_raw(metadata.width, metadata.height, rgba.to_vec()).ok_or_else(|| { + "Decoded Subtitle OCR bitmap dimensions did not match RGBA data".to_string() + })?; + let image = DynamicImage::ImageRgba8(image); + + let preview_path = write_resized_bitmap_asset( + item_id, + run_id, + metadata, + &image, + PREVIEW_MAX_WIDTH, + PREVIEW_MAX_HEIGHT, + "previews", + )?; + + Ok(DecodedBitmapAssetPaths { preview_path }) +} + +fn write_resized_bitmap_asset( + item_id: &str, + run_id: &str, + metadata: &SubtitleOcrDecodedCue, + image: &DynamicImage, + max_width: u32, + max_height: u32, + variant: &str, +) -> Result { + let resized = if image.width() > max_width || image.height() > max_height { + image.resize(max_width, max_height, FilterType::Triangle) + } else { + image.clone() + }; + let output_dir = subtitle_ocr_bitmap_asset_dir(item_id, run_id, variant); + std::fs::create_dir_all(&output_dir).map_err(|e| { + format!( + "Failed to create Subtitle OCR bitmap asset directory: {}", + e + ) + })?; + let output_path = output_dir.join(format!( + "{}.png", + safe_asset_path_component(&metadata.cache_key) + )); + resized + .save(&output_path) + .map_err(|e| format!("Failed to write Subtitle OCR bitmap asset: {}", e))?; + + Ok(output_path.to_string_lossy().to_string()) +} + +fn subtitle_ocr_bitmap_asset_dir(item_id: &str, run_id: &str, variant: &str) -> PathBuf { + std::env::temp_dir() + .join("MediaFlow") + .join("subtitle-ocr") + .join(safe_asset_path_component(item_id)) + .join(safe_asset_path_component(run_id)) + .join(safe_asset_path_component(variant)) +} + +fn safe_asset_path_component(value: &str) -> String { + let sanitized = value + .chars() + .map(|ch| { + if ch.is_ascii_alphanumeric() || ch == '-' || ch == '_' { + ch + } else { + '_' + } + }) + .collect::() + .trim_matches('_') + .to_string(); + + if sanitized.is_empty() { + "subtitle-ocr".to_string() + } else { + sanitized + } +} + +#[cfg(test)] +mod tests { + use super::{ + safe_asset_path_component, subtitle_ocr_bitmap_asset_dir, write_decoded_bitmap_assets, + }; + use crate::tools::subtitle_ocr::SubtitleOcrDecodedCue; + + #[test] + fn safe_asset_path_component_removes_path_separators_and_empty_segments() { + assert_eq!( + safe_asset_path_component("subtitle-ocr:item/../cache:key"), + "subtitle-ocr_item____cache_key" + ); + assert_eq!(safe_asset_path_component(":::"), "subtitle-ocr"); + } + + #[test] + fn subtitle_ocr_bitmap_asset_dir_stays_under_mediaflow_temp_namespace() { + let dir = subtitle_ocr_bitmap_asset_dir("../item", "run/id", "previews"); + let path = dir.to_string_lossy(); + + assert!(path.contains("MediaFlow")); + assert!(path.contains("subtitle-ocr")); + assert!(path.contains("item")); + assert!(path.contains("run_id")); + assert!(path.contains("previews")); + assert!(!path.contains("../")); + } + + #[test] + fn write_decoded_bitmap_assets_populates_only_preview_pngs_under_temp_dir() { + let item_id = "asset-preview-only-item"; + let run_id = format!( + "asset-preview-only-run-{}", + std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .expect("system time should be after UNIX epoch") + .as_nanos() + ); + let metadata = SubtitleOcrDecodedCue { + cue_id: "cue-1".to_string(), + start_time_ms: 0, + end_time_ms: 1_000, + width: 720, + height: 360, + cache_key: "subtitle-ocr:test/cache".to_string(), + preview_path: None, + }; + let rgba = vec![255; (metadata.width * metadata.height * 4) as usize]; + + let assets = write_decoded_bitmap_assets(item_id, &run_id, &metadata, &rgba) + .expect("bitmap assets should be written"); + let preview = image::open(&assets.preview_path).expect("preview should be readable"); + let preview_path = assets.preview_path; + let run_dir = subtitle_ocr_bitmap_asset_dir(item_id, &run_id, "previews") + .parent() + .expect("preview directory should have a run parent") + .to_path_buf(); + let mut asset_dirs = std::fs::read_dir(&run_dir) + .expect("run directory should be readable") + .map(|entry| { + entry + .expect("asset directory entry should be readable") + .file_name() + .to_string_lossy() + .to_string() + }) + .collect::>(); + asset_dirs.sort(); + + assert!(preview_path.contains("MediaFlow")); + assert!(preview_path.contains("subtitle-ocr")); + assert!(std::path::Path::new(&preview_path).is_file()); + assert_eq!(asset_dirs, vec!["previews".to_string()]); + assert_eq!(preview.width(), metadata.width); + assert_eq!(preview.height(), metadata.height); + + let _ = std::fs::remove_file(preview_path); + let _ = std::fs::remove_dir_all(run_dir); + } +} diff --git a/src-tauri/src/tools/subtitle_ocr/cancel.rs b/src-tauri/src/tools/subtitle_ocr/cancel.rs new file mode 100644 index 00000000..2ad8a147 --- /dev/null +++ b/src-tauri/src/tools/subtitle_ocr/cancel.rs @@ -0,0 +1,62 @@ +use crate::shared::process::terminate_process; + +#[tauri::command] +pub(crate) async fn cancel_subtitle_ocr_operation( + item_id: String, + run_id: String, +) -> Result<(), String> { + if let Some(pid) = super::state::mark_cancelled(&item_id, &run_id)? { + terminate_process(pid); + } + + Ok(()) +} + +#[cfg(test)] +mod tests { + use serial_test::serial; + + use super::cancel_subtitle_ocr_operation; + + #[tokio::test] + #[serial] + async fn cancel_subtitle_ocr_operation_preserves_registered_outputs_for_owner_cleanup() { + let dir = tempfile::tempdir().expect("failed to create tempdir"); + let output = dir.path().join("partial.sup"); + std::fs::write(&output, b"partial").expect("failed to write partial output"); + let item_id = "subtitle-item-1".to_string(); + let run_id = "run-1".to_string(); + + let _ = super::super::state::clear_registered_operation(&item_id, &run_id); + let _ = super::super::state::clear_cancelled(&item_id, &run_id); + + super::super::state::begin_operation(&item_id, &run_id).expect("operation should start"); + super::super::state::register_output_paths( + &item_id, + &run_id, + vec![output.to_string_lossy().to_string()], + ) + .expect("output registration should work"); + + cancel_subtitle_ocr_operation(item_id.clone(), run_id.clone()) + .await + .expect("cancel should succeed"); + + assert!(output.exists()); + assert!(super::super::state::is_operation_cancelled( + &item_id, &run_id + )); + assert!(super::super::state::has_registered_operation(&item_id)); + + let paths = super::super::state::take_output_paths(&item_id, &run_id) + .expect("owner should be able to take registered output paths"); + assert_eq!(paths, vec![output.to_string_lossy().to_string()]); + let _ = std::fs::remove_file(&output); + + super::super::state::clear_registered_operation(&item_id, &run_id) + .expect("owner cleanup should clear active operation"); + assert!(!super::super::state::has_registered_operation(&item_id)); + super::super::state::clear_cancelled(&item_id, &run_id) + .expect("test cleanup should clear cancellation flag"); + } +} diff --git a/src-tauri/src/tools/subtitle_ocr/decode.rs b/src-tauri/src/tools/subtitle_ocr/decode.rs new file mode 100644 index 00000000..eadf073e --- /dev/null +++ b/src-tauri/src/tools/subtitle_ocr/decode.rs @@ -0,0 +1,1098 @@ +use std::collections::VecDeque; +use std::fs::File; +use std::io::Cursor; +use std::path::{Path, PathBuf}; + +use oxideav_core::{Error as OxideavError, Frame, ReadSeek, RuntimeContext, TimeBase, VideoFrame}; + +use crate::shared::validation::validate_media_path; +use crate::tools::subtitle_ocr::SubtitleOcrDecodedCue; + +const DEFAULT_MISSING_CUE_DURATION_MS: u64 = 2_000; +const MIN_NORMALIZED_CUE_DURATION_MS: u64 = 250; +const MAX_NORMALIZED_CUE_DURATION_MS: u64 = 10_000; +const TRANSPARENT_ALPHA_THRESHOLD: u8 = 8; +const LIGHT_PIXEL_LUMA_THRESHOLD: u16 = 192; + +#[derive(Debug, Clone, PartialEq, Eq)] +pub(super) enum BitmapSubtitleSource { + Pgs { + path: PathBuf, + }, + VobSub { + idx_path: PathBuf, + sub_path: PathBuf, + }, +} + +#[derive(Debug, Clone)] +pub(super) struct DecodedBitmapCue { + pub(super) metadata: SubtitleOcrDecodedCue, + pub(super) rgba: Vec, + pub(super) content_hash: u64, +} + +#[tauri::command] +pub(crate) async fn decode_subtitle_ocr_bitmaps( + source_path: String, + idx_path: Option, + sub_path: Option, + item_id: String, + run_id: String, +) -> Result, String> { + if item_id.trim().is_empty() { + return Err("Subtitle OCR item id is required".to_string()); + } + if run_id.trim().is_empty() { + return Err("Subtitle OCR run id is required".to_string()); + } + + let source = + validate_bitmap_subtitle_source(&source_path, idx_path.as_deref(), sub_path.as_deref())?; + super::state::begin_operation(&item_id, &run_id)?; + + let item_id_for_task = item_id.clone(); + let run_id_for_task = run_id.clone(); + let join_result = tokio::task::spawn_blocking(move || { + let mut decoded_metadata = Vec::new(); + decode_bitmap_subtitle_source_with_handler( + &source, + &item_id_for_task, + &run_id_for_task, + |decoded| { + decoded_metadata.push(decoded.metadata); + Ok(()) + }, + )?; + Ok(decoded_metadata) + }) + .await; + + let _ = super::state::clear_registered_operation(&item_id, &run_id); + let result = join_result.map_err(|e| format!("Subtitle OCR decode task failed: {}", e))?; + if result.is_ok() { + let _ = super::state::clear_cancelled(&item_id, &run_id); + } + + result +} + +pub(super) fn validate_bitmap_subtitle_source( + source_path: &str, + idx_path: Option<&str>, + sub_path: Option<&str>, +) -> Result { + let idx_path = non_empty_path(idx_path); + let sub_path = non_empty_path(sub_path); + + match (idx_path, sub_path) { + (Some(idx_path), Some(sub_path)) => { + let idx_path = validate_existing_file_with_extension(idx_path, "idx")?; + let sub_path = validate_existing_file_with_extension(sub_path, "sub")?; + ensure_vobsub_pair_matches(&idx_path, &sub_path)?; + return Ok(BitmapSubtitleSource::VobSub { idx_path, sub_path }); + } + (Some(_), None) | (None, Some(_)) => { + return Err("VobSub Subtitle OCR requires both .idx and .sub paths".to_string()); + } + (None, None) => {} + } + + let source = Path::new(source_path); + match lower_extension(source).as_deref() { + Some("sup") => { + validate_media_path(source_path)?; + Ok(BitmapSubtitleSource::Pgs { + path: source.to_path_buf(), + }) + } + Some("idx") => { + let idx_path = validate_existing_file_with_extension(source_path, "idx")?; + let sub_path = sibling_with_extension(source, "sub"); + if !sub_path.exists() { + return Err(format!( + "VobSub .sub sidecar not found: {}", + sub_path.display() + )); + } + validate_existing_file_with_extension(sub_path.to_string_lossy().as_ref(), "sub")?; + ensure_vobsub_pair_matches(&idx_path, &sub_path)?; + Ok(BitmapSubtitleSource::VobSub { idx_path, sub_path }) + } + Some("sub") => { + let sub_path = validate_existing_file_with_extension(source_path, "sub")?; + let idx_path = sibling_with_extension(source, "idx"); + if !idx_path.exists() { + return Err(format!( + "VobSub .idx sidecar not found: {}", + idx_path.display() + )); + } + validate_existing_file_with_extension(idx_path.to_string_lossy().as_ref(), "idx")?; + ensure_vobsub_pair_matches(&idx_path, &sub_path)?; + Ok(BitmapSubtitleSource::VobSub { idx_path, sub_path }) + } + Some(ext) => Err(format!("Unsupported Subtitle OCR source type: .{}", ext)), + None => Err("Subtitle OCR source path has no file extension".to_string()), + } +} + +pub(super) fn decode_bitmap_subtitle_source_with_handler( + source: &BitmapSubtitleSource, + item_id: &str, + run_id: &str, + mut handler: F, +) -> Result<(), String> +where + F: FnMut(DecodedBitmapCue) -> Result<(), String>, +{ + decode_bitmap_subtitle_source_with_handler_and_stop( + source, + item_id, + run_id, + &mut handler, + || false, + ) +} + +pub(super) fn decode_bitmap_subtitle_source_with_handler_and_stop( + source: &BitmapSubtitleSource, + item_id: &str, + run_id: &str, + mut handler: F, + mut should_stop: S, +) -> Result<(), String> +where + F: FnMut(DecodedBitmapCue) -> Result<(), String>, + S: FnMut() -> bool, +{ + let mut ctx = RuntimeContext::new(); + oxideav_sub_image::register(&mut ctx); + + let (container_id, source_key, input, missing_palette): ( + &str, + String, + Box, + bool, + ) = match source { + BitmapSubtitleSource::Pgs { path } => { + let file = File::open(path) + .map_err(|e| format!("Failed to open PGS subtitle source: {}", e))?; + ( + "pgs", + path.to_string_lossy().to_string(), + Box::new(file), + false, + ) + } + BitmapSubtitleSource::VobSub { idx_path, sub_path } => { + let idx_text = std::fs::read_to_string(idx_path) + .map_err(|e| format!("Failed to read VobSub .idx source: {}", e))?; + let missing_palette = !has_vobsub_palette_line(&idx_text); + let idx_text = normalize_oxideav_vobsub_idx_text(&idx_text, idx_path); + ( + "vobsub", + format!( + "{}|{}", + idx_path.to_string_lossy(), + sub_path.to_string_lossy() + ), + Box::new(Cursor::new(idx_text.into_bytes())), + missing_palette, + ) + } + }; + + let mut demuxer = ctx + .containers + .open_demuxer(container_id, input, &ctx.codecs) + .map_err(|e| format!("Failed to open Subtitle OCR source: {}", e))?; + let stream = demuxer + .streams() + .first() + .ok_or_else(|| "Subtitle OCR source did not contain a subtitle stream".to_string())? + .clone(); + let mut decoder = ctx + .codecs + .first_decoder(&stream.params) + .map_err(|e| format!("Failed to create Subtitle OCR decoder: {}", e))?; + + let mut cue_index = 0usize; + let mut normalizer = StreamingCueTimingNormalizer::new(&mut handler); + loop { + ensure_decode_not_cancelled(item_id, run_id)?; + ensure_decode_not_stopped(&mut should_stop)?; + let packet = match demuxer.next_packet() { + Ok(packet) => packet, + Err(OxideavError::Eof) => break, + Err(error) => return Err(format!("Failed to read Subtitle OCR packet: {}", error)), + }; + ensure_decode_not_stopped(&mut should_stop)?; + + if packet.stream_index != stream.index { + continue; + } + + let start_time_ms = packet + .pts + .map(|pts| timestamp_to_ms(pts, packet.time_base)) + .unwrap_or(0); + let end_time_ms = packet + .duration + .map(|duration| { + start_time_ms.saturating_add(timestamp_to_ms(duration, packet.time_base)) + }) + // OxideAV leaves the final bitmap subtitle packet duration unset when + // the source does not encode a disappearance time. + .unwrap_or(start_time_ms); + + decoder + .send_packet(&packet) + .map_err(|e| format!("Failed to decode Subtitle OCR packet: {}", e))?; + drain_decoder_frames( + decoder.as_mut(), + &mut normalizer, + &mut cue_index, + &mut should_stop, + item_id, + run_id, + &source_key, + start_time_ms, + end_time_ms, + missing_palette, + )?; + } + + ensure_decode_not_stopped(&mut should_stop)?; + decoder + .flush() + .map_err(|e| format!("Failed to flush Subtitle OCR decoder: {}", e))?; + drain_decoder_frames( + decoder.as_mut(), + &mut normalizer, + &mut cue_index, + &mut should_stop, + item_id, + run_id, + &source_key, + 0, + 0, + missing_palette, + )?; + ensure_decode_not_stopped(&mut should_stop)?; + normalizer.finish() +} + +pub(super) fn count_bitmap_subtitle_source_with_stop( + source: &BitmapSubtitleSource, + item_id: &str, + run_id: &str, + mut should_stop: F, +) -> Result +where + F: FnMut() -> bool, +{ + let mut count = 0u32; + decode_bitmap_subtitle_source_with_handler_and_stop( + source, + item_id, + run_id, + |_| { + count = count.saturating_add(1); + Ok(()) + }, + &mut should_stop, + )?; + + Ok(count) +} + +fn ensure_end_after_start(start_time_ms: u64, end_time_ms: u64) -> u64 { + if end_time_ms > start_time_ms { + end_time_ms + } else { + start_time_ms.saturating_add(1) + } +} + +struct StreamingCueTimingNormalizer<'handler, F> +where + F: FnMut(DecodedBitmapCue) -> Result<(), String>, +{ + pending: VecDeque, + previous_positive_duration: Option, + handler: &'handler mut F, +} + +impl StreamingCueTimingNormalizer<'_, F> +where + F: FnMut(DecodedBitmapCue) -> Result<(), String>, +{ + fn new(handler: &mut F) -> StreamingCueTimingNormalizer<'_, F> { + StreamingCueTimingNormalizer { + pending: VecDeque::new(), + previous_positive_duration: None, + handler, + } + } + + fn push(&mut self, cue: DecodedBitmapCue) -> Result<(), String> { + self.flush_ready(Some(cue.metadata.start_time_ms))?; + self.pending.push_back(cue); + self.flush_ready(None) + } + + fn finish(mut self) -> Result<(), String> { + while let Some(mut cue) = self.pending.pop_front() { + self.normalize_final_missing_duration(&mut cue); + self.emit(cue)?; + } + + Ok(()) + } + + fn flush_ready(&mut self, later_start_time_ms: Option) -> Result<(), String> { + loop { + let Some(front) = self.pending.front_mut() else { + return Ok(()); + }; + let start_time_ms = front.metadata.start_time_ms; + + if front.metadata.end_time_ms <= start_time_ms { + match later_start_time_ms { + Some(later_start_time_ms) if later_start_time_ms > start_time_ms => { + front.metadata.end_time_ms = + ensure_end_after_start(start_time_ms, later_start_time_ms); + } + _ => return Ok(()), + } + } + + let cue = self + .pending + .pop_front() + .ok_or_else(|| "Subtitle OCR cue timing queue was empty".to_string())?; + self.emit(cue)?; + } + } + + fn normalize_final_missing_duration(&self, cue: &mut DecodedBitmapCue) { + let start_time_ms = cue.metadata.start_time_ms; + if cue.metadata.end_time_ms > start_time_ms { + return; + } + + let duration = self + .previous_positive_duration + .unwrap_or(DEFAULT_MISSING_CUE_DURATION_MS) + .clamp( + MIN_NORMALIZED_CUE_DURATION_MS, + MAX_NORMALIZED_CUE_DURATION_MS, + ); + cue.metadata.end_time_ms = + ensure_end_after_start(start_time_ms, start_time_ms.saturating_add(duration)); + } + + fn emit(&mut self, cue: DecodedBitmapCue) -> Result<(), String> { + let duration = cue + .metadata + .end_time_ms + .saturating_sub(cue.metadata.start_time_ms); + if duration > 0 { + self.previous_positive_duration = Some(duration); + } + + (self.handler)(cue) + } +} + +fn drain_decoder_frames( + decoder: &mut dyn oxideav_core::Decoder, + normalizer: &mut StreamingCueTimingNormalizer<'_, F>, + cue_index: &mut usize, + should_stop: &mut impl FnMut() -> bool, + item_id: &str, + run_id: &str, + source_key: &str, + start_time_ms: u64, + end_time_ms: u64, + missing_palette_vobsub: bool, +) -> Result<(), String> +where + F: FnMut(DecodedBitmapCue) -> Result<(), String>, +{ + loop { + ensure_decode_not_cancelled(item_id, run_id)?; + ensure_decode_not_stopped(should_stop)?; + match decoder.receive_frame() { + Ok(Frame::Video(frame)) => { + ensure_decode_not_stopped(should_stop)?; + let cue = decoded_frame_to_cue( + frame, + *cue_index, + item_id, + source_key, + start_time_ms, + end_time_ms, + missing_palette_vobsub, + )?; + ensure_decode_not_stopped(should_stop)?; + *cue_index += 1; + normalizer.push(cue)?; + } + Ok(_) => {} + Err(OxideavError::NeedMore | OxideavError::Eof) => break, + Err(error) => return Err(format!("Failed to receive Subtitle OCR frame: {}", error)), + } + } + + Ok(()) +} + +fn ensure_decode_not_stopped(should_stop: &mut impl FnMut() -> bool) -> Result<(), String> { + if should_stop() { + Err("Subtitle OCR bitmap decode stopped".to_string()) + } else { + Ok(()) + } +} + +fn ensure_decode_not_cancelled(item_id: &str, run_id: &str) -> Result<(), String> { + if super::state::is_operation_cancelled(item_id, run_id) { + Err("Subtitle OCR operation cancelled".to_string()) + } else { + Ok(()) + } +} + +fn decoded_frame_to_cue( + frame: VideoFrame, + cue_index: usize, + item_id: &str, + source_key: &str, + start_time_ms: u64, + end_time_ms: u64, + missing_palette_vobsub: bool, +) -> Result { + let plane = + frame.planes.into_iter().next().ok_or_else(|| { + "Decoded Subtitle OCR frame did not contain an RGBA plane".to_string() + })?; + if plane.stride == 0 || plane.stride % 4 != 0 { + return Err("Decoded Subtitle OCR frame had an invalid RGBA stride".to_string()); + } + if plane.data.len() % plane.stride != 0 { + return Err("Decoded Subtitle OCR frame had incomplete RGBA rows".to_string()); + } + + let width = u32::try_from(plane.stride / 4) + .map_err(|_| "Decoded Subtitle OCR frame width was too large".to_string())?; + let height = u32::try_from(plane.data.len() / plane.stride) + .map_err(|_| "Decoded Subtitle OCR frame height was too large".to_string())?; + let mut rgba = plane.data; + if missing_palette_vobsub { + normalize_missing_palette_vobsub_rgba(&mut rgba); + } + let cue_id = format!("{}-cue-{}", item_id, cue_index + 1); + let bitmap_hash = bitmap_content_hash(&rgba); + let cache_key = format!( + "subtitle-ocr:{}:{:016x}", + item_id, + stable_hash64_bytes( + format!( + "{}:{}:{}:{}:{}:{:016x}", + source_key, cue_index, start_time_ms, end_time_ms, width, bitmap_hash + ) + .as_bytes() + ) + ); + + Ok(DecodedBitmapCue { + metadata: SubtitleOcrDecodedCue { + cue_id, + start_time_ms, + end_time_ms, + width, + height, + cache_key, + preview_path: None, + }, + rgba, + content_hash: bitmap_hash, + }) +} + +fn timestamp_to_ms(value: i64, time_base: TimeBase) -> u64 { + let ms = time_base.seconds_of(value) * 1000.0; + if !ms.is_finite() || ms <= 0.0 { + 0 + } else { + ms.round() as u64 + } +} + +fn with_oxideav_idx_path_hint(idx_text: &str, idx_path: &Path) -> String { + if idx_text.lines().any(|line| line.starts_with("# idx-path:")) { + idx_text.to_string() + } else { + format!("# idx-path: {}\n{}", idx_path.display(), idx_text) + } +} + +fn normalize_oxideav_vobsub_idx_text(idx_text: &str, idx_path: &Path) -> String { + let idx_text = with_oxideav_idx_path_hint(idx_text, idx_path); + if has_vobsub_palette_line(&idx_text) { + idx_text + } else { + format!("{}\n{}", fallback_vobsub_palette_line(), idx_text) + } +} + +fn has_vobsub_palette_line(idx_text: &str) -> bool { + idx_text + .lines() + .any(|line| line.trim_start().starts_with("palette:")) +} + +fn fallback_vobsub_palette_line() -> &'static str { + "palette: 000000, 111111, 222222, 333333, 444444, 555555, 666666, 777777, 888888, 999999, aaaaaa, bbbbbb, cccccc, dddddd, eeeeee, ffffff" +} + +fn normalize_missing_palette_vobsub_rgba(rgba: &mut [u8]) { + if !looks_like_light_vobsub_mask_with_transparent_glyphs(rgba) { + return; + } + + for pixel in rgba.chunks_exact_mut(4) { + if pixel[3] <= TRANSPARENT_ALPHA_THRESHOLD { + pixel.copy_from_slice(&[255, 255, 255, 255]); + } else { + pixel.copy_from_slice(&[0, 0, 0, 0]); + } + } +} + +fn looks_like_light_vobsub_mask_with_transparent_glyphs(rgba: &[u8]) -> bool { + let mut total = 0usize; + let mut visible = 0usize; + let mut transparent = 0usize; + let mut light_visible = 0usize; + + for pixel in rgba.chunks_exact(4) { + total += 1; + if pixel[3] <= TRANSPARENT_ALPHA_THRESHOLD { + transparent += 1; + continue; + } + + visible += 1; + if pixel_luma(pixel) >= LIGHT_PIXEL_LUMA_THRESHOLD { + light_visible += 1; + } + } + + total > 0 + && transparent > 0 + && visible > transparent + && light_visible.saturating_mul(10) >= visible.saturating_mul(9) +} + +fn pixel_luma(pixel: &[u8]) -> u16 { + ((u16::from(pixel[0]) * 77) + (u16::from(pixel[1]) * 150) + (u16::from(pixel[2]) * 29)) >> 8 +} + +fn validate_existing_file_with_extension(path: &str, extension: &str) -> Result { + let path = Path::new(path); + if !path.exists() { + return Err(format!("File not found: {}", path.display())); + } + if !path.is_file() { + return Err(format!("Not a file: {}", path.display())); + } + match lower_extension(path).as_deref() { + Some(ext) if ext == extension => Ok(path.to_path_buf()), + Some(ext) => Err(format!( + "Expected .{} Subtitle OCR source, got .{}", + extension, ext + )), + None => Err(format!( + "Expected .{} Subtitle OCR source, got path without extension", + extension + )), + } +} + +fn ensure_vobsub_pair_matches(idx_path: &Path, sub_path: &Path) -> Result<(), String> { + if vobsub_paths_match(idx_path, sub_path) { + Ok(()) + } else { + Err(format!( + "VobSub .sub sidecar must match the .idx path: expected {}", + idx_path.with_extension("sub").display() + )) + } +} + +fn vobsub_paths_match(idx_path: &Path, sub_path: &Path) -> bool { + idx_path.parent() == sub_path.parent() + && path_stem_eq_ignore_ascii_case(idx_path, sub_path) + && lower_extension(idx_path).as_deref() == Some("idx") + && lower_extension(sub_path).as_deref() == Some("sub") +} + +fn path_stem_eq_ignore_ascii_case(first: &Path, second: &Path) -> bool { + let Some(first_stem) = first.file_stem().and_then(|stem| stem.to_str()) else { + return false; + }; + let Some(second_stem) = second.file_stem().and_then(|stem| stem.to_str()) else { + return false; + }; + + first_stem.eq_ignore_ascii_case(second_stem) +} + +fn sibling_with_extension(path: &Path, extension: &str) -> PathBuf { + let direct = path.with_extension(extension); + find_sibling_with_extension(path, extension).unwrap_or(direct) +} + +fn find_sibling_with_extension(path: &Path, extension: &str) -> Option { + let parent = path.parent()?; + let stem = path.file_stem()?.to_string_lossy(); + + for entry in std::fs::read_dir(parent).ok()?.flatten() { + let candidate = entry.path(); + let Some(candidate_stem) = candidate.file_stem() else { + continue; + }; + let Some(candidate_extension) = candidate.extension() else { + continue; + }; + let candidate_stem = candidate_stem.to_string_lossy(); + let candidate_extension = candidate_extension.to_string_lossy(); + if candidate_stem.eq_ignore_ascii_case(&stem) + && candidate_extension.eq_ignore_ascii_case(extension) + { + return Some(candidate); + } + } + + None +} + +fn lower_extension(path: &Path) -> Option { + path.extension() + .and_then(|extension| extension.to_str()) + .map(|extension| extension.to_ascii_lowercase()) +} + +fn non_empty_path(path: Option<&str>) -> Option<&str> { + path.map(str::trim).filter(|path| !path.is_empty()) +} + +pub(super) fn bitmap_content_hash(rgba: &[u8]) -> u64 { + stable_hash64_bytes(rgba) +} + +pub(super) fn is_empty_subtitle_bitmap_rgba(rgba: &[u8]) -> bool { + if rgba.is_empty() { + return true; + } + + let mut pixels = rgba.chunks_exact(4); + if !pixels.remainder().is_empty() { + return false; + } + + let Some(first_pixel) = pixels.next() else { + return true; + }; + let mut is_uniform = true; + let mut is_transparent = first_pixel[3] <= TRANSPARENT_ALPHA_THRESHOLD; + + for pixel in pixels { + if pixel != first_pixel { + is_uniform = false; + } + if pixel[3] > TRANSPARENT_ALPHA_THRESHOLD { + is_transparent = false; + } + if !is_uniform && !is_transparent { + return false; + } + } + + is_uniform || is_transparent +} + +fn stable_hash64_bytes(bytes: &[u8]) -> u64 { + const FNV_OFFSET_BASIS: u64 = 0xcbf29ce484222325; + const FNV_PRIME: u64 = 0x100000001b3; + + let mut hash = FNV_OFFSET_BASIS; + for byte in bytes { + hash ^= *byte as u64; + hash = hash.wrapping_mul(FNV_PRIME); + } + hash +} + +#[cfg(test)] +mod tests { + use std::cell::Cell; + + use super::{ + BitmapSubtitleSource, DecodedBitmapCue, StreamingCueTimingNormalizer, + TRANSPARENT_ALPHA_THRESHOLD, bitmap_content_hash, + decode_bitmap_subtitle_source_with_handler, + decode_bitmap_subtitle_source_with_handler_and_stop, is_empty_subtitle_bitmap_rgba, + validate_bitmap_subtitle_source, + }; + use crate::tools::subtitle_ocr::SubtitleOcrDecodedCue; + + fn decoded_cue(cue_id: &str, start_time_ms: u64, end_time_ms: u64) -> DecodedBitmapCue { + DecodedBitmapCue { + metadata: SubtitleOcrDecodedCue { + cue_id: cue_id.to_string(), + start_time_ms, + end_time_ms, + width: 2, + height: 2, + cache_key: format!("cache-{cue_id}"), + preview_path: None, + }, + rgba: Vec::new(), + content_hash: bitmap_content_hash(&[]), + } + } + + #[test] + fn empty_bitmap_detection_accepts_empty_and_uniform_rgba() { + assert!(is_empty_subtitle_bitmap_rgba(&[])); + assert!(is_empty_subtitle_bitmap_rgba(&[ + 0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255, + ])); + } + + #[test] + fn empty_bitmap_detection_accepts_quasi_transparent_pixels() { + assert!(is_empty_subtitle_bitmap_rgba(&[ + 255, + 255, + 255, + 0, + 32, + 64, + 128, + TRANSPARENT_ALPHA_THRESHOLD, + ])); + } + + #[test] + fn empty_bitmap_detection_rejects_non_uniform_visible_pixels() { + assert!(!is_empty_subtitle_bitmap_rgba(&[ + 0, 0, 0, 255, 255, 255, 255, 255, + ])); + } + + #[test] + fn empty_bitmap_detection_rejects_black_glyph_on_transparent_background() { + assert!(!is_empty_subtitle_bitmap_rgba(&[ + 0, + 0, + 0, + 0, + 0, + 0, + 0, + TRANSPARENT_ALPHA_THRESHOLD, + 0, + 0, + 0, + 255, + ])); + } + + #[test] + fn bitmap_content_hash_changes_with_bitmap_content() { + let first = bitmap_content_hash(&[0, 0, 0, 255]); + let duplicate = bitmap_content_hash(&[0, 0, 0, 255]); + let different = bitmap_content_hash(&[255, 255, 255, 255]); + + assert_eq!(first, duplicate); + assert_ne!(first, different); + } + + fn select_pattern_palette_entry(spu: &mut [u8], palette_entry: u8) { + let command_offset = spu + .windows(4) + .position(|window| window == [0x03, 0x01, 0x32, 0x04]) + .expect("demo SPU should contain a palette select command"); + spu[command_offset + 1] = palette_entry & 0x0f; + } + + #[test] + fn validate_bitmap_subtitle_source_accepts_standalone_sup() { + let dir = tempfile::tempdir().expect("failed to create tempdir"); + let sup = dir.path().join("track.sup"); + std::fs::write(&sup, b"PG").expect("failed to write sup"); + + let source = validate_bitmap_subtitle_source(sup.to_string_lossy().as_ref(), None, None) + .expect("sup source should be valid"); + + assert!(matches!(source, BitmapSubtitleSource::Pgs { .. })); + } + + #[test] + fn validate_bitmap_subtitle_source_accepts_vobsub_pair() { + let dir = tempfile::tempdir().expect("failed to create tempdir"); + let idx = dir.path().join("track.idx"); + let sub = dir.path().join("track.sub"); + std::fs::write(&idx, b"# VobSub index file").expect("failed to write idx"); + std::fs::write(&sub, b"sub").expect("failed to write sub"); + + let source = validate_bitmap_subtitle_source( + idx.to_string_lossy().as_ref(), + Some(idx.to_string_lossy().as_ref()), + Some(sub.to_string_lossy().as_ref()), + ) + .expect("vobsub source should be valid"); + + assert!(matches!(source, BitmapSubtitleSource::VobSub { .. })); + } + + #[test] + fn validate_bitmap_subtitle_source_accepts_uppercase_vobsub_pair() { + let dir = tempfile::tempdir().expect("failed to create tempdir"); + let idx = dir.path().join("Movie.IDX"); + let sub = dir.path().join("Movie.SUB"); + std::fs::write(&idx, b"# VobSub index file").expect("failed to write idx"); + std::fs::write(&sub, b"sub").expect("failed to write sub"); + + let source = validate_bitmap_subtitle_source( + idx.to_string_lossy().as_ref(), + Some(idx.to_string_lossy().as_ref()), + Some(sub.to_string_lossy().as_ref()), + ) + .expect("uppercase VobSub pair should be valid"); + + assert!(matches!(source, BitmapSubtitleSource::VobSub { .. })); + } + + #[test] + fn validate_bitmap_subtitle_source_rejects_missing_vobsub_sidecar() { + let dir = tempfile::tempdir().expect("failed to create tempdir"); + let idx = dir.path().join("track.idx"); + std::fs::write(&idx, b"# VobSub index file").expect("failed to write idx"); + + let error = validate_bitmap_subtitle_source(idx.to_string_lossy().as_ref(), None, None) + .expect_err("missing sub sidecar should fail"); + + assert!(error.contains("VobSub .sub sidecar not found")); + } + + #[test] + fn streaming_timing_normalizer_uses_later_start_for_middle_missing_duration() { + let mut emitted = Vec::new(); + let mut handler = |cue| { + emitted.push(cue); + Ok(()) + }; + let mut normalizer = StreamingCueTimingNormalizer::new(&mut handler); + + normalizer + .push(decoded_cue("cue-1", 0, 1_000)) + .expect("first cue should push"); + normalizer + .push(decoded_cue("cue-2", 1_500, 1_500)) + .expect("second cue should push"); + normalizer + .push(decoded_cue("cue-3", 2_500, 3_000)) + .expect("third cue should push"); + normalizer.finish().expect("normalizer should finish"); + + assert_eq!(emitted[1].metadata.end_time_ms, 2_500); + } + + #[test] + fn streaming_timing_normalizer_uses_previous_duration_for_final_missing_duration() { + let mut emitted = Vec::new(); + let mut handler = |cue| { + emitted.push(cue); + Ok(()) + }; + let mut normalizer = StreamingCueTimingNormalizer::new(&mut handler); + + normalizer + .push(decoded_cue("cue-1", 0, 1_000)) + .expect("first cue should push"); + normalizer + .push(decoded_cue("cue-2", 3_000, 3_000)) + .expect("second cue should push"); + normalizer.finish().expect("normalizer should finish"); + + assert_eq!(emitted[1].metadata.end_time_ms, 4_000); + } + + #[test] + fn decode_bitmap_subtitle_source_with_handler_decodes_vobsub_demo_spu() { + let dir = tempfile::tempdir().expect("failed to create tempdir"); + let idx = dir.path().join("demo.idx"); + let sub = dir.path().join("demo.sub"); + let spu = oxideav_sub_image::vobsub::build_demo_spu(2, 2, &[1, 1, 1, 1]); + std::fs::write(&sub, spu).expect("failed to write sub"); + std::fs::write( + &idx, + "\ +# VobSub index file +size: 2x2 +palette: ff0000, 00ff00, 0000ff, ffffff, 000000, 808080, c0c0c0, 404040, 200020, 800080, a0a0a0, 010203, 040506, 070809, 0a0b0c, 0d0e0f +timestamp: 00:00:01:500, filepos: 000000000 +", + ) + .expect("failed to write idx"); + let source = BitmapSubtitleSource::VobSub { + idx_path: idx, + sub_path: sub, + }; + let mut decoded_metadata = Vec::new(); + + decode_bitmap_subtitle_source_with_handler(&source, "demo-item", "demo-run", |decoded| { + assert_eq!(decoded.metadata.width, 2); + assert_eq!(decoded.metadata.height, 2); + assert_eq!(decoded.metadata.start_time_ms, 1_500); + assert!(decoded.metadata.end_time_ms > decoded.metadata.start_time_ms); + assert_eq!(decoded.rgba.len(), 16); + decoded_metadata.push(decoded.metadata); + Ok(()) + }) + .expect("demo VobSub fixture should decode"); + + assert_eq!(decoded_metadata.len(), 1); + assert_eq!(decoded_metadata[0].end_time_ms, 3_500); + } + + #[test] + fn decode_bitmap_subtitle_source_with_handler_decodes_vobsub_without_palette_as_visible_rgba() { + let dir = tempfile::tempdir().expect("failed to create tempdir"); + let idx = dir.path().join("demo.idx"); + let sub = dir.path().join("demo.sub"); + let spu = oxideav_sub_image::vobsub::build_demo_spu(2, 2, &[1, 1, 1, 1]); + std::fs::write(&sub, spu).expect("failed to write sub"); + std::fs::write( + &idx, + "\ +# VobSub index file +size: 2x2 +timestamp: 00:00:01:500, filepos: 000000000 +", + ) + .expect("failed to write idx"); + let source = BitmapSubtitleSource::VobSub { + idx_path: idx, + sub_path: sub, + }; + let mut decoded_cues = Vec::new(); + + decode_bitmap_subtitle_source_with_handler(&source, "demo-item", "demo-run", |decoded| { + decoded_cues.push(decoded); + Ok(()) + }) + .expect("no-palette VobSub fixture should decode"); + + assert_eq!(decoded_cues.len(), 1); + assert!( + decoded_cues[0] + .rgba + .chunks_exact(4) + .any(|pixel| { pixel[3] > 0 && (pixel[0] > 0 || pixel[1] > 0 || pixel[2] > 0) }) + ); + } + + #[test] + fn decode_vobsub_without_palette_normalizes_transparent_glyphs() { + let dir = tempfile::tempdir().expect("failed to create tempdir"); + let idx = dir.path().join("demo.idx"); + let sub = dir.path().join("demo.sub"); + let indices = [1, 1, 1, 1, 0, 1, 1, 1, 1]; + let mut spu = oxideav_sub_image::vobsub::build_demo_spu(3, 3, &indices); + select_pattern_palette_entry(&mut spu, 0x0f); + std::fs::write(&sub, spu).expect("failed to write sub"); + std::fs::write( + &idx, + "\ +# VobSub index file +size: 3x3 +timestamp: 00:00:01:500, filepos: 000000000 +", + ) + .expect("failed to write idx"); + let source = BitmapSubtitleSource::VobSub { + idx_path: idx, + sub_path: sub, + }; + let mut decoded_cues = Vec::new(); + + decode_bitmap_subtitle_source_with_handler(&source, "demo-item", "demo-run", |decoded| { + decoded_cues.push(decoded); + Ok(()) + }) + .expect("no-palette VobSub fixture should decode"); + + assert_eq!(decoded_cues.len(), 1); + let rgba = &decoded_cues[0].rgba; + assert_eq!(&rgba[0..4], &[0, 0, 0, 0]); + assert_eq!(&rgba[16..20], &[255, 255, 255, 255]); + } + + #[test] + fn decode_bitmap_subtitle_source_with_handler_and_stop_observes_stop_between_frames() { + let dir = tempfile::tempdir().expect("failed to create tempdir"); + let idx = dir.path().join("demo.idx"); + let sub = dir.path().join("demo.sub"); + let first_spu = oxideav_sub_image::vobsub::build_demo_spu(2, 2, &[1, 1, 1, 1]); + let second_offset = first_spu.len(); + let third_offset = second_offset + first_spu.len(); + let mut sub_bytes = Vec::new(); + sub_bytes.extend_from_slice(&first_spu); + sub_bytes.extend_from_slice(&first_spu); + sub_bytes.extend_from_slice(&first_spu); + std::fs::write(&sub, sub_bytes).expect("failed to write sub"); + std::fs::write( + &idx, + format!( + "\ +# VobSub index file +size: 2x2 +palette: ff0000, 00ff00, 0000ff, ffffff, 000000, 808080, c0c0c0, 404040, 200020, 800080, a0a0a0, 010203, 040506, 070809, 0a0b0c, 0d0e0f +timestamp: 00:00:01:000, filepos: 000000000 +timestamp: 00:00:02:000, filepos: {second_offset:09x} +timestamp: 00:00:03:000, filepos: {third_offset:09x} +" + ), + ) + .expect("failed to write idx"); + let source = BitmapSubtitleSource::VobSub { + idx_path: idx, + sub_path: sub, + }; + let stop = Cell::new(false); + let mut decoded_metadata = Vec::new(); + + let error = decode_bitmap_subtitle_source_with_handler_and_stop( + &source, + "demo-item", + "demo-run", + |decoded| { + decoded_metadata.push(decoded.metadata); + stop.set(true); + Ok(()) + }, + || stop.get(), + ) + .expect_err("stop flag should interrupt the decode stream"); + + assert!(error.contains("stopped")); + assert_eq!(decoded_metadata.len(), 1); + } +} diff --git a/src-tauri/src/tools/subtitle_ocr/export.rs b/src-tauri/src/tools/subtitle_ocr/export.rs new file mode 100644 index 00000000..8c17adae --- /dev/null +++ b/src-tauri/src/tools/subtitle_ocr/export.rs @@ -0,0 +1,409 @@ +use crate::shared::validation::validate_output_path; +use crate::tools::subtitle_ocr::{SubtitleOcrCue, SubtitleOcrPlacement}; + +enum SubtitleOcrExportFormat { + Ass, + Srt, + Vtt, +} + +#[tauri::command] +pub(crate) async fn export_subtitle_ocr_version( + cues: Vec, + output_path: String, + format: String, +) -> Result<(), String> { + validate_output_path(&output_path)?; + let format = parse_export_format(&format)?; + let cues = validated_sorted_nonblank_cues(&cues)?; + + let content = match format { + SubtitleOcrExportFormat::Ass => format_ass(&cues, 1920, 1080), + SubtitleOcrExportFormat::Srt => format_srt(&cues), + SubtitleOcrExportFormat::Vtt => format_vtt(&cues), + }; + + std::fs::write(&output_path, content) + .map_err(|e| format!("Failed to write Subtitle OCR export: {}", e))?; + + Ok(()) +} + +fn parse_export_format(format: &str) -> Result { + match format { + "ass" => Ok(SubtitleOcrExportFormat::Ass), + "srt" => Ok(SubtitleOcrExportFormat::Srt), + "vtt" => Ok(SubtitleOcrExportFormat::Vtt), + _ => Err(format!( + "Unsupported Subtitle OCR export format: {}", + format + )), + } +} + +fn validated_sorted_nonblank_cues(cues: &[SubtitleOcrCue]) -> Result, String> { + for cue in cues { + if cue.end_time_ms <= cue.start_time_ms { + return Err(format!( + "Invalid Subtitle OCR cue time range for cue {}", + cue.id + )); + } + } + + let mut sorted = cues + .iter() + .filter(|cue| !cue.text.trim().is_empty()) + .collect::>(); + + sorted.sort_by(|a, b| { + a.start_time_ms + .cmp(&b.start_time_ms) + .then_with(|| a.end_time_ms.cmp(&b.end_time_ms)) + .then_with(|| a.id.cmp(&b.id)) + }); + + Ok(sorted) +} + +fn format_srt(cues: &[&SubtitleOcrCue]) -> String { + cues.iter() + .enumerate() + .map(|(index, cue)| { + format!( + "{}\n{} --> {}\n{}\n", + index + 1, + format_srt_time(cue.start_time_ms), + format_srt_time(cue.end_time_ms), + cue.text + ) + }) + .collect::>() + .join("\n") +} + +fn format_vtt(cues: &[&SubtitleOcrCue]) -> String { + let mut output = String::from("WEBVTT\n\n"); + + for cue in cues { + output.push_str(&format!( + "{} --> {}\n{}\n\n", + format_vtt_time(cue.start_time_ms), + format_vtt_time(cue.end_time_ms), + cue.text + )); + } + + output +} + +fn format_ass(cues: &[&SubtitleOcrCue], width: u32, height: u32) -> String { + let events = cues + .iter() + .map(|cue| { + let text = format_ass_cue_text(cue); + format!( + "Dialogue: 0,{},{},Default,,0,0,0,,{}", + format_ass_time(cue.start_time_ms), + format_ass_time(cue.end_time_ms), + text + ) + }) + .collect::>() + .join("\n"); + + [ + "[Script Info]".to_string(), + "ScriptType: v4.00+".to_string(), + format!("PlayResX: {}", width), + format!("PlayResY: {}", height), + String::new(), + "[V4+ Styles]".to_string(), + "Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding".to_string(), + "Style: Default,Arial,48,&H00FFFFFF,&H000000FF,&H00000000,&H80000000,0,0,0,0,100,100,0,0,1,2,0,2,20,20,40,1".to_string(), + String::new(), + "[Events]".to_string(), + "Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text".to_string(), + events, + ] + .join("\n") +} + +fn format_ass_cue_text(cue: &SubtitleOcrCue) -> String { + let text = format_ass_text(&cue.text); + if cue.placement == Some(SubtitleOcrPlacement::Top) { + format!("{{\\an8}}{}", text) + } else { + text + } +} + +fn format_ass_text(text: &str) -> String { + text.replace("\r\n", "\n") + .replace('\r', "\n") + .replace('\\', "\\\\") + .replace('{', "\\{") + .replace('}', "\\}") + .replace('\n', "\\N") +} + +fn format_srt_time(ms: u64) -> String { + let hours = ms / 3_600_000; + let minutes = (ms % 3_600_000) / 60_000; + let seconds = (ms % 60_000) / 1000; + let millis = ms % 1000; + + format!("{:02}:{:02}:{:02},{:03}", hours, minutes, seconds, millis) +} + +fn format_vtt_time(ms: u64) -> String { + let hours = ms / 3_600_000; + let minutes = (ms % 3_600_000) / 60_000; + let seconds = (ms % 60_000) / 1000; + let millis = ms % 1000; + + format!("{:02}:{:02}:{:02}.{:03}", hours, minutes, seconds, millis) +} + +fn format_ass_time(ms: u64) -> String { + let hours = ms / 3_600_000; + let minutes = (ms % 3_600_000) / 60_000; + let seconds = (ms % 60_000) / 1000; + let centiseconds = (ms % 1000) / 10; + + format!( + "{}:{:02}:{:02}.{:02}", + hours, minutes, seconds, centiseconds + ) +} + +#[cfg(test)] +mod tests { + use super::export_subtitle_ocr_version; + use crate::tools::subtitle_ocr::{SubtitleOcrCue, SubtitleOcrPlacement}; + + fn cue(id: &str, start_time_ms: u64, end_time_ms: u64, text: &str) -> SubtitleOcrCue { + SubtitleOcrCue { + id: id.to_string(), + source_cue_ids: vec!["raw-1".to_string()], + start_time_ms, + end_time_ms, + text: text.to_string(), + confidence: 0.9, + placement: Some(SubtitleOcrPlacement::Bottom), + placement_source_count: Some(1), + top_placement_source_count: Some(0), + } + } + + fn top_cue(id: &str, start_time_ms: u64, end_time_ms: u64, text: &str) -> SubtitleOcrCue { + SubtitleOcrCue { + placement: Some(SubtitleOcrPlacement::Top), + placement_source_count: Some(1), + top_placement_source_count: Some(1), + ..cue(id, start_time_ms, end_time_ms, text) + } + } + + fn multiline_cue() -> SubtitleOcrCue { + cue("cue-1", 1_000, 2_500, "- Stop.\n- I cannot.") + } + + #[tokio::test] + async fn srt_preserves_real_line_breaks() { + let dir = tempfile::tempdir().expect("failed to create tempdir"); + let output = dir.path().join("subtitle-ocr.srt"); + + export_subtitle_ocr_version( + vec![multiline_cue()], + output.to_string_lossy().to_string(), + "srt".to_string(), + ) + .await + .expect("export should succeed"); + + let content = std::fs::read_to_string(output).expect("failed to read export"); + assert!(content.contains("- Stop.\n- I cannot.")); + } + + #[tokio::test] + async fn vtt_preserves_real_line_breaks() { + let dir = tempfile::tempdir().expect("failed to create tempdir"); + let output = dir.path().join("subtitle-ocr.vtt"); + + export_subtitle_ocr_version( + vec![multiline_cue()], + output.to_string_lossy().to_string(), + "vtt".to_string(), + ) + .await + .expect("export should succeed"); + + let content = std::fs::read_to_string(output).expect("failed to read export"); + assert!(content.contains("- Stop.\n- I cannot.")); + } + + #[tokio::test] + async fn ass_serializes_line_breaks_as_ass_line_breaks() { + let dir = tempfile::tempdir().expect("failed to create tempdir"); + let output = dir.path().join("subtitle-ocr.ass"); + + export_subtitle_ocr_version( + vec![multiline_cue()], + output.to_string_lossy().to_string(), + "ass".to_string(), + ) + .await + .expect("export should succeed"); + + let content = std::fs::read_to_string(output).expect("failed to read export"); + assert!(content.contains("- Stop.\\N- I cannot.")); + } + + #[tokio::test] + async fn ass_prefixes_top_cues_with_alignment_override() { + let dir = tempfile::tempdir().expect("failed to create tempdir"); + let output = dir.path().join("subtitle-ocr.ass"); + + export_subtitle_ocr_version( + vec![ + top_cue("top", 1_000, 2_000, "Top line"), + cue("bottom", 3_000, 4_000, "Bottom line"), + ], + output.to_string_lossy().to_string(), + "ass".to_string(), + ) + .await + .expect("export should succeed"); + + let content = std::fs::read_to_string(output).expect("failed to read export"); + assert!(content.contains(r"{\an8}Top line")); + assert!(content.contains(",,Bottom line")); + assert!(!content.contains(r"{\an8}Bottom line")); + } + + #[tokio::test] + async fn srt_and_vtt_ignore_top_alignment() { + let dir = tempfile::tempdir().expect("failed to create tempdir"); + let srt_output = dir.path().join("subtitle-ocr.srt"); + let vtt_output = dir.path().join("subtitle-ocr.vtt"); + + export_subtitle_ocr_version( + vec![top_cue("top", 1_000, 2_000, "Top line")], + srt_output.to_string_lossy().to_string(), + "srt".to_string(), + ) + .await + .expect("srt export should succeed"); + export_subtitle_ocr_version( + vec![top_cue("top", 1_000, 2_000, "Top line")], + vtt_output.to_string_lossy().to_string(), + "vtt".to_string(), + ) + .await + .expect("vtt export should succeed"); + + let srt = std::fs::read_to_string(srt_output).expect("failed to read srt export"); + let vtt = std::fs::read_to_string(vtt_output).expect("failed to read vtt export"); + assert!(!srt.contains(r"{\an8}")); + assert!(!vtt.contains(r"{\an8}")); + } + + #[tokio::test] + async fn invalid_time_range_rejects_with_cue_id() { + let dir = tempfile::tempdir().expect("failed to create tempdir"); + let output = dir.path().join("subtitle-ocr.srt"); + + let error = export_subtitle_ocr_version( + vec![cue("bad-cue", 2_000, 2_000, "Bad range")], + output.to_string_lossy().to_string(), + "srt".to_string(), + ) + .await + .expect_err("invalid timing should fail"); + + assert!(error.contains("bad-cue")); + } + + #[tokio::test] + async fn whitespace_only_cue_is_omitted_and_numbering_remains_correct() { + let dir = tempfile::tempdir().expect("failed to create tempdir"); + let output = dir.path().join("subtitle-ocr.srt"); + + export_subtitle_ocr_version( + vec![ + cue("blank", 0, 500, " \n\t "), + cue("visible", 1_000, 2_000, "Visible"), + ], + output.to_string_lossy().to_string(), + "srt".to_string(), + ) + .await + .expect("export should succeed"); + + let content = std::fs::read_to_string(output).expect("failed to read export"); + assert!(content.starts_with("1\n00:00:01,000 --> 00:00:02,000\nVisible")); + assert!(!content.contains("2\n")); + } + + #[tokio::test] + async fn unsorted_input_exports_chronologically() { + let dir = tempfile::tempdir().expect("failed to create tempdir"); + let output = dir.path().join("subtitle-ocr.vtt"); + + export_subtitle_ocr_version( + vec![ + cue("later", 2_000, 3_000, "Later"), + cue("earlier", 500, 1_000, "Earlier"), + ], + output.to_string_lossy().to_string(), + "vtt".to_string(), + ) + .await + .expect("export should succeed"); + + let content = std::fs::read_to_string(output).expect("failed to read export"); + let earlier_index = content.find("Earlier").expect("earlier cue should exist"); + let later_index = content.find("Later").expect("later cue should exist"); + assert!(earlier_index < later_index); + } + + #[tokio::test] + async fn unsupported_format_rejects() { + let dir = tempfile::tempdir().expect("failed to create tempdir"); + let output = dir.path().join("subtitle-ocr.txt"); + + let error = export_subtitle_ocr_version( + vec![multiline_cue()], + output.to_string_lossy().to_string(), + "txt".to_string(), + ) + .await + .expect_err("unsupported format should fail"); + + assert_eq!(error, "Unsupported Subtitle OCR export format: txt"); + } + + #[tokio::test] + async fn ass_escapes_braces_backslashes_and_line_breaks() { + let dir = tempfile::tempdir().expect("failed to create tempdir"); + let output = dir.path().join("subtitle-ocr.ass"); + + export_subtitle_ocr_version( + vec![cue( + "escaped", + 1_000, + 2_000, + r"Path C:\Temp\{file} +Next", + )], + output.to_string_lossy().to_string(), + "ass".to_string(), + ) + .await + .expect("export should succeed"); + + let content = std::fs::read_to_string(output).expect("failed to read export"); + assert!(content.contains(r"Path C:\\Temp\\\{file\}\NNext")); + } +} diff --git a/src-tauri/src/tools/subtitle_ocr/extract.rs b/src-tauri/src/tools/subtitle_ocr/extract.rs new file mode 100644 index 00000000..7471fef8 --- /dev/null +++ b/src-tauri/src/tools/subtitle_ocr/extract.rs @@ -0,0 +1,327 @@ +use std::path::{Path, PathBuf}; +use std::process::Stdio; +use std::time::Duration; + +use tauri::Emitter; + +use crate::shared::process::{terminate_process, tokio_command, wait_with_output_timeout}; +use crate::shared::sleep_inhibit::SleepInhibitGuard; +use crate::shared::store::resolve_ffmpeg_path; +use crate::shared::validation::{validate_media_path, validate_output_path}; +use crate::tools::subtitle_ocr::progress::SubtitleOcrProgressEvent; + +const SUBTITLE_OCR_EXTRACT_TIMEOUT: Duration = Duration::from_secs(300); +const SUBTITLE_OCR_CANCELLED: &str = "Subtitle OCR operation cancelled"; +const VOBSUB_CONTAINER_EXTRACTION_UNSUPPORTED: &str = "Container VobSub extraction is not supported by the bundled FFmpeg path. Import the .idx/.sub pair directly."; + +#[tauri::command] +pub(crate) async fn prepare_subtitle_ocr_track( + app: tauri::AppHandle, + input_path: String, + stream_index: u32, + codec: String, + item_id: String, + run_id: String, +) -> Result { + validate_media_path(&input_path)?; + validate_item_id(&item_id)?; + validate_run_id(&run_id)?; + ensure_container_extraction_supported(&codec)?; + let _sleep_guard = SleepInhibitGuard::try_acquire("Subtitle OCR extraction").ok(); + let ffmpeg_path = resolve_ffmpeg_path(&app)?; + let output_path = subtitle_ocr_temp_output_path(&input_path, stream_index, &codec, &item_id)?; + let sidecar_path = vobsub_sidecar_path(&output_path, &codec); + let mut registered_paths = vec![output_path.to_string_lossy().to_string()]; + if let Some(sidecar_path) = sidecar_path.as_ref() { + registered_paths.push(sidecar_path.to_string_lossy().to_string()); + } + + super::state::begin_operation(&item_id, &run_id)?; + let result = async { + if super::state::register_output_paths(&item_id, &run_id, registered_paths)? { + return Err(SUBTITLE_OCR_CANCELLED.to_string()); + } + + run_prepare_subtitle_ocr_ffmpeg( + &app, + &ffmpeg_path, + &input_path, + &output_path, + stream_index, + &codec, + &item_id, + &run_id, + ) + .await?; + + if let Some(sidecar_path) = sidecar_path.as_ref() { + if !sidecar_path.exists() { + return Err(format!( + "Expected VobSub .sub sidecar not found after extraction: {}", + sidecar_path.display() + )); + } + } + + if super::state::is_operation_cancelled(&item_id, &run_id) { + return Err(SUBTITLE_OCR_CANCELLED.to_string()); + } + + Ok(()) + } + .await; + + if result.is_err() { + remove_registered_outputs(&item_id, &run_id); + } + let _ = super::state::clear_registered_operation(&item_id, &run_id); + if result.is_ok() { + let _ = super::state::clear_cancelled(&item_id, &run_id); + } + result?; + Ok(output_path.to_string_lossy().to_string()) +} + +async fn run_prepare_subtitle_ocr_ffmpeg( + app: &tauri::AppHandle, + ffmpeg_path: &str, + input_path: &str, + output_path: &Path, + stream_index: u32, + codec: &str, + item_id: &str, + run_id: &str, +) -> Result<(), String> { + validate_output_path(output_path.to_string_lossy().as_ref())?; + let output_path_string = output_path.to_string_lossy().to_string(); + let args = + build_prepare_subtitle_ocr_args(input_path, &output_path_string, stream_index, codec)?; + + emit_extract_progress(app, item_id, run_id, 0); + + let child = tokio_command(ffmpeg_path) + .args(&args) + .stdout(Stdio::piped()) + .stderr(Stdio::piped()) + .spawn() + .map_err(|e| { + format!( + "Failed to execute ffmpeg: {}. Make sure FFmpeg is installed.", + e + ) + })?; + + if let Some(pid) = child.id() { + if super::state::register_operation_pid(item_id, run_id, pid)? { + if let Some(pid) = super::state::take_operation_pid(item_id, run_id)? { + terminate_process(pid); + } + return Err(SUBTITLE_OCR_CANCELLED.to_string()); + } + } else if super::state::is_operation_cancelled(item_id, run_id) { + return Err(SUBTITLE_OCR_CANCELLED.to_string()); + } + + let output = match wait_with_output_timeout( + child, + "Subtitle OCR FFmpeg extraction", + SUBTITLE_OCR_EXTRACT_TIMEOUT, + ) + .await + { + Ok(output) => output, + Err(error) => { + if let Some(pid) = super::state::take_operation_pid(item_id, run_id)? { + terminate_process(pid); + } + return Err(error); + } + }; + + let _ = super::state::take_operation_pid(item_id, run_id)?; + + if super::state::is_operation_cancelled(item_id, run_id) { + return Err(SUBTITLE_OCR_CANCELLED.to_string()); + } + + if !output.status.success() { + let stderr = String::from_utf8_lossy(&output.stderr); + return Err(format!("Subtitle OCR extraction failed: {}", stderr)); + } + + emit_extract_progress(app, item_id, run_id, 1); + Ok(()) +} + +fn remove_registered_outputs(item_id: &str, run_id: &str) { + if let Ok(paths) = super::state::take_output_paths(item_id, run_id) { + for path in paths { + let _ = std::fs::remove_file(path); + } + } +} + +fn emit_extract_progress(app: &tauri::AppHandle, item_id: &str, run_id: &str, current: u32) { + let _ = app.emit( + "subtitle-ocr-progress", + SubtitleOcrProgressEvent::new(item_id, run_id, "extracting", current, 1), + ); +} + +pub(super) fn subtitle_ocr_extension_for_codec(codec: &str) -> Option<&'static str> { + match codec.to_ascii_lowercase().as_str() { + "hdmv_pgs_subtitle" | "pgs" => Some("sup"), + _ => None, + } +} + +pub(super) fn build_prepare_subtitle_ocr_args( + input_path: &str, + output_path: &str, + stream_index: u32, + codec: &str, +) -> Result, String> { + ensure_container_extraction_supported(codec)?; + subtitle_ocr_extension_for_codec(codec) + .ok_or_else(|| format!("Unsupported Subtitle OCR codec: {}", codec))?; + + let mut args = vec![ + "-y".to_string(), + "-i".to_string(), + input_path.to_string(), + "-map".to_string(), + format!("0:{}", stream_index), + "-c:s".to_string(), + "copy".to_string(), + ]; + + args.push(output_path.to_string()); + Ok(args) +} + +fn ensure_container_extraction_supported(codec: &str) -> Result<(), String> { + if codec.eq_ignore_ascii_case("dvd_subtitle") { + Err(VOBSUB_CONTAINER_EXTRACTION_UNSUPPORTED.to_string()) + } else { + Ok(()) + } +} + +fn subtitle_ocr_temp_output_path( + input_path: &str, + stream_index: u32, + codec: &str, + item_id: &str, +) -> Result { + let extension = subtitle_ocr_extension_for_codec(codec) + .ok_or_else(|| format!("Unsupported Subtitle OCR codec: {}", codec))?; + let dir = std::env::temp_dir().join("MediaFlow").join("subtitle-ocr"); + std::fs::create_dir_all(&dir) + .map_err(|e| format!("Failed to create Subtitle OCR temp directory: {}", e))?; + let input_hash = crate::shared::hash::stable_hash64(input_path); + let filename = format!( + "{}-{:016x}-stream-{}.{}", + sanitize_file_component(item_id), + input_hash, + stream_index, + extension + ); + Ok(dir.join(filename)) +} + +fn sanitize_file_component(value: &str) -> String { + let sanitized = value + .chars() + .map(|ch| { + if ch.is_ascii_alphanumeric() || matches!(ch, '-' | '_') { + ch + } else { + '_' + } + }) + .collect::(); + + if sanitized.is_empty() { + "item".to_string() + } else { + sanitized + } +} + +fn validate_item_id(item_id: &str) -> Result<(), String> { + if item_id.trim().is_empty() { + Err("Subtitle OCR item id is required".to_string()) + } else { + Ok(()) + } +} + +fn validate_run_id(run_id: &str) -> Result<(), String> { + if run_id.trim().is_empty() { + Err("Subtitle OCR run id is required".to_string()) + } else { + Ok(()) + } +} + +fn vobsub_sidecar_path(output_path: &Path, codec: &str) -> Option { + codec + .eq_ignore_ascii_case("dvd_subtitle") + .then(|| output_path.with_extension("sub")) +} + +#[cfg(test)] +mod tests { + use super::{build_prepare_subtitle_ocr_args, subtitle_ocr_extension_for_codec}; + + #[test] + fn subtitle_ocr_extension_for_codec_maps_pgs_only() { + assert_eq!( + subtitle_ocr_extension_for_codec("hdmv_pgs_subtitle"), + Some("sup") + ); + assert_eq!(subtitle_ocr_extension_for_codec("pgs"), Some("sup")); + assert_eq!(subtitle_ocr_extension_for_codec("dvd_subtitle"), None); + } + + #[test] + fn build_prepare_subtitle_ocr_args_extracts_selected_stream() { + let args = build_prepare_subtitle_ocr_args( + "/media/input.mkv", + "/tmp/item-stream-2.sup", + 2, + "hdmv_pgs_subtitle", + ) + .expect("args should build"); + + assert_eq!( + args, + vec![ + "-y", + "-i", + "/media/input.mkv", + "-map", + "0:2", + "-c:s", + "copy", + "/tmp/item-stream-2.sup" + ] + ); + } + + #[test] + fn build_prepare_subtitle_ocr_args_rejects_dvd_subtitle_container_extraction() { + let error = build_prepare_subtitle_ocr_args( + "/media/input.mkv", + "/tmp/item-stream-3.idx", + 3, + "dvd_subtitle", + ) + .expect_err("container VobSub extraction should be unsupported"); + + assert_eq!( + error, + "Container VobSub extraction is not supported by the bundled FFmpeg path. Import the .idx/.sub pair directly." + ); + } +} diff --git a/src-tauri/src/tools/subtitle_ocr/import.rs b/src-tauri/src/tools/subtitle_ocr/import.rs new file mode 100644 index 00000000..1b88c4b9 --- /dev/null +++ b/src-tauri/src/tools/subtitle_ocr/import.rs @@ -0,0 +1,375 @@ +use std::path::{Path, PathBuf}; + +use serde_json::Value; + +use crate::shared::store::resolve_ffprobe_path; +use crate::shared::validation::validate_media_path; +use crate::tools::ffprobe::probe::probe_file_with_ffprobe; +use crate::tools::subtitle_ocr::SubtitleOcrTrackInfo; + +const VOBSUB_CONTAINER_EXTRACTION_UNSUPPORTED: &str = "Container VobSub extraction is not supported by the bundled FFmpeg path. Import the .idx/.sub pair directly."; + +#[derive(Debug, Clone, serde::Serialize, PartialEq, Eq)] +#[serde(rename_all = "camelCase")] +pub(crate) struct SubtitleOcrVobSubPairInfo { + pub(crate) idx_path: String, + pub(crate) sub_path: String, +} + +#[tauri::command] +pub(crate) async fn probe_subtitle_ocr_tracks( + app: tauri::AppHandle, + path: String, +) -> Result, String> { + validate_media_path(&path)?; + let ffprobe_path = resolve_ffprobe_path(&app)?; + let probe_json = probe_file_with_ffprobe(&ffprobe_path, &path).await?; + parse_tracks_from_probe_json(&probe_json) +} + +#[tauri::command] +pub(crate) async fn resolve_subtitle_ocr_vobsub_pair( + path: String, +) -> Result { + resolve_vobsub_pair(&path) +} + +pub(super) fn codec_label(codec: &str) -> Option<&'static str> { + match codec.to_ascii_lowercase().as_str() { + "hdmv_pgs_subtitle" | "pgs" => Some("PGS"), + _ => None, + } +} + +pub(super) fn parse_tracks_from_probe_json( + probe_json: &str, +) -> Result, String> { + let value: Value = serde_json::from_str(probe_json) + .map_err(|e| format!("Failed to parse ffprobe subtitle metadata: {}", e))?; + let streams = value + .get("streams") + .and_then(Value::as_array) + .ok_or_else(|| "FFprobe output did not contain streams".to_string())?; + + let tracks = streams + .iter() + .filter_map(parse_track_from_stream) + .collect::>(); + if tracks.is_empty() && streams.iter().any(stream_is_container_vobsub) { + return Err(VOBSUB_CONTAINER_EXTRACTION_UNSUPPORTED.to_string()); + } + + Ok(tracks) +} + +fn parse_track_from_stream(stream: &Value) -> Option { + if stream.get("codec_type").and_then(Value::as_str) != Some("subtitle") { + return None; + } + + let codec = stream.get("codec_name").and_then(Value::as_str)?; + let codec_label = codec_label(codec)?; + let stream_index = stream + .get("index") + .and_then(Value::as_u64) + .and_then(|value| u32::try_from(value).ok())?; + + Some(SubtitleOcrTrackInfo { + stream_index, + codec: codec.to_string(), + codec_label: codec_label.to_string(), + language: tag_value(stream, "language"), + title: tag_value(stream, "title"), + forced: disposition_flag(stream, "forced"), + r#default: disposition_flag(stream, "default"), + }) +} + +fn tag_value(stream: &Value, key: &str) -> Option { + stream + .get("tags") + .and_then(Value::as_object) + .and_then(|tags| { + tags.iter().find_map(|(tag_key, value)| { + tag_key + .eq_ignore_ascii_case(key) + .then(|| value.as_str().map(str::trim)) + .flatten() + .filter(|value| !value.is_empty()) + .map(ToOwned::to_owned) + }) + }) +} + +fn stream_is_container_vobsub(stream: &Value) -> bool { + stream.get("codec_type").and_then(Value::as_str) == Some("subtitle") + && stream + .get("codec_name") + .and_then(Value::as_str) + .is_some_and(|codec| codec.eq_ignore_ascii_case("dvd_subtitle")) +} + +fn disposition_flag(stream: &Value, key: &str) -> bool { + stream + .get("disposition") + .and_then(|disposition| disposition.get(key)) + .is_some_and(ffprobe_flag_is_enabled) +} + +fn ffprobe_flag_is_enabled(value: &Value) -> bool { + value.as_bool().unwrap_or_else(|| { + value + .as_i64() + .map(|flag| flag != 0) + .or_else(|| { + value + .as_str() + .map(|flag| flag == "1" || flag.eq_ignore_ascii_case("true")) + }) + .unwrap_or(false) + }) +} + +fn resolve_vobsub_pair(path: &str) -> Result { + let path = Path::new(path); + let extension = lower_extension(path).ok_or_else(|| { + "Expected .idx or .sub Subtitle OCR source, got path without extension".to_string() + })?; + + match extension.as_str() { + "idx" => { + validate_existing_vobsub_part(path, "idx")?; + let sub_path = sibling_with_extension(path, "sub"); + validate_existing_vobsub_sidecar(&sub_path, "sub")?; + Ok(SubtitleOcrVobSubPairInfo { + idx_path: path.to_string_lossy().to_string(), + sub_path: sub_path.to_string_lossy().to_string(), + }) + } + "sub" => { + validate_existing_vobsub_part(path, "sub")?; + let idx_path = sibling_with_extension(path, "idx"); + validate_existing_vobsub_sidecar(&idx_path, "idx")?; + Ok(SubtitleOcrVobSubPairInfo { + idx_path: idx_path.to_string_lossy().to_string(), + sub_path: path.to_string_lossy().to_string(), + }) + } + ext => Err(format!( + "Expected .idx or .sub Subtitle OCR source, got .{}", + ext + )), + } +} + +fn validate_existing_vobsub_part(path: &Path, extension: &str) -> Result<(), String> { + if !path.exists() { + return Err(format!("File not found: {}", path.display())); + } + if !path.is_file() { + return Err(format!("Not a file: {}", path.display())); + } + match lower_extension(path).as_deref() { + Some(ext) if ext == extension => Ok(()), + Some(ext) => Err(format!( + "Expected .{} Subtitle OCR source, got .{}", + extension, ext + )), + None => Err(format!( + "Expected .{} Subtitle OCR source, got path without extension", + extension + )), + } +} + +fn validate_existing_vobsub_sidecar(path: &Path, extension: &str) -> Result<(), String> { + if !path.exists() { + return Err(format!( + "VobSub .{} sidecar not found: {}", + extension, + path.display() + )); + } + validate_existing_vobsub_part(path, extension) +} + +fn lower_extension(path: &Path) -> Option { + path.extension() + .and_then(|extension| extension.to_str()) + .map(|extension| extension.to_ascii_lowercase()) +} + +fn sibling_with_extension(path: &Path, extension: &str) -> PathBuf { + let direct = path.with_extension(extension); + find_sibling_with_extension(path, extension).unwrap_or(direct) +} + +fn find_sibling_with_extension(path: &Path, extension: &str) -> Option { + let parent = path.parent()?; + let stem = path.file_stem()?.to_string_lossy(); + + for entry in std::fs::read_dir(parent).ok()?.flatten() { + let candidate = entry.path(); + let Some(candidate_stem) = candidate.file_stem() else { + continue; + }; + let Some(candidate_extension) = candidate.extension() else { + continue; + }; + let candidate_stem = candidate_stem.to_string_lossy(); + let candidate_extension = candidate_extension.to_string_lossy(); + if candidate_stem.eq_ignore_ascii_case(&stem) + && candidate_extension.eq_ignore_ascii_case(extension) + { + return Some(candidate); + } + } + + None +} + +#[cfg(test)] +mod tests { + use super::{codec_label, parse_tracks_from_probe_json, resolve_vobsub_pair}; + + #[test] + fn codec_label_accepts_bitmap_subtitle_codecs() { + assert_eq!(codec_label("hdmv_pgs_subtitle"), Some("PGS")); + assert_eq!(codec_label("pgs"), Some("PGS")); + } + + #[test] + fn codec_label_rejects_unsupported_container_subtitle_codecs() { + assert_eq!(codec_label("subrip"), None); + assert_eq!(codec_label("ass"), None); + assert_eq!(codec_label("dvd_subtitle"), None); + } + + #[test] + fn parse_tracks_from_probe_json_filters_supported_bitmap_subtitle_streams() { + let json = r#"{ + "streams": [ + { "index": 0, "codec_type": "video", "codec_name": "h264" }, + { + "index": 2, + "codec_type": "subtitle", + "codec_name": "hdmv_pgs_subtitle", + "tags": { "language": "eng", "title": "Signs" }, + "disposition": { "forced": 1, "default": 0 } + }, + { + "index": 3, + "codec_type": "subtitle", + "codec_name": "subrip", + "tags": { "language": "eng" } + }, + { + "index": 4, + "codec_type": "subtitle", + "codec_name": "dvd_subtitle", + "tags": { "LANGUAGE": "jpn", "TITLE": "Main" }, + "disposition": { "forced": "false", "default": "true" } + } + ] + }"#; + + let tracks = parse_tracks_from_probe_json(json).expect("tracks should parse"); + + assert_eq!(tracks.len(), 1); + assert_eq!(tracks[0].stream_index, 2); + assert_eq!(tracks[0].codec_label, "PGS"); + assert_eq!(tracks[0].language.as_deref(), Some("eng")); + assert_eq!(tracks[0].title.as_deref(), Some("Signs")); + assert!(tracks[0].forced); + assert!(!tracks[0].r#default); + } + + #[test] + fn parse_tracks_from_probe_json_reports_container_vobsub_as_unsupported() { + let json = r#"{ + "streams": [ + { "index": 0, "codec_type": "video", "codec_name": "h264" }, + { + "index": 3, + "codec_type": "subtitle", + "codec_name": "dvd_subtitle", + "tags": { "language": "eng" } + } + ] + }"#; + + let error = parse_tracks_from_probe_json(json) + .expect_err("container VobSub should get an actionable unsupported message"); + + assert!(error.contains("Container VobSub extraction is not supported")); + } + + #[test] + fn resolve_vobsub_pair_accepts_selected_sub_with_sibling_idx() { + let dir = tempfile::tempdir().expect("failed to create tempdir"); + let idx = dir.path().join("Movie.idx"); + let sub = dir.path().join("Movie.sub"); + std::fs::write(&idx, b"# VobSub index file").expect("failed to write idx"); + std::fs::write(&sub, b"sub").expect("failed to write sub"); + + let pair = resolve_vobsub_pair(sub.to_string_lossy().as_ref()) + .expect("selected .sub should resolve sibling .idx"); + + assert_eq!(pair.idx_path, idx.to_string_lossy()); + assert_eq!(pair.sub_path, sub.to_string_lossy()); + } + + #[test] + fn resolve_vobsub_pair_accepts_selected_idx_with_sibling_sub() { + let dir = tempfile::tempdir().expect("failed to create tempdir"); + let idx = dir.path().join("Movie.idx"); + let sub = dir.path().join("Movie.sub"); + std::fs::write(&idx, b"# VobSub index file").expect("failed to write idx"); + std::fs::write(&sub, b"sub").expect("failed to write sub"); + + let pair = resolve_vobsub_pair(idx.to_string_lossy().as_ref()) + .expect("selected .idx should resolve sibling .sub"); + + assert_eq!(pair.idx_path, idx.to_string_lossy()); + assert_eq!(pair.sub_path, sub.to_string_lossy()); + } + + #[test] + fn resolve_vobsub_pair_accepts_uppercase_sidecar_extensions() { + let dir = tempfile::tempdir().expect("failed to create tempdir"); + let idx = dir.path().join("Movie.IDX"); + let sub = dir.path().join("Movie.SUB"); + std::fs::write(&idx, b"# VobSub index file").expect("failed to write idx"); + std::fs::write(&sub, b"sub").expect("failed to write sub"); + + let pair = resolve_vobsub_pair(idx.to_string_lossy().as_ref()) + .expect("selected uppercase .idx should resolve uppercase .sub"); + + assert_eq!(pair.idx_path, idx.to_string_lossy()); + assert_eq!(pair.sub_path, sub.to_string_lossy()); + } + + #[test] + fn resolve_vobsub_pair_rejects_missing_sibling() { + let dir = tempfile::tempdir().expect("failed to create tempdir"); + let sub = dir.path().join("Movie.sub"); + std::fs::write(&sub, b"sub").expect("failed to write sub"); + + let error = resolve_vobsub_pair(sub.to_string_lossy().as_ref()) + .expect_err("missing idx should fail"); + + assert!(error.contains("VobSub .idx sidecar not found")); + } + + #[test] + fn resolve_vobsub_pair_rejects_non_vobsub_extension() { + let dir = tempfile::tempdir().expect("failed to create tempdir"); + let txt = dir.path().join("Movie.txt"); + std::fs::write(&txt, b"text").expect("failed to write txt"); + + let error = resolve_vobsub_pair(txt.to_string_lossy().as_ref()) + .expect_err("unsupported extension should fail"); + + assert!(error.contains("Expected .idx or .sub Subtitle OCR source")); + } +} diff --git a/src-tauri/src/tools/subtitle_ocr/mod.rs b/src-tauri/src/tools/subtitle_ocr/mod.rs new file mode 100644 index 00000000..bfc8fcbd --- /dev/null +++ b/src-tauri/src/tools/subtitle_ocr/mod.rs @@ -0,0 +1,229 @@ +pub(crate) mod assets; +pub(crate) mod cancel; +pub(crate) mod decode; +pub(crate) mod export; +pub(crate) mod extract; +pub(crate) mod import; +pub(crate) mod ocr; +pub(crate) mod progress; +pub(crate) mod restore; +pub(crate) mod stabilize; +pub(crate) mod state; +pub(crate) mod text; + +use serde::{Deserialize, Serialize}; + +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] +#[serde(rename_all = "camelCase")] +pub(crate) struct SubtitleOcrTrackInfo { + pub(crate) stream_index: u32, + pub(crate) codec: String, + pub(crate) codec_label: String, + pub(crate) language: Option, + pub(crate) title: Option, + pub(crate) forced: bool, + #[serde(rename = "default")] + pub(crate) r#default: bool, +} + +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] +#[serde(rename_all = "camelCase")] +#[allow(dead_code)] +pub(crate) struct SubtitleOcrBox { + pub(crate) text: String, + pub(crate) confidence: f64, + pub(crate) x: f64, + pub(crate) y: f64, + pub(crate) width: f64, + pub(crate) height: f64, +} + +#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq)] +#[serde(rename_all = "lowercase")] +pub(crate) enum SubtitleOcrPlacement { + Top, + Bottom, +} + +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] +#[serde(rename_all = "camelCase")] +pub(crate) struct SubtitleOcrCue { + pub(crate) id: String, + pub(crate) source_cue_ids: Vec, + pub(crate) start_time_ms: u64, + pub(crate) end_time_ms: u64, + pub(crate) text: String, + pub(crate) confidence: f64, + #[serde(skip_serializing_if = "Option::is_none")] + pub(crate) placement: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub(crate) placement_source_count: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub(crate) top_placement_source_count: Option, +} + +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] +#[serde(rename_all = "camelCase")] +pub(crate) struct SubtitleOcrDecodedCue { + pub(crate) cue_id: String, + pub(crate) start_time_ms: u64, + pub(crate) end_time_ms: u64, + pub(crate) width: u32, + pub(crate) height: u32, + pub(crate) cache_key: String, + #[serde(skip_serializing_if = "Option::is_none")] + pub(crate) preview_path: Option, +} + +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] +#[serde(rename_all = "camelCase")] +pub(crate) struct SubtitleOcrRawCue { + pub(crate) cue_id: String, + pub(crate) start_time_ms: u64, + pub(crate) end_time_ms: u64, + pub(crate) cache_key: String, + pub(crate) boxes: Vec, + pub(crate) text: String, + pub(crate) confidence: f64, + #[serde(skip_serializing_if = "Option::is_none")] + pub(crate) placement: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub(crate) placement_source_count: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub(crate) top_placement_source_count: Option, +} + +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] +#[serde(rename_all = "camelCase")] +pub(crate) struct SubtitleOcrLiveCueEvent { + pub(crate) item_id: String, + pub(crate) run_id: String, + pub(crate) bitmap: SubtitleOcrDecodedCue, + pub(crate) raw_cue: SubtitleOcrRawCue, + pub(crate) provisional_cue: SubtitleOcrCue, +} + +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] +#[serde(rename_all = "camelCase")] +pub(crate) struct SubtitleOcrPipelineResult { + pub(crate) decoded_cues: Vec, + pub(crate) raw_ocr_cues: Vec, + pub(crate) stabilized_cues: Vec, + pub(crate) final_cues: Vec, + pub(crate) stats: SubtitleOcrPipelineStats, +} + +#[derive(Debug, Clone, Default, Serialize, Deserialize, PartialEq, Eq)] +#[serde(rename_all = "camelCase")] +pub(crate) struct SubtitleOcrPipelineStats { + pub(crate) decoded_bitmap_count: u32, + pub(crate) skipped_empty_bitmap_count: u32, + pub(crate) ocr_processed_bitmap_count: u32, + pub(crate) deduplicated_bitmap_count: u32, +} + +#[cfg(test)] +mod tests { + use super::{ + SubtitleOcrBox, SubtitleOcrCue, SubtitleOcrDecodedCue, SubtitleOcrLiveCueEvent, + SubtitleOcrPlacement, SubtitleOcrRawCue, + }; + + #[test] + fn decoded_cue_serializes_optional_preview_path() { + let cue = SubtitleOcrDecodedCue { + cue_id: "cue-1".to_string(), + start_time_ms: 1_000, + end_time_ms: 2_000, + width: 1920, + height: 1080, + cache_key: "cache-key".to_string(), + preview_path: Some("/tmp/MediaFlow/subtitle-ocr/preview.png".to_string()), + }; + + let value = serde_json::to_value(cue).expect("decoded cue should serialize"); + + assert_eq!( + value.get("previewPath").and_then(serde_json::Value::as_str), + Some("/tmp/MediaFlow/subtitle-ocr/preview.png") + ); + } + + #[test] + fn decoded_cue_omits_missing_preview_path() { + let cue = SubtitleOcrDecodedCue { + cue_id: "cue-1".to_string(), + start_time_ms: 1_000, + end_time_ms: 2_000, + width: 1920, + height: 1080, + cache_key: "cache-key".to_string(), + preview_path: None, + }; + + let value = serde_json::to_value(cue).expect("decoded cue should serialize"); + + assert!(value.get("previewPath").is_none()); + } + + #[test] + fn live_cue_event_serializes_frontend_contract() { + let event = SubtitleOcrLiveCueEvent { + item_id: "item-1".to_string(), + run_id: "run-1".to_string(), + bitmap: SubtitleOcrDecodedCue { + cue_id: "cue-1".to_string(), + start_time_ms: 1_000, + end_time_ms: 2_000, + width: 1920, + height: 1080, + cache_key: "cache-key".to_string(), + preview_path: Some("/tmp/preview.png".to_string()), + }, + raw_cue: SubtitleOcrRawCue { + cue_id: "cue-1".to_string(), + start_time_ms: 1_000, + end_time_ms: 2_000, + cache_key: "cache-key".to_string(), + boxes: vec![SubtitleOcrBox { + text: "Hello".to_string(), + confidence: 0.9, + x: 0.1, + y: 0.2, + width: 0.3, + height: 0.4, + }], + text: "Hello".to_string(), + confidence: 0.9, + placement: Some(SubtitleOcrPlacement::Top), + placement_source_count: Some(1), + top_placement_source_count: Some(1), + }, + provisional_cue: SubtitleOcrCue { + id: "cue-1".to_string(), + source_cue_ids: vec!["cue-1".to_string()], + start_time_ms: 1_000, + end_time_ms: 2_000, + text: "Hello".to_string(), + confidence: 0.9, + placement: Some(SubtitleOcrPlacement::Top), + placement_source_count: Some(1), + top_placement_source_count: Some(1), + }, + }; + + let value = serde_json::to_value(event).expect("live cue event should serialize"); + + assert_eq!(value["itemId"], "item-1"); + assert_eq!(value["runId"], "run-1"); + assert_eq!(value["bitmap"]["previewPath"], "/tmp/preview.png"); + assert_eq!(value["rawCue"]["boxes"][0]["text"], "Hello"); + assert_eq!(value["rawCue"]["placement"], "top"); + assert_eq!(value["rawCue"]["placementSourceCount"], 1); + assert_eq!(value["rawCue"]["topPlacementSourceCount"], 1); + assert_eq!(value["provisionalCue"]["placement"], "top"); + assert_eq!(value["provisionalCue"]["placementSourceCount"], 1); + assert_eq!(value["provisionalCue"]["topPlacementSourceCount"], 1); + assert_eq!(value["provisionalCue"]["sourceCueIds"][0], "cue-1"); + } +} diff --git a/src-tauri/src/tools/subtitle_ocr/ocr.rs b/src-tauri/src/tools/subtitle_ocr/ocr.rs new file mode 100644 index 00000000..6bd7087d --- /dev/null +++ b/src-tauri/src/tools/subtitle_ocr/ocr.rs @@ -0,0 +1,1170 @@ +use std::collections::HashMap; +use std::path::PathBuf; +use std::sync::{ + Arc, + atomic::{AtomicBool, AtomicU32, Ordering}, +}; +use std::thread::{self, JoinHandle}; + +use image::{DynamicImage, RgbaImage}; +use tauri::Emitter; + +use crate::shared::sleep_inhibit::SleepInhibitGuard; +use crate::tools::ocr::{create_ocr_engine, get_ocr_models_dir, resolve_ocr_engine_threads}; +use crate::tools::subtitle_ocr::assets::write_decoded_bitmap_assets; +use crate::tools::subtitle_ocr::decode::{ + BitmapSubtitleSource, DecodedBitmapCue, count_bitmap_subtitle_source_with_stop, + decode_bitmap_subtitle_source_with_handler, is_empty_subtitle_bitmap_rgba, + validate_bitmap_subtitle_source, +}; +use crate::tools::subtitle_ocr::progress::{ProgressTotal, SubtitleOcrProgressEmitter}; +use crate::tools::subtitle_ocr::stabilize::stabilize_cues; +use crate::tools::subtitle_ocr::text::reconstruct_text_from_boxes; +use crate::tools::subtitle_ocr::{ + SubtitleOcrBox, SubtitleOcrCue, SubtitleOcrDecodedCue, SubtitleOcrLiveCueEvent, + SubtitleOcrPipelineResult, SubtitleOcrPipelineStats, SubtitleOcrPlacement, SubtitleOcrRawCue, +}; + +#[derive(Clone)] +struct PipelineProgress { + ocr: SubtitleOcrProgressEmitter, + ai_cleaning: SubtitleOcrProgressEmitter, +} + +#[derive(Debug, Clone)] +struct CachedSubtitleOcrBitmap { + width: u32, + height: u32, + preview_path: String, + boxes: Vec, + text: String, + confidence: f64, + placement: Option, + placement_source_count: Option, + top_placement_source_count: Option, +} + +impl CachedSubtitleOcrBitmap { + fn from_raw( + preview_path: String, + metadata: &SubtitleOcrDecodedCue, + raw_cue: &SubtitleOcrRawCue, + ) -> Self { + Self { + width: metadata.width, + height: metadata.height, + preview_path, + boxes: raw_cue.boxes.clone(), + text: raw_cue.text.clone(), + confidence: raw_cue.confidence, + placement: raw_cue.placement, + placement_source_count: raw_cue.placement_source_count, + top_placement_source_count: raw_cue.top_placement_source_count, + } + } + + fn matches_dimensions(&self, metadata: &SubtitleOcrDecodedCue) -> bool { + self.width == metadata.width && self.height == metadata.height + } + + fn raw_cue_for_metadata(&self, metadata: &SubtitleOcrDecodedCue) -> SubtitleOcrRawCue { + SubtitleOcrRawCue { + cue_id: metadata.cue_id.clone(), + start_time_ms: metadata.start_time_ms, + end_time_ms: metadata.end_time_ms, + cache_key: metadata.cache_key.clone(), + boxes: self.boxes.clone(), + text: self.text.clone(), + confidence: self.confidence, + placement: self.placement, + placement_source_count: self.placement_source_count, + top_placement_source_count: self.top_placement_source_count, + } + } +} + +#[tauri::command] +pub(crate) async fn run_subtitle_ocr_pipeline( + app: tauri::AppHandle, + item_id: String, + run_id: String, + source_path: String, + idx_path: Option, + sub_path: Option, + language: String, + use_gpu: bool, + expected_bitmap_count: Option, +) -> Result { + if item_id.trim().is_empty() { + return Err("Subtitle OCR item id is required".to_string()); + } + if run_id.trim().is_empty() { + return Err("Subtitle OCR run id is required".to_string()); + } + + let _sleep_guard = SleepInhibitGuard::try_acquire("Running Subtitle OCR pipeline").ok(); + let source = + validate_bitmap_subtitle_source(&source_path, idx_path.as_deref(), sub_path.as_deref())?; + let models_dir = get_ocr_models_dir(&app)?; + super::state::begin_operation(&item_id, &run_id)?; + + let item_id_for_task = item_id.clone(); + let run_id_for_task = run_id.clone(); + let task = tokio::task::spawn_blocking(move || { + run_subtitle_ocr_pipeline_blocking( + &item_id_for_task, + &run_id_for_task, + &source, + app, + models_dir, + &language, + use_gpu, + expected_bitmap_count, + ) + }); + + let join_result = task.await; + let _ = super::state::clear_registered_operation(&item_id, &run_id); + let result = join_result.map_err(|e| format!("Subtitle OCR pipeline task failed: {}", e))?; + if result.is_ok() { + let _ = super::state::clear_cancelled(&item_id, &run_id); + } + + result +} + +fn run_subtitle_ocr_pipeline_blocking( + item_id: &str, + run_id: &str, + source: &super::decode::BitmapSubtitleSource, + app: tauri::AppHandle, + models_dir: PathBuf, + language: &str, + use_gpu: bool, + expected_bitmap_count: Option, +) -> Result { + ensure_not_cancelled(item_id, run_id)?; + let live_event_app = app.clone(); + let progress = PipelineProgress { + ocr: SubtitleOcrProgressEmitter::new( + app.clone(), + item_id.to_string(), + run_id.to_string(), + "ocr", + initial_bitmap_total(expected_bitmap_count), + ), + ai_cleaning: SubtitleOcrProgressEmitter::new( + app, + item_id.to_string(), + run_id.to_string(), + "ai_cleaning", + 1, + ), + }; + let processed_count = Arc::new(AtomicU32::new(0)); + let mut background_count = start_background_bitmap_count( + should_start_background_count(expected_bitmap_count), + source, + item_id, + run_id, + progress.ocr.clone(), + Arc::clone(&processed_count), + ); + + let engine_threads = resolve_ocr_engine_threads(1); + let engine = create_ocr_engine(&models_dir, language, use_gpu, engine_threads, true)?; + progress.ocr.emit_force(0); + + let mut decoded_metadata = Vec::new(); + let mut raw_ocr_cues = Vec::new(); + let mut final_candidates = Vec::new(); + let mut stats = SubtitleOcrPipelineStats::default(); + let mut ocr_cache: HashMap> = HashMap::new(); + decode_bitmap_subtitle_source_with_handler(source, item_id, run_id, |mut decoded| { + ensure_not_cancelled(item_id, run_id)?; + stats.decoded_bitmap_count = stats.decoded_bitmap_count.saturating_add(1); + processed_count.store(stats.decoded_bitmap_count, Ordering::Relaxed); + + if is_empty_subtitle_bitmap_rgba(&decoded.rgba) { + stats.skipped_empty_bitmap_count = stats.skipped_empty_bitmap_count.saturating_add(1); + progress.ocr.emit(stats.decoded_bitmap_count); + return Ok(()); + } + + if let Some(cached) = ocr_cache + .get(&decoded.content_hash) + .and_then(|cached_entries| { + cached_entries + .iter() + .find(|entry| entry.matches_dimensions(&decoded.metadata)) + }) + { + stats.deduplicated_bitmap_count = stats.deduplicated_bitmap_count.saturating_add(1); + decoded.metadata.preview_path = Some(cached.preview_path.clone()); + let metadata = decoded.metadata.clone(); + let raw_cue = cached.raw_cue_for_metadata(&metadata); + append_ocr_result( + &live_event_app, + item_id, + run_id, + metadata, + raw_cue, + &mut decoded_metadata, + &mut raw_ocr_cues, + &mut final_candidates, + ); + progress.ocr.emit(stats.decoded_bitmap_count); + return Ok(()); + } + + let content_hash = decoded.content_hash; + let bitmap_assets = + write_decoded_bitmap_assets(item_id, run_id, &decoded.metadata, &decoded.rgba)?; + let preview_path = bitmap_assets.preview_path; + decoded.metadata.preview_path = Some(preview_path.clone()); + let metadata = decoded.metadata.clone(); + let raw_cue = ocr_decoded_bitmap( + &engine, + decoded, + matches!(source, BitmapSubtitleSource::Pgs { .. }), + )?; + stats.ocr_processed_bitmap_count = stats.ocr_processed_bitmap_count.saturating_add(1); + ocr_cache + .entry(content_hash) + .or_default() + .push(CachedSubtitleOcrBitmap::from_raw( + preview_path, + &metadata, + &raw_cue, + )); + append_ocr_result( + &live_event_app, + item_id, + run_id, + metadata, + raw_cue, + &mut decoded_metadata, + &mut raw_ocr_cues, + &mut final_candidates, + ); + progress.ocr.emit(stats.decoded_bitmap_count); + Ok(()) + })?; + + stop_background_bitmap_count(&mut background_count); + progress + .ocr + .emit_force_with_total(stats.decoded_bitmap_count, stats.decoded_bitmap_count); + + ensure_not_cancelled(item_id, run_id)?; + if stats.decoded_bitmap_count == 0 { + progress.ai_cleaning.emit_force(1); + return Ok(empty_subtitle_ocr_pipeline_result()); + } + + if raw_ocr_cues.is_empty() { + progress.ai_cleaning.emit_force(1); + return Ok(SubtitleOcrPipelineResult { + decoded_cues: decoded_metadata, + raw_ocr_cues, + final_cues: Vec::new(), + stabilized_cues: Vec::new(), + stats, + }); + } + + progress.ai_cleaning.emit_force(0); + let stabilized_cues = stabilize_cues(&final_candidates); + let final_cues = build_final_subtitle_ocr_cues(&raw_ocr_cues, stabilized_cues.clone()); + progress.ai_cleaning.emit_force(1); + + Ok(SubtitleOcrPipelineResult { + decoded_cues: decoded_metadata, + raw_ocr_cues, + final_cues, + stabilized_cues, + stats, + }) +} + +struct BackgroundBitmapCountTask { + stop: Arc, + handle: Option>, +} + +impl Drop for BackgroundBitmapCountTask { + fn drop(&mut self) { + self.stop(); + } +} + +impl BackgroundBitmapCountTask { + fn stop(&mut self) { + self.stop.store(true, Ordering::Relaxed); + if let Some(handle) = self.handle.take() { + let _ = handle.join(); + } + } +} + +fn stop_background_bitmap_count(background_count: &mut Option) { + if let Some(mut task) = background_count.take() { + task.stop(); + } +} + +fn start_background_bitmap_count( + enabled: bool, + source: &BitmapSubtitleSource, + item_id: &str, + run_id: &str, + progress: SubtitleOcrProgressEmitter, + processed_count: Arc, +) -> Option { + if !enabled { + return None; + } + + let stop = Arc::new(AtomicBool::new(false)); + let thread_stop = Arc::clone(&stop); + let source = source.clone(); + let item_id = item_id.to_string(); + let run_id = run_id.to_string(); + let handle = thread::spawn(move || { + let count = count_bitmap_subtitle_source_with_stop(&source, &item_id, &run_id, || { + thread_stop.load(Ordering::Relaxed) + }); + if thread_stop.load(Ordering::Relaxed) + || super::state::is_operation_cancelled(&item_id, &run_id) + { + return; + } + + if let Ok(total) = count { + let current = processed_count.load(Ordering::Relaxed); + if thread_stop.load(Ordering::Relaxed) + || super::state::is_operation_cancelled(&item_id, &run_id) + { + return; + } + + progress.emit_force_with_total(current, total); + } + }); + + Some(BackgroundBitmapCountTask { + stop, + handle: Some(handle), + }) +} + +fn empty_subtitle_ocr_pipeline_result() -> SubtitleOcrPipelineResult { + SubtitleOcrPipelineResult { + decoded_cues: Vec::new(), + raw_ocr_cues: Vec::new(), + final_cues: Vec::new(), + stabilized_cues: Vec::new(), + stats: SubtitleOcrPipelineStats::default(), + } +} + +fn initial_bitmap_total(expected_bitmap_count: Option) -> ProgressTotal { + expected_bitmap_count + .filter(|count| *count > 0) + .map(ProgressTotal::Known) + .unwrap_or(ProgressTotal::Unknown) +} + +fn should_start_background_count(expected_bitmap_count: Option) -> bool { + expected_bitmap_count.unwrap_or(0) == 0 +} + +fn build_final_subtitle_ocr_cues( + raw_ocr_cues: &[SubtitleOcrRawCue], + stabilized_cues: Vec, +) -> Vec { + if raw_ocr_cues.is_empty() { + return stabilized_cues; + } + + let mut stabilized_index_by_source = HashMap::new(); + for (index, cue) in stabilized_cues.iter().enumerate() { + for source_cue_id in &cue.source_cue_ids { + stabilized_index_by_source.insert(source_cue_id.as_str(), index); + } + } + + let mut emitted_stabilized_cues = vec![false; stabilized_cues.len()]; + let mut final_cues = Vec::with_capacity(raw_ocr_cues.len().max(stabilized_cues.len())); + for raw_cue in raw_ocr_cues { + if let Some(index) = stabilized_index_by_source.get(raw_cue.cue_id.as_str()) { + if !emitted_stabilized_cues[*index] { + final_cues.push(stabilized_cues[*index].clone()); + emitted_stabilized_cues[*index] = true; + } + continue; + } + + final_cues.push(blank_final_cue_from_raw(raw_cue)); + } + + for (index, cue) in stabilized_cues.into_iter().enumerate() { + if !emitted_stabilized_cues[index] { + final_cues.push(cue); + } + } + + final_cues +} + +fn append_ocr_result( + app: &tauri::AppHandle, + item_id: &str, + run_id: &str, + metadata: SubtitleOcrDecodedCue, + raw_cue: SubtitleOcrRawCue, + decoded_metadata: &mut Vec, + raw_ocr_cues: &mut Vec, + final_candidates: &mut Vec, +) { + let provisional_cue = provisional_cue_from_raw(&raw_cue); + emit_live_cue_event(app, item_id, run_id, &metadata, &raw_cue, &provisional_cue); + if !raw_cue.text.trim().is_empty() { + final_candidates.push(provisional_cue); + } + decoded_metadata.push(metadata); + raw_ocr_cues.push(raw_cue); +} + +fn blank_final_cue_from_raw(raw_cue: &SubtitleOcrRawCue) -> SubtitleOcrCue { + SubtitleOcrCue { + id: raw_cue.cue_id.clone(), + source_cue_ids: vec![raw_cue.cue_id.clone()], + start_time_ms: raw_cue.start_time_ms, + end_time_ms: raw_cue.end_time_ms, + text: String::new(), + confidence: raw_cue.confidence, + placement: raw_cue.placement, + placement_source_count: raw_cue.placement_source_count, + top_placement_source_count: raw_cue.top_placement_source_count, + } +} + +fn ensure_not_cancelled(item_id: &str, run_id: &str) -> Result<(), String> { + if super::state::is_operation_cancelled(item_id, run_id) { + Err("Subtitle OCR operation cancelled".to_string()) + } else { + Ok(()) + } +} + +fn provisional_cue_from_raw(raw_cue: &SubtitleOcrRawCue) -> SubtitleOcrCue { + SubtitleOcrCue { + id: raw_cue.cue_id.clone(), + source_cue_ids: vec![raw_cue.cue_id.clone()], + start_time_ms: raw_cue.start_time_ms, + end_time_ms: raw_cue.end_time_ms, + text: raw_cue.text.clone(), + confidence: raw_cue.confidence, + placement: raw_cue.placement, + placement_source_count: raw_cue.placement_source_count, + top_placement_source_count: raw_cue.top_placement_source_count, + } +} + +fn emit_live_cue_event( + app: &tauri::AppHandle, + item_id: &str, + run_id: &str, + bitmap: &super::SubtitleOcrDecodedCue, + raw_cue: &SubtitleOcrRawCue, + provisional_cue: &SubtitleOcrCue, +) { + let _ = app.emit( + "subtitle-ocr-live-cue", + SubtitleOcrLiveCueEvent { + item_id: item_id.to_string(), + run_id: run_id.to_string(), + bitmap: bitmap.clone(), + raw_cue: raw_cue.clone(), + provisional_cue: provisional_cue.clone(), + }, + ); +} + +fn ocr_decoded_bitmap( + engine: &ocr_rs::OcrEngine, + decoded: DecodedBitmapCue, + detect_placement: bool, +) -> Result { + let DecodedBitmapCue { metadata, rgba, .. } = decoded; + let image = RgbaImage::from_raw(metadata.width, metadata.height, rgba).ok_or_else(|| { + "Decoded Subtitle OCR bitmap dimensions did not match RGBA data".to_string() + })?; + let image = DynamicImage::ImageRgba8(image); + recognize_subtitle_ocr_image(engine, &metadata, &image, detect_placement) +} + +fn recognize_subtitle_ocr_image( + engine: &ocr_rs::OcrEngine, + metadata: &crate::tools::subtitle_ocr::SubtitleOcrDecodedCue, + image: &DynamicImage, + detect_placement: bool, +) -> Result { + let ocr_results = engine + .recognize(image) + .map_err(|e| format!("Subtitle OCR recognition failed: {}", e))?; + let boxes = ocr_results + .iter() + .map(|result| SubtitleOcrBox { + text: result.text.clone(), + confidence: result.confidence as f64, + x: result.bbox.rect.left().max(0) as f64, + y: result.bbox.rect.top().max(0) as f64, + width: result.bbox.rect.width() as f64, + height: result.bbox.rect.height() as f64, + }) + .collect::>(); + let text = reconstruct_text_from_boxes(&boxes); + let confidence = average_confidence(&boxes); + let placement = detect_placement.then(|| placement_from_boxes(&boxes, metadata.height)); + let placement_source_count = placement.map(|_| 1); + let top_placement_source_count = + placement.map(|value| u32::from(value == SubtitleOcrPlacement::Top)); + + Ok(SubtitleOcrRawCue { + cue_id: metadata.cue_id.clone(), + start_time_ms: metadata.start_time_ms, + end_time_ms: metadata.end_time_ms, + cache_key: metadata.cache_key.clone(), + boxes, + text, + confidence, + placement, + placement_source_count, + top_placement_source_count, + }) +} + +fn placement_from_boxes(boxes: &[SubtitleOcrBox], bitmap_height: u32) -> SubtitleOcrPlacement { + if bitmap_height == 0 { + return SubtitleOcrPlacement::Bottom; + } + + let mut min_y = f64::INFINITY; + let mut max_y = f64::NEG_INFINITY; + for ocr_box in boxes { + if !ocr_box.y.is_finite() || !ocr_box.height.is_finite() || ocr_box.height <= 0.0 { + continue; + } + + min_y = min_y.min(ocr_box.y.max(0.0)); + max_y = max_y.max((ocr_box.y + ocr_box.height).max(0.0)); + } + + if !min_y.is_finite() || !max_y.is_finite() || max_y <= min_y { + return SubtitleOcrPlacement::Bottom; + } + + let center_y = min_y + ((max_y - min_y) / 2.0); + if center_y < f64::from(bitmap_height) / 2.0 { + SubtitleOcrPlacement::Top + } else { + SubtitleOcrPlacement::Bottom + } +} + +fn average_confidence(boxes: &[SubtitleOcrBox]) -> f64 { + if boxes.is_empty() { + 0.0 + } else { + boxes.iter().map(|ocr_box| ocr_box.confidence).sum::() / boxes.len() as f64 + } +} + +#[cfg(test)] +mod tests { + use std::cell::Cell; + use std::path::{Path, PathBuf}; + use std::sync::{ + Arc, + atomic::{AtomicBool, Ordering}, + }; + use std::thread; + use std::time::Duration; + + use image::{DynamicImage, Rgba, RgbaImage}; + + use super::{ + BackgroundBitmapCountTask, CachedSubtitleOcrBitmap, build_final_subtitle_ocr_cues, + empty_subtitle_ocr_pipeline_result, initial_bitmap_total, ocr_decoded_bitmap, + placement_from_boxes, should_start_background_count, + }; + use crate::tools::ocr::{create_ocr_engine, resolve_ocr_engine_threads}; + use crate::tools::subtitle_ocr::decode::{ + BitmapSubtitleSource, decode_bitmap_subtitle_source_with_handler_and_stop, + }; + use crate::tools::subtitle_ocr::progress::ProgressTotal; + use crate::tools::subtitle_ocr::{ + SubtitleOcrBox, SubtitleOcrCue, SubtitleOcrDecodedCue, SubtitleOcrPipelineResult, + SubtitleOcrPipelineStats, SubtitleOcrPlacement, SubtitleOcrRawCue, + }; + + #[test] + fn empty_subtitle_ocr_pipeline_result_has_no_artifacts_or_cues() { + let result = empty_subtitle_ocr_pipeline_result(); + + assert!(result.decoded_cues.is_empty()); + assert!(result.raw_ocr_cues.is_empty()); + assert!(result.stabilized_cues.is_empty()); + assert!(result.final_cues.is_empty()); + assert_eq!(result.stats, SubtitleOcrPipelineStats::default()); + } + + #[test] + fn initial_bitmap_total_uses_expected_count_when_available() { + assert_eq!(initial_bitmap_total(Some(373)), ProgressTotal::Known(373)); + assert_eq!(initial_bitmap_total(Some(0)), ProgressTotal::Unknown); + assert_eq!(initial_bitmap_total(None), ProgressTotal::Unknown); + } + + #[test] + fn background_count_starts_only_without_expected_count() { + assert!(!should_start_background_count(Some(373))); + assert!(should_start_background_count(Some(0))); + assert!(should_start_background_count(None)); + } + + #[test] + fn background_count_stop_joins_running_thread() { + let stop = Arc::new(AtomicBool::new(false)); + let thread_stop = Arc::clone(&stop); + let finished = Arc::new(AtomicBool::new(false)); + let thread_finished = Arc::clone(&finished); + let handle = thread::spawn(move || { + while !thread_stop.load(Ordering::Relaxed) { + thread::sleep(Duration::from_millis(1)); + } + thread_finished.store(true, Ordering::Relaxed); + }); + + let mut task = BackgroundBitmapCountTask { + stop, + handle: Some(handle), + }; + task.stop(); + + assert!(finished.load(Ordering::Relaxed)); + assert!(task.handle.is_none()); + } + + fn raw_cue(cue_id: &str, text: &str) -> SubtitleOcrRawCue { + raw_cue_at(cue_id, 1_000, 2_500, text) + } + + fn raw_cue_at( + cue_id: &str, + start_time_ms: u64, + end_time_ms: u64, + text: &str, + ) -> SubtitleOcrRawCue { + SubtitleOcrRawCue { + cue_id: cue_id.to_string(), + start_time_ms, + end_time_ms, + cache_key: format!("cache-{cue_id}"), + boxes: Vec::new(), + text: text.to_string(), + confidence: if text.is_empty() { 0.0 } else { 0.8 }, + placement: Some(SubtitleOcrPlacement::Bottom), + placement_source_count: Some(1), + top_placement_source_count: Some(0), + } + } + + fn final_cue(cue_id: &str, text: &str) -> SubtitleOcrCue { + SubtitleOcrCue { + id: cue_id.to_string(), + source_cue_ids: vec![cue_id.to_string()], + start_time_ms: 1_000, + end_time_ms: 2_500, + text: text.to_string(), + confidence: 0.8, + placement: Some(SubtitleOcrPlacement::Bottom), + placement_source_count: Some(1), + top_placement_source_count: Some(0), + } + } + + #[test] + fn placement_from_boxes_detects_top_half() { + let boxes = vec![SubtitleOcrBox { + text: "Top".to_string(), + confidence: 0.9, + x: 100.0, + y: 80.0, + width: 400.0, + height: 60.0, + }]; + + assert_eq!( + placement_from_boxes(&boxes, 1080), + SubtitleOcrPlacement::Top + ); + } + + #[test] + fn placement_from_boxes_defaults_bottom_for_lower_or_missing_boxes() { + let boxes = vec![SubtitleOcrBox { + text: "Bottom".to_string(), + confidence: 0.9, + x: 100.0, + y: 880.0, + width: 400.0, + height: 60.0, + }]; + + assert_eq!( + placement_from_boxes(&boxes, 1080), + SubtitleOcrPlacement::Bottom + ); + assert_eq!( + placement_from_boxes(&[], 1080), + SubtitleOcrPlacement::Bottom + ); + } + + #[test] + fn cached_ocr_bitmap_creates_distinct_raw_cues_for_duplicate_timings() { + let mut original = raw_cue_at("cue-a", 1_000, 2_000, "OK"); + original.cache_key = "cache-a".to_string(); + original.boxes = vec![SubtitleOcrBox { + text: "OK".to_string(), + confidence: 0.95, + x: 100.0, + y: 900.0, + width: 90.0, + height: 40.0, + }]; + let duplicate_metadata = SubtitleOcrDecodedCue { + cue_id: "cue-b".to_string(), + start_time_ms: 60_000, + end_time_ms: 61_000, + width: 1920, + height: 1080, + cache_key: "cache-b".to_string(), + preview_path: None, + }; + let cached = CachedSubtitleOcrBitmap::from_raw( + "/tmp/shared-preview.png".to_string(), + &duplicate_metadata, + &original, + ); + + let duplicate = cached.raw_cue_for_metadata(&duplicate_metadata); + + assert_eq!(duplicate.cue_id, "cue-b"); + assert_eq!(duplicate.start_time_ms, 60_000); + assert_eq!(duplicate.end_time_ms, 61_000); + assert_eq!(duplicate.cache_key, "cache-b"); + assert_eq!(duplicate.text, "OK"); + assert_eq!(duplicate.boxes, original.boxes); + assert_eq!(duplicate.placement_source_count, Some(1)); + assert_eq!(duplicate.top_placement_source_count, Some(0)); + } + + #[test] + fn cached_ocr_bitmap_requires_matching_dimensions() { + let raw = raw_cue_at("cue-a", 1_000, 2_000, "OK"); + let original_metadata = SubtitleOcrDecodedCue { + cue_id: "cue-a".to_string(), + start_time_ms: 1_000, + end_time_ms: 2_000, + width: 1920, + height: 1080, + cache_key: "cache-a".to_string(), + preview_path: Some("/tmp/shared-preview.png".to_string()), + }; + let cached = CachedSubtitleOcrBitmap::from_raw( + "/tmp/shared-preview.png".to_string(), + &original_metadata, + &raw, + ); + let same_dimensions = SubtitleOcrDecodedCue { + cue_id: "cue-b".to_string(), + start_time_ms: 60_000, + end_time_ms: 61_000, + width: 1920, + height: 1080, + cache_key: "cache-b".to_string(), + preview_path: None, + }; + let different_dimensions = SubtitleOcrDecodedCue { + width: 960, + height: 540, + ..same_dimensions.clone() + }; + + assert!(cached.matches_dimensions(&same_dimensions)); + assert!(!cached.matches_dimensions(&different_dimensions)); + } + + #[test] + #[ignore] + fn diagnose_required_vobsub_pipeline() { + let Ok(idx_path) = + std::env::var("MEDIAFLOW_SUBTITLE_OCR_DIAGNOSTIC_IDX").map(PathBuf::from) + else { + eprintln!("Set MEDIAFLOW_SUBTITLE_OCR_DIAGNOSTIC_IDX to run this diagnostic"); + return; + }; + let Ok(sub_path) = + std::env::var("MEDIAFLOW_SUBTITLE_OCR_DIAGNOSTIC_SUB").map(PathBuf::from) + else { + eprintln!("Set MEDIAFLOW_SUBTITLE_OCR_DIAGNOSTIC_SUB to run this diagnostic"); + return; + }; + let models_dir = std::env::var("MEDIAFLOW_SUBTITLE_OCR_DIAGNOSTIC_MODELS") + .map(PathBuf::from) + .unwrap_or_else(|_| PathBuf::from("ocr-models")); + let output_dir = std::env::var("MEDIAFLOW_SUBTITLE_OCR_DIAGNOSTIC_OUT") + .map(PathBuf::from) + .unwrap_or_else(|_| { + PathBuf::from("target") + .join("subtitle-ocr-diagnostics") + .join("env-vobsub-current") + }); + let limit = std::env::var("MEDIAFLOW_SUBTITLE_OCR_DIAGNOSTIC_LIMIT") + .ok() + .and_then(|value| value.parse::().ok()) + .unwrap_or(0); + let language = std::env::var("MEDIAFLOW_SUBTITLE_OCR_DIAGNOSTIC_LANGUAGE") + .unwrap_or_else(|_| "multi".to_string()); + let use_gpu = std::env::var("MEDIAFLOW_SUBTITLE_OCR_DIAGNOSTIC_GPU") + .map(|value| value != "0") + .unwrap_or(true); + + std::fs::create_dir_all(output_dir.join("decoded")) + .expect("failed to create decoded output directory"); + std::fs::create_dir_all(output_dir.join("ocr-input")) + .expect("failed to create ocr input output directory"); + std::fs::create_dir_all(output_dir.join("overlays")) + .expect("failed to create overlay output directory"); + std::fs::create_dir_all(output_dir.join("line-overlays")) + .expect("failed to create line overlay output directory"); + + let source = BitmapSubtitleSource::VobSub { idx_path, sub_path }; + let engine_threads = resolve_ocr_engine_threads(1); + let engine = create_ocr_engine(&models_dir, &language, use_gpu, engine_threads, true) + .expect("failed to create OCR engine"); + let mut decoded_cues = Vec::new(); + let mut raw_ocr_cues = Vec::new(); + let mut final_candidates = Vec::new(); + let decoded_count = Cell::new(0usize); + + let decode_result = decode_bitmap_subtitle_source_with_handler_and_stop( + &source, + "diagnostic-vobsub", + "diagnostic-run", + |decoded| { + decoded_count.set(decoded_count.get().saturating_add(1)); + let sequence = decoded_count.get(); + let decoded_path = output_dir + .join("decoded") + .join(format!("{sequence:04}-decoded.png")); + let ocr_input_path = output_dir + .join("ocr-input") + .join(format!("{sequence:04}-ocr-input.png")); + write_rgba_png( + &decoded_path, + decoded.metadata.width, + decoded.metadata.height, + &decoded.rgba, + ); + let decoded_image = RgbaImage::from_raw( + decoded.metadata.width, + decoded.metadata.height, + decoded.rgba.clone(), + ) + .map(DynamicImage::ImageRgba8) + .expect("decoded RGBA dimensions should match data"); + decoded_image + .save(&ocr_input_path) + .expect("failed to write diagnostic OCR input PNG"); + + let metadata = decoded.metadata.clone(); + let raw_cue = ocr_decoded_bitmap(&engine, decoded, false) + .expect("failed to OCR decoded VobSub bitmap"); + write_detection_overlay( + &output_dir + .join("overlays") + .join(format!("{sequence:04}-boxes.png")), + &ocr_input_path, + &raw_cue.boxes, + Rgba([255, 0, 0, 255]), + ); + write_line_overlay( + &output_dir + .join("line-overlays") + .join(format!("{sequence:04}-lines.png")), + &ocr_input_path, + &raw_cue.boxes, + ); + + if !raw_cue.text.trim().is_empty() { + final_candidates.push(SubtitleOcrCue { + id: raw_cue.cue_id.clone(), + source_cue_ids: vec![raw_cue.cue_id.clone()], + start_time_ms: raw_cue.start_time_ms, + end_time_ms: raw_cue.end_time_ms, + text: raw_cue.text.clone(), + confidence: raw_cue.confidence, + placement: raw_cue.placement, + placement_source_count: raw_cue.placement_source_count, + top_placement_source_count: raw_cue.top_placement_source_count, + }); + } + + decoded_cues.push(metadata); + raw_ocr_cues.push(raw_cue); + Ok(()) + }, + || limit > 0 && decoded_count.get() >= limit, + ); + if limit > 0 { + let error = decode_result.expect_err("diagnostic decode limit should stop the stream"); + assert!( + error.contains("stopped"), + "unexpected diagnostic decode error: {error}" + ); + } else { + decode_result.expect("failed to decode and OCR VobSub diagnostic source"); + } + + let stabilized_cues = + crate::tools::subtitle_ocr::stabilize::stabilize_cues(&final_candidates); + let final_cues = build_final_subtitle_ocr_cues(&raw_ocr_cues, stabilized_cues.clone()); + let result = SubtitleOcrPipelineResult { + decoded_cues, + raw_ocr_cues, + stabilized_cues, + final_cues, + stats: SubtitleOcrPipelineStats::default(), + }; + + std::fs::write( + output_dir.join("pipeline-result.json"), + serde_json::to_string_pretty(&result).expect("failed to serialize diagnostic result"), + ) + .expect("failed to write diagnostic JSON"); + std::fs::write( + output_dir.join("final.srt"), + format_diagnostic_srt(&result.final_cues), + ) + .expect("failed to write diagnostic SRT"); + + println!( + "SUBTITLE_OCR_DIAGNOSTIC output={} decoded={} raw={} final={}", + output_dir.display(), + result.decoded_cues.len(), + result.raw_ocr_cues.len(), + result.final_cues.len() + ); + } + + fn write_rgba_png(path: &Path, width: u32, height: u32, rgba: &[u8]) { + let image = RgbaImage::from_raw(width, height, rgba.to_vec()) + .expect("decoded RGBA dimensions should match data"); + image.save(path).expect("failed to write RGBA PNG"); + } + + fn write_detection_overlay( + output_path: &Path, + source_path: &Path, + boxes: &[SubtitleOcrBox], + color: Rgba, + ) { + let mut image = image::open(source_path) + .expect("failed to open source image for overlay") + .to_rgba8(); + for ocr_box in boxes { + draw_rect(&mut image, ocr_box, color); + } + image + .save(output_path) + .expect("failed to write detection overlay"); + } + + fn write_line_overlay(output_path: &Path, source_path: &Path, boxes: &[SubtitleOcrBox]) { + let mut image = image::open(source_path) + .expect("failed to open source image for line overlay") + .to_rgba8(); + for (line_index, line) in diagnostic_lines(boxes).iter().enumerate() { + let color = if line_index % 2 == 0 { + Rgba([0, 192, 255, 255]) + } else { + Rgba([255, 160, 0, 255]) + }; + draw_rect(&mut image, line, color); + } + image + .save(output_path) + .expect("failed to write line overlay"); + } + + fn diagnostic_lines(boxes: &[SubtitleOcrBox]) -> Vec { + let mut boxes = boxes + .iter() + .filter(|ocr_box| !ocr_box.text.trim().is_empty()) + .cloned() + .collect::>(); + boxes.sort_by(|a, b| a.y.total_cmp(&b.y).then_with(|| a.x.total_cmp(&b.x))); + + let mut lines: Vec = Vec::new(); + for ocr_box in boxes { + let ocr_box_center_y = diagnostic_box_center_y(&ocr_box); + if let Some(line) = lines.last_mut().filter(|line| { + diagnostic_is_same_line( + diagnostic_box_center_y(line), + line.height, + ocr_box_center_y, + ocr_box.height, + ) + }) { + let right = (line.x + line.width).max(ocr_box.x + ocr_box.width); + let bottom = (line.y + line.height).max(ocr_box.y + ocr_box.height); + line.x = line.x.min(ocr_box.x); + line.y = line.y.min(ocr_box.y); + line.width = (right - line.x).max(1.0); + line.height = (bottom - line.y).max(1.0); + line.text = format!("{} {}", line.text, ocr_box.text); + } else { + lines.push(ocr_box); + } + } + + lines + } + + fn diagnostic_box_center_y(ocr_box: &SubtitleOcrBox) -> f64 { + ocr_box.y + (ocr_box.height / 2.0) + } + + fn diagnostic_is_same_line( + line_center_y: f64, + line_height: f64, + box_center_y: f64, + box_height: f64, + ) -> bool { + let height = line_height.max(box_height); + let threshold = if height > 0.0 { + (height * 0.45).max(8.0) + } else { + 0.01 + }; + + (line_center_y - box_center_y).abs() <= threshold + } + + fn draw_rect(image: &mut RgbaImage, rect: &SubtitleOcrBox, color: Rgba) { + if image.width() == 0 || image.height() == 0 { + return; + } + + let left = rect.x.max(0.0).floor() as u32; + let top = rect.y.max(0.0).floor() as u32; + let right = (rect.x + rect.width) + .max(0.0) + .ceil() + .min(f64::from(image.width().saturating_sub(1))) as u32; + let bottom = (rect.y + rect.height) + .max(0.0) + .ceil() + .min(f64::from(image.height().saturating_sub(1))) as u32; + + if right < left || bottom < top { + return; + } + + for x in left..=right { + image.put_pixel(x, top, color); + image.put_pixel(x, bottom, color); + } + for y in top..=bottom { + image.put_pixel(left, y, color); + image.put_pixel(right, y, color); + } + } + + fn format_diagnostic_srt(cues: &[SubtitleOcrCue]) -> String { + cues.iter() + .filter(|cue| !cue.text.trim().is_empty()) + .enumerate() + .map(|(index, cue)| { + format!( + "{}\n{} --> {}\n{}\n", + index + 1, + format_diagnostic_srt_time(cue.start_time_ms), + format_diagnostic_srt_time(cue.end_time_ms), + cue.text + ) + }) + .collect::>() + .join("\n") + } + + fn format_diagnostic_srt_time(ms: u64) -> String { + let hours = ms / 3_600_000; + let minutes = (ms % 3_600_000) / 60_000; + let seconds = (ms % 60_000) / 1_000; + let millis = ms % 1_000; + + format!("{hours:02}:{minutes:02}:{seconds:02},{millis:03}") + } + + #[test] + fn build_final_subtitle_ocr_cues_creates_blank_review_cues_for_empty_ocr() { + let raw = vec![raw_cue("cue-1", ""), raw_cue("cue-2", " ")]; + + let final_cues = build_final_subtitle_ocr_cues(&raw, Vec::new()); + + assert_eq!(final_cues.len(), 2); + assert_eq!(final_cues[0].id, "cue-1"); + assert_eq!(final_cues[0].source_cue_ids, vec!["cue-1"]); + assert_eq!(final_cues[0].start_time_ms, 1_000); + assert_eq!(final_cues[0].end_time_ms, 2_500); + assert!(final_cues.iter().all(|cue| cue.text.is_empty())); + } + + #[test] + fn build_final_subtitle_ocr_cues_preserves_blank_cues_in_mixed_results() { + let raw = vec![raw_cue("cue-1", ""), raw_cue("cue-2", "Hello")]; + let stabilized = vec![final_cue("cue-2", "Hello")]; + + let final_cues = build_final_subtitle_ocr_cues(&raw, stabilized); + + assert_eq!(final_cues.len(), 2); + assert_eq!(final_cues[0].id, "cue-1"); + assert!(final_cues[0].text.is_empty()); + assert_eq!(final_cues[1], final_cue("cue-2", "Hello")); + } + + #[test] + fn build_final_subtitle_ocr_cues_emits_merged_stabilized_cues_once() { + let raw = vec![ + raw_cue_at("cue-1", 1_000, 2_000, "Hello"), + raw_cue_at("cue-2", 2_100, 2_500, "Hello"), + raw_cue_at("cue-3", 2_600, 3_000, ""), + ]; + let mut stabilized = final_cue("cue-1", "Hello"); + stabilized.end_time_ms = 2_500; + stabilized.source_cue_ids = vec!["cue-1".to_string(), "cue-2".to_string()]; + + let final_cues = build_final_subtitle_ocr_cues(&raw, vec![stabilized.clone()]); + + assert_eq!(final_cues.len(), 2); + assert_eq!(final_cues[0], stabilized); + assert_eq!(final_cues[1].id, "cue-3"); + assert!(final_cues[1].text.is_empty()); + } +} diff --git a/src-tauri/src/tools/subtitle_ocr/progress.rs b/src-tauri/src/tools/subtitle_ocr/progress.rs new file mode 100644 index 00000000..f8e4e504 --- /dev/null +++ b/src-tauri/src/tools/subtitle_ocr/progress.rs @@ -0,0 +1,255 @@ +use std::sync::{Arc, Mutex}; +use std::time::{Duration, Instant}; + +use serde::Serialize; +use tauri::Emitter; + +const PROGRESS_MIN_INTERVAL: Duration = Duration::from_millis(150); +const PROGRESS_MIN_PERCENT_STEP: u32 = 5; + +#[derive(Debug)] +struct ProgressState { + last_percentage: u32, + last_emitted_at: Option, + total: ProgressTotal, +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub(super) enum ProgressTotal { + Known(u32), + Unknown, +} + +impl From for ProgressTotal { + fn from(value: u32) -> Self { + Self::Known(value) + } +} + +impl ProgressTotal { + fn event_total(self) -> u32 { + match self { + Self::Known(total) => total, + Self::Unknown => 0, + } + } + + fn is_known(self) -> bool { + matches!(self, Self::Known(_)) + } +} + +#[derive(Debug, Clone, Serialize)] +#[serde(rename_all = "camelCase")] +pub(super) struct SubtitleOcrProgressEvent { + item_id: String, + run_id: String, + phase: &'static str, + current: u32, + total: u32, + total_known: bool, + percentage: u32, +} + +impl SubtitleOcrProgressEvent { + pub(super) fn new( + item_id: impl Into, + run_id: impl Into, + phase: &'static str, + current: u32, + total: impl Into, + ) -> Self { + let total = total.into(); + Self { + item_id: item_id.into(), + run_id: run_id.into(), + phase, + current, + total: total.event_total(), + total_known: total.is_known(), + percentage: progress_percentage(current, total), + } + } +} + +#[derive(Clone)] +pub(super) struct SubtitleOcrProgressEmitter { + app: tauri::AppHandle, + item_id: String, + run_id: String, + phase: &'static str, + state: Arc>, +} + +impl SubtitleOcrProgressEmitter { + pub(super) fn new( + app: tauri::AppHandle, + item_id: impl Into, + run_id: impl Into, + phase: &'static str, + total: impl Into, + ) -> Self { + Self { + app, + item_id: item_id.into(), + run_id: run_id.into(), + phase, + state: Arc::new(Mutex::new(ProgressState { + last_percentage: 0, + last_emitted_at: None, + total: total.into(), + })), + } + } + + pub(super) fn emit(&self, current: u32) { + self.emit_internal(current, None, false); + } + + pub(super) fn emit_force(&self, current: u32) { + self.emit_internal(current, None, true); + } + + pub(super) fn emit_force_with_total(&self, current: u32, total: u32) { + self.emit_internal(current, Some(total), true); + } + + fn emit_internal(&self, current: u32, total_update: Option, force: bool) { + let mut state = match self.state.lock() { + Ok(state) => state, + Err(_) => return, + }; + let now = Instant::now(); + if let Some(total) = total_update { + state.total = ProgressTotal::Known(total); + } + let percentage = progress_percentage(current, state.total); + + if !should_emit_progress(&state, percentage, now, force) { + return; + } + + let _ = self.app.emit( + "subtitle-ocr-progress", + SubtitleOcrProgressEvent::new( + self.item_id.clone(), + self.run_id.clone(), + self.phase, + current, + state.total, + ), + ); + + state.last_percentage = percentage; + state.last_emitted_at = Some(now); + } +} + +fn should_emit_progress(state: &ProgressState, percentage: u32, now: Instant, force: bool) -> bool { + if force { + return true; + } + + let Some(last_emitted_at) = state.last_emitted_at else { + return true; + }; + + if percentage.saturating_sub(state.last_percentage) >= PROGRESS_MIN_PERCENT_STEP { + return true; + } + + now.duration_since(last_emitted_at) >= PROGRESS_MIN_INTERVAL +} + +fn progress_percentage(current: u32, total: ProgressTotal) -> u32 { + match total { + ProgressTotal::Known(0) => 100, + ProgressTotal::Known(total) => { + ((u64::from(current.min(total)) * 100) / u64::from(total)) as u32 + } + ProgressTotal::Unknown => 0, + } +} + +#[cfg(test)] +mod tests { + use std::time::{Duration, Instant}; + + use super::{ + ProgressState, ProgressTotal, SubtitleOcrProgressEvent, progress_percentage, + should_emit_progress, + }; + + #[test] + fn progress_event_serializes_percentage() { + let event = SubtitleOcrProgressEvent::new("item-1", "run-1", "ocr", 5, 10); + let value = serde_json::to_value(event).expect("event should serialize"); + + assert_eq!(value["itemId"], "item-1"); + assert_eq!(value["runId"], "run-1"); + assert_eq!(value["phase"], "ocr"); + assert_eq!(value["current"], 5); + assert_eq!(value["total"], 10); + assert_eq!(value["totalKnown"], true); + assert_eq!(value["percentage"], 50); + assert!(value.get("message").is_none()); + } + + #[test] + fn progress_percentage_reports_partial_bitmap_ocr_progress() { + assert_eq!(progress_percentage(27, ProgressTotal::Known(373)), 7); + } + + #[test] + fn progress_percentage_treats_empty_work_as_complete() { + assert_eq!(progress_percentage(0, ProgressTotal::Known(0)), 100); + } + + #[test] + fn progress_event_serializes_unknown_total_without_progress() { + let event = + SubtitleOcrProgressEvent::new("item-1", "run-1", "ocr", 27, ProgressTotal::Unknown); + let value = serde_json::to_value(event).expect("event should serialize"); + + assert_eq!(value["total"], 0); + assert_eq!(value["totalKnown"], false); + assert_eq!(value["percentage"], 0); + } + + #[test] + fn progress_throttle_skips_small_updates_inside_interval() { + let now = Instant::now(); + let mut state = ProgressState { + last_percentage: 10, + last_emitted_at: Some(now), + total: ProgressTotal::Known(100), + }; + + assert!(!should_emit_progress( + &state, + 11, + now + Duration::from_millis(25), + false + )); + + state.last_percentage = 10; + assert!(should_emit_progress( + &state, + 15, + now + Duration::from_millis(25), + false + )); + assert!(should_emit_progress( + &state, + 11, + now + Duration::from_millis(250), + false + )); + assert!(should_emit_progress( + &state, + 11, + now + Duration::from_millis(25), + true + )); + } +} diff --git a/src-tauri/src/tools/subtitle_ocr/restore.rs b/src-tauri/src/tools/subtitle_ocr/restore.rs new file mode 100644 index 00000000..64f61ca2 --- /dev/null +++ b/src-tauri/src/tools/subtitle_ocr/restore.rs @@ -0,0 +1,536 @@ +use std::{ + cell::Cell, + path::{Component, Path, PathBuf}, +}; + +use crate::shared::sleep_inhibit::SleepInhibitGuard; +use crate::tools::subtitle_ocr::SubtitleOcrDecodedCue; +use crate::tools::subtitle_ocr::assets::write_decoded_bitmap_assets; +use crate::tools::subtitle_ocr::decode::{ + DecodedBitmapCue, decode_bitmap_subtitle_source_with_handler_and_stop, + validate_bitmap_subtitle_source, +}; +use crate::tools::subtitle_ocr::progress::SubtitleOcrProgressEmitter; +use tauri::Emitter; + +const BITMAP_DECODE_STOPPED_ERROR: &str = "Subtitle OCR bitmap decode stopped"; + +#[derive(Debug, Clone, serde::Deserialize, serde::Serialize, PartialEq, Eq)] +#[serde(rename_all = "camelCase")] +pub(crate) struct SubtitleOcrRestoreBitmap { + pub(crate) cue_id: String, + pub(crate) start_time_ms: u64, + pub(crate) end_time_ms: u64, + pub(crate) width: u32, + pub(crate) height: u32, + #[serde(skip_serializing_if = "Option::is_none")] + pub(crate) cache_key: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub(crate) preview_path: Option, +} + +#[derive(Debug, Clone, serde::Serialize)] +#[serde(rename_all = "camelCase")] +struct SubtitleOcrRestoredBitmapEvent { + item_id: String, + run_id: String, + bitmap: SubtitleOcrRestoreBitmap, +} + +#[tauri::command] +pub(crate) async fn restore_subtitle_ocr_bitmap_assets( + app: tauri::AppHandle, + item_id: String, + run_id: String, + source_path: String, + idx_path: Option, + sub_path: Option, + bitmaps: Vec, +) -> Result, String> { + if item_id.trim().is_empty() { + return Err("Subtitle OCR item id is required".to_string()); + } + if run_id.trim().is_empty() { + return Err("Subtitle OCR run id is required".to_string()); + } + + let _sleep_guard = SleepInhibitGuard::try_acquire("Restoring Subtitle OCR previews").ok(); + let source = + validate_bitmap_subtitle_source(&source_path, idx_path.as_deref(), sub_path.as_deref())?; + super::state::begin_operation(&item_id, &run_id)?; + + let item_id_for_task = item_id.clone(); + let run_id_for_task = run_id.clone(); + let join_result = tokio::task::spawn_blocking(move || { + restore_subtitle_ocr_bitmap_assets_blocking( + app, + &item_id_for_task, + &run_id_for_task, + &source, + bitmaps, + ) + }) + .await; + + let _ = super::state::clear_registered_operation(&item_id, &run_id); + let result = join_result.map_err(|e| format!("Subtitle OCR restore task failed: {}", e))?; + if result.is_ok() { + let _ = super::state::clear_cancelled(&item_id, &run_id); + } + + result +} + +#[tauri::command] +pub(crate) async fn collect_missing_subtitle_ocr_bitmap_assets( + bitmaps: Vec, +) -> Result, String> { + tokio::task::spawn_blocking(move || collect_missing_bitmap_assets(bitmaps)) + .await + .map_err(|e| format!("Subtitle OCR bitmap asset scan failed: {}", e)) +} + +fn collect_missing_bitmap_assets( + bitmaps: Vec, +) -> Vec { + let mut missing = Vec::new(); + let mut seen_keys = std::collections::HashSet::new(); + + for bitmap in bitmaps { + if bitmap_asset_is_missing(&bitmap) { + let key = bitmap_restore_key(&bitmap); + if !seen_keys.insert(key) { + continue; + } + missing.push(bitmap); + } + } + + missing +} + +fn bitmap_restore_key(bitmap: &SubtitleOcrRestoreBitmap) -> String { + if let Some(cache_key) = bitmap + .cache_key + .as_deref() + .filter(|value| !value.is_empty()) + { + return format!("cache:{cache_key}"); + } + + if !bitmap.cue_id.is_empty() { + return format!("cue:{}", bitmap.cue_id); + } + + format!( + "time:{}:{}:{}:{}", + bitmap.start_time_ms, bitmap.end_time_ms, bitmap.width, bitmap.height + ) +} + +fn bitmap_asset_is_missing(bitmap: &SubtitleOcrRestoreBitmap) -> bool { + let Some(preview_path) = bitmap.preview_path.as_deref() else { + return true; + }; + + !subtitle_ocr_asset_path_exists(preview_path) +} + +fn subtitle_ocr_asset_path_exists(path: &str) -> bool { + let path = Path::new(path); + if !path_is_in_subtitle_ocr_temp_root(path) { + return false; + } + + path.exists() +} + +fn path_is_in_subtitle_ocr_temp_root(path: &Path) -> bool { + let root = subtitle_ocr_temp_asset_root(); + path.is_absolute() + && path.starts_with(&root) + && path + .components() + .all(|component| !matches!(component, Component::ParentDir | Component::CurDir)) +} + +fn subtitle_ocr_temp_asset_root() -> PathBuf { + std::env::temp_dir().join("MediaFlow").join("subtitle-ocr") +} + +fn restore_subtitle_ocr_bitmap_assets_blocking( + app: tauri::AppHandle, + item_id: &str, + run_id: &str, + source: &super::decode::BitmapSubtitleSource, + bitmaps: Vec, +) -> Result, String> { + ensure_not_cancelled(item_id, run_id)?; + let total = u32::try_from(bitmaps.len()).unwrap_or(u32::MAX); + let progress = SubtitleOcrProgressEmitter::new( + app.clone(), + item_id.to_string(), + run_id.to_string(), + "decoding", + total, + ); + progress.emit_force(0); + + if bitmaps.is_empty() { + return Ok(Vec::new()); + } + + let mut matcher = RestoreBitmapMatcher::new(bitmaps); + let restore_complete = Cell::new(false); + let mut restored = Vec::new(); + let mut restored_count = 0u32; + + let decode_result = decode_bitmap_subtitle_source_with_handler_and_stop( + source, + item_id, + run_id, + |decoded| { + ensure_not_cancelled(item_id, run_id)?; + if let Some(target) = matcher.take_match(&decoded.metadata) { + let restored_bitmap = restore_bitmap_paths(item_id, run_id, target, &decoded)?; + emit_restored_bitmap_event(&app, item_id, run_id, &restored_bitmap); + restored.push(restored_bitmap); + restored_count = restored_count.saturating_add(1); + progress.emit(restored_count); + restore_complete.set(matcher.is_complete()); + } + Ok(()) + }, + || restore_complete.get(), + ); + normalize_restore_decode_result(decode_result, restore_complete.get())?; + + progress.emit_force_with_total(restored_count, total); + Ok(restored) +} + +fn normalize_restore_decode_result( + result: Result<(), String>, + restore_complete: bool, +) -> Result<(), String> { + match result { + Err(error) if restore_complete && error == BITMAP_DECODE_STOPPED_ERROR => Ok(()), + other => other, + } +} + +fn emit_restored_bitmap_event( + app: &tauri::AppHandle, + item_id: &str, + run_id: &str, + bitmap: &SubtitleOcrRestoreBitmap, +) { + let _ = app.emit( + "subtitle-ocr-restored-bitmap", + SubtitleOcrRestoredBitmapEvent { + item_id: item_id.to_string(), + run_id: run_id.to_string(), + bitmap: bitmap.clone(), + }, + ); +} + +fn restore_bitmap_paths( + item_id: &str, + run_id: &str, + mut target: SubtitleOcrRestoreBitmap, + decoded: &DecodedBitmapCue, +) -> Result { + let assets = write_decoded_bitmap_assets(item_id, run_id, &decoded.metadata, &decoded.rgba)?; + target.preview_path = Some(assets.preview_path); + if target.cache_key.is_none() { + target.cache_key = Some(decoded.metadata.cache_key.clone()); + } + Ok(target) +} + +fn ensure_not_cancelled(item_id: &str, run_id: &str) -> Result<(), String> { + if super::state::is_operation_cancelled(item_id, run_id) { + Err("Subtitle OCR operation cancelled".to_string()) + } else { + Ok(()) + } +} + +struct RestoreBitmapMatcher { + targets: Vec, + matched: Vec, +} + +impl RestoreBitmapMatcher { + fn new(targets: Vec) -> Self { + let matched = vec![false; targets.len()]; + Self { targets, matched } + } + + fn take_match(&mut self, decoded: &SubtitleOcrDecodedCue) -> Option { + let index = self + .find_by_cache_key(decoded) + .or_else(|| self.find_by_cue_id(decoded)) + .or_else(|| self.find_by_timing_and_dimensions(decoded))?; + self.matched[index] = true; + Some(self.targets[index].clone()) + } + + fn is_complete(&self) -> bool { + self.matched.iter().all(|matched| *matched) + } + + fn find_by_cache_key(&self, decoded: &SubtitleOcrDecodedCue) -> Option { + self.targets.iter().enumerate().position(|(index, target)| { + !self.matched[index] + && target + .cache_key + .as_deref() + .is_some_and(|cache_key| cache_key == decoded.cache_key) + }) + } + + fn find_by_cue_id(&self, decoded: &SubtitleOcrDecodedCue) -> Option { + self.targets + .iter() + .enumerate() + .position(|(index, target)| !self.matched[index] && target.cue_id == decoded.cue_id) + } + + fn find_by_timing_and_dimensions(&self, decoded: &SubtitleOcrDecodedCue) -> Option { + self.targets.iter().enumerate().position(|(index, target)| { + !self.matched[index] + && target.start_time_ms == decoded.start_time_ms + && target.end_time_ms == decoded.end_time_ms + && target.width == decoded.width + && target.height == decoded.height + }) + } +} + +#[cfg(test)] +mod tests { + use super::{ + BITMAP_DECODE_STOPPED_ERROR, RestoreBitmapMatcher, SubtitleOcrRestoreBitmap, + SubtitleOcrRestoredBitmapEvent, collect_missing_bitmap_assets, + normalize_restore_decode_result, restore_bitmap_paths, subtitle_ocr_temp_asset_root, + }; + use crate::tools::subtitle_ocr::SubtitleOcrDecodedCue; + use crate::tools::subtitle_ocr::decode::{DecodedBitmapCue, bitmap_content_hash}; + + fn bitmap( + cue_id: &str, + cache_key: Option<&str>, + start_time_ms: u64, + ) -> SubtitleOcrRestoreBitmap { + SubtitleOcrRestoreBitmap { + cue_id: cue_id.to_string(), + start_time_ms, + end_time_ms: start_time_ms + 1_000, + width: 720, + height: 360, + cache_key: cache_key.map(ToOwned::to_owned), + preview_path: Some(format!("/tmp/{cue_id}-old-preview.png")), + } + } + + fn decoded(cue_id: &str, cache_key: &str, start_time_ms: u64) -> SubtitleOcrDecodedCue { + SubtitleOcrDecodedCue { + cue_id: cue_id.to_string(), + start_time_ms, + end_time_ms: start_time_ms + 1_000, + width: 720, + height: 360, + cache_key: cache_key.to_string(), + preview_path: None, + } + } + + #[test] + fn restore_matcher_prefers_cache_key_before_cue_id() { + let mut matcher = RestoreBitmapMatcher::new(vec![ + bitmap("same-cue", Some("cache-miss"), 1_000), + bitmap("other-cue", Some("cache-hit"), 2_000), + ]); + + let matched = matcher + .take_match(&decoded("same-cue", "cache-hit", 1_000)) + .expect("cache key match should win"); + + assert_eq!(matched.cue_id, "other-cue"); + } + + #[test] + fn restore_matcher_uses_cue_id_before_timing() { + let mut matcher = RestoreBitmapMatcher::new(vec![ + bitmap("timing-match", None, 1_000), + bitmap("cue-match", None, 2_000), + ]); + + let matched = matcher + .take_match(&decoded("cue-match", "cache-new", 1_000)) + .expect("cue id match should win"); + + assert_eq!(matched.cue_id, "cue-match"); + } + + #[test] + fn restore_matcher_falls_back_to_timing_and_dimensions() { + let mut matcher = RestoreBitmapMatcher::new(vec![bitmap("target", None, 1_000)]); + + let matched = matcher + .take_match(&decoded("decoded", "cache-new", 1_000)) + .expect("timing and dimensions should match"); + + assert_eq!(matched.cue_id, "target"); + } + + #[test] + fn restored_bitmap_event_serializes_frontend_contract() { + let event = SubtitleOcrRestoredBitmapEvent { + item_id: "item-1".to_string(), + run_id: "run-1".to_string(), + bitmap: bitmap("cue-1", Some("cache-1"), 1_000), + }; + + let value = serde_json::to_value(event).expect("event should serialize"); + + assert_eq!(value["itemId"], "item-1"); + assert_eq!(value["runId"], "run-1"); + assert_eq!(value["bitmap"]["cueId"], "cue-1"); + assert_eq!(value["bitmap"]["cacheKey"], "cache-1"); + } + + #[test] + fn collect_missing_bitmap_assets_deduplicates_and_checks_paths() { + let asset_root = subtitle_ocr_temp_asset_root(); + std::fs::create_dir_all(&asset_root).expect("asset root should be created"); + let temp_dir = tempfile::Builder::new() + .prefix("restore-test") + .tempdir_in(&asset_root) + .expect("temp dir should be created"); + let preview_path = temp_dir.path().join("existing-preview.png"); + std::fs::write(&preview_path, b"preview").expect("preview should be written"); + + let mut existing = bitmap("existing", Some("existing-cache"), 1_000); + existing.preview_path = Some(preview_path.to_string_lossy().to_string()); + + let mut missing = bitmap("missing", Some("missing-cache"), 2_000); + missing.preview_path = Some( + temp_dir + .path() + .join("missing-preview.png") + .to_string_lossy() + .to_string(), + ); + + let found = collect_missing_bitmap_assets(vec![ + existing, + missing.clone(), + SubtitleOcrRestoreBitmap { + cue_id: "duplicate".to_string(), + ..missing.clone() + }, + ]); + + assert_eq!(found, vec![missing]); + } + + #[test] + fn collect_missing_bitmap_assets_detects_later_missing_duplicate() { + let asset_root = subtitle_ocr_temp_asset_root(); + std::fs::create_dir_all(&asset_root).expect("asset root should be created"); + let temp_dir = tempfile::Builder::new() + .prefix("restore-duplicate-test") + .tempdir_in(&asset_root) + .expect("temp dir should be created"); + let preview_path = temp_dir.path().join("existing-preview.png"); + std::fs::write(&preview_path, b"preview").expect("preview should be written"); + + let mut existing = bitmap("existing", Some("shared-cache"), 1_000); + existing.preview_path = Some(preview_path.to_string_lossy().to_string()); + + let mut missing = bitmap("missing", Some("shared-cache"), 2_000); + missing.preview_path = Some( + temp_dir + .path() + .join("missing-preview.png") + .to_string_lossy() + .to_string(), + ); + + let found = collect_missing_bitmap_assets(vec![existing, missing.clone()]); + + assert_eq!(found, vec![missing]); + } + + #[test] + fn collect_missing_bitmap_assets_treats_out_of_scope_paths_as_missing_without_probing() { + let temp_dir = tempfile::tempdir().expect("temp dir should be created"); + let preview_path = temp_dir.path().join("existing-preview.png"); + std::fs::write(&preview_path, b"preview").expect("preview should be written"); + + let mut target = bitmap("target", Some("target-cache"), 1_000); + target.preview_path = Some(preview_path.to_string_lossy().to_string()); + + assert_eq!( + collect_missing_bitmap_assets(vec![target.clone()]), + vec![target] + ); + } + + #[test] + fn collect_missing_bitmap_assets_treats_absent_paths_as_missing() { + let target = SubtitleOcrRestoreBitmap { + preview_path: None, + ..bitmap("target", Some("target-cache"), 1_000) + }; + + assert_eq!( + collect_missing_bitmap_assets(vec![target.clone()]), + vec![target] + ); + } + + #[test] + fn restore_bitmap_paths_returns_requested_bitmap_with_new_paths() { + let target = bitmap("target", None, 1_000); + let metadata = decoded("decoded", "cache-new", 1_000); + let rgba = vec![255; (metadata.width * metadata.height * 4) as usize]; + let decoded = DecodedBitmapCue { + content_hash: bitmap_content_hash(&rgba), + rgba, + metadata, + }; + + let restored = restore_bitmap_paths("item", "restore-run", target, &decoded) + .expect("bitmap paths should restore"); + + assert_eq!(restored.cue_id, "target"); + assert_eq!(restored.cache_key.as_deref(), Some("cache-new")); + assert!(restored.preview_path.as_deref().is_some_and(|path| { + path.contains("MediaFlow") && std::path::Path::new(path).is_file() + })); + + if let Some(path) = restored.preview_path { + let _ = std::fs::remove_file(path); + } + } + + #[test] + fn completed_restore_stop_error_is_success() { + let result = + normalize_restore_decode_result(Err(BITMAP_DECODE_STOPPED_ERROR.to_string()), true); + + assert_eq!(result, Ok(())); + } + + #[test] + fn incomplete_restore_stop_error_stays_error() { + let result = + normalize_restore_decode_result(Err(BITMAP_DECODE_STOPPED_ERROR.to_string()), false); + + assert_eq!(result, Err(BITMAP_DECODE_STOPPED_ERROR.to_string())); + } +} diff --git a/src-tauri/src/tools/subtitle_ocr/stabilize.rs b/src-tauri/src/tools/subtitle_ocr/stabilize.rs new file mode 100644 index 00000000..58502b03 --- /dev/null +++ b/src-tauri/src/tools/subtitle_ocr/stabilize.rs @@ -0,0 +1,289 @@ +#![allow(dead_code)] + +use std::borrow::Cow; + +use crate::tools::subtitle_ocr::{SubtitleOcrCue, SubtitleOcrPlacement}; + +pub(crate) fn stabilize_cues(cues: &[SubtitleOcrCue]) -> Vec { + let mut stabilized: Vec = Vec::new(); + let mut placement_states: Vec = Vec::new(); + + for cue in cues { + if cue.text.trim().is_empty() { + continue; + } + + if let Some(previous_index) = stabilized.len().checked_sub(1) { + let previous = &stabilized[previous_index]; + let is_adjacent = cue.start_time_ms <= previous.end_time_ms.saturating_add(250); + + if normalize_line_endings(&previous.text) == normalize_line_endings(&cue.text) + && is_adjacent + { + let previous = &mut stabilized[previous_index]; + let placement_state = &mut placement_states[previous_index]; + placement_state.add(cue); + previous.end_time_ms = previous.end_time_ms.max(cue.end_time_ms); + previous.confidence = previous.confidence.max(cue.confidence); + placement_state.apply_to_cue(previous); + previous + .source_cue_ids + .extend(cue.source_cue_ids.iter().cloned()); + continue; + } + } + + stabilized.push(cue.clone()); + placement_states.push(PlacementMergeState::from_cue(cue)); + } + + stabilized +} + +#[derive(Debug, Clone, Copy, Default)] +struct PlacementMergeState { + placed_weight: usize, + top_weight: usize, +} + +impl PlacementMergeState { + fn from_cue(cue: &SubtitleOcrCue) -> Self { + let mut state = Self::default(); + state.add(cue); + state + } + + fn add(&mut self, cue: &SubtitleOcrCue) { + if let Some(placed_weight) = cue.placement_source_count.filter(|count| *count > 0) { + self.placed_weight = self.placed_weight.saturating_add(placed_weight as usize); + self.top_weight = self.top_weight.saturating_add( + cue.top_placement_source_count + .unwrap_or(0) + .min(placed_weight) as usize, + ); + return; + } + + let weight = cue.source_cue_ids.len().max(1); + match cue.placement { + Some(SubtitleOcrPlacement::Top) => { + self.placed_weight = self.placed_weight.saturating_add(weight); + self.top_weight = self.top_weight.saturating_add(weight); + } + Some(SubtitleOcrPlacement::Bottom) => { + self.placed_weight = self.placed_weight.saturating_add(weight); + } + None => {} + } + } + + fn apply_to_cue(&self, cue: &mut SubtitleOcrCue) { + cue.placement = self.placement(); + if self.placed_weight == 0 { + cue.placement_source_count = None; + cue.top_placement_source_count = None; + return; + } + + cue.placement_source_count = Some(saturating_u32(self.placed_weight)); + cue.top_placement_source_count = Some(saturating_u32(self.top_weight)); + } + + fn placement(&self) -> Option { + if self.placed_weight == 0 { + return None; + } + + if self.top_weight > self.placed_weight / 2 { + Some(SubtitleOcrPlacement::Top) + } else { + Some(SubtitleOcrPlacement::Bottom) + } + } +} + +fn saturating_u32(value: usize) -> u32 { + u32::try_from(value).unwrap_or(u32::MAX) +} + +fn normalize_line_endings(text: &str) -> Cow<'_, str> { + if text.contains('\r') { + Cow::Owned(text.replace("\r\n", "\n").replace('\r', "\n")) + } else { + Cow::Borrowed(text) + } +} + +#[cfg(test)] +mod tests { + use super::stabilize_cues; + use crate::tools::subtitle_ocr::{SubtitleOcrCue, SubtitleOcrPlacement}; + + fn cue(id: &str, start_time_ms: u64, end_time_ms: u64, text: &str) -> SubtitleOcrCue { + SubtitleOcrCue { + id: id.to_string(), + source_cue_ids: vec![id.to_string()], + start_time_ms, + end_time_ms, + text: text.to_string(), + confidence: 0.8, + placement: Some(SubtitleOcrPlacement::Bottom), + placement_source_count: Some(1), + top_placement_source_count: Some(0), + } + } + + fn placed_cue( + id: &str, + start_time_ms: u64, + end_time_ms: u64, + text: &str, + placement: SubtitleOcrPlacement, + ) -> SubtitleOcrCue { + let top_placement_source_count = match placement { + SubtitleOcrPlacement::Top => 1, + SubtitleOcrPlacement::Bottom => 0, + }; + + SubtitleOcrCue { + placement: Some(placement), + top_placement_source_count: Some(top_placement_source_count), + ..cue(id, start_time_ms, end_time_ms, text) + } + } + + #[test] + fn adjacent_identical_text_merges() { + let cues = vec![ + cue("a", 0, 1_000, "Hello world"), + cue("b", 1_200, 2_000, "Hello world"), + ]; + + let stabilized = stabilize_cues(&cues); + + assert_eq!(stabilized.len(), 1); + assert_eq!(stabilized[0].end_time_ms, 2_000); + assert_eq!(stabilized[0].source_cue_ids, vec!["a", "b"]); + } + + #[test] + fn non_adjacent_identical_text_does_not_merge() { + let cues = vec![cue("a", 0, 1_000, "OK"), cue("b", 60_000, 61_000, "OK")]; + + let stabilized = stabilize_cues(&cues); + + assert_eq!(stabilized.len(), 2); + assert_eq!(stabilized[0].source_cue_ids, vec!["a"]); + assert_eq!(stabilized[1].source_cue_ids, vec!["b"]); + } + + #[test] + fn case_and_whitespace_differences_do_not_merge() { + let cues = vec![ + cue("a", 0, 1_000, "Hello world"), + cue("b", 1_100, 2_000, "Hello world"), + cue("c", 2_100, 3_000, "hello world"), + ]; + + let stabilized = stabilize_cues(&cues); + + assert_eq!(stabilized.len(), 3); + } + + #[test] + fn different_text_does_not_merge() { + let cues = vec![cue("a", 0, 1_000, "Hello"), cue("b", 1_100, 2_000, "World")]; + + let stabilized = stabilize_cues(&cues); + + assert_eq!(stabilized.len(), 2); + } + + #[test] + fn empty_text_cues_are_dropped() { + let cues = vec![ + cue("a", 0, 1_000, " \n\t "), + cue("b", 1_100, 2_000, "World"), + ]; + + let stabilized = stabilize_cues(&cues); + + assert_eq!(stabilized, vec![cue("b", 1_100, 2_000, "World")]); + } + + #[test] + fn merged_cue_keeps_top_only_for_majority_top_sources() { + let mut previous_top = placed_cue("a", 0, 1_000, "Hello", SubtitleOcrPlacement::Top); + previous_top.source_cue_ids = vec!["a1".to_string(), "a2".to_string()]; + previous_top.placement_source_count = Some(2); + previous_top.top_placement_source_count = Some(2); + let bottom = placed_cue("b", 1_100, 2_000, "Hello", SubtitleOcrPlacement::Bottom); + + let stabilized = stabilize_cues(&[previous_top, bottom]); + + assert_eq!(stabilized.len(), 1); + assert_eq!(stabilized[0].placement, Some(SubtitleOcrPlacement::Top)); + } + + #[test] + fn merged_cue_defaults_bottom_when_top_is_not_majority() { + let top = placed_cue("a", 0, 1_000, "Hello", SubtitleOcrPlacement::Top); + let bottom = placed_cue("b", 1_100, 2_000, "Hello", SubtitleOcrPlacement::Bottom); + + let stabilized = stabilize_cues(&[top, bottom]); + + assert_eq!(stabilized.len(), 1); + assert_eq!(stabilized[0].placement, Some(SubtitleOcrPlacement::Bottom)); + } + + #[test] + fn merged_cue_tracks_top_majority_across_three_sources() { + let cues = vec![ + placed_cue("a", 0, 1_000, "Hello", SubtitleOcrPlacement::Top), + placed_cue("b", 1_100, 2_000, "Hello", SubtitleOcrPlacement::Bottom), + placed_cue("c", 2_100, 3_000, "Hello", SubtitleOcrPlacement::Top), + ]; + + let stabilized = stabilize_cues(&cues); + + assert_eq!(stabilized.len(), 1); + assert_eq!(stabilized[0].placement, Some(SubtitleOcrPlacement::Top)); + assert_eq!(stabilized[0].placement_source_count, Some(3)); + assert_eq!(stabilized[0].top_placement_source_count, Some(2)); + } + + #[test] + fn merged_cue_preserves_missing_placement_when_sources_have_none() { + let mut first = cue("a", 0, 1_000, "Hello"); + first.placement = None; + first.placement_source_count = None; + first.top_placement_source_count = None; + let mut second = cue("b", 1_100, 2_000, "Hello"); + second.placement = None; + second.placement_source_count = None; + second.top_placement_source_count = None; + + let stabilized = stabilize_cues(&[first, second]); + + assert_eq!(stabilized.len(), 1); + assert_eq!(stabilized[0].placement, None); + assert_eq!(stabilized[0].placement_source_count, None); + assert_eq!(stabilized[0].top_placement_source_count, None); + } + + #[test] + fn merged_cue_uses_existing_placement_source_counts() { + let mut mixed = placed_cue("a", 0, 2_000, "Hello", SubtitleOcrPlacement::Bottom); + mixed.source_cue_ids = vec!["a1".to_string(), "a2".to_string()]; + mixed.placement_source_count = Some(2); + mixed.top_placement_source_count = Some(1); + let top = placed_cue("b", 2_100, 3_000, "Hello", SubtitleOcrPlacement::Top); + + let stabilized = stabilize_cues(&[mixed, top]); + + assert_eq!(stabilized.len(), 1); + assert_eq!(stabilized[0].placement, Some(SubtitleOcrPlacement::Top)); + assert_eq!(stabilized[0].placement_source_count, Some(3)); + assert_eq!(stabilized[0].top_placement_source_count, Some(2)); + } +} diff --git a/src-tauri/src/tools/subtitle_ocr/state.rs b/src-tauri/src/tools/subtitle_ocr/state.rs new file mode 100644 index 00000000..df7a99a6 --- /dev/null +++ b/src-tauri/src/tools/subtitle_ocr/state.rs @@ -0,0 +1,444 @@ +use std::collections::{HashMap, HashSet}; +use std::mem; +use std::sync::{LazyLock, Mutex, MutexGuard}; + +#[derive(Debug, Default)] +struct SubtitleOcrState { + operations: HashMap, + cancelled_runs: HashSet, +} + +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +struct OperationKey { + item_id: String, + run_id: String, +} + +impl OperationKey { + fn new(item_id: &str, run_id: &str) -> Self { + Self { + item_id: item_id.to_string(), + run_id: run_id.to_string(), + } + } +} + +#[derive(Debug)] +struct OperationRecord { + run_id: String, + pid: Option, + output_paths: Vec, +} + +impl OperationRecord { + fn new(run_id: &str) -> Self { + Self { + run_id: run_id.to_string(), + pid: None, + output_paths: Vec::new(), + } + } +} + +static SUBTITLE_OCR_STATE: LazyLock> = + LazyLock::new(|| Mutex::new(SubtitleOcrState::default())); + +pub(super) fn begin_operation(item_id: &str, run_id: &str) -> Result<(), String> { + let mut state = lock_state()?; + if state.operations.contains_key(item_id) { + return Err(format!( + "Subtitle OCR operation already active for item: {}", + item_id + )); + } + + state + .cancelled_runs + .retain(|key| key.item_id != item_id || key.run_id == run_id); + let key = OperationKey::new(item_id, run_id); + if state.cancelled_runs.remove(&key) { + return Err("Subtitle OCR operation cancelled".to_string()); + } + + state + .operations + .insert(item_id.to_string(), OperationRecord::new(run_id)); + Ok(()) +} + +pub(super) fn register_operation_pid( + item_id: &str, + run_id: &str, + pid: u32, +) -> Result { + let mut state = lock_state()?; + let is_cancelled = state + .cancelled_runs + .contains(&OperationKey::new(item_id, run_id)); + let operation = active_operation_mut(&mut state, item_id, run_id)?; + operation.pid = Some(pid); + Ok(is_cancelled) +} + +pub(super) fn register_output_paths( + item_id: &str, + run_id: &str, + paths: Vec, +) -> Result { + let mut state = lock_state()?; + let is_cancelled = state + .cancelled_runs + .contains(&OperationKey::new(item_id, run_id)); + let operation = active_operation_mut(&mut state, item_id, run_id)?; + operation.output_paths = paths; + Ok(is_cancelled) +} + +pub(super) fn take_operation_pid(item_id: &str, run_id: &str) -> Result, String> { + Ok(lock_state()? + .operations + .get_mut(item_id) + .filter(|operation| operation.run_id == run_id) + .and_then(|operation| operation.pid.take())) +} + +pub(super) fn take_output_paths(item_id: &str, run_id: &str) -> Result, String> { + Ok(lock_state()? + .operations + .get_mut(item_id) + .filter(|operation| operation.run_id == run_id) + .map(|operation| mem::take(&mut operation.output_paths)) + .unwrap_or_default()) +} + +pub(super) fn clear_registered_operation(item_id: &str, run_id: &str) -> Result<(), String> { + let mut state = lock_state()?; + if state + .operations + .get(item_id) + .is_some_and(|operation| operation.run_id == run_id) + { + state.operations.remove(item_id); + state + .cancelled_runs + .remove(&OperationKey::new(item_id, run_id)); + } + Ok(()) +} + +pub(super) fn mark_cancelled(item_id: &str, run_id: &str) -> Result, String> { + let mut state = lock_state()?; + let key = OperationKey::new(item_id, run_id); + + if let Some(operation) = state.operations.get_mut(item_id) { + if operation.run_id != run_id { + return Ok(None); + } + + let pid = operation.pid.take(); + state.cancelled_runs.insert(key); + return Ok(pid); + } + + state.cancelled_runs.insert(key); + Ok(None) +} + +pub(super) fn clear_cancelled(item_id: &str, run_id: &str) -> Result<(), String> { + lock_state()? + .cancelled_runs + .remove(&OperationKey::new(item_id, run_id)); + Ok(()) +} + +pub(super) fn is_operation_cancelled(item_id: &str, run_id: &str) -> bool { + SUBTITLE_OCR_STATE + .lock() + .map(|state| { + state + .cancelled_runs + .contains(&OperationKey::new(item_id, run_id)) + }) + .unwrap_or(true) +} + +#[cfg_attr(not(test), allow(dead_code))] +pub(super) fn has_registered_operation(item_id: &str) -> bool { + SUBTITLE_OCR_STATE + .lock() + .map(|state| state.operations.contains_key(item_id)) + .unwrap_or(false) +} + +fn active_operation_mut<'state>( + state: &'state mut SubtitleOcrState, + item_id: &str, + run_id: &str, +) -> Result<&'state mut OperationRecord, String> { + let operation = state + .operations + .get_mut(item_id) + .ok_or_else(|| format!("No active Subtitle OCR operation for item: {}", item_id))?; + + if operation.run_id != run_id { + return Err(format!( + "No active Subtitle OCR operation for item/run: {}/{}", + item_id, run_id + )); + } + + Ok(operation) +} + +fn lock_state() -> Result, String> { + SUBTITLE_OCR_STATE + .lock() + .map_err(|_| "Failed to acquire Subtitle OCR state lock".to_string()) +} + +#[cfg(test)] +mod tests { + use std::sync::atomic::{AtomicUsize, Ordering}; + use std::sync::{Arc, Barrier}; + use std::thread; + + use serial_test::serial; + + #[test] + #[serial] + fn begin_operation_rejects_duplicate_active_operation() { + let item_id = "subtitle-ocr-duplicate-test"; + let run_id = "run-1"; + let _ = super::clear_registered_operation(item_id, run_id); + let _ = super::clear_cancelled(item_id, run_id); + + super::begin_operation(item_id, run_id).expect("first operation should start"); + let error = super::begin_operation(item_id, "run-2") + .expect_err("second active operation for the same item should fail"); + + assert!(error.contains("already active")); + + let _ = super::clear_registered_operation(item_id, run_id); + let _ = super::clear_cancelled(item_id, run_id); + } + + #[test] + #[serial] + fn begin_operation_waits_for_cancelled_operation_to_clear_before_reuse() { + let item_id = "subtitle-ocr-cancel-then-new-test"; + let run_id = "run-1"; + let _ = super::clear_registered_operation(item_id, run_id); + let _ = super::clear_cancelled(item_id, run_id); + + super::begin_operation(item_id, run_id).expect("operation should start"); + super::mark_cancelled(item_id, run_id).expect("operation should cancel"); + assert!(super::is_operation_cancelled(item_id, run_id)); + + let error = super::begin_operation(item_id, "run-2") + .expect_err("new operation should wait for cancelled operation cleanup"); + assert!(error.contains("already active")); + + super::clear_registered_operation(item_id, run_id) + .expect("cancelled operation cleanup should clear active state"); + super::begin_operation(item_id, run_id) + .expect("new operation after cancellation should start"); + assert!(!super::is_operation_cancelled(item_id, run_id)); + assert!(super::has_registered_operation(item_id)); + + let _ = super::clear_registered_operation(item_id, run_id); + let _ = super::clear_cancelled(item_id, run_id); + } + + #[test] + #[serial] + fn cancel_before_begin_causes_next_begin_to_fail_once() { + let item_id = "subtitle-ocr-sticky-cancel-before-begin-test"; + let run_id = "run-1"; + let _ = super::clear_registered_operation(item_id, run_id); + let _ = super::clear_cancelled(item_id, run_id); + + assert_eq!( + super::mark_cancelled(item_id, run_id).expect("pre-begin cancel should be recorded"), + None + ); + assert!(super::is_operation_cancelled(item_id, run_id)); + + let error = super::begin_operation(item_id, run_id) + .expect_err("next begin should consume sticky cancellation"); + assert!(error.contains("cancelled")); + assert!(!super::is_operation_cancelled(item_id, run_id)); + assert!(!super::has_registered_operation(item_id)); + + super::begin_operation(item_id, run_id).expect("later begin should start normally"); + assert!(super::has_registered_operation(item_id)); + + let _ = super::clear_registered_operation(item_id, run_id); + let _ = super::clear_cancelled(item_id, run_id); + } + + #[test] + #[serial] + fn late_cancel_for_finished_run_does_not_cancel_next_run() { + let item_id = "subtitle-ocr-late-cancel-next-run-test"; + let run_a = "run-a"; + let run_b = "run-b"; + let _ = super::clear_registered_operation(item_id, run_a); + let _ = super::clear_registered_operation(item_id, run_b); + let _ = super::clear_cancelled(item_id, run_a); + let _ = super::clear_cancelled(item_id, run_b); + + super::begin_operation(item_id, run_a).expect("run A should start"); + super::clear_registered_operation(item_id, run_a).expect("run A should finish"); + assert_eq!( + super::mark_cancelled(item_id, run_a).expect("late cancel should be recorded"), + None + ); + + super::begin_operation(item_id, run_b) + .expect("run B should ignore stale cancellation for run A"); + + assert!(!super::is_operation_cancelled(item_id, run_a)); + assert!(!super::is_operation_cancelled(item_id, run_b)); + + let _ = super::clear_registered_operation(item_id, run_b); + let _ = super::clear_cancelled(item_id, run_a); + let _ = super::clear_cancelled(item_id, run_b); + } + + #[test] + #[serial] + fn active_cancel_stays_active_until_owner_cleanup_then_allows_reuse() { + let item_id = "subtitle-ocr-active-cancel-owner-cleanup-test"; + let run_id = "run-1"; + let _ = super::clear_registered_operation(item_id, run_id); + let _ = super::clear_cancelled(item_id, run_id); + + super::begin_operation(item_id, run_id).expect("operation should start"); + super::mark_cancelled(item_id, run_id).expect("active operation should cancel"); + assert!(super::is_operation_cancelled(item_id, run_id)); + assert!(super::has_registered_operation(item_id)); + + let error = super::begin_operation(item_id, "run-2") + .expect_err("cancelled active operation should remain active until owner cleanup"); + assert!(error.contains("already active")); + + super::clear_registered_operation(item_id, run_id) + .expect("owner cleanup should clear operation"); + assert!(!super::is_operation_cancelled(item_id, run_id)); + + super::begin_operation(item_id, run_id) + .expect("operation should be reusable after owner cleanup"); + assert!(super::has_registered_operation(item_id)); + + let _ = super::clear_registered_operation(item_id, run_id); + let _ = super::clear_cancelled(item_id, run_id); + } + + #[test] + #[serial] + fn mark_cancelled_ignores_active_operation_with_different_run() { + let item_id = "subtitle-ocr-stale-cancel-active-run-test"; + let active_run_id = "active-run"; + let stale_run_id = "stale-run"; + let _ = super::clear_registered_operation(item_id, active_run_id); + let _ = super::clear_cancelled(item_id, active_run_id); + let _ = super::clear_cancelled(item_id, stale_run_id); + + super::begin_operation(item_id, active_run_id).expect("active run should start"); + assert_eq!( + super::mark_cancelled(item_id, stale_run_id).expect("stale cancel should not fail"), + None + ); + + assert!(!super::is_operation_cancelled(item_id, active_run_id)); + assert!(!super::is_operation_cancelled(item_id, stale_run_id)); + assert!(super::has_registered_operation(item_id)); + + let _ = super::clear_registered_operation(item_id, active_run_id); + let _ = super::clear_cancelled(item_id, active_run_id); + let _ = super::clear_cancelled(item_id, stale_run_id); + } + + #[test] + #[serial] + fn register_output_paths_reports_prior_cancellation() { + let item_id = "subtitle-ocr-cancel-before-output-registration-test"; + let run_id = "run-1"; + let _ = super::clear_registered_operation(item_id, run_id); + let _ = super::clear_cancelled(item_id, run_id); + + super::begin_operation(item_id, run_id).expect("operation should start"); + super::mark_cancelled(item_id, run_id).expect("operation should cancel"); + + let is_cancelled = super::register_output_paths( + item_id, + run_id, + vec!["/tmp/subtitle-ocr-cancelled-output.sup".to_string()], + ) + .expect("output registration should work"); + + assert!(is_cancelled); + + let _ = super::take_output_paths(item_id, run_id); + let _ = super::clear_registered_operation(item_id, run_id); + let _ = super::clear_cancelled(item_id, run_id); + } + + #[test] + #[serial] + fn register_operation_pid_reports_prior_cancellation() { + let item_id = "subtitle-ocr-cancel-before-pid-registration-test"; + let run_id = "run-1"; + let _ = super::clear_registered_operation(item_id, run_id); + let _ = super::clear_cancelled(item_id, run_id); + + super::begin_operation(item_id, run_id).expect("operation should start"); + super::mark_cancelled(item_id, run_id).expect("operation should cancel"); + + let is_cancelled = super::register_operation_pid(item_id, run_id, 42) + .expect("pid registration should work"); + + assert!(is_cancelled); + assert_eq!( + super::take_operation_pid(item_id, run_id).expect("pid should be readable by owner"), + Some(42) + ); + + let _ = super::clear_registered_operation(item_id, run_id); + let _ = super::clear_cancelled(item_id, run_id); + } + + #[test] + #[serial] + fn begin_operation_allows_exactly_one_concurrent_start_for_same_item() { + let item_id = "subtitle-ocr-concurrent-begin-test"; + let run_id = "run-1"; + let _ = super::clear_registered_operation(item_id, run_id); + let _ = super::clear_cancelled(item_id, run_id); + + let thread_count = 64; + let barrier = Arc::new(Barrier::new(thread_count)); + let successes = Arc::new(AtomicUsize::new(0)); + let handles = (0..thread_count) + .map(|_| { + let barrier = Arc::clone(&barrier); + let successes = Arc::clone(&successes); + thread::spawn(move || { + barrier.wait(); + if super::begin_operation(item_id, run_id).is_ok() { + successes.fetch_add(1, Ordering::SeqCst); + } + }) + }) + .collect::>(); + + for handle in handles { + handle.join().expect("worker thread should not panic"); + } + + assert_eq!(successes.load(Ordering::SeqCst), 1); + + let _ = super::clear_registered_operation(item_id, run_id); + let _ = super::clear_cancelled(item_id, run_id); + } +} diff --git a/src-tauri/src/tools/subtitle_ocr/text.rs b/src-tauri/src/tools/subtitle_ocr/text.rs new file mode 100644 index 00000000..95bdf0d9 --- /dev/null +++ b/src-tauri/src/tools/subtitle_ocr/text.rs @@ -0,0 +1,251 @@ +#![allow(dead_code)] + +use crate::tools::subtitle_ocr::SubtitleOcrBox; + +#[derive(Debug)] +struct TextBox { + text: String, + x: f64, + y: f64, + height: f64, +} + +#[derive(Debug)] +struct TextLine { + y: f64, + center_y: f64, + max_height: f64, + boxes: Vec, +} + +pub(crate) fn reconstruct_text_from_boxes(boxes: &[SubtitleOcrBox]) -> String { + let mut text_boxes = boxes + .iter() + .filter_map(|ocr_box| { + let text = collapse_whitespace(&ocr_box.text); + (!text.is_empty()).then_some(TextBox { + text, + x: ocr_box.x, + y: ocr_box.y, + height: ocr_box.height, + }) + }) + .collect::>(); + + text_boxes.sort_by(|a, b| a.y.total_cmp(&b.y).then_with(|| a.x.total_cmp(&b.x))); + + let mut lines: Vec = Vec::new(); + for text_box in text_boxes { + let text_box_center_y = box_center_y(text_box.y, text_box.height); + if let Some(line) = lines.last_mut().filter(|line| { + is_same_line( + line.center_y, + line.max_height, + text_box_center_y, + text_box.height, + ) + }) { + let box_count = line.boxes.len() as f64; + line.y = ((line.y * box_count) + text_box.y) / (box_count + 1.0); + line.center_y = ((line.center_y * box_count) + text_box_center_y) / (box_count + 1.0); + line.max_height = line.max_height.max(text_box.height); + line.boxes.push(text_box); + } else { + lines.push(TextLine { + y: text_box.y, + center_y: text_box_center_y, + max_height: text_box.height, + boxes: vec![text_box], + }); + } + } + + let text = lines + .iter_mut() + .map(|line| { + line.boxes.sort_by(|a, b| a.x.total_cmp(&b.x)); + line.boxes + .iter() + .map(|text_box| text_box.text.as_str()) + .collect::>() + .join(" ") + }) + .collect::>() + .join("\n"); + + split_dialogue_dash_fallback(&text) +} + +pub(crate) fn split_dialogue_dash_fallback(text: &str) -> String { + if text.contains('\n') || !starts_with_dialogue_dash(text) { + return text.to_string(); + } + + for (index, ch) in text.char_indices().skip(1) { + if !is_dialogue_dash(ch) { + continue; + } + + let previous_is_whitespace = text[..index] + .chars() + .next_back() + .is_some_and(char::is_whitespace); + let next_is_whitespace = text[index + ch.len_utf8()..] + .chars() + .next() + .is_some_and(char::is_whitespace); + + if previous_is_whitespace && next_is_whitespace { + let first = text[..index].trim_end(); + let second = text[index..].trim_start(); + + if !first.trim().is_empty() + && !second.trim().is_empty() + && ends_with_sentence_terminal(first) + { + return format!("{}\n{}", first, second); + } + } + } + + text.to_string() +} + +fn collapse_whitespace(text: &str) -> String { + let mut output = String::with_capacity(text.len()); + let mut last_was_whitespace = false; + + for ch in text.chars() { + if ch.is_whitespace() { + if !last_was_whitespace && !output.is_empty() { + output.push(' '); + } + last_was_whitespace = true; + continue; + } + + last_was_whitespace = false; + output.push(ch); + } + + output.trim().to_string() +} + +fn box_center_y(y: f64, height: f64) -> f64 { + y + (height / 2.0) +} + +fn is_same_line(line_center_y: f64, line_height: f64, box_center_y: f64, box_height: f64) -> bool { + let height = line_height.max(box_height); + let threshold = if height > 0.0 { + (height * 0.45).max(8.0) + } else { + 0.01 + }; + + (line_center_y - box_center_y).abs() <= threshold +} + +fn starts_with_dialogue_dash(text: &str) -> bool { + text.chars().next().is_some_and(is_dialogue_dash) +} + +fn is_dialogue_dash(ch: char) -> bool { + matches!(ch, '-' | '–' | '—') +} + +fn ends_with_sentence_terminal(text: &str) -> bool { + matches!( + text.chars().next_back(), + Some('.' | '!' | '?' | '…' | '。' | '!' | '?') + ) +} + +#[cfg(test)] +mod tests { + use super::{reconstruct_text_from_boxes, split_dialogue_dash_fallback}; + use crate::tools::subtitle_ocr::SubtitleOcrBox; + + fn ocr_box(text: &str, x: f64, y: f64) -> SubtitleOcrBox { + SubtitleOcrBox { + text: text.to_string(), + confidence: 0.9, + x, + y, + width: 20.0, + height: 10.0, + } + } + + #[test] + fn reconstruct_text_groups_boxes_by_line() { + let boxes = vec![ + ocr_box("I cannot.", 24.0, 42.0), + ocr_box("Stop.", 24.0, 10.0), + ocr_box(" - ", 10.0, 10.0), + ocr_box("- ", 10.0, 42.0), + ]; + + let text = reconstruct_text_from_boxes(&boxes); + + assert_eq!(text, "- Stop.\n- I cannot."); + } + + #[test] + fn reconstruct_text_keeps_close_subtitle_lines_separate() { + let boxes = vec![ + SubtitleOcrBox { + text: "Of course, I'd expect nothing less".to_string(), + confidence: 0.9, + x: 159.0, + y: 343.0, + width: 401.0, + height: 45.0, + }, + SubtitleOcrBox { + text: "from the master of the Kimeragi Clan.".to_string(), + confidence: 0.9, + x: 135.0, + y: 374.0, + width: 445.0, + height: 49.0, + }, + ]; + + let text = reconstruct_text_from_boxes(&boxes); + + assert_eq!( + text, + "Of course, I'd expect nothing less\nfrom the master of the Kimeragi Clan." + ); + } + + #[test] + fn dash_fallback_splits_obvious_dialogue() { + let text = split_dialogue_dash_fallback("- Stop. - I cannot."); + + assert_eq!(text, "- Stop.\n- I cannot."); + } + + #[test] + fn dash_fallback_splits_en_dash_and_em_dash_dialogue() { + assert_eq!( + split_dialogue_dash_fallback("– Stop. — I cannot."), + "– Stop.\n— I cannot." + ); + } + + #[test] + fn dash_fallback_keeps_leading_dash_sentence_with_internal_dash() { + let text = split_dialogue_dash_fallback("- A well-known phrase - not dialogue."); + + assert_eq!(text, "- A well-known phrase - not dialogue."); + } + + #[test] + fn dash_fallback_keeps_normal_hyphen_text() { + let text = split_dialogue_dash_fallback("This is a well-known phrase - not dialogue."); + + assert_eq!(text, "This is a well-known phrase - not dialogue."); + } +} diff --git a/src/lib/components/AppSidebar.svelte b/src/lib/components/AppSidebar.svelte index 8ffce7d4..eb5afc76 100644 --- a/src/lib/components/AppSidebar.svelte +++ b/src/lib/components/AppSidebar.svelte @@ -9,6 +9,7 @@ PenLine, AudioLines, ScanText, + Captions, } from '@lucide/svelte'; import type { ComponentProps } from 'svelte'; @@ -50,6 +51,11 @@ title: 'Video OCR', icon: ScanText, }, + { + id: 'subtitle-ocr', + title: 'Subtitle OCR', + icon: Captions, + }, { id: 'translate', title: 'AI Translation', diff --git a/src/lib/components/subtitle-ocr/SubtitleOcrBasket.svelte b/src/lib/components/subtitle-ocr/SubtitleOcrBasket.svelte new file mode 100644 index 00000000..2af097f4 --- /dev/null +++ b/src/lib/components/subtitle-ocr/SubtitleOcrBasket.svelte @@ -0,0 +1,120 @@ + + +
+
+
+ + {#if cue} + + + {formatTime(cue.startTimeMs)} - {formatTime(cue.endTimeMs)} + + Confidence {confidencePercent}% + {cue.sourceCueIds.length} source cue{cue.sourceCueIds.length === 1 ? '' : 's'} + + + + + + + Recognized text +