diff --git a/src/practicelens/api/contracts.py b/src/practicelens/api/contracts.py index 6fdf622..3d92835 100644 --- a/src/practicelens/api/contracts.py +++ b/src/practicelens/api/contracts.py @@ -102,6 +102,12 @@ class InputSuitabilitySummaryPayload(TypedDict): duration_ratio: float duration_diagnostic: str duration_diagnostic_message: str | None + reference_activity_start_s: float | None + take_activity_start_s: float | None + start_offset_s: float | None + leading_noise_duration_s: float + start_diagnostic: str + start_diagnostic_message: str | None alignment_coverage: float voiced_frame_coverage: float reference_voiced_frame_coverage: float diff --git a/src/practicelens/diagnostics/input_suitability.py b/src/practicelens/diagnostics/input_suitability.py index 150d482..e9ab973 100644 --- a/src/practicelens/diagnostics/input_suitability.py +++ b/src/practicelens/diagnostics/input_suitability.py @@ -14,11 +14,22 @@ _VOICED_WARNING_MIN = 0.35 _VOICED_LOW_MIN = 0.15 _ONSET_PRESENT_MIN = 2 +_START_OFFSET_WARNING_S = 0.35 +_LEADING_NOISE_WARNING_S = 0.25 +_ENERGY_ACTIVITY_RELATIVE_THRESHOLD = 0.20 _SCORE_DIGITS = 6 _DURATION_WARNING_MESSAGE = ( "Take duration differs substantially from the reference. Possible causes include extra silence, " "a restart, a missing section, or unrelated material." ) +_START_DELAY_MESSAGE = ( + "The take start may be delayed relative to the reference. This may indicate a weak or missing first note, " + "late playing, or leading silence before the musical activity." +) +_LEADING_NOISE_MESSAGE = ( + "The take may contain leading noise before the first clear musical activity. Possible causes include handling noise, " + "breath noise, pickup noise, or room noise before the performance starts." +) def summarize_input_suitability( @@ -33,6 +44,12 @@ def summarize_input_suitability( duration_ratio = _duration_ratio(reference_duration_s, take_duration_s) duration_diagnostic = _duration_diagnostic(reference_duration_s, take_duration_s, duration_ratio) duration_diagnostic_message = _duration_diagnostic_message(duration_diagnostic) + reference_activity_start_s = _activity_start_s(reference) + take_activity_start_s = _activity_start_s(take) + start_offset_s = _start_offset_s(reference_activity_start_s, take_activity_start_s) + leading_noise_duration_s = _leading_noise_duration_s(take, take_activity_start_s) + start_diagnostic = _start_diagnostic(start_offset_s, leading_noise_duration_s) + start_diagnostic_message = _start_diagnostic_message(start_diagnostic) alignment_coverage = _round_ratio(alignment.coverage_ratio) reference_voiced_coverage = _round_ratio(_voiced_ratio(reference)) take_voiced_coverage = _round_ratio(_voiced_ratio(take)) @@ -56,6 +73,14 @@ def summarize_input_suitability( if duration_ratio < _DURATION_RATIO_LOW_MIN or duration_ratio > _DURATION_RATIO_LOW_MAX: low_confidence = True + if start_diagnostic == "start_region_unavailable": + reasons.append("Start-region activity evidence is unavailable.") + elif start_diagnostic == "start_region_ok": + reasons.append("Start-region activity appears aligned enough for review.") + else: + reasons.append(start_diagnostic_message or "Start-region activity may not align cleanly.") + risk_points += 1 + if alignment_coverage >= _ALIGNMENT_WARNING_MIN: reasons.append("Alignment coverage is broad.") else: @@ -89,6 +114,12 @@ def summarize_input_suitability( duration_ratio=duration_ratio, duration_diagnostic=duration_diagnostic, duration_diagnostic_message=duration_diagnostic_message, + reference_activity_start_s=_seconds_or_none(reference_activity_start_s), + take_activity_start_s=_seconds_or_none(take_activity_start_s), + start_offset_s=_seconds_or_none(start_offset_s), + leading_noise_duration_s=Seconds(_round_ratio(leading_noise_duration_s)), + start_diagnostic=start_diagnostic, + start_diagnostic_message=start_diagnostic_message, alignment_coverage=alignment_coverage, voiced_frame_coverage=voiced_frame_coverage, reference_voiced_frame_coverage=reference_voiced_coverage, @@ -130,6 +161,111 @@ def _duration_diagnostic_message(duration_diagnostic: str) -> str | None: return None +def _activity_start_s(bundle: FeatureBundle) -> float | None: + musical_starts = [_first_voiced_time_s(bundle), _first_onset_time_s(bundle)] + available_musical_starts = [start for start in musical_starts if start is not None] + if available_musical_starts: + return min(available_musical_starts) + return _first_energy_activity_time_s(bundle) + + +def _first_voiced_time_s(bundle: FeatureBundle) -> float | None: + for time_s, voiced in zip(bundle.time_axis_s, bundle.voiced_mask, strict=False): + if voiced: + return float(time_s) + return None + + +def _first_onset_time_s(bundle: FeatureBundle) -> float | None: + if not bundle.onset_times_s: + return None + return float(bundle.onset_times_s[0]) + + +def _first_energy_activity_time_s(bundle: FeatureBundle) -> float | None: + if not bundle.time_axis_s or not bundle.energy_curve: + return None + peak_energy = max(bundle.energy_curve) + if peak_energy <= 0.0: + return None + threshold = peak_energy * _ENERGY_ACTIVITY_RELATIVE_THRESHOLD + for time_s, energy in zip(bundle.time_axis_s, bundle.energy_curve, strict=False): + if energy >= threshold: + return float(time_s) + return None + + +def _start_offset_s(reference_activity_start_s: float | None, take_activity_start_s: float | None) -> float | None: + if reference_activity_start_s is None or take_activity_start_s is None: + return None + return _round_ratio(take_activity_start_s - reference_activity_start_s) + + +def _leading_noise_duration_s(bundle: FeatureBundle, take_activity_start_s: float | None) -> float: + if take_activity_start_s is None or take_activity_start_s <= 0.0: + return 0.0 + if not bundle.time_axis_s or not bundle.energy_curve: + return 0.0 + + activity_index = _first_index_at_or_after(bundle.time_axis_s, take_activity_start_s) + if activity_index <= 0: + return 0.0 + + pre_activity_energy = bundle.energy_curve[:activity_index] + post_activity_energy = bundle.energy_curve[activity_index:] + if not pre_activity_energy or not post_activity_energy: + return 0.0 + + max_post_activity_energy = max(post_activity_energy) + if max_post_activity_energy <= 0.0: + return 0.0 + + noise_threshold = max_post_activity_energy * _ENERGY_ACTIVITY_RELATIVE_THRESHOLD + noisy_times = [ + time_s + for time_s, energy in zip( + bundle.time_axis_s[:activity_index], + pre_activity_energy, + strict=False, + ) + if energy >= noise_threshold + ] + if not noisy_times: + return 0.0 + return _round_ratio(max(0.0, take_activity_start_s - noisy_times[0])) + + +def _first_index_at_or_after(time_axis_s: tuple[float, ...], start_s: float) -> int: + for index, time_s in enumerate(time_axis_s): + if time_s >= start_s: + return index + return len(time_axis_s) + + +def _start_diagnostic(start_offset_s: float | None, leading_noise_duration_s: float) -> str: + if start_offset_s is None: + return "start_region_unavailable" + if leading_noise_duration_s >= _LEADING_NOISE_WARNING_S: + return "take_leading_noise_before_activity" + if start_offset_s >= _START_OFFSET_WARNING_S: + return "take_activity_starts_late" + return "start_region_ok" + + +def _start_diagnostic_message(start_diagnostic: str) -> str | None: + if start_diagnostic == "take_activity_starts_late": + return _START_DELAY_MESSAGE + if start_diagnostic == "take_leading_noise_before_activity": + return _LEADING_NOISE_MESSAGE + return None + + +def _seconds_or_none(value: float | None) -> Seconds | None: + if value is None: + return None + return Seconds(_round_ratio(value)) + + def _voiced_ratio(bundle: FeatureBundle) -> float: if not bundle.voiced_mask: return 0.0 diff --git a/src/practicelens/domain/models.py b/src/practicelens/domain/models.py index 787b9d1..ccf1502 100644 --- a/src/practicelens/domain/models.py +++ b/src/practicelens/domain/models.py @@ -146,6 +146,12 @@ class InputSuitabilitySummary: duration_ratio: float = 0.0 duration_diagnostic: str = "duration_ratio_unavailable" duration_diagnostic_message: str | None = None + reference_activity_start_s: Seconds | None = None + take_activity_start_s: Seconds | None = None + start_offset_s: Seconds | None = None + leading_noise_duration_s: Seconds = Seconds(0.0) + start_diagnostic: str = "start_region_unavailable" + start_diagnostic_message: str | None = None alignment_coverage: float = 0.0 voiced_frame_coverage: float = 0.0 reference_voiced_frame_coverage: float = 0.0 diff --git a/src/practicelens/reporting/input_suitability_payload.py b/src/practicelens/reporting/input_suitability_payload.py index 3586573..a681b1d 100644 --- a/src/practicelens/reporting/input_suitability_payload.py +++ b/src/practicelens/reporting/input_suitability_payload.py @@ -12,6 +12,12 @@ def input_suitability_to_payload(summary: InputSuitabilitySummary) -> dict[str, "duration_ratio": summary.duration_ratio, "duration_diagnostic": summary.duration_diagnostic, "duration_diagnostic_message": summary.duration_diagnostic_message, + "reference_activity_start_s": summary.reference_activity_start_s, + "take_activity_start_s": summary.take_activity_start_s, + "start_offset_s": summary.start_offset_s, + "leading_noise_duration_s": summary.leading_noise_duration_s, + "start_diagnostic": summary.start_diagnostic, + "start_diagnostic_message": summary.start_diagnostic_message, "alignment_coverage": summary.alignment_coverage, "voiced_frame_coverage": summary.voiced_frame_coverage, "reference_voiced_frame_coverage": summary.reference_voiced_frame_coverage, diff --git a/tests/unit/test_input_suitability.py b/tests/unit/test_input_suitability.py index cac231d..a3677d0 100644 --- a/tests/unit/test_input_suitability.py +++ b/tests/unit/test_input_suitability.py @@ -5,16 +5,8 @@ def test_input_suitability_summary_reports_ok_when_evidence_is_strong() -> None: summary = summarize_input_suitability( - _feature_bundle( - time_axis_s=(0.0, 1.0, 2.0, 3.0), - voiced_mask=(True, True, True, True), - onset_times_s=(0.5, 1.5), - ), - _feature_bundle( - time_axis_s=(0.0, 1.0, 2.0, 3.0), - voiced_mask=(True, True, True, True), - onset_times_s=(0.5, 1.5), - ), + _feature_bundle(time_axis_s=(0.0, 1.0, 2.0, 3.0), voiced_mask=(True, True, True, True), onset_times_s=(0.5, 1.5)), + _feature_bundle(time_axis_s=(0.0, 1.0, 2.0, 3.0), voiced_mask=(True, True, True, True), onset_times_s=(0.5, 1.5)), AlignmentPath(pairs=(), total_cost=0.0, coverage_ratio=0.9), ) @@ -24,6 +16,8 @@ def test_input_suitability_summary_reports_ok_when_evidence_is_strong() -> None: assert summary.duration_ratio == 1.0 assert summary.duration_diagnostic == "duration_ratio_ok" assert summary.duration_diagnostic_message is None + assert summary.start_diagnostic == "start_region_ok" + assert summary.start_diagnostic_message is None assert summary.alignment_coverage == 0.9 assert summary.voiced_frame_coverage == 1.0 assert summary.onset_evidence == "present" @@ -31,16 +25,8 @@ def test_input_suitability_summary_reports_ok_when_evidence_is_strong() -> None: def test_input_suitability_summary_reports_warning_when_duration_differs() -> None: summary = summarize_input_suitability( - _feature_bundle( - time_axis_s=(0.0, 1.0, 2.0, 3.0), - voiced_mask=(True, True, True, True), - onset_times_s=(0.5, 1.5), - ), - _feature_bundle( - time_axis_s=(0.0, 1.0, 2.0), - voiced_mask=(True, True, True), - onset_times_s=(0.5, 1.5), - ), + _feature_bundle(time_axis_s=(0.0, 1.0, 2.0, 3.0), voiced_mask=(True, True, True, True), onset_times_s=(0.5, 1.5)), + _feature_bundle(time_axis_s=(0.0, 1.0, 2.0), voiced_mask=(True, True, True), onset_times_s=(0.5, 1.5)), AlignmentPath(pairs=(), total_cost=0.0, coverage_ratio=0.9), ) @@ -57,16 +43,8 @@ def test_input_suitability_summary_reports_warning_when_duration_differs() -> No def test_input_suitability_summary_reports_low_confidence_when_evidence_is_thin() -> None: summary = summarize_input_suitability( - _feature_bundle( - time_axis_s=(0.0, 1.0, 2.0, 3.0), - voiced_mask=(True, False, False, False), - onset_times_s=(), - ), - _feature_bundle( - time_axis_s=(0.0, 0.5), - voiced_mask=(False, False), - onset_times_s=(), - ), + _feature_bundle(time_axis_s=(0.0, 1.0, 2.0, 3.0), voiced_mask=(True, False, False, False), onset_times_s=()), + _feature_bundle(time_axis_s=(0.0, 0.5), voiced_mask=(False, False), onset_times_s=()), AlignmentPath(pairs=(), total_cost=0.0, coverage_ratio=0.4), ) @@ -79,16 +57,8 @@ def test_input_suitability_summary_reports_low_confidence_when_evidence_is_thin( def test_input_suitability_duration_diagnostic_reports_much_longer_take() -> None: summary = summarize_input_suitability( - _feature_bundle( - time_axis_s=(0.0, 1.0, 2.0), - voiced_mask=(True, True, True), - onset_times_s=(0.5, 1.5), - ), - _feature_bundle( - time_axis_s=(0.0, 1.0, 2.0, 3.0, 4.0), - voiced_mask=(True, True, True, True, True), - onset_times_s=(0.5, 1.5), - ), + _feature_bundle(time_axis_s=(0.0, 1.0, 2.0), voiced_mask=(True, True, True), onset_times_s=(0.5, 1.5)), + _feature_bundle(time_axis_s=(0.0, 1.0, 2.0, 3.0, 4.0), voiced_mask=(True, True, True, True, True), onset_times_s=(0.5, 1.5)), AlignmentPath(pairs=(), total_cost=0.0, coverage_ratio=0.9), ) @@ -100,24 +70,83 @@ def test_input_suitability_duration_diagnostic_reports_much_longer_take() -> Non def test_input_suitability_duration_diagnostic_reports_acceptable_duration() -> None: + summary = summarize_input_suitability( + _feature_bundle(time_axis_s=(0.0, 1.0, 2.0, 3.0), voiced_mask=(True, True, True, True), onset_times_s=(0.5, 1.5)), + _feature_bundle(time_axis_s=(0.0, 1.0, 2.0, 3.2), voiced_mask=(True, True, True, True), onset_times_s=(0.5, 1.5)), + AlignmentPath(pairs=(), total_cost=0.0, coverage_ratio=0.9), + ) + + assert summary.status == "ok" + assert summary.duration_ratio == 1.066667 + assert summary.duration_diagnostic == "duration_ratio_ok" + assert summary.duration_diagnostic_message is None + + +def test_input_suitability_start_diagnostic_reports_delayed_take_start() -> None: summary = summarize_input_suitability( _feature_bundle( - time_axis_s=(0.0, 1.0, 2.0, 3.0), - voiced_mask=(True, True, True, True), - onset_times_s=(0.5, 1.5), + time_axis_s=(0.0, 0.25, 0.5, 0.75, 1.0), + voiced_mask=(True, True, True, True, True), + onset_times_s=(0.1, 0.6), + energy_curve=(1.0, 1.0, 1.0, 1.0, 1.0), ), _feature_bundle( - time_axis_s=(0.0, 1.0, 2.0, 3.2), - voiced_mask=(True, True, True, True), - onset_times_s=(0.5, 1.5), + time_axis_s=(0.0, 0.25, 0.5, 0.75, 1.0), + voiced_mask=(False, False, True, True, True), + onset_times_s=(0.55, 0.8), + energy_curve=(0.0, 0.0, 1.0, 1.0, 1.0), ), AlignmentPath(pairs=(), total_cost=0.0, coverage_ratio=0.9), ) + assert summary.status == "warning" + assert summary.reference_activity_start_s == 0.0 + assert summary.take_activity_start_s == 0.5 + assert summary.start_offset_s == 0.5 + assert summary.start_diagnostic == "take_activity_starts_late" + assert summary.start_diagnostic_message is not None + assert "may indicate" in summary.start_diagnostic_message + assert summary.start_diagnostic_message in summary.reasons + + +def test_input_suitability_start_diagnostic_reports_noisy_leading_start() -> None: + summary = summarize_input_suitability( + _feature_bundle( + time_axis_s=(0.0, 0.25, 0.5, 0.75, 1.0), + voiced_mask=(True, True, True, True, True), + onset_times_s=(0.1, 0.6), + energy_curve=(1.0, 1.0, 1.0, 1.0, 1.0), + ), + _feature_bundle( + time_axis_s=(0.0, 0.25, 0.5, 0.75, 1.0), + voiced_mask=(False, False, True, True, True), + onset_times_s=(0.55, 0.8), + energy_curve=(0.3, 0.3, 1.0, 1.0, 1.0), + ), + AlignmentPath(pairs=(), total_cost=0.0, coverage_ratio=0.9), + ) + + assert summary.status == "warning" + assert summary.leading_noise_duration_s == 0.5 + assert summary.start_diagnostic == "take_leading_noise_before_activity" + assert summary.start_diagnostic_message is not None + assert "may contain leading noise" in summary.start_diagnostic_message + assert "Possible causes" in summary.start_diagnostic_message + assert summary.start_diagnostic_message in summary.reasons + + +def test_input_suitability_start_diagnostic_reports_normal_start() -> None: + summary = summarize_input_suitability( + _feature_bundle(time_axis_s=(0.0, 0.25, 0.5, 0.75, 1.0), voiced_mask=(True, True, True, True, True), onset_times_s=(0.1, 0.6)), + _feature_bundle(time_axis_s=(0.0, 0.25, 0.5, 0.75, 1.0), voiced_mask=(True, True, True, True, True), onset_times_s=(0.1, 0.6)), + AlignmentPath(pairs=(), total_cost=0.0, coverage_ratio=0.9), + ) + assert summary.status == "ok" - assert summary.duration_ratio == 1.066667 - assert summary.duration_diagnostic == "duration_ratio_ok" - assert summary.duration_diagnostic_message is None + assert summary.start_offset_s == 0.0 + assert summary.leading_noise_duration_s == 0.0 + assert summary.start_diagnostic == "start_region_ok" + assert summary.start_diagnostic_message is None def _feature_bundle( @@ -125,11 +154,12 @@ def _feature_bundle( time_axis_s: tuple[float, ...], voiced_mask: tuple[bool, ...], onset_times_s: tuple[float, ...], + energy_curve: tuple[float, ...] | None = None, ) -> FeatureBundle: frame_count = len(time_axis_s) return FeatureBundle( time_axis_s=time_axis_s, - energy_curve=(1.0,) * frame_count, + energy_curve=energy_curve or (1.0,) * frame_count, zero_crossing_rate=(0.1,) * frame_count, pitch_contour_hz=tuple(220.0 if voiced else 0.0 for voiced in voiced_mask), voiced_mask=voiced_mask, diff --git a/tests/unit/test_json_report.py b/tests/unit/test_json_report.py index 4621901..91d0cd5 100644 --- a/tests/unit/test_json_report.py +++ b/tests/unit/test_json_report.py @@ -47,6 +47,12 @@ def _sample_report() -> AnalysisReport: duration_ratio=1.0, duration_diagnostic="duration_ratio_ok", duration_diagnostic_message=None, + reference_activity_start_s=0.0, + take_activity_start_s=0.0, + start_offset_s=0.0, + leading_noise_duration_s=0.0, + start_diagnostic="start_region_ok", + start_diagnostic_message=None, alignment_coverage=0.95, voiced_frame_coverage=0.8, reference_voiced_frame_coverage=0.85, @@ -117,6 +123,12 @@ def test_report_json_payload_has_stable_top_level_contract() -> None: "duration_ratio": 1.0, "duration_diagnostic": "duration_ratio_ok", "duration_diagnostic_message": None, + "reference_activity_start_s": 0.0, + "take_activity_start_s": 0.0, + "start_offset_s": 0.0, + "leading_noise_duration_s": 0.0, + "start_diagnostic": "start_region_ok", + "start_diagnostic_message": None, "alignment_coverage": 0.95, "voiced_frame_coverage": 0.8, "reference_voiced_frame_coverage": 0.85, diff --git a/tests/unit/test_reporting.py b/tests/unit/test_reporting.py index 8a28105..46e4e1e 100644 --- a/tests/unit/test_reporting.py +++ b/tests/unit/test_reporting.py @@ -62,6 +62,12 @@ def _sample_report() -> AnalysisReport: duration_ratio=1.0, duration_diagnostic="duration_ratio_ok", duration_diagnostic_message=None, + reference_activity_start_s=0.0, + take_activity_start_s=0.0, + start_offset_s=0.0, + leading_noise_duration_s=0.0, + start_diagnostic="start_region_ok", + start_diagnostic_message=None, alignment_coverage=0.95, voiced_frame_coverage=0.8, reference_voiced_frame_coverage=0.85, @@ -97,6 +103,7 @@ def test_report_to_json_payload_is_serializable() -> None: assert payload["scores"][0]["name"] == "pitch_fidelity" assert payload["input_suitability"]["status"] == "ok" assert payload["input_suitability"]["duration_diagnostic"] == "duration_ratio_ok" + assert payload["input_suitability"]["start_diagnostic"] == "start_region_ok" assert payload["artifacts"][1]["kind"] == "csv_report" @@ -155,6 +162,12 @@ def test_report_to_debug_payload_is_serializable() -> None: "duration_ratio": 1.0, "duration_diagnostic": "duration_ratio_ok", "duration_diagnostic_message": None, + "reference_activity_start_s": 0.0, + "take_activity_start_s": 0.0, + "start_offset_s": 0.0, + "leading_noise_duration_s": 0.0, + "start_diagnostic": "start_region_ok", + "start_diagnostic_message": None, "alignment_coverage": 0.95, "voiced_frame_coverage": 0.8, "reference_voiced_frame_coverage": 0.85,