Add nospeech detect

darkautism · darkautism · commit aad73b6609f4 · 2025-12-03T16:13:29.000+08:00
diff --git a/src/lib.rs b/src/lib.rs
@@ -21,7 +21,7 @@ use rknn_rs::prelude::{Rknn, RknnInput, RknnTensorFormat, RknnTensorType};
 use sentencepiece::SentencePieceProcessor;
 
 use config::SenseVoiceConfig;
-use silero_vad::{VadConfig, VadProcessor, CHUNK_SIZE};
+use silero_vad::{VadConfig, VadOutput, VadProcessor, CHUNK_SIZE};
 use wavfrontend::{WavFrontend, WavFrontendConfig};
 
 #[cfg(feature = "stream")]
@@ -510,6 +510,13 @@ impl SenseVoiceSmall {
         })
     }
 
+    /// Updates the silence notification threshold for VAD.
+    /// If `ms` is Some, a NoSpeech event will be emitted once after `ms` milliseconds of continuous dropped audio (waiting state).
+    #[cfg(feature = "stream")]
+    pub fn set_vad_silence_notification(&mut self, ms: Option<u32>) {
+        self.silero_vad.set_notify_silence_after_ms(ms);
+    }
+
     /// Performs speech recognition on a vector of audio samples.
     pub fn infer_vec(
         &self,
@@ -530,15 +537,44 @@ impl SenseVoiceSmall {
 
         for chunk in padded_content.chunks_exact(chunk_size) {
             let chunk_arr: &[i16; CHUNK_SIZE] = chunk.try_into()?;
-            if let Some(segment) = vad.process_chunk(chunk_arr) {
-                let vt = self.recognition(&segment)?;
-                ret.push(vt);
+            if let Some(output) = vad.process_chunk(chunk_arr) {
+                match output {
+                    VadOutput::Segment(segment) => {
+                        let vt = self.recognition(&segment)?;
+                        ret.push(vt);
+                    },
+                    VadOutput::SilenceNotification => {
+                        // For batch infer, usually we don't need intermediate notifications,
+                        // but if configured in vad_config, we respect it.
+                        ret.push(VoiceText {
+                            language: SenseVoiceLanguage::NoSpeech,
+                            emotion: SenseVoiceEmo::Unknown,
+                            event: SenseVoiceEvent::Unknown,
+                            punctuation_normalization: SenseVoicePunctuationNormalization::Woitn,
+                            content: String::new(),
+                        });
+                    }
+                }
             }
         }
 
-        if let Some(segment) = vad.finish() {
-            let vt = self.recognition(&segment)?;
-            ret.push(vt);
+        if let Some(output) = vad.finish() {
+            match output {
+                VadOutput::Segment(segment) => {
+                    let vt = self.recognition(&segment)?;
+                    ret.push(vt);
+                },
+                VadOutput::SilenceNotification => {
+                    // Should not happen in finish usually, but handle it
+                    ret.push(VoiceText {
+                        language: SenseVoiceLanguage::NoSpeech,
+                        emotion: SenseVoiceEmo::Unknown,
+                        event: SenseVoiceEvent::Unknown,
+                        punctuation_normalization: SenseVoicePunctuationNormalization::Woitn,
+                        content: String::new(),
+                    });
+                }
+            }
         }
 
         Ok(ret)
@@ -648,18 +684,41 @@ impl SenseVoiceSmall {
             // For now, assuming the stream provides correct chunks or we try to convert.
             // process_chunk expects &[i16; 512].
             if let Ok(chunk_arr) = chunk.as_slice().try_into() {
-                if let Some(segment) = self.silero_vad.process_chunk(chunk_arr) {
-                    yield self.recognition(&segment);
+                if let Some(output) = self.silero_vad.process_chunk(chunk_arr) {
+                    match output {
+                        VadOutput::Segment(segment) => {
+                            yield self.recognition(&segment);
+                        },
+                        VadOutput::SilenceNotification => {
+                            yield Ok(VoiceText {
+                                language: SenseVoiceLanguage::NoSpeech,
+                                emotion: SenseVoiceEmo::Unknown,
+                                event: SenseVoiceEvent::Unknown,
+                                punctuation_normalization: SenseVoicePunctuationNormalization::Woitn,
+                                content: String::new(),
+                            });
+                        }
+                    }
                 }
             } else {
                  // Handle mismatch size? For now ignore or log?
-                 // Or maybe we should allow partial chunks if the logic allows, but process_chunk seems strict.
-                 // Ideally we should buffer. But let's stick to simple fix first: expect 512.
-                 // If chunk is not 512, silero_vad might panic if we force it? No, try_into returns error.
             }
         }
-        if let Some(segment) = self.silero_vad.finish() {
-        yield self.recognition(&segment);
+        if let Some(output) = self.silero_vad.finish() {
+            match output {
+                VadOutput::Segment(segment) => {
+                    yield self.recognition(&segment);
+                },
+                VadOutput::SilenceNotification => {
+                     yield Ok(VoiceText {
+                        language: SenseVoiceLanguage::NoSpeech,
+                        emotion: SenseVoiceEmo::Unknown,
+                        event: SenseVoiceEvent::Unknown,
+                        punctuation_normalization: SenseVoicePunctuationNormalization::Woitn,
+                        content: String::new(),
+                    });
+                }
+            }
         }
         }
     }
diff --git a/src/silero_vad.rs b/src/silero_vad.rs
@@ -4,11 +4,13 @@ pub const CHUNK_SIZE: usize = 512;
 
 #[derive(Debug, Clone, Copy)]
 pub struct VadConfig {
-    sample_rate: u32,            // 採樣率，例如 16000 Hz
-    speech_threshold: f32,       // 語音概率閾值，例如 0.5
-    silence_duration_ms: u32,    // 靜音持續時間（毫秒），例如 500 ms
-    max_speech_duration_ms: u32, // 最大語音段長（毫秒），例如 10000 ms
-    rollback_duration_ms: u32,   // 剪斷後回退時間（毫秒），例如 200 ms
+    pub sample_rate: u32,            // 採樣率，例如 16000 Hz
+    pub speech_threshold: f32,       // 語音概率閾值，例如 0.5
+    pub silence_duration_ms: u32,    // 靜音持續時間（毫秒），例如 500 ms
+    pub max_speech_duration_ms: u32, // 最大語音段長（毫秒），例如 10000 ms
+    pub rollback_duration_ms: u32,   // 剪斷後回退時間（毫秒），例如 200 ms
+    pub min_speech_duration_ms: u32, // 最小語音段長（毫秒），小於此長度視為噪音，例如 250 ms
+    pub notify_silence_after_ms: Option<u32>, // 如果處於等待狀態超過此時間，發出靜音通知
 }
 
 impl Default for VadConfig {
@@ -19,18 +21,36 @@ impl Default for VadConfig {
             silence_duration_ms: 500,     // 500 ms 靜音算結束
             max_speech_duration_ms: 9000, // 9 秒最大語音段
             rollback_duration_ms: 200,    // 回退 200 ms
+            min_speech_duration_ms: 250,  // 最小 250 ms
+            notify_silence_after_ms: None,
         }
     }
 }
 
+#[derive(Debug)]
+enum VadState {
+    Waiting,
+    Recording,
+}
+
+/// Enum to distinguish between a speech segment and a silence notification
+#[derive(Debug)]
+pub enum VadOutput {
+    Segment(Vec<i16>),
+    SilenceNotification,
+}
+
 #[derive(Debug)]
 pub struct VadProcessor {
     vad: VoiceActivityDetector,
     config: VadConfig,
-    current_segment: Vec<i16>,      // 當前語音段的樣本
-    pending_samples: VecDeque<i16>, // 未完成的樣本，等待下次處理
-    silence_chunks: u32,            // 連續靜音塊數
-    speech_chunks: u32,             // 當前語音段的塊數
+    state: VadState,
+    current_segment: Vec<i16>,
+    history_buffer: VecDeque<i16>, // 用於保留語音開始前的上下文
+    silence_chunks: u32,           // 連續靜音塊數 (Recording 狀態下)
+    speech_chunks: u32,            // 當前語音段的塊數
+    waiting_dropped_chunks: u32,   // Waiting 狀態下已丟棄的塊數
+    notified_silence: bool,        // 是否已經發出過靜音通知 (One-shot)
 }
 
 impl VadProcessor {
@@ -42,14 +62,27 @@ impl VadProcessor {
         Ok(Self {
             vad,
             config,
+            state: VadState::Waiting,
             current_segment: Vec::new(),
-            pending_samples: VecDeque::new(),
+            history_buffer: VecDeque::new(),
             silence_chunks: 0,
             speech_chunks: 0,
+            waiting_dropped_chunks: 0,
+            notified_silence: false,
         })
     }
 
-    pub fn process_chunk(&mut self, chunk: &[i16; CHUNK_SIZE]) -> Option<Vec<i16>> {
+    /// 更新通知靜音的設定
+    pub fn set_notify_silence_after_ms(&mut self, ms: Option<u32>) {
+        self.config.notify_silence_after_ms = ms;
+        // 如果關閉了通知，重置狀態以防萬一
+        if ms.is_none() {
+            self.notified_silence = false;
+        }
+        // 如果開啟了且當前累積已超過，下一次 process_chunk 會觸發
+    }
+
+    pub fn process_chunk(&mut self, chunk: &[i16; CHUNK_SIZE]) -> Option<VadOutput> {
         let chunk_duration_ms = (CHUNK_SIZE as f32 / self.config.sample_rate as f32) * 1000.0;
         let probability = chunk
             .iter()
@@ -59,70 +92,131 @@ impl VadProcessor {
             .unwrap()
             .1;
 
-        // 將 chunk 添加到待處理樣本
-        self.pending_samples.extend(chunk.iter().copied());
+        match self.state {
+            VadState::Waiting => {
+                // 將塊加入歷史緩衝區
+                self.history_buffer.extend(chunk.iter().copied());
+
+                // 維護緩衝區大小 (rollback_duration)
+                let rollback_samples = ((self.config.rollback_duration_ms as f32 / 1000.0)
+                    * self.config.sample_rate as f32) as usize;
+                while self.history_buffer.len() > rollback_samples {
+                    self.history_buffer.pop_front();
+                }
 
-        if probability > self.config.speech_threshold {
-            // 語音檢測到，重置靜音計數，增加語音塊數
-            self.silence_chunks = 0;
-            self.speech_chunks += 1;
-            self.current_segment.extend(chunk);
+                if probability > self.config.speech_threshold {
+                    // 檢測到語音，切換到 Recording 狀態
+                    self.state = VadState::Recording;
+                    // 將歷史緩衝區的內容移動到當前段（保留語音開頭的上下文）
+                    self.current_segment.extend(self.history_buffer.iter());
+                    self.history_buffer.clear(); // 清空緩衝區
+                    self.silence_chunks = 0;
+                    self.speech_chunks = 0;
 
-            // 檢查是否超過最大語音長度
-            let speech_duration_ms = self.speech_chunks as f32 * chunk_duration_ms;
-            if speech_duration_ms >= self.config.max_speech_duration_ms as f32 {
-                return self.finalize_segment();
+                    // 重置 Waiting 相關計數
+                    self.waiting_dropped_chunks = 0;
+                    self.notified_silence = false;
+                } else {
+                    // 仍在等待，檢查是否需要發出靜音通知
+                    if let Some(limit_ms) = self.config.notify_silence_after_ms {
+                        self.waiting_dropped_chunks += 1;
+                        let dropped_duration = self.waiting_dropped_chunks as f32 * chunk_duration_ms;
+                        if dropped_duration >= limit_ms as f32 && !self.notified_silence {
+                            self.notified_silence = true;
+                            return Some(VadOutput::SilenceNotification);
+                        }
+                    }
+                }
+                None
             }
-        } else {
-            // 靜音檢測到，增加靜音計數
-            self.silence_chunks += 1;
-            let silence_duration_ms = self.silence_chunks as f32 * chunk_duration_ms;
-            if !self.current_segment.is_empty()
-                && silence_duration_ms >= self.config.silence_duration_ms as f32
-            {
-                return self.finalize_segment();
+            VadState::Recording => {
+                self.current_segment.extend(chunk);
+                self.speech_chunks += 1;
+
+                if probability > self.config.speech_threshold {
+                    self.silence_chunks = 0;
+                    // 檢查是否超過最大語音長度
+                    let speech_duration_ms = self.speech_chunks as f32 * chunk_duration_ms;
+                    if speech_duration_ms >= self.config.max_speech_duration_ms as f32 {
+                        // 強制切斷
+                        return self.finalize_segment(false);
+                    }
+                } else {
+                    self.silence_chunks += 1;
+                    let silence_duration_ms = self.silence_chunks as f32 * chunk_duration_ms;
+                    if silence_duration_ms >= self.config.silence_duration_ms as f32 {
+                        // 靜音時間過長，結束當前段
+                        // 並修剪掉尾部的靜音
+                        return self.finalize_segment(true);
+                    }
+                }
+                None
             }
-            self.current_segment.extend(chunk); // 靜音部分也先保留，直到確認段結束
         }
-
-        None // 未結束，返回 None
     }
 
-    pub fn finalize_segment(&mut self) -> Option<Vec<i16>> {
+    // trim_tail: 是否修剪尾部的靜音
+    fn finalize_segment(&mut self, trim_tail: bool) -> Option<VadOutput> {
         if self.current_segment.is_empty() {
+            self.reset();
             return None;
         }
 
-        let chunk_duration_ms = (CHUNK_SIZE as f32 / self.config.sample_rate as f32) * 1000.0;
-        let rollback_chunks =
-            (self.config.rollback_duration_ms as f32 / chunk_duration_ms).ceil() as usize;
-        let rollback_samples = rollback_chunks * CHUNK_SIZE;
+        let mut segment = if trim_tail {
+            // 計算需要修剪的樣本數
+            let chunk_len = CHUNK_SIZE;
+            let silence_len = (self.silence_chunks as usize) * chunk_len;
+            let valid_len = self.current_segment.len().saturating_sub(silence_len);
+            if valid_len == 0 {
+                Vec::new() // 全是靜音？
+            } else {
+                self.current_segment[..valid_len].to_vec()
+            }
+        } else {
+            self.current_segment.clone()
+        };
+
+        // 最小長度檢查
+        let duration_ms =
+            (segment.len() as f32 / self.config.sample_rate as f32) * 1000.0;
+        if duration_ms < self.config.min_speech_duration_ms as f32 {
+            // 語音太短，視為噪音丟棄
+            segment.clear(); // 清空以確保返回 None
+        }
+
+        self.reset();
 
-        // 計算回退樣本數並分割
-        let segment_len = self.current_segment.len();
-        let rollback_start = segment_len.saturating_sub(rollback_samples);
-        let segment = self.current_segment[..rollback_start].to_vec();
-        let rollback = self.current_segment[rollback_start..].to_vec();
+        if segment.is_empty() {
+            None
+        } else {
+            Some(VadOutput::Segment(segment))
+        }
+    }
 
-        // 重置當前段，將回退部分加入待處理
+    fn reset(&mut self) {
         self.current_segment.clear();
-        self.pending_samples.clear();
-        self.pending_samples.extend(rollback);
+        self.history_buffer.clear();
         self.silence_chunks = 0;
         self.speech_chunks = 0;
-
-        Some(segment)
+        self.state = VadState::Waiting;
+        // 重置 waiting 狀態
+        self.waiting_dropped_chunks = 0;
+        self.notified_silence = false;
     }
 
-    pub fn finish(&mut self) -> Option<Vec<i16>> {
-        // 處理剩餘樣本作為最終段
+    pub fn finish(&mut self) -> Option<VadOutput> {
+        // 如果還在 Recording 狀態，返回剩餘內容
         if !self.current_segment.is_empty() {
+             // 對於最後一段，我們也要做最小長度檢查
+             let duration_ms = (self.current_segment.len() as f32 / self.config.sample_rate as f32) * 1000.0;
+             if duration_ms < self.config.min_speech_duration_ms as f32 {
+                 self.reset();
+                 return None;
+             }
+
             let segment = self.current_segment.clone();
-            self.current_segment.clear();
-            self.pending_samples.clear();
-            self.silence_chunks = 0;
-            self.speech_chunks = 0;
-            Some(segment)
+            self.reset();
+            Some(VadOutput::Segment(segment))
         } else {
             None
         }