Skip to content

Commit aad73b6

Browse files
committed
Add nospeech detect
1 parent fcdbc96 commit aad73b6

File tree

2 files changed

+222
-69
lines changed

2 files changed

+222
-69
lines changed

src/lib.rs

Lines changed: 73 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ use rknn_rs::prelude::{Rknn, RknnInput, RknnTensorFormat, RknnTensorType};
2121
use sentencepiece::SentencePieceProcessor;
2222

2323
use config::SenseVoiceConfig;
24-
use silero_vad::{VadConfig, VadProcessor, CHUNK_SIZE};
24+
use silero_vad::{VadConfig, VadOutput, VadProcessor, CHUNK_SIZE};
2525
use wavfrontend::{WavFrontend, WavFrontendConfig};
2626

2727
#[cfg(feature = "stream")]
@@ -510,6 +510,13 @@ impl SenseVoiceSmall {
510510
})
511511
}
512512

513+
/// Updates the silence notification threshold for VAD.
514+
/// If `ms` is Some, a NoSpeech event will be emitted once after `ms` milliseconds of continuous dropped audio (waiting state).
515+
#[cfg(feature = "stream")]
516+
pub fn set_vad_silence_notification(&mut self, ms: Option<u32>) {
517+
self.silero_vad.set_notify_silence_after_ms(ms);
518+
}
519+
513520
/// Performs speech recognition on a vector of audio samples.
514521
pub fn infer_vec(
515522
&self,
@@ -530,15 +537,44 @@ impl SenseVoiceSmall {
530537

531538
for chunk in padded_content.chunks_exact(chunk_size) {
532539
let chunk_arr: &[i16; CHUNK_SIZE] = chunk.try_into()?;
533-
if let Some(segment) = vad.process_chunk(chunk_arr) {
534-
let vt = self.recognition(&segment)?;
535-
ret.push(vt);
540+
if let Some(output) = vad.process_chunk(chunk_arr) {
541+
match output {
542+
VadOutput::Segment(segment) => {
543+
let vt = self.recognition(&segment)?;
544+
ret.push(vt);
545+
},
546+
VadOutput::SilenceNotification => {
547+
// For batch infer, usually we don't need intermediate notifications,
548+
// but if configured in vad_config, we respect it.
549+
ret.push(VoiceText {
550+
language: SenseVoiceLanguage::NoSpeech,
551+
emotion: SenseVoiceEmo::Unknown,
552+
event: SenseVoiceEvent::Unknown,
553+
punctuation_normalization: SenseVoicePunctuationNormalization::Woitn,
554+
content: String::new(),
555+
});
556+
}
557+
}
536558
}
537559
}
538560

539-
if let Some(segment) = vad.finish() {
540-
let vt = self.recognition(&segment)?;
541-
ret.push(vt);
561+
if let Some(output) = vad.finish() {
562+
match output {
563+
VadOutput::Segment(segment) => {
564+
let vt = self.recognition(&segment)?;
565+
ret.push(vt);
566+
},
567+
VadOutput::SilenceNotification => {
568+
// Should not happen in finish usually, but handle it
569+
ret.push(VoiceText {
570+
language: SenseVoiceLanguage::NoSpeech,
571+
emotion: SenseVoiceEmo::Unknown,
572+
event: SenseVoiceEvent::Unknown,
573+
punctuation_normalization: SenseVoicePunctuationNormalization::Woitn,
574+
content: String::new(),
575+
});
576+
}
577+
}
542578
}
543579

544580
Ok(ret)
@@ -648,18 +684,41 @@ impl SenseVoiceSmall {
648684
// For now, assuming the stream provides correct chunks or we try to convert.
649685
// process_chunk expects &[i16; 512].
650686
if let Ok(chunk_arr) = chunk.as_slice().try_into() {
651-
if let Some(segment) = self.silero_vad.process_chunk(chunk_arr) {
652-
yield self.recognition(&segment);
687+
if let Some(output) = self.silero_vad.process_chunk(chunk_arr) {
688+
match output {
689+
VadOutput::Segment(segment) => {
690+
yield self.recognition(&segment);
691+
},
692+
VadOutput::SilenceNotification => {
693+
yield Ok(VoiceText {
694+
language: SenseVoiceLanguage::NoSpeech,
695+
emotion: SenseVoiceEmo::Unknown,
696+
event: SenseVoiceEvent::Unknown,
697+
punctuation_normalization: SenseVoicePunctuationNormalization::Woitn,
698+
content: String::new(),
699+
});
700+
}
701+
}
653702
}
654703
} else {
655704
// Handle mismatch size? For now ignore or log?
656-
// Or maybe we should allow partial chunks if the logic allows, but process_chunk seems strict.
657-
// Ideally we should buffer. But let's stick to simple fix first: expect 512.
658-
// If chunk is not 512, silero_vad might panic if we force it? No, try_into returns error.
659705
}
660706
}
661-
if let Some(segment) = self.silero_vad.finish() {
662-
yield self.recognition(&segment);
707+
if let Some(output) = self.silero_vad.finish() {
708+
match output {
709+
VadOutput::Segment(segment) => {
710+
yield self.recognition(&segment);
711+
},
712+
VadOutput::SilenceNotification => {
713+
yield Ok(VoiceText {
714+
language: SenseVoiceLanguage::NoSpeech,
715+
emotion: SenseVoiceEmo::Unknown,
716+
event: SenseVoiceEvent::Unknown,
717+
punctuation_normalization: SenseVoicePunctuationNormalization::Woitn,
718+
content: String::new(),
719+
});
720+
}
721+
}
663722
}
664723
}
665724
}

src/silero_vad.rs

Lines changed: 149 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -4,11 +4,13 @@ pub const CHUNK_SIZE: usize = 512;
44

55
#[derive(Debug, Clone, Copy)]
66
pub struct VadConfig {
7-
sample_rate: u32, // 採樣率,例如 16000 Hz
8-
speech_threshold: f32, // 語音概率閾值,例如 0.5
9-
silence_duration_ms: u32, // 靜音持續時間(毫秒),例如 500 ms
10-
max_speech_duration_ms: u32, // 最大語音段長(毫秒),例如 10000 ms
11-
rollback_duration_ms: u32, // 剪斷後回退時間(毫秒),例如 200 ms
7+
pub sample_rate: u32, // 採樣率,例如 16000 Hz
8+
pub speech_threshold: f32, // 語音概率閾值,例如 0.5
9+
pub silence_duration_ms: u32, // 靜音持續時間(毫秒),例如 500 ms
10+
pub max_speech_duration_ms: u32, // 最大語音段長(毫秒),例如 10000 ms
11+
pub rollback_duration_ms: u32, // 剪斷後回退時間(毫秒),例如 200 ms
12+
pub min_speech_duration_ms: u32, // 最小語音段長(毫秒),小於此長度視為噪音,例如 250 ms
13+
pub notify_silence_after_ms: Option<u32>, // 如果處於等待狀態超過此時間,發出靜音通知
1214
}
1315

1416
impl Default for VadConfig {
@@ -19,18 +21,36 @@ impl Default for VadConfig {
1921
silence_duration_ms: 500, // 500 ms 靜音算結束
2022
max_speech_duration_ms: 9000, // 9 秒最大語音段
2123
rollback_duration_ms: 200, // 回退 200 ms
24+
min_speech_duration_ms: 250, // 最小 250 ms
25+
notify_silence_after_ms: None,
2226
}
2327
}
2428
}
2529

30+
#[derive(Debug)]
31+
enum VadState {
32+
Waiting,
33+
Recording,
34+
}
35+
36+
/// Enum to distinguish between a speech segment and a silence notification
37+
#[derive(Debug)]
38+
pub enum VadOutput {
39+
Segment(Vec<i16>),
40+
SilenceNotification,
41+
}
42+
2643
#[derive(Debug)]
2744
pub struct VadProcessor {
2845
vad: VoiceActivityDetector,
2946
config: VadConfig,
30-
current_segment: Vec<i16>, // 當前語音段的樣本
31-
pending_samples: VecDeque<i16>, // 未完成的樣本,等待下次處理
32-
silence_chunks: u32, // 連續靜音塊數
33-
speech_chunks: u32, // 當前語音段的塊數
47+
state: VadState,
48+
current_segment: Vec<i16>,
49+
history_buffer: VecDeque<i16>, // 用於保留語音開始前的上下文
50+
silence_chunks: u32, // 連續靜音塊數 (Recording 狀態下)
51+
speech_chunks: u32, // 當前語音段的塊數
52+
waiting_dropped_chunks: u32, // Waiting 狀態下已丟棄的塊數
53+
notified_silence: bool, // 是否已經發出過靜音通知 (One-shot)
3454
}
3555

3656
impl VadProcessor {
@@ -42,14 +62,27 @@ impl VadProcessor {
4262
Ok(Self {
4363
vad,
4464
config,
65+
state: VadState::Waiting,
4566
current_segment: Vec::new(),
46-
pending_samples: VecDeque::new(),
67+
history_buffer: VecDeque::new(),
4768
silence_chunks: 0,
4869
speech_chunks: 0,
70+
waiting_dropped_chunks: 0,
71+
notified_silence: false,
4972
})
5073
}
5174

52-
pub fn process_chunk(&mut self, chunk: &[i16; CHUNK_SIZE]) -> Option<Vec<i16>> {
75+
/// 更新通知靜音的設定
76+
pub fn set_notify_silence_after_ms(&mut self, ms: Option<u32>) {
77+
self.config.notify_silence_after_ms = ms;
78+
// 如果關閉了通知,重置狀態以防萬一
79+
if ms.is_none() {
80+
self.notified_silence = false;
81+
}
82+
// 如果開啟了且當前累積已超過,下一次 process_chunk 會觸發
83+
}
84+
85+
pub fn process_chunk(&mut self, chunk: &[i16; CHUNK_SIZE]) -> Option<VadOutput> {
5386
let chunk_duration_ms = (CHUNK_SIZE as f32 / self.config.sample_rate as f32) * 1000.0;
5487
let probability = chunk
5588
.iter()
@@ -59,70 +92,131 @@ impl VadProcessor {
5992
.unwrap()
6093
.1;
6194

62-
// 將 chunk 添加到待處理樣本
63-
self.pending_samples.extend(chunk.iter().copied());
95+
match self.state {
96+
VadState::Waiting => {
97+
// 將塊加入歷史緩衝區
98+
self.history_buffer.extend(chunk.iter().copied());
99+
100+
// 維護緩衝區大小 (rollback_duration)
101+
let rollback_samples = ((self.config.rollback_duration_ms as f32 / 1000.0)
102+
* self.config.sample_rate as f32) as usize;
103+
while self.history_buffer.len() > rollback_samples {
104+
self.history_buffer.pop_front();
105+
}
64106

65-
if probability > self.config.speech_threshold {
66-
// 語音檢測到,重置靜音計數,增加語音塊數
67-
self.silence_chunks = 0;
68-
self.speech_chunks += 1;
69-
self.current_segment.extend(chunk);
107+
if probability > self.config.speech_threshold {
108+
// 檢測到語音,切換到 Recording 狀態
109+
self.state = VadState::Recording;
110+
// 將歷史緩衝區的內容移動到當前段(保留語音開頭的上下文)
111+
self.current_segment.extend(self.history_buffer.iter());
112+
self.history_buffer.clear(); // 清空緩衝區
113+
self.silence_chunks = 0;
114+
self.speech_chunks = 0;
70115

71-
// 檢查是否超過最大語音長度
72-
let speech_duration_ms = self.speech_chunks as f32 * chunk_duration_ms;
73-
if speech_duration_ms >= self.config.max_speech_duration_ms as f32 {
74-
return self.finalize_segment();
116+
// 重置 Waiting 相關計數
117+
self.waiting_dropped_chunks = 0;
118+
self.notified_silence = false;
119+
} else {
120+
// 仍在等待,檢查是否需要發出靜音通知
121+
if let Some(limit_ms) = self.config.notify_silence_after_ms {
122+
self.waiting_dropped_chunks += 1;
123+
let dropped_duration = self.waiting_dropped_chunks as f32 * chunk_duration_ms;
124+
if dropped_duration >= limit_ms as f32 && !self.notified_silence {
125+
self.notified_silence = true;
126+
return Some(VadOutput::SilenceNotification);
127+
}
128+
}
129+
}
130+
None
75131
}
76-
} else {
77-
// 靜音檢測到,增加靜音計數
78-
self.silence_chunks += 1;
79-
let silence_duration_ms = self.silence_chunks as f32 * chunk_duration_ms;
80-
if !self.current_segment.is_empty()
81-
&& silence_duration_ms >= self.config.silence_duration_ms as f32
82-
{
83-
return self.finalize_segment();
132+
VadState::Recording => {
133+
self.current_segment.extend(chunk);
134+
self.speech_chunks += 1;
135+
136+
if probability > self.config.speech_threshold {
137+
self.silence_chunks = 0;
138+
// 檢查是否超過最大語音長度
139+
let speech_duration_ms = self.speech_chunks as f32 * chunk_duration_ms;
140+
if speech_duration_ms >= self.config.max_speech_duration_ms as f32 {
141+
// 強制切斷
142+
return self.finalize_segment(false);
143+
}
144+
} else {
145+
self.silence_chunks += 1;
146+
let silence_duration_ms = self.silence_chunks as f32 * chunk_duration_ms;
147+
if silence_duration_ms >= self.config.silence_duration_ms as f32 {
148+
// 靜音時間過長,結束當前段
149+
// 並修剪掉尾部的靜音
150+
return self.finalize_segment(true);
151+
}
152+
}
153+
None
84154
}
85-
self.current_segment.extend(chunk); // 靜音部分也先保留,直到確認段結束
86155
}
87-
88-
None // 未結束,返回 None
89156
}
90157

91-
pub fn finalize_segment(&mut self) -> Option<Vec<i16>> {
158+
// trim_tail: 是否修剪尾部的靜音
159+
fn finalize_segment(&mut self, trim_tail: bool) -> Option<VadOutput> {
92160
if self.current_segment.is_empty() {
161+
self.reset();
93162
return None;
94163
}
95164

96-
let chunk_duration_ms = (CHUNK_SIZE as f32 / self.config.sample_rate as f32) * 1000.0;
97-
let rollback_chunks =
98-
(self.config.rollback_duration_ms as f32 / chunk_duration_ms).ceil() as usize;
99-
let rollback_samples = rollback_chunks * CHUNK_SIZE;
165+
let mut segment = if trim_tail {
166+
// 計算需要修剪的樣本數
167+
let chunk_len = CHUNK_SIZE;
168+
let silence_len = (self.silence_chunks as usize) * chunk_len;
169+
let valid_len = self.current_segment.len().saturating_sub(silence_len);
170+
if valid_len == 0 {
171+
Vec::new() // 全是靜音?
172+
} else {
173+
self.current_segment[..valid_len].to_vec()
174+
}
175+
} else {
176+
self.current_segment.clone()
177+
};
178+
179+
// 最小長度檢查
180+
let duration_ms =
181+
(segment.len() as f32 / self.config.sample_rate as f32) * 1000.0;
182+
if duration_ms < self.config.min_speech_duration_ms as f32 {
183+
// 語音太短,視為噪音丟棄
184+
segment.clear(); // 清空以確保返回 None
185+
}
186+
187+
self.reset();
100188

101-
// 計算回退樣本數並分割
102-
let segment_len = self.current_segment.len();
103-
let rollback_start = segment_len.saturating_sub(rollback_samples);
104-
let segment = self.current_segment[..rollback_start].to_vec();
105-
let rollback = self.current_segment[rollback_start..].to_vec();
189+
if segment.is_empty() {
190+
None
191+
} else {
192+
Some(VadOutput::Segment(segment))
193+
}
194+
}
106195

107-
// 重置當前段,將回退部分加入待處理
196+
fn reset(&mut self) {
108197
self.current_segment.clear();
109-
self.pending_samples.clear();
110-
self.pending_samples.extend(rollback);
198+
self.history_buffer.clear();
111199
self.silence_chunks = 0;
112200
self.speech_chunks = 0;
113-
114-
Some(segment)
201+
self.state = VadState::Waiting;
202+
// 重置 waiting 狀態
203+
self.waiting_dropped_chunks = 0;
204+
self.notified_silence = false;
115205
}
116206

117-
pub fn finish(&mut self) -> Option<Vec<i16>> {
118-
// 處理剩餘樣本作為最終段
207+
pub fn finish(&mut self) -> Option<VadOutput> {
208+
// 如果還在 Recording 狀態,返回剩餘內容
119209
if !self.current_segment.is_empty() {
210+
// 對於最後一段,我們也要做最小長度檢查
211+
let duration_ms = (self.current_segment.len() as f32 / self.config.sample_rate as f32) * 1000.0;
212+
if duration_ms < self.config.min_speech_duration_ms as f32 {
213+
self.reset();
214+
return None;
215+
}
216+
120217
let segment = self.current_segment.clone();
121-
self.current_segment.clear();
122-
self.pending_samples.clear();
123-
self.silence_chunks = 0;
124-
self.speech_chunks = 0;
125-
Some(segment)
218+
self.reset();
219+
Some(VadOutput::Segment(segment))
126220
} else {
127221
None
128222
}

0 commit comments

Comments
 (0)