@@ -4,11 +4,13 @@ pub const CHUNK_SIZE: usize = 512;
44
55#[ derive( Debug , Clone , Copy ) ]
66pub struct VadConfig {
7- sample_rate : u32 , // 採樣率,例如 16000 Hz
8- speech_threshold : f32 , // 語音概率閾值,例如 0.5
9- silence_duration_ms : u32 , // 靜音持續時間(毫秒),例如 500 ms
10- max_speech_duration_ms : u32 , // 最大語音段長(毫秒),例如 10000 ms
11- rollback_duration_ms : u32 , // 剪斷後回退時間(毫秒),例如 200 ms
7+ pub sample_rate : u32 , // 採樣率,例如 16000 Hz
8+ pub speech_threshold : f32 , // 語音概率閾值,例如 0.5
9+ pub silence_duration_ms : u32 , // 靜音持續時間(毫秒),例如 500 ms
10+ pub max_speech_duration_ms : u32 , // 最大語音段長(毫秒),例如 10000 ms
11+ pub rollback_duration_ms : u32 , // 剪斷後回退時間(毫秒),例如 200 ms
12+ pub min_speech_duration_ms : u32 , // 最小語音段長(毫秒),小於此長度視為噪音,例如 250 ms
13+ pub notify_silence_after_ms : Option < u32 > , // 如果處於等待狀態超過此時間,發出靜音通知
1214}
1315
1416impl Default for VadConfig {
@@ -19,18 +21,36 @@ impl Default for VadConfig {
1921 silence_duration_ms : 500 , // 500 ms 靜音算結束
2022 max_speech_duration_ms : 9000 , // 9 秒最大語音段
2123 rollback_duration_ms : 200 , // 回退 200 ms
24+ min_speech_duration_ms : 250 , // 最小 250 ms
25+ notify_silence_after_ms : None ,
2226 }
2327 }
2428}
2529
30+ #[ derive( Debug ) ]
31+ enum VadState {
32+ Waiting ,
33+ Recording ,
34+ }
35+
36+ /// Enum to distinguish between a speech segment and a silence notification
37+ #[ derive( Debug ) ]
38+ pub enum VadOutput {
39+ Segment ( Vec < i16 > ) ,
40+ SilenceNotification ,
41+ }
42+
2643#[ derive( Debug ) ]
2744pub struct VadProcessor {
2845 vad : VoiceActivityDetector ,
2946 config : VadConfig ,
30- current_segment : Vec < i16 > , // 當前語音段的樣本
31- pending_samples : VecDeque < i16 > , // 未完成的樣本,等待下次處理
32- silence_chunks : u32 , // 連續靜音塊數
33- speech_chunks : u32 , // 當前語音段的塊數
47+ state : VadState ,
48+ current_segment : Vec < i16 > ,
49+ history_buffer : VecDeque < i16 > , // 用於保留語音開始前的上下文
50+ silence_chunks : u32 , // 連續靜音塊數 (Recording 狀態下)
51+ speech_chunks : u32 , // 當前語音段的塊數
52+ waiting_dropped_chunks : u32 , // Waiting 狀態下已丟棄的塊數
53+ notified_silence : bool , // 是否已經發出過靜音通知 (One-shot)
3454}
3555
3656impl VadProcessor {
@@ -42,14 +62,27 @@ impl VadProcessor {
4262 Ok ( Self {
4363 vad,
4464 config,
65+ state : VadState :: Waiting ,
4566 current_segment : Vec :: new ( ) ,
46- pending_samples : VecDeque :: new ( ) ,
67+ history_buffer : VecDeque :: new ( ) ,
4768 silence_chunks : 0 ,
4869 speech_chunks : 0 ,
70+ waiting_dropped_chunks : 0 ,
71+ notified_silence : false ,
4972 } )
5073 }
5174
52- pub fn process_chunk ( & mut self , chunk : & [ i16 ; CHUNK_SIZE ] ) -> Option < Vec < i16 > > {
75+ /// 更新通知靜音的設定
76+ pub fn set_notify_silence_after_ms ( & mut self , ms : Option < u32 > ) {
77+ self . config . notify_silence_after_ms = ms;
78+ // 如果關閉了通知,重置狀態以防萬一
79+ if ms. is_none ( ) {
80+ self . notified_silence = false ;
81+ }
82+ // 如果開啟了且當前累積已超過,下一次 process_chunk 會觸發
83+ }
84+
85+ pub fn process_chunk ( & mut self , chunk : & [ i16 ; CHUNK_SIZE ] ) -> Option < VadOutput > {
5386 let chunk_duration_ms = ( CHUNK_SIZE as f32 / self . config . sample_rate as f32 ) * 1000.0 ;
5487 let probability = chunk
5588 . iter ( )
@@ -59,70 +92,131 @@ impl VadProcessor {
5992 . unwrap ( )
6093 . 1 ;
6194
62- // 將 chunk 添加到待處理樣本
63- self . pending_samples . extend ( chunk. iter ( ) . copied ( ) ) ;
95+ match self . state {
96+ VadState :: Waiting => {
97+ // 將塊加入歷史緩衝區
98+ self . history_buffer . extend ( chunk. iter ( ) . copied ( ) ) ;
99+
100+ // 維護緩衝區大小 (rollback_duration)
101+ let rollback_samples = ( ( self . config . rollback_duration_ms as f32 / 1000.0 )
102+ * self . config . sample_rate as f32 ) as usize ;
103+ while self . history_buffer . len ( ) > rollback_samples {
104+ self . history_buffer . pop_front ( ) ;
105+ }
64106
65- if probability > self . config . speech_threshold {
66- // 語音檢測到,重置靜音計數,增加語音塊數
67- self . silence_chunks = 0 ;
68- self . speech_chunks += 1 ;
69- self . current_segment . extend ( chunk) ;
107+ if probability > self . config . speech_threshold {
108+ // 檢測到語音,切換到 Recording 狀態
109+ self . state = VadState :: Recording ;
110+ // 將歷史緩衝區的內容移動到當前段(保留語音開頭的上下文)
111+ self . current_segment . extend ( self . history_buffer . iter ( ) ) ;
112+ self . history_buffer . clear ( ) ; // 清空緩衝區
113+ self . silence_chunks = 0 ;
114+ self . speech_chunks = 0 ;
70115
71- // 檢查是否超過最大語音長度
72- let speech_duration_ms = self . speech_chunks as f32 * chunk_duration_ms;
73- if speech_duration_ms >= self . config . max_speech_duration_ms as f32 {
74- return self . finalize_segment ( ) ;
116+ // 重置 Waiting 相關計數
117+ self . waiting_dropped_chunks = 0 ;
118+ self . notified_silence = false ;
119+ } else {
120+ // 仍在等待,檢查是否需要發出靜音通知
121+ if let Some ( limit_ms) = self . config . notify_silence_after_ms {
122+ self . waiting_dropped_chunks += 1 ;
123+ let dropped_duration = self . waiting_dropped_chunks as f32 * chunk_duration_ms;
124+ if dropped_duration >= limit_ms as f32 && !self . notified_silence {
125+ self . notified_silence = true ;
126+ return Some ( VadOutput :: SilenceNotification ) ;
127+ }
128+ }
129+ }
130+ None
75131 }
76- } else {
77- // 靜音檢測到,增加靜音計數
78- self . silence_chunks += 1 ;
79- let silence_duration_ms = self . silence_chunks as f32 * chunk_duration_ms;
80- if !self . current_segment . is_empty ( )
81- && silence_duration_ms >= self . config . silence_duration_ms as f32
82- {
83- return self . finalize_segment ( ) ;
132+ VadState :: Recording => {
133+ self . current_segment . extend ( chunk) ;
134+ self . speech_chunks += 1 ;
135+
136+ if probability > self . config . speech_threshold {
137+ self . silence_chunks = 0 ;
138+ // 檢查是否超過最大語音長度
139+ let speech_duration_ms = self . speech_chunks as f32 * chunk_duration_ms;
140+ if speech_duration_ms >= self . config . max_speech_duration_ms as f32 {
141+ // 強制切斷
142+ return self . finalize_segment ( false ) ;
143+ }
144+ } else {
145+ self . silence_chunks += 1 ;
146+ let silence_duration_ms = self . silence_chunks as f32 * chunk_duration_ms;
147+ if silence_duration_ms >= self . config . silence_duration_ms as f32 {
148+ // 靜音時間過長,結束當前段
149+ // 並修剪掉尾部的靜音
150+ return self . finalize_segment ( true ) ;
151+ }
152+ }
153+ None
84154 }
85- self . current_segment . extend ( chunk) ; // 靜音部分也先保留,直到確認段結束
86155 }
87-
88- None // 未結束,返回 None
89156 }
90157
91- pub fn finalize_segment ( & mut self ) -> Option < Vec < i16 > > {
158+ // trim_tail: 是否修剪尾部的靜音
159+ fn finalize_segment ( & mut self , trim_tail : bool ) -> Option < VadOutput > {
92160 if self . current_segment . is_empty ( ) {
161+ self . reset ( ) ;
93162 return None ;
94163 }
95164
96- let chunk_duration_ms = ( CHUNK_SIZE as f32 / self . config . sample_rate as f32 ) * 1000.0 ;
97- let rollback_chunks =
98- ( self . config . rollback_duration_ms as f32 / chunk_duration_ms) . ceil ( ) as usize ;
99- let rollback_samples = rollback_chunks * CHUNK_SIZE ;
165+ let mut segment = if trim_tail {
166+ // 計算需要修剪的樣本數
167+ let chunk_len = CHUNK_SIZE ;
168+ let silence_len = ( self . silence_chunks as usize ) * chunk_len;
169+ let valid_len = self . current_segment . len ( ) . saturating_sub ( silence_len) ;
170+ if valid_len == 0 {
171+ Vec :: new ( ) // 全是靜音?
172+ } else {
173+ self . current_segment [ ..valid_len] . to_vec ( )
174+ }
175+ } else {
176+ self . current_segment . clone ( )
177+ } ;
178+
179+ // 最小長度檢查
180+ let duration_ms =
181+ ( segment. len ( ) as f32 / self . config . sample_rate as f32 ) * 1000.0 ;
182+ if duration_ms < self . config . min_speech_duration_ms as f32 {
183+ // 語音太短,視為噪音丟棄
184+ segment. clear ( ) ; // 清空以確保返回 None
185+ }
186+
187+ self . reset ( ) ;
100188
101- // 計算回退樣本數並分割
102- let segment_len = self . current_segment . len ( ) ;
103- let rollback_start = segment_len. saturating_sub ( rollback_samples) ;
104- let segment = self . current_segment [ ..rollback_start] . to_vec ( ) ;
105- let rollback = self . current_segment [ rollback_start..] . to_vec ( ) ;
189+ if segment. is_empty ( ) {
190+ None
191+ } else {
192+ Some ( VadOutput :: Segment ( segment) )
193+ }
194+ }
106195
107- // 重置當前段,將回退部分加入待處理
196+ fn reset ( & mut self ) {
108197 self . current_segment . clear ( ) ;
109- self . pending_samples . clear ( ) ;
110- self . pending_samples . extend ( rollback) ;
198+ self . history_buffer . clear ( ) ;
111199 self . silence_chunks = 0 ;
112200 self . speech_chunks = 0 ;
113-
114- Some ( segment)
201+ self . state = VadState :: Waiting ;
202+ // 重置 waiting 狀態
203+ self . waiting_dropped_chunks = 0 ;
204+ self . notified_silence = false ;
115205 }
116206
117- pub fn finish ( & mut self ) -> Option < Vec < i16 > > {
118- // 處理剩餘樣本作為最終段
207+ pub fn finish ( & mut self ) -> Option < VadOutput > {
208+ // 如果還在 Recording 狀態,返回剩餘內容
119209 if !self . current_segment . is_empty ( ) {
210+ // 對於最後一段,我們也要做最小長度檢查
211+ let duration_ms = ( self . current_segment . len ( ) as f32 / self . config . sample_rate as f32 ) * 1000.0 ;
212+ if duration_ms < self . config . min_speech_duration_ms as f32 {
213+ self . reset ( ) ;
214+ return None ;
215+ }
216+
120217 let segment = self . current_segment . clone ( ) ;
121- self . current_segment . clear ( ) ;
122- self . pending_samples . clear ( ) ;
123- self . silence_chunks = 0 ;
124- self . speech_chunks = 0 ;
125- Some ( segment)
218+ self . reset ( ) ;
219+ Some ( VadOutput :: Segment ( segment) )
126220 } else {
127221 None
128222 }
0 commit comments