88#include < string>
99#include < whisper.h>
1010#include < sstream>
11-
11+ # include < speex/speex_preprocess.h >
1212using namespace stream_components ;
1313
14- bool processAudio (WhisperService service, std::vector<float > pcm32, const whisper_local_stream_params ¶ms);
15-
1614int main (int argc, char **argv) {
1715 // Read parameters...
1816 whisper_local_stream_params params;
@@ -39,6 +37,8 @@ int main(int argc, char **argv) {
3937 stream_components::WhisperService whisperService (params.service , params.audio , cparams);
4038
4139 const int port = 8090 ;
40+ std::mutex whisper_mutex;
41+
4242
4343 // started handler
4444 auto started_handler = [](auto *token) {
@@ -65,6 +65,7 @@ int main(int argc, char **argv) {
6565 thread_local wav_writer wavWriter;
6666 thread_local std::string filename;
6767
68+
6869 nlohmann::json response;
6970 if (opCode == uWS::OpCode::TEXT) {
7071 // printf("%s: Received message on /streaming/save: %s\n", get_current_time().c_str(),std::string(message).c_str());
@@ -93,7 +94,7 @@ int main(int argc, char **argv) {
9394 // process binary message(PCM16 data)
9495 auto size = message.size ();
9596 std::basic_string_view<char , std::char_traits<char >>::const_pointer data = message.data ();
96- // printf("%s: Received message size on /streaming/save: %zu\n", get_current_time().c_str(), size);
97+ printf (" %s: Received message size on /streaming/save: %zu\n " , get_current_time ().c_str (), size);
9798 // add received PCM16 to audio cache
9899 std::vector<int16_t > pcm16 (size / 2 );
99100 std::memcpy (pcm16.data (), data, size);
@@ -104,14 +105,17 @@ int main(int argc, char **argv) {
104105 };
105106
106107 // WebSocket /paddlespeech/asr/streaming handler
107- auto ws_streaming_handler = [&whisperService, ¶ms](auto *ws, std::string_view message, uWS::OpCode opCode) {
108+ auto ws_streaming_handler = [&whisperService, ¶ms, &whisper_mutex ](auto *ws, std::string_view message, uWS::OpCode opCode) {
108109 thread_local std::vector<float > audioBuffer; // thread-localized variable
109110 thread_local wav_writer wavWriter;
110111 thread_local std::string filename;
111- thread_local bool is_last_active = false ;
112+ thread_local bool last_is_speech = false ;
113+ thread_local int chunk_size = 160 ; // 适用于 16 kHz 采样率的 100 毫秒帧
114+ thread_local SpeexPreprocessState *st;
115+
112116 // std::unique_ptr<nlohmann::json> results(new nlohmann::json(nlohmann::json::array()));
113117 thread_local nlohmann::json final_results;
114- auto thread_id = std::this_thread::get_id ();
118+ // auto thread_id = std::this_thread::get_id();
115119 // std::cout << get_current_time().c_str() << ": Handling a message in thread: " << thread_id << std::endl;
116120 nlohmann::json response;
117121 if (opCode == uWS::OpCode::TEXT) {
@@ -122,45 +126,50 @@ int main(int argc, char **argv) {
122126 auto jsonMsg = nlohmann::json::parse (message);
123127 std::string signal = jsonMsg[" signal" ];
124128 if (signal == " start" ) {
129+ printf (" %s start\n " ,get_current_time ().c_str ());
130+
125131 if (jsonMsg[" name" ].is_string ()) {
126132 filename = jsonMsg[" name" ];
127133 } else {
128134 filename = std::to_string (get_current_time_millis ()) + " .wav" ;
129135 }
130- final_results = nlohmann::json (nlohmann::json::array ());
131136 // 发送服务器准备好的消息
132137 response = {{" status" , " ok" },
133138 {" signal" , " server_ready" }};
134139 ws->send (response.dump (), uWS::OpCode::TEXT);
135140 wavWriter.open (filename, WHISPER_SAMPLE_RATE, 16 , 1 );
141+ st = speex_preprocess_state_init (chunk_size, WHISPER_SAMPLE_RATE);
142+ int vad = 1 ;
143+ speex_preprocess_ctl (st, SPEEX_PREPROCESS_SET_VAD, &vad);
144+
136145 }
137146 if (signal == " end" ) {
138- printf (" %s end\n " );
139- wavWriter.close ();
147+ printf (" %s end\n " ,get_current_time ().c_str ());
140148// nlohmann::json response = {{"name",filename},{"signal", signal}};
141149 response = {{" name" , filename},
142150 {" signal" , signal}};
143- printf (" %s:buffer size:%d \n " ,get_current_time ().c_str (),audioBuffer.size ());
151+ printf (" %s:buffer size:%lu \n " ,get_current_time ().c_str (),audioBuffer.size ());
144152 bool isOk = whisperService.process (audioBuffer.data (), audioBuffer.size ());
145153 if (isOk) {
146154 final_results = get_result (whisperService.ctx );
147155 response[" result" ] = final_results;
148156 }
149157 ws->send (response.dump (), uWS::OpCode::TEXT);
158+ wavWriter.close ();
159+ speex_preprocess_state_destroy (st);
150160 }
151161 // other process logic...
152162 } catch (const std::exception &e) {
153163 std::cerr << " JSON parse error: " << e.what () << std::endl;
154164 auto size = message.size ();
155165 }
156166 } else if (opCode == uWS::OpCode::BINARY) {
157- int size = message.size ();
158167 // process binary message(PCM16 data)
168+ auto size = message.size ();
159169 std::basic_string_view<char , std::char_traits<char >>::const_pointer data = message.data ();
160170 printf (" %s: Received message size on /paddlespeech/asr/streaming: %zu\n " , get_current_time ().c_str (), size);
161171 // add received PCM16 to audio cache
162172 std::vector<int16_t > pcm16 (size / 2 );
163-
164173 std::memcpy (pcm16.data (), data, size);
165174 // write to file
166175 wavWriter.write (pcm16.data (), size / 2 );
@@ -172,28 +181,41 @@ int main(int argc, char **argv) {
172181 // insert to audio_buffer
173182 audioBuffer.insert (audioBuffer.end (), temp.begin (), temp.end ());
174183
175- printf (" %s:buffer size:%d \n " ,get_current_time ().c_str (),audioBuffer.size ());
184+ // printf("%s:buffer size:% ld \n",get_current_time().c_str(),audioBuffer.size());
176185 // 如果开启了VAD
177186 bool isOk;
178187 // printf("%s: use_vad: %d\n", get_current_time().c_str(), params.audio.use_vad);
179188 if (params.audio .use_vad ) {
189+ whisper_mutex.lock ();
190+ for (size_t i = 0 ; i < pcm16.size (); i += chunk_size) {
191+ spx_int16_t frame[chunk_size];
192+ for (int j = 0 ; j < chunk_size; ++j) {
193+ if (i + j < pcm16.size ()) {
194+ frame[j] = (spx_int16_t )(pcm16[i + j]);
195+ } else {
196+ frame[j] = 0 ; // 对于超出范围的部分填充 0
197+ }
198+ }
199+ int is_speech = speex_preprocess_run (st, frame);
200+
201+ // printf("%s: is_active: %d,is_last_active %d\n", get_current_time().c_str(), is_speech, last_is_speech);
202+ if (!is_speech && last_is_speech) {
203+ isOk = whisperService.process (audioBuffer.data (), audioBuffer.size ());
204+ audioBuffer.clear ();
205+ break ;
206+ }
207+ last_is_speech = is_speech != 0 ;
180208
181- bool is_active = ::vad_simple (audioBuffer, WHISPER_SAMPLE_RATE, 1000 , params.audio .vad_thold ,
182- params.audio .freq_thold , false );
183- printf (" %s: is_active: %d,is_last_active %d\n " , get_current_time ().c_str (), is_active, is_last_active);
184- if (!is_active && is_last_active) {
185- is_last_active = false ;
186- isOk = whisperService.process (audioBuffer.data (), audioBuffer.size ());
187- audioBuffer.clear ();
188- } else {
189- is_last_active = is_active;
190209 }
210+ whisper_mutex.unlock ();
191211 } else {
192212 // asr
213+ whisper_mutex.lock ();
193214 isOk = whisperService.process (audioBuffer.data (), audioBuffer.size ());
194215 audioBuffer.clear ();
216+ whisper_mutex.unlock ();
195217 }
196- printf (" %s: is_ok: %d \n " , get_current_time ().c_str (), isOk);
218+ // printf("%s: is_ok: %d \n", get_current_time().c_str(), isOk);
197219 if (isOk) {
198220 final_results = get_result (whisperService.ctx );
199221 response[" result" ] = final_results;
@@ -221,18 +243,6 @@ int main(int argc, char **argv) {
221243 .listen (port, started_handler).run ();
222244}
223245
224- bool processAudio (WhisperService whisperService, std::vector<float > pcm32, const whisper_local_stream_params ¶ms) {
225- if (params.audio .use_vad ) {
226- // printf("%s: vad: %d \n", get_current_time().c_str(), params.audio.use_vad);
227- // TODO: 实现VAD处理,
228- // bool containsVoice = vad_simple(audioBuffer, WHISPER_SAMPLE_RATE, 1000, params.audio.vad_thold, params.audio.freq_thold, false);
229- return whisperService.process (pcm32.data (), pcm32.size ());
230- } else {
231- // asr
232- return whisperService.process (pcm32.data (), pcm32.size ());
233- }
234- }
235-
236246
237247
238248
0 commit comments