@@ -76,7 +76,8 @@ int64_t decode(ov::Tensor& encoder_hidden_state,
7676 ov::InferRequest& decoder,
7777 std::vector<int64_t >& input_ids,
7878 const ov::genai::WhisperGenerationConfig& config,
79- bool apply_logit_processors = true ) {
79+ const bool apply_logit_processors = true ,
80+ const bool return_timestamps = false ) {
8081 decoder.set_tensor (" encoder_hidden_states" , ov::Tensor{encoder_hidden_state});
8182
8283 ov::Tensor input_ids_tensor (ov::element::i64 , {1 , input_ids.size ()}, input_ids.data ());
@@ -90,7 +91,7 @@ int64_t decode(ov::Tensor& encoder_hidden_state,
9091 ov::genai::do_suppress_tokens (output_tensor, 0 , config.begin_suppress_tokens );
9192 ov::genai::do_suppress_tokens (output_tensor, 0 , config.suppress_tokens );
9293
93- if (config. return_timestamps ) {
94+ if (return_timestamps) {
9495 ov::genai::process_whisper_timestamp_logits (output_tensor, 0 , config, {}, true );
9596 }
9697 }
@@ -105,6 +106,7 @@ int64_t decode_with_past(ov::Tensor& encoder_hidden_state,
105106 int64_t input_id,
106107 const size_t cache_position,
107108 const ov::genai::WhisperGenerationConfig& config,
109+ const bool return_timestamps,
108110 const std::vector<int64_t >& generated_tokens) {
109111 decoder_with_past.set_tensor (" encoder_hidden_states" , ov::Tensor{encoder_hidden_state});
110112
@@ -122,7 +124,7 @@ int64_t decode_with_past(ov::Tensor& encoder_hidden_state,
122124
123125 ov::genai::do_suppress_tokens (output_tensor, 0 , config.suppress_tokens );
124126
125- if (config. return_timestamps ) {
127+ if (return_timestamps) {
126128 ov::genai::process_whisper_timestamp_logits (output_tensor, 0 , config, generated_tokens);
127129 }
128130
@@ -135,14 +137,15 @@ int64_t detect_language(ov::Tensor& encoder_hidden_state,
135137 ov::InferRequest decoder,
136138 const ov::genai::WhisperGenerationConfig& config) {
137139 std::vector<int64_t > input_ids{config.decoder_start_token_id };
138- int64_t output_token = decode (encoder_hidden_state, decoder, input_ids, config, false );
140+ int64_t output_token = decode (encoder_hidden_state, decoder, input_ids, config, false , false );
139141
140142 return output_token;
141143}
142144
143- std::vector<int64_t > prepare_input_ids (ov::Tensor& encoder_hidden_state,
144- ov::InferRequest decoder,
145- const ov::genai::WhisperGenerationConfig& config) {
145+ std::vector<int64_t > prepare_init_ids (ov::Tensor& encoder_hidden_state,
146+ ov::InferRequest decoder,
147+ const ov::genai::WhisperGenerationConfig& config,
148+ const bool return_timestamps) {
146149 if (!config.is_multilingual ) {
147150 return std::vector<int64_t >{config.decoder_start_token_id , config.no_timestamps_token_id };
148151 }
@@ -162,7 +165,7 @@ std::vector<int64_t> prepare_input_ids(ov::Tensor& encoder_hidden_state,
162165 task_token_id = config.translate_token_id ;
163166 }
164167
165- if (config. return_timestamps ) {
168+ if (return_timestamps) {
166169 return std::vector<int64_t >{config.decoder_start_token_id , language_token_id, task_token_id};
167170 }
168171
@@ -175,11 +178,11 @@ std::vector<int64_t> prepare_input_ids(ov::Tensor& encoder_hidden_state,
175178std::pair<bool , std::vector<int64_t >> full_decode (ov::Tensor& encoder_hidden_state,
176179 const ov::genai::WhisperGenerationConfig& config,
177180 ov::genai::WhisperInitializedModels& models,
181+ std::vector<int64_t > init_ids,
178182 const size_t max_new_tokens,
183+ const bool return_timestamps,
179184 const std::shared_ptr<ov::genai::StreamerBase> streamer) {
180- std::vector<int64_t > input_ids = prepare_input_ids (encoder_hidden_state, models.decoder , config);
181-
182- int64_t output_token = decode (encoder_hidden_state, models.decoder , input_ids, config);
185+ int64_t output_token = decode (encoder_hidden_state, models.decoder , init_ids, config, true , return_timestamps);
183186
184187 std::vector<int64_t > output_tokens{output_token};
185188
@@ -198,8 +201,9 @@ std::pair<bool, std::vector<int64_t>> full_decode(ov::Tensor& encoder_hidden_sta
198201 auto output_token = decode_with_past (encoder_hidden_state,
199202 models.decoder_with_past ,
200203 output_tokens.back (),
201- input_ids .size () + output_tokens.size () - 1 ,
204+ init_ids .size () + output_tokens.size () - 1 ,
202205 config,
206+ return_timestamps,
203207 output_tokens);
204208
205209 if (i == 0 ) {
@@ -225,52 +229,75 @@ std::pair<bool, std::vector<int64_t>> full_decode(ov::Tensor& encoder_hidden_sta
225229
226230namespace ov {
227231namespace genai {
228- // hf hash 2 algos for handling long (>30s) audios https://huggingface.co/openai/whisper-large-v3#chunked-long-form
229- // Sequential: uses a "sliding window" for buffered inference, transcribing 30-second slices one after the other
230- // Chunked: splits long audio files into shorter ones (with a small overlap between segments), transcribes each segment
231- // independently, and stitches the resulting transcriptions at the boundaries
232-
233- // By default, Transformers uses the sequential algorithm. To enable the chunked algorithm, pass the chunk_length_s
234- // parameter to the pipeline. A chunk length of 30-seconds is optimal. Sequential algo:
235- // 1. Process whole raw speech into mel spectrogram
236- // 2. Chunk mel spectrogram into 30s
237- // 3. Enable timestamps
238- // 4. Process each chunk sequentially.
239- // 5. For each chunk stop at first eos token. Start next window from last timestamp found.
240- // remove eos tokens if not finished yet
241- // remove pad tokens
242- // 7. Concatenate output tokens
232+
243233std::pair<std::vector<int64_t >, std::optional<std::vector<Segment>>> whisper_generate (
244234 const ov::genai::WhisperGenerationConfig& config,
245235 const ov::genai::WhisperConfig& model_config,
246236 const RawSpeechInput& raw_speech,
247237 ov::genai::WhisperInitializedModels& models,
248238 WhisperFeatureExtractor& feature_extractor,
249239 const std::shared_ptr<StreamerBase> streamer) {
240+ auto input_features = feature_extractor.extract (raw_speech);
241+
242+ const bool is_shortform = input_features.n_frames <= feature_extractor.nb_max_frames ;
243+ // long-form audio processing requires timestamps to be enabled
244+ const bool return_timestamps = config.return_timestamps || !is_shortform;
245+
246+ std::vector<int64_t > init_ids;
250247 std::vector<int64_t > output_tokens;
251248 size_t max_new_tokens = config.get_max_new_tokens ();
252249
253- for (size_t chunk_offset = 0 ; chunk_offset < raw_speech.size (); chunk_offset += feature_extractor.n_samples ) {
250+ std::vector<Segment> segments;
251+
252+ // 0.02 by default
253+ const float time_precision = static_cast <float >(feature_extractor.chunk_length ) / model_config.max_source_positions ;
254+ size_t segment_offset = 0 ;
255+
256+ for (size_t chunk_offset = 0 ; chunk_offset < input_features.n_frames ; chunk_offset += segment_offset) {
254257 if (output_tokens.size () >= max_new_tokens) {
255258 break ;
256259 }
257260
258- // Split audio data into fixed feature_extractor.chunk_size windows.
259- size_t copy_size = std::min ((raw_speech.size () - chunk_offset), size_t (feature_extractor.n_samples ));
260- std::vector<float > input_features_sub_chunk (raw_speech.begin () + chunk_offset,
261- raw_speech.begin () + chunk_offset + copy_size);
261+ auto input_features_chunk = input_features.get_data_with_offset (chunk_offset, feature_extractor.nb_max_frames );
262262
263- auto input_features = feature_extractor.extract (input_features_sub_chunk);
263+ ov::Tensor hidden_state_tensor = encode (models.encoder ,
264+ input_features_chunk,
265+ feature_extractor.feature_size ,
266+ feature_extractor.nb_max_frames );
264267
265- ov::Tensor hidden_state_tensor =
266- encode (models.encoder , input_features, feature_extractor.feature_size , feature_extractor.nb_max_frames );
268+ // prepare init_ids just once for whole input
269+ if (init_ids.empty ()) {
270+ init_ids = prepare_init_ids (hidden_state_tensor, models.decoder , config, return_timestamps);
271+ }
267272
268- bool cancelled;
269- std::vector<int64_t > chunk_output_tokens;
270- std::tie (cancelled, chunk_output_tokens) =
271- full_decode (hidden_state_tensor, config, models, max_new_tokens - output_tokens.size (), streamer);
273+ auto [cancelled, chunk_output_tokens] = full_decode (hidden_state_tensor,
274+ config,
275+ models,
276+ init_ids,
277+ max_new_tokens - output_tokens.size (),
278+ return_timestamps,
279+ streamer);
280+
281+ if (return_timestamps) {
282+ auto extracted_segments = ov::genai::extract_segments (chunk_output_tokens,
283+ config,
284+ feature_extractor.nb_max_frames ,
285+ time_precision);
286+
287+ segments.insert (segments.end (), extracted_segments.segments .begin (), extracted_segments.segments .end ());
288+
289+ output_tokens.insert (output_tokens.end (),
290+ extracted_segments.non_timestamp_tokens .begin (),
291+ extracted_segments.non_timestamp_tokens .end ());
292+
293+ segment_offset = extracted_segments.last_offset ;
294+ } else {
295+ output_tokens.insert (output_tokens.end (), chunk_output_tokens.begin (), chunk_output_tokens.end ());
296+ }
272297
273- output_tokens.insert (output_tokens.end (), chunk_output_tokens.begin (), chunk_output_tokens.end ());
298+ if (is_shortform) {
299+ segment_offset = input_features.n_frames ;
300+ }
274301
275302 if (cancelled) {
276303 break ;
@@ -281,12 +308,9 @@ std::pair<std::vector<int64_t>, std::optional<std::vector<Segment>>> whisper_gen
281308 streamer->end ();
282309 }
283310
284- std::optional<std::vector<Segment>> segments = std::nullopt ;
285- if (config.return_timestamps ) {
286- // 0.02 by default
287- const float time_precision =
288- static_cast <float >(feature_extractor.chunk_length ) / model_config.max_source_positions ;
289- std::tie (output_tokens, segments) = ov::genai::extract_segments (output_tokens, config, time_precision);
311+ // if return_timestamps wasn't enabled by user
312+ if (!config.return_timestamps ) {
313+ return {output_tokens, std::nullopt };
290314 }
291315
292316 return {output_tokens, segments};
0 commit comments