Whisper pipeline: support long-form audio (#941)

as-suvorov · ilya-lavrenov · web-flow · commit 0aa6742b3ff8 · 2024-10-15T04:25:35.000Z
This PR adds: - [x] Long-form audio support with sequential chunking. Common Todos for Whisper support: - [ ] Long-form audio support with [parallel chunking](https://huggingface.co/blog/asr-chunking). - [ ] add perf metrics - [ ] update documentation - [ ] add cpp, python samples tests - [ ] support timestamps streaming - [ ] expose only meaningful parameters in `GenerationConfig` (`task`, `language`, `return_timestamps`, etc) - [ ] Move all whisper pipeline files to dedicated subfolder - [ ] Whisper pipeline doesn't need tokenizer, it uses detokenizer only. Implement detokenizer only initialization for `ov::genai::Tokenizer` - [ ] Check discrete GPU. Integrated GPU works as expected. - [ ] Investigate use of `RemoteTensor` for GPU - [ ] Add batch - [ ] Add sampler, inherit WhisperGenerationConfig from GenerationConfig - [ ] Investigate language autodetection with single decoder (without past) call - [ ] Update python bindings cmake to include whole directory instead of explicit list of files - [ ] Add samples with audio preparation examples - [ ] Add links to audio files so users can download them in samples - [ ] Move supported models list from samples README to common supported models section - [ ] Avoid building GenAI in each tests job as it takes a lot of time - [ ] Double check FP32 support - [ ] Fix tests sporadic fails. Sometimes whisper model cannot be downloaded from HF due to network issues - [ ] Fix stop criteria. Current approach stops on eos_token which is no speech token. But there could be more speech tokens further which are wrongly skipped now. Completed: - [x] support different languages, language autodetection - [x] support translation - [x] support timestamps Current limitations: - No resampling during preprocessing. Input raw speech should have 16k Hz sampling rate - No normalization during preprocessing. Input raw speech should be normalized to near [-1, 1] range Tickets: CVS-147994, CVS-146010, CVS-152542 --------- Co-authored-by: Ilya Lavrenov <ilya.lavrenov@intel.com>
diff --git a/.github/workflows/linux.yml b/.github/workflows/linux.yml
@@ -347,7 +347,7 @@ jobs:
         run: |
           source ${OV_INSTALL_DIR}/setupvars.sh
           python -m pip install ./thirdparty/openvino_tokenizers/[transformers] -r ./tests/python_tests/requirements.txt --find-links ${OV_INSTALL_DIR}/wheels --upgrade-strategy eager
-          python -m pytest ./tests/python_tests/test_whisper_generate_api.py
+          python -m pytest ./tests/python_tests/test_whisper_generate_api.py -k test_smoke
         env:
           PYTHONPATH: "./build/:$PYTHONPATH"
 
diff --git a/.github/workflows/mac.yml b/.github/workflows/mac.yml
@@ -291,7 +291,7 @@ jobs:
         run: |
           source ${OV_INSTALL_DIR}/setupvars.sh
           python -m pip install ./thirdparty/openvino_tokenizers/[transformers] -r ./tests/python_tests/requirements.txt --find-links ${OV_INSTALL_DIR}/wheels --upgrade-strategy eager
-          python -m pytest ./tests/python_tests/test_whisper_generate_api.py
+          python -m pytest ./tests/python_tests/test_whisper_generate_api.py -k test_smoke
         env:
           PYTHONPATH: "./build/:$PYTHONPATH"
 
diff --git a/.github/workflows/windows.yml b/.github/workflows/windows.yml
@@ -301,7 +301,7 @@ jobs:
         run: |
           . "${{ env.OV_INSTALL_DIR }}/setupvars.ps1"
           python -m pip install ./thirdparty/openvino_tokenizers/[transformers] -r ./tests/python_tests/requirements.txt --find-links ${env:OV_INSTALL_DIR}/wheels --upgrade-strategy eager
-          python -m pytest ./tests/python_tests/test_whisper_generate_api.py
+          python -m pytest ./tests/python_tests/test_whisper_generate_api.py -k test_smoke
         env:
           PYTHONPATH: "./build/" # cmd evaluates variables in a different way. Setting PYTHONPATH before setupvars.bat instead of doing that after solves that.
 
@@ -365,7 +365,7 @@ jobs:
       - name: Test bindings
         run: |
           . "${{ env.OV_INSTALL_DIR }}/setupvars.ps1"
-          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] -r ./tests/python_tests/requirements.txt --find-links ${env:OV_INSTALL_DIR}/tools --upgrade-strategy eager
+          python -m pip install ./thirdparty/openvino_tokenizers/[transformers] -r ./tests/python_tests/requirements.txt --find-links ${env:OV_INSTALL_DIR}/wheels --upgrade-strategy eager
           python -m pytest ./tests/python_tests/test_vlm_api.py
         env:
           PYTHONPATH: "./build/" # cmd evaluates variables in a different way. Setting PYTHONPATH before setupvars.bat instead of doing that after solves that.
diff --git a/samples/python/whisper_speech_recognition/whisper_speech_recognition.py b/samples/python/whisper_speech_recognition/whisper_speech_recognition.py
@@ -36,11 +36,11 @@ def streamer(word: str) -> bool:
         streamer=streamer,
     )
 
+    print()
+
     for chunk in result.chunks:
         print(f"timestamps: [{chunk.start_ts}, {chunk.end_ts}] text: {chunk.text}")
 
-    print()
-
 
 if "__main__" == __name__:
     main()
diff --git a/samples/requirements.txt b/samples/requirements.txt
@@ -4,4 +4,5 @@ numpy<2.0.0; sys_platform == 'darwin'
 einops==0.8.0  # For Qwen
 transformers_stream_generator==0.0.5  # For Qwen
 diffusers==0.30.3
+librosa # For Whisper
 torchvision # needed for mini-CPM export script. Need to remove when we switch to exporting with optimum-intel.
diff --git a/src/cpp/src/whisper/logit_processor.cpp b/src/cpp/src/whisper/logit_processor.cpp
@@ -47,7 +47,7 @@ void process_whisper_timestamp_logits(ov::Tensor& logits,
 
     if (last_was_timestamp) {
         if (penultimate_was_timestamp) {
-            // has to be timestamp
+            // has to be non-timestamp
             for (size_t i = timestamp_begin; i < vocab_size; i++) {
                 logits_data[i] = -std::numeric_limits<float>::infinity();
             }
diff --git a/src/cpp/src/whisper/timestamps.cpp b/src/cpp/src/whisper/timestamps.cpp
@@ -6,12 +6,11 @@
 namespace ov {
 namespace genai {
 
-std::pair<std::vector<int64_t>, std::vector<ov::genai::Segment>> extract_segments(
-    const std::vector<int64_t>& tokens,
-    const ov::genai::WhisperGenerationConfig& config,
-    const float time_precision) {
-    std::vector<int64_t> non_timestamp_tokens;
-    std::vector<ov::genai::Segment> segments;
+ov::genai::ExtractedSegments extract_segments(const std::vector<int64_t>& tokens,
+                                              const ov::genai::WhisperGenerationConfig& config,
+                                              const size_t nb_max_frames,
+                                              const float time_precision) {
+    ov::genai::ExtractedSegments extracted_segments;
     std::optional<int64_t> token_start = std::nullopt;
     size_t idx_start = 0;
 
@@ -41,9 +40,14 @@ std::pair<std::vector<int64_t>, std::vector<ov::genai::Segment>> extract_segment
             segment.m_tokens = {tokens.begin() + idx_start + 1, tokens.begin() + i};
             segment.m_start = (*token_start - config.begin_timestamps_token_id) * time_precision;
             segment.m_end = (token - config.begin_timestamps_token_id) * time_precision;
-            segments.push_back(segment);
+            extracted_segments.segments.push_back(segment);
 
-            non_timestamp_tokens.insert(non_timestamp_tokens.end(), tokens.begin() + idx_start + 1, tokens.begin() + i);
+            // each next timestamp token represents .02 time diff
+            extracted_segments.last_offset = (token - config.begin_timestamps_token_id) * 2;
+
+            extracted_segments.non_timestamp_tokens.insert(extracted_segments.non_timestamp_tokens.end(),
+                                                           tokens.begin() + idx_start + 1,
+                                                           tokens.begin() + i);
 
             token_start = std::nullopt;
         }
@@ -53,18 +57,28 @@ std::pair<std::vector<int64_t>, std::vector<ov::genai::Segment>> extract_segment
     // add new segment only if it has non timestamps tokens
     // do not add new segment if previous segments exists
     bool has_tokens_to_add = idx_start < tokens.size() - 1;
-    bool has_previous_segments = segments.size() > 0;
+    bool has_previous_segments = extracted_segments.segments.size() > 0;
     if (token_start.has_value() && has_tokens_to_add && !has_previous_segments) {
         ov::genai::Segment segment;
         segment.m_tokens = {tokens.begin() + idx_start + 1, tokens.end()};
         segment.m_start = (*token_start - config.begin_timestamps_token_id) * time_precision;
         segment.m_end = -1.0f;
-        segments.push_back(segment);
+        extracted_segments.segments.push_back(segment);
+
+        extracted_segments.last_offset = nb_max_frames;
+
+        extracted_segments.non_timestamp_tokens.insert(extracted_segments.non_timestamp_tokens.end(),
+                                                       tokens.begin() + idx_start + 1,
+                                                       tokens.end());
+    }
 
-        non_timestamp_tokens.insert(non_timestamp_tokens.end(), tokens.begin() + idx_start + 1, tokens.end());
+    // last timestamps generated in pairs <ts><ts><eos> -> speech segment continuation to the next chunk -> token_start will have value
+    // single ending timestamp <ts><eos> -> no more speech till the end of current chunk -> set offset to the end of frame
+    if (!token_start.has_value()) {
+        extracted_segments.last_offset = nb_max_frames;
     }
 
-    return {non_timestamp_tokens, segments};
+    return extracted_segments;
 }
 
 }  // namespace genai
diff --git a/src/cpp/src/whisper/timestamps.hpp b/src/cpp/src/whisper/timestamps.hpp
@@ -10,10 +10,16 @@
 namespace ov {
 namespace genai {
 
-std::pair<std::vector<int64_t>, std::vector<ov::genai::Segment>> extract_segments(
-    const std::vector<int64_t>& tokens,
-    const ov::genai::WhisperGenerationConfig& config,
-    const float time_precision);
+struct ExtractedSegments {
+    std::vector<ov::genai::Segment> segments;
+    size_t last_offset;
+    std::vector<int64_t> non_timestamp_tokens;
+};
+
+ExtractedSegments extract_segments(const std::vector<int64_t>& tokens,
+                                   const ov::genai::WhisperGenerationConfig& config,
+                                   const size_t nb_max_frames,
+                                   const float time_precision);
 
 }  // namespace genai
 }  // namespace ov
diff --git a/src/cpp/src/whisper/whisper.cpp b/src/cpp/src/whisper/whisper.cpp
@@ -76,7 +76,8 @@ int64_t decode(ov::Tensor& encoder_hidden_state,
                ov::InferRequest& decoder,
                std::vector<int64_t>& input_ids,
                const ov::genai::WhisperGenerationConfig& config,
-               bool apply_logit_processors = true) {
+               const bool apply_logit_processors = true,
+               const bool return_timestamps = false) {
     decoder.set_tensor("encoder_hidden_states", ov::Tensor{encoder_hidden_state});
 
     ov::Tensor input_ids_tensor(ov::element::i64, {1, input_ids.size()}, input_ids.data());
@@ -90,7 +91,7 @@ int64_t decode(ov::Tensor& encoder_hidden_state,
         ov::genai::do_suppress_tokens(output_tensor, 0, config.begin_suppress_tokens);
         ov::genai::do_suppress_tokens(output_tensor, 0, config.suppress_tokens);
 
-        if (config.return_timestamps) {
+        if (return_timestamps) {
             ov::genai::process_whisper_timestamp_logits(output_tensor, 0, config, {}, true);
         }
     }
@@ -105,6 +106,7 @@ int64_t decode_with_past(ov::Tensor& encoder_hidden_state,
                          int64_t input_id,
                          const size_t cache_position,
                          const ov::genai::WhisperGenerationConfig& config,
+                         const bool return_timestamps,
                          const std::vector<int64_t>& generated_tokens) {
     decoder_with_past.set_tensor("encoder_hidden_states", ov::Tensor{encoder_hidden_state});
 
@@ -122,7 +124,7 @@ int64_t decode_with_past(ov::Tensor& encoder_hidden_state,
 
     ov::genai::do_suppress_tokens(output_tensor, 0, config.suppress_tokens);
 
-    if (config.return_timestamps) {
+    if (return_timestamps) {
         ov::genai::process_whisper_timestamp_logits(output_tensor, 0, config, generated_tokens);
     }
 
@@ -135,14 +137,15 @@ int64_t detect_language(ov::Tensor& encoder_hidden_state,
                         ov::InferRequest decoder,
                         const ov::genai::WhisperGenerationConfig& config) {
     std::vector<int64_t> input_ids{config.decoder_start_token_id};
-    int64_t output_token = decode(encoder_hidden_state, decoder, input_ids, config, false);
+    int64_t output_token = decode(encoder_hidden_state, decoder, input_ids, config, false, false);
 
     return output_token;
 }
 
-std::vector<int64_t> prepare_input_ids(ov::Tensor& encoder_hidden_state,
-                                       ov::InferRequest decoder,
-                                       const ov::genai::WhisperGenerationConfig& config) {
+std::vector<int64_t> prepare_init_ids(ov::Tensor& encoder_hidden_state,
+                                      ov::InferRequest decoder,
+                                      const ov::genai::WhisperGenerationConfig& config,
+                                      const bool return_timestamps) {
     if (!config.is_multilingual) {
         return std::vector<int64_t>{config.decoder_start_token_id, config.no_timestamps_token_id};
     }
@@ -162,7 +165,7 @@ std::vector<int64_t> prepare_input_ids(ov::Tensor& encoder_hidden_state,
         task_token_id = config.translate_token_id;
     }
 
-    if (config.return_timestamps) {
+    if (return_timestamps) {
         return std::vector<int64_t>{config.decoder_start_token_id, language_token_id, task_token_id};
     }
 
@@ -175,11 +178,11 @@ std::vector<int64_t> prepare_input_ids(ov::Tensor& encoder_hidden_state,
 std::pair<bool, std::vector<int64_t>> full_decode(ov::Tensor& encoder_hidden_state,
                                                   const ov::genai::WhisperGenerationConfig& config,
                                                   ov::genai::WhisperInitializedModels& models,
+                                                  std::vector<int64_t> init_ids,
                                                   const size_t max_new_tokens,
+                                                  const bool return_timestamps,
                                                   const std::shared_ptr<ov::genai::StreamerBase> streamer) {
-    std::vector<int64_t> input_ids = prepare_input_ids(encoder_hidden_state, models.decoder, config);
-
-    int64_t output_token = decode(encoder_hidden_state, models.decoder, input_ids, config);
+    int64_t output_token = decode(encoder_hidden_state, models.decoder, init_ids, config, true, return_timestamps);
 
     std::vector<int64_t> output_tokens{output_token};
 
@@ -198,8 +201,9 @@ std::pair<bool, std::vector<int64_t>> full_decode(ov::Tensor& encoder_hidden_sta
         auto output_token = decode_with_past(encoder_hidden_state,
                                              models.decoder_with_past,
                                              output_tokens.back(),
-                                             input_ids.size() + output_tokens.size() - 1,
+                                             init_ids.size() + output_tokens.size() - 1,
                                              config,
+                                             return_timestamps,
                                              output_tokens);
 
         if (i == 0) {
@@ -225,52 +229,75 @@ std::pair<bool, std::vector<int64_t>> full_decode(ov::Tensor& encoder_hidden_sta
 
 namespace ov {
 namespace genai {
-// hf hash 2 algos for handling long (>30s) audios https://huggingface.co/openai/whisper-large-v3#chunked-long-form
-// Sequential: uses a "sliding window" for buffered inference, transcribing 30-second slices one after the other
-// Chunked: splits long audio files into shorter ones (with a small overlap between segments), transcribes each segment
-// independently, and stitches the resulting transcriptions at the boundaries
-
-// By default, Transformers uses the sequential algorithm. To enable the chunked algorithm, pass the chunk_length_s
-// parameter to the pipeline. A chunk length of 30-seconds is optimal. Sequential algo:
-// 1. Process whole raw speech into mel spectrogram
-// 2. Chunk mel spectrogram into 30s
-// 3. Enable timestamps
-// 4. Process each chunk sequentially.
-// 5. For each chunk stop at first eos token. Start next window from last timestamp found.
-//          remove eos tokens if not finished yet
-//          remove pad tokens
-// 7. Concatenate output tokens
+
 std::pair<std::vector<int64_t>, std::optional<std::vector<Segment>>> whisper_generate(
     const ov::genai::WhisperGenerationConfig& config,
     const ov::genai::WhisperConfig& model_config,
     const RawSpeechInput& raw_speech,
     ov::genai::WhisperInitializedModels& models,
     WhisperFeatureExtractor& feature_extractor,
     const std::shared_ptr<StreamerBase> streamer) {
+    auto input_features = feature_extractor.extract(raw_speech);
+
+    const bool is_shortform = input_features.n_frames <= feature_extractor.nb_max_frames;
+    // long-form audio processing requires timestamps to be enabled
+    const bool return_timestamps = config.return_timestamps || !is_shortform;
+
+    std::vector<int64_t> init_ids;
     std::vector<int64_t> output_tokens;
     size_t max_new_tokens = config.get_max_new_tokens();
 
-    for (size_t chunk_offset = 0; chunk_offset < raw_speech.size(); chunk_offset += feature_extractor.n_samples) {
+    std::vector<Segment> segments;
+
+    // 0.02 by default
+    const float time_precision = static_cast<float>(feature_extractor.chunk_length) / model_config.max_source_positions;
+    size_t segment_offset = 0;
+
+    for (size_t chunk_offset = 0; chunk_offset < input_features.n_frames; chunk_offset += segment_offset) {
         if (output_tokens.size() >= max_new_tokens) {
             break;
         }
 
-        // Split audio data into fixed feature_extractor.chunk_size windows.
-        size_t copy_size = std::min((raw_speech.size() - chunk_offset), size_t(feature_extractor.n_samples));
-        std::vector<float> input_features_sub_chunk(raw_speech.begin() + chunk_offset,
-                                                    raw_speech.begin() + chunk_offset + copy_size);
+        auto input_features_chunk = input_features.get_data_with_offset(chunk_offset, feature_extractor.nb_max_frames);
 
-        auto input_features = feature_extractor.extract(input_features_sub_chunk);
+        ov::Tensor hidden_state_tensor = encode(models.encoder,
+                                                input_features_chunk,
+                                                feature_extractor.feature_size,
+                                                feature_extractor.nb_max_frames);
 
-        ov::Tensor hidden_state_tensor =
-            encode(models.encoder, input_features, feature_extractor.feature_size, feature_extractor.nb_max_frames);
+        // prepare init_ids just once for whole input
+        if (init_ids.empty()) {
+            init_ids = prepare_init_ids(hidden_state_tensor, models.decoder, config, return_timestamps);
+        }
 
-        bool cancelled;
-        std::vector<int64_t> chunk_output_tokens;
-        std::tie(cancelled, chunk_output_tokens) =
-            full_decode(hidden_state_tensor, config, models, max_new_tokens - output_tokens.size(), streamer);
+        auto [cancelled, chunk_output_tokens] = full_decode(hidden_state_tensor,
+                                                            config,
+                                                            models,
+                                                            init_ids,
+                                                            max_new_tokens - output_tokens.size(),
+                                                            return_timestamps,
+                                                            streamer);
+
+        if (return_timestamps) {
+            auto extracted_segments = ov::genai::extract_segments(chunk_output_tokens,
+                                                                  config,
+                                                                  feature_extractor.nb_max_frames,
+                                                                  time_precision);
+
+            segments.insert(segments.end(), extracted_segments.segments.begin(), extracted_segments.segments.end());
+
+            output_tokens.insert(output_tokens.end(),
+                                 extracted_segments.non_timestamp_tokens.begin(),
+                                 extracted_segments.non_timestamp_tokens.end());
+
+            segment_offset = extracted_segments.last_offset;
+        } else {
+            output_tokens.insert(output_tokens.end(), chunk_output_tokens.begin(), chunk_output_tokens.end());
+        }
 
-        output_tokens.insert(output_tokens.end(), chunk_output_tokens.begin(), chunk_output_tokens.end());
+        if (is_shortform) {
+            segment_offset = input_features.n_frames;
+        }
 
         if (cancelled) {
             break;
@@ -281,12 +308,9 @@ std::pair<std::vector<int64_t>, std::optional<std::vector<Segment>>> whisper_gen
         streamer->end();
     }
 
-    std::optional<std::vector<Segment>> segments = std::nullopt;
-    if (config.return_timestamps) {
-        // 0.02 by default
-        const float time_precision =
-            static_cast<float>(feature_extractor.chunk_length) / model_config.max_source_positions;
-        std::tie(output_tokens, segments) = ov::genai::extract_segments(output_tokens, config, time_precision);
+    // if return_timestamps wasn't enabled by user
+    if (!config.return_timestamps) {
+        return {output_tokens, std::nullopt};
     }
 
     return {output_tokens, segments};
diff --git a/src/cpp/src/whisper/whisper_feature_extractor.cpp b/src/cpp/src/whisper/whisper_feature_extractor.cpp
diff --git a/src/cpp/src/whisper/whisper_feature_extractor.hpp b/src/cpp/src/whisper/whisper_feature_extractor.hpp
diff --git a/tests/python_tests/ov_genai_test_utils.py b/tests/python_tests/ov_genai_test_utils.py
diff --git a/tests/python_tests/test_whisper_generate_api.py b/tests/python_tests/test_whisper_generate_api.py

Original file line number	Diff line number	Diff line change
`@@ -47,7 +47,7 @@ void process_whisper_timestamp_logits(ov::Tensor& logits,`
`47`	`47`
`48`	`48`	`if (last_was_timestamp) {`
`49`	`49`	`if (penultimate_was_timestamp) {`
`50`		`- // has to be timestamp`
	`50`	`+ // has to be non-timestamp`
`51`	`51`	`for (size_t i = timestamp_begin; i < vocab_size; i++) {`
`52`	`52`	`logits_data[i] = -std::numeric_limits<float>::infinity();`
`53`	`53`	`}`