openvinotoolkit
diff --git a/‎samples/cpp/whisper_speech_recognition/whisper_speech_recognition.cpp‎
Lines changed: 7 additions & 2 deletions b/‎samples/cpp/whisper_speech_recognition/whisper_speech_recognition.cpp‎
Lines changed: 7 additions & 2 deletions
diff --git a/‎samples/python/whisper_speech_recognition/whisper_speech_recognition.py‎
Lines changed: 5 additions & 1 deletion b/‎samples/python/whisper_speech_recognition/whisper_speech_recognition.py‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎src/cpp/include/openvino/genai/whisper_generation_config.hpp‎
Lines changed: 13 additions & 0 deletions b/‎src/cpp/include/openvino/genai/whisper_generation_config.hpp‎
Lines changed: 13 additions & 0 deletions
diff --git a/‎src/cpp/include/openvino/genai/whisper_pipeline.hpp‎
Lines changed: 23 additions & 8 deletions b/‎src/cpp/include/openvino/genai/whisper_pipeline.hpp‎
Lines changed: 23 additions & 8 deletions
diff --git a/‎src/cpp/src/sampler.hpp‎
Lines changed: 2 additions & 0 deletions b/‎src/cpp/src/sampler.hpp‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎src/cpp/src/whisper/logit_processor.cpp‎
Lines changed: 118 additions & 0 deletions b/‎src/cpp/src/whisper/logit_processor.cpp‎
Lines changed: 118 additions & 0 deletions
diff --git a/‎src/cpp/src/whisper/logit_processor.hpp‎
Lines changed: 22 additions & 0 deletions b/‎src/cpp/src/whisper/logit_processor.hpp‎
Lines changed: 22 additions & 0 deletions
diff --git a/‎src/cpp/src/whisper/timestamps.cpp‎
Lines changed: 71 additions & 0 deletions b/‎src/cpp/src/whisper/timestamps.cpp‎
Lines changed: 71 additions & 0 deletions
diff --git a/‎src/cpp/src/whisper/timestamps.hpp‎
Lines changed: 19 additions & 0 deletions b/‎src/cpp/src/whisper/timestamps.hpp‎
Lines changed: 19 additions & 0 deletions
@@ -21,15 +21,20 @@ int main(int argc, char* argv[]) try {
     // 'task' and 'language' parameters are supported for multilingual models only
     config.language = "<|en|>";
     config.task = "transcribe";
+    config.return_timestamps = true;
 
     auto streamer = [](std::string word) {
         std::cout << word;
         return false;
     };
 
-    pipeline.generate(raw_speech, config, streamer);
+    auto result = pipeline.generate(raw_speech, config, streamer);
 
-    std::cout << std::endl;
+    std::cout << "\n";
+
+    for (auto& chunk : *result.chunks) {
+        std::cout << "timestamps: [" << chunk.start_ts << ", " << chunk.end_ts << "] text: " << chunk.text << "\n";
+    }
 } catch (const std::exception& error) {
     try {
         std::cerr << error.what() << '\n';
 
@@ -26,15 +26,19 @@ def streamer(word: str) -> bool:
         print(word, end="")
         return False
 
-    pipe.generate(
+    result = pipe.generate(
         raw_speech,
         max_new_tokens=100,
         # 'task' and 'language' parameters are supported for multilingual models only
         language="<|en|>",
         task="transcribe",
+        return_timestamps=True,
         streamer=streamer,
     )
 
+    for chunk in result.chunks:
+        print(f"timestamps: [{chunk.start_ts}, {chunk.end_ts}] text: {chunk.text}")
+
     print()
 
 
 
@@ -51,6 +51,8 @@ class OPENVINO_GENAI_EXPORTS WhisperGenerationConfig {
     // Begin timestamps token id.
     int64_t begin_timestamps_token_id = 50364;
 
+    size_t max_initial_timestamp_index = 50;
+
     bool is_multilingual = true;
 
     // Language token to use for generation in the form of <|en|>.
@@ -65,6 +67,16 @@ class OPENVINO_GENAI_EXPORTS WhisperGenerationConfig {
     // Can be set for multilingual models only.
     std::optional<std::string> task = std::nullopt;
 
+    // If `true` the pipeline will return timestamps along the text for *segments* of words in the text.
+    // For instance, if you get
+    // WhisperDecodedResultChunk
+    //      start_ts = 0.5
+    //      end_ts = 1.5
+    //      text = " Hi there!"
+    // then it means the model predicts that the segment "Hi there!" was spoken after `0.5` and before `1.5` seconds.
+    // Note that a segment of text refers to a sequence of one or more words, rather than individual words.
+    bool return_timestamps = false;
+
     // A list containing tokens that will be supressed at the beginning of the sampling process.
     std::vector<int64_t> begin_suppress_tokens;
 
@@ -105,6 +117,7 @@ static constexpr ov::Property<int64_t> no_timestamps_token_id{"no_timestamps_tok
 static constexpr ov::Property<int64_t> begin_timestamps_token_id{"begin_timestamps_token_id"};
 static constexpr ov::Property<std::string> language{"language"};
 static constexpr ov::Property<std::string> task{"task"};
+static constexpr ov::Property<bool> return_timestamps{"return_timestamps"};
 static constexpr ov::Property<std::map<std::string, int64_t>> lang_to_id{"lang_to_id"};
 
 }  // namespace genai
 
@@ -17,6 +17,21 @@ using OptionalWhisperGenerationConfig = std::optional<WhisperGenerationConfig>;
 
 using RawSpeechInput = std::vector<float>;
 
+struct WhisperDecodedResultChunk {
+    // start of chunk in seconds
+    float start_ts;
+
+    // end of chunk in seconds
+    // -1.0f if chunk started but model did not predict an ending timestamp
+    // can happen if audio is cut off in the middle of a word
+    float end_ts = -1.0f;
+    std::string text;
+};
+
+struct WhisperDecodedResults : public DecodedResults {
+    std::optional<std::vector<WhisperDecodedResultChunk>> chunks = std::nullopt;
+};
+
 class OPENVINO_GENAI_EXPORTS WhisperPipeline {
     class Impl;
     std::unique_ptr<Impl> m_impl;
@@ -57,11 +72,11 @@ class OPENVINO_GENAI_EXPORTS WhisperPipeline {
      * sampling rate.
      * @param generation_config optional GenerationConfig
      * @param streamer optional streamer
-     * @return DecodedResults decoded resulting text transcription
+     * @return WhisperDecodedResults decoded resulting text transcription
      */
-    DecodedResults generate(const RawSpeechInput& raw_speech_input,
-                            OptionalWhisperGenerationConfig generation_config = std::nullopt,
-                            StreamerVariant streamer = std::monostate());
+    WhisperDecodedResults generate(const RawSpeechInput& raw_speech_input,
+                                   OptionalWhisperGenerationConfig generation_config = std::nullopt,
+                                   StreamerVariant streamer = std::monostate());
 
     /**
      * @brief High level generate that receives raw speech as a vector of floats and returns decoded output.
@@ -70,14 +85,14 @@ class OPENVINO_GENAI_EXPORTS WhisperPipeline {
      *
      * @param raw_speech_input raw speech input
      * @param properties properties
-     * @return DecodedResults decoded resulting text transcription
+     * @return WhisperDecodedResults decoded resulting text transcription
      */
     template <typename... Properties>
-    util::EnableIfAllStringAny<DecodedResults, Properties...> generate(const RawSpeechInput& raw_speech_input,
-                                                                       Properties&&... properties) {
+    util::EnableIfAllStringAny<WhisperDecodedResults, Properties...> generate(const RawSpeechInput& raw_speech_input,
+                                                                              Properties&&... properties) {
         return generate(raw_speech_input, AnyMap{std::forward<Properties>(properties)...});
     }
-    DecodedResults generate(const RawSpeechInput& raw_speech_input, const ov::AnyMap& config_map);
+    WhisperDecodedResults generate(const RawSpeechInput& raw_speech_input, const ov::AnyMap& config_map);
 
     ov::genai::Tokenizer get_tokenizer();
     WhisperGenerationConfig get_generation_config() const;
 
@@ -30,6 +30,8 @@ inline bool is_stop_token_id_hit(int64_t generated_token, const std::set<int64_t
     return false;
 }
 
+std::vector<Token> log_softmax(const ov::Tensor& logits, size_t batch_idx);
+
 struct SamplerOutput {
     // IDs of sequences that need to be dropped
     std::vector<uint64_t> m_dropped_sequences;
 
@@ -0,0 +1,118 @@
+// Copyright (C) 2023-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include <openvino/openvino.hpp>
+
+#include "openvino/genai/whisper_generation_config.hpp"
+#include "sampler.hpp"
+
+namespace ov {
+namespace genai {
+
+void do_suppress_tokens(ov::Tensor& logits, const size_t batch_idx, const std::vector<int64_t>& suppress_tokens) {
+    OPENVINO_ASSERT(logits.get_shape()[0] >= batch_idx, "logits batch size doesn't match the batch number");
+
+    size_t vocab_size = logits.get_shape().back();
+    size_t batch_offset = batch_idx * logits.get_shape()[1] * vocab_size;
+    size_t sequence_offset = (logits.get_shape()[1] - 1) * vocab_size;
+    float* logits_data = logits.data<float>() + batch_offset + sequence_offset;
+
+    for (auto supress_token : suppress_tokens) {
+        logits_data[supress_token] = -std::numeric_limits<float>::infinity();
+    }
+}
+
+void process_whisper_timestamp_logits(ov::Tensor& logits,
+                                      const size_t batch_idx,
+                                      const ov::genai::WhisperGenerationConfig& config,
+                                      const std::vector<int64_t>& generated_tokens,
+                                      bool initial_step = false) {
+    const size_t batch_size = logits.get_shape().at(0);
+    OPENVINO_ASSERT(batch_size == 1, "Batch != 1 is not supported");
+
+    size_t vocab_size = logits.get_shape().back();
+    size_t batch_offset = batch_idx * logits.get_shape()[1] * vocab_size;
+    size_t sequence_offset = (logits.get_shape()[1] - 1) * vocab_size;
+    float* logits_data = logits.data<float>() + batch_offset + sequence_offset;
+
+    // supress<|notimestamps|>
+    logits_data[config.no_timestamps_token_id] = -std::numeric_limits<float>::infinity();
+
+    size_t timestamp_begin = config.no_timestamps_token_id + 1;
+
+    // timestamps have to appear in pairs, except directly before eos_token; mask logits accordingly
+    size_t generated_length = generated_tokens.size();
+    bool last_was_timestamp = generated_length >= 1 && generated_tokens[generated_length - 1] >= timestamp_begin;
+    bool penultimate_was_timestamp = generated_length < 2 || generated_tokens[generated_length - 2] >= timestamp_begin;
+
+    if (last_was_timestamp) {
+        if (penultimate_was_timestamp) {
+            // has to be timestamp
+            for (size_t i = timestamp_begin; i < vocab_size; i++) {
+                logits_data[i] = -std::numeric_limits<float>::infinity();
+            }
+        } else {
+            // cannot be normal text token
+            for (size_t i = 0; i < config.eos_token_id; i++) {
+                logits_data[i] = -std::numeric_limits<float>::infinity();
+            }
+        }
+    }
+
+    // filter generated timestaps
+    std::vector<int64_t> timestamps;
+    for (const auto token : generated_tokens) {
+        if (token >= timestamp_begin) {
+            timestamps.push_back(token);
+        }
+    }
+
+    if (timestamps.size() > 0) {
+        size_t timestamp_last;
+        // `timestamps` shouldn't decrease; forbid timestamp tokens smaller than the last
+        // The following lines of code are copied from: https://github.com/openai/whisper/pull/914/files#r1137085090
+        if (last_was_timestamp && !penultimate_was_timestamp) {
+            timestamp_last = timestamps.back();
+        } else {
+            // Avoid to emit <|0.00|> again
+            timestamp_last = timestamps.back() + 1;
+        }
+
+        for (size_t i = timestamp_begin; i < timestamp_last; i++) {
+            logits_data[i] = -std::numeric_limits<float>::infinity();
+        }
+    }
+
+    // apply the `max_initial_timestamp` option
+    if (initial_step) {
+        for (size_t i = 0; i < timestamp_begin; i++) {
+            logits_data[i] = -std::numeric_limits<float>::infinity();
+        }
+
+        size_t last_allowed = timestamp_begin + config.max_initial_timestamp_index;
+        for (size_t i = last_allowed + 1; i < vocab_size; i++) {
+            logits_data[i] = -std::numeric_limits<float>::infinity();
+        }
+    }
+
+    auto tokens = ov::genai::log_softmax(logits, 0);
+    float timestamp_exp_prov_sum = 0;
+
+    for (size_t i = timestamp_begin; i < vocab_size; i++) {
+        timestamp_exp_prov_sum += std::exp(tokens[i].m_log_prob);
+    }
+    float timestamp_logprob = std::log(timestamp_exp_prov_sum);
+
+    auto max_logprob_token = std::max_element(tokens.begin(), tokens.end(), [](const Token& left, const Token& right) {
+        return left.m_log_prob < right.m_log_prob;
+    });
+
+    if (timestamp_logprob > max_logprob_token->m_log_prob) {
+        for (size_t i = 0; i < timestamp_begin; i++) {
+            logits_data[i] = -std::numeric_limits<float>::infinity();
+        }
+    }
+}
+
+}  // namespace genai
+}  // namespace ov
@@ -0,0 +1,22 @@
+// Copyright (C) 2023-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include <openvino/openvino.hpp>
+
+#include "openvino/genai/whisper_generation_config.hpp"
+
+namespace ov {
+namespace genai {
+
+void do_suppress_tokens(ov::Tensor& logits, const size_t batch_idx, const std::vector<int64_t>& suppress_tokens);
+
+void process_whisper_timestamp_logits(ov::Tensor& logits,
+                                      const size_t batch_idx,
+                                      const ov::genai::WhisperGenerationConfig& config,
+                                      const std::vector<int64_t>& generated_tokens,
+                                      bool initial_step = false);
+
+}  // namespace genai
+}  // namespace ov
@@ -0,0 +1,71 @@
+// Copyright (C) 2023-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#include "timestamps.hpp"
+
+namespace ov {
+namespace genai {
+
+std::pair<std::vector<int64_t>, std::vector<ov::genai::Segment>> extract_segments(
+    const std::vector<int64_t>& tokens,
+    const ov::genai::WhisperGenerationConfig& config,
+    const float time_precision) {
+    std::vector<int64_t> non_timestamp_tokens;
+    std::vector<ov::genai::Segment> segments;
+    std::optional<int64_t> token_start = std::nullopt;
+    size_t idx_start = 0;
+
+    for (size_t i = 0; i < tokens.size(); i++) {
+        int64_t token = tokens[i];
+
+        bool is_timestamp = token >= config.begin_timestamps_token_id;
+
+        if (!is_timestamp) {
+            continue;
+        }
+
+        if (!token_start.has_value()) {
+            token_start = token;
+            idx_start = i;
+        } else {
+            if (token_start == token) {
+                // from HF:
+                // https://github.com/huggingface/transformers/blob/main/src/transformers/models/whisper/tokenization_whisper.py#L1020
+                // This is a bug in timestamp token output where we're taking the duplicate token as a stop where it
+                // should be a start. This is an issue in the underlying model output. Let's just skip it so it becomes
+                // de-factor a start again.
+                continue;
+            }
+
+            ov::genai::Segment segment;
+            segment.m_tokens = {tokens.begin() + idx_start + 1, tokens.begin() + i};
+            segment.m_start = (*token_start - config.begin_timestamps_token_id) * time_precision;
+            segment.m_end = (token - config.begin_timestamps_token_id) * time_precision;
+            segments.push_back(segment);
+
+            non_timestamp_tokens.insert(non_timestamp_tokens.end(), tokens.begin() + idx_start + 1, tokens.begin() + i);
+
+            token_start = std::nullopt;
+        }
+    }
+
+    // segment started but has no closing timestamp
+    // add new segment only if it has non timestamps tokens
+    // do not add new segment if previous segments exists
+    bool has_tokens_to_add = idx_start < tokens.size() - 1;
+    bool has_previous_segments = segments.size() > 0;
+    if (token_start.has_value() && has_tokens_to_add && !has_previous_segments) {
+        ov::genai::Segment segment;
+        segment.m_tokens = {tokens.begin() + idx_start + 1, tokens.end()};
+        segment.m_start = (*token_start - config.begin_timestamps_token_id) * time_precision;
+        segment.m_end = -1.0f;
+        segments.push_back(segment);
+
+        non_timestamp_tokens.insert(non_timestamp_tokens.end(), tokens.begin() + idx_start + 1, tokens.end());
+    }
+
+    return {non_timestamp_tokens, segments};
+}
+
+}  // namespace genai
+}  // namespace ov
@@ -0,0 +1,19 @@
+// Copyright (C) 2023-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include <openvino/openvino.hpp>
+
+#include "whisper.hpp"
+
+namespace ov {
+namespace genai {
+
+std::pair<std::vector<int64_t>, std::vector<ov::genai::Segment>> extract_segments(
+    const std::vector<int64_t>& tokens,
+    const ov::genai::WhisperGenerationConfig& config,
+    const float time_precision);
+
+}  // namespace genai
+}  // namespace ov
Original file line number	Diff line number	Diff line change
`@@ -30,6 +30,8 @@ inline bool is_stop_token_id_hit(int64_t generated_token, const std::set<int64_t`
`30`	`30`	`return false;`
`31`	`31`	`}`
`32`	`32`
	`33`	`+std::vector<Token> log_softmax(const ov::Tensor& logits, size_t batch_idx);`
	`34`	`+`
`33`	`35`	`struct SamplerOutput {`
`34`	`36`	`// IDs of sequences that need to be dropped`
`35`	`37`	`std::vector<uint64_t> m_dropped_sequences;`