Support longform voxtral processing (#1375)

xenova · web-flow · commit 635bff5b8f76 · 2025-07-22T17:45:25.000-04:00
diff --git a/src/models/voxtral/processing_voxtral.js b/src/models/voxtral/processing_voxtral.js
@@ -7,6 +7,20 @@ const AUDIO_TOKEN = "[AUDIO]";
 const BEGIN_AUDIO_TOKEN = "[BEGIN_AUDIO]";
 const NUM_AUDIO_TOKENS = 375;
 
+/**
+ * Helper function to split audio into non-overlapping chunks of n_samples
+ * @param {Float32Array} audio 
+ * @param {number} n_samples 
+ * @returns {Float32Array[]}
+ */
+function chunk(audio, n_samples) {
+    const chunks = [];
+    for (let i = 0; i < audio.length; i += n_samples) {
+        chunks.push(audio.subarray(i, Math.min(i + n_samples, audio.length)));
+    }
+    return chunks;
+}
+
 /**
  * Represents a VoxtralProcessor that extracts features from an audio input.
  */
@@ -32,16 +46,36 @@ export class VoxtralProcessor extends Processor {
             if (!Array.isArray(audio)) {
                 audio = [audio];
             }
-            const num_audio_tokens = text.split(AUDIO_TOKEN).length - 1;
+            const text_parts = text.split(AUDIO_TOKEN);
+            const num_audio_tokens = text_parts.length - 1;
             if (num_audio_tokens !== audio.length) {
                 throw new Error(`The number of audio inputs (${audio.length}) does not match the number of audio tokens in the text (${num_audio_tokens}).`);
             }
+
+            const n_samples = this.feature_extractor.config.n_samples;
+
+            // Split each audio input into chunks and keep track of chunk counts
+            const audio_chunks = audio.map(a => chunk(a, n_samples));
+            const chunk_counts = audio_chunks.map(chunks => chunks.length);
+
+            // Flatten all chunks for feature extraction
+            const all_chunks = audio_chunks.flat();
             const features = (await Promise.all(
-                audio.map((audio_input) => this.feature_extractor(audio_input, kwargs))
+                all_chunks.map((audio_input) => this.feature_extractor(audio_input, kwargs))
             )).map(x => x.input_features);
+
             audio_inputs["audio_values"] = features.length > 1 ? cat(features, 0) : features[0];
 
-            text = text.replaceAll(AUDIO_TOKEN, BEGIN_AUDIO_TOKEN + AUDIO_TOKEN.repeat(NUM_AUDIO_TOKENS));
+            // Replace text tokens for each audio input, expanding for chunk count
+            let new_text = text_parts[0];
+            for (let i = 0; i < chunk_counts.length; ++i) {
+                new_text += BEGIN_AUDIO_TOKEN;
+                for (let j = 0; j < chunk_counts[i]; ++j) {
+                    new_text += AUDIO_TOKEN.repeat(NUM_AUDIO_TOKENS);
+                }
+                new_text += text_parts[i + 1];
+            }
+            text = new_text;
         }
 
         const text_inputs = this.tokenizer(text, {