@@ -7,6 +7,20 @@ const AUDIO_TOKEN = "[AUDIO]";
77const BEGIN_AUDIO_TOKEN = "[BEGIN_AUDIO]" ;
88const NUM_AUDIO_TOKENS = 375 ;
99
10+ /**
11+ * Helper function to split audio into non-overlapping chunks of n_samples
12+ * @param {Float32Array } audio
13+ * @param {number } n_samples
14+ * @returns {Float32Array[] }
15+ */
16+ function chunk ( audio , n_samples ) {
17+ const chunks = [ ] ;
18+ for ( let i = 0 ; i < audio . length ; i += n_samples ) {
19+ chunks . push ( audio . subarray ( i , Math . min ( i + n_samples , audio . length ) ) ) ;
20+ }
21+ return chunks ;
22+ }
23+
1024/**
1125 * Represents a VoxtralProcessor that extracts features from an audio input.
1226 */
@@ -32,16 +46,36 @@ export class VoxtralProcessor extends Processor {
3246 if ( ! Array . isArray ( audio ) ) {
3347 audio = [ audio ] ;
3448 }
35- const num_audio_tokens = text . split ( AUDIO_TOKEN ) . length - 1 ;
49+ const text_parts = text . split ( AUDIO_TOKEN ) ;
50+ const num_audio_tokens = text_parts . length - 1 ;
3651 if ( num_audio_tokens !== audio . length ) {
3752 throw new Error ( `The number of audio inputs (${ audio . length } ) does not match the number of audio tokens in the text (${ num_audio_tokens } ).` ) ;
3853 }
54+
55+ const n_samples = this . feature_extractor . config . n_samples ;
56+
57+ // Split each audio input into chunks and keep track of chunk counts
58+ const audio_chunks = audio . map ( a => chunk ( a , n_samples ) ) ;
59+ const chunk_counts = audio_chunks . map ( chunks => chunks . length ) ;
60+
61+ // Flatten all chunks for feature extraction
62+ const all_chunks = audio_chunks . flat ( ) ;
3963 const features = ( await Promise . all (
40- audio . map ( ( audio_input ) => this . feature_extractor ( audio_input , kwargs ) )
64+ all_chunks . map ( ( audio_input ) => this . feature_extractor ( audio_input , kwargs ) )
4165 ) ) . map ( x => x . input_features ) ;
66+
4267 audio_inputs [ "audio_values" ] = features . length > 1 ? cat ( features , 0 ) : features [ 0 ] ;
4368
44- text = text . replaceAll ( AUDIO_TOKEN , BEGIN_AUDIO_TOKEN + AUDIO_TOKEN . repeat ( NUM_AUDIO_TOKENS ) ) ;
69+ // Replace text tokens for each audio input, expanding for chunk count
70+ let new_text = text_parts [ 0 ] ;
71+ for ( let i = 0 ; i < chunk_counts . length ; ++ i ) {
72+ new_text += BEGIN_AUDIO_TOKEN ;
73+ for ( let j = 0 ; j < chunk_counts [ i ] ; ++ j ) {
74+ new_text += AUDIO_TOKEN . repeat ( NUM_AUDIO_TOKENS ) ;
75+ }
76+ new_text += text_parts [ i + 1 ] ;
77+ }
78+ text = new_text ;
4579 }
4680
4781 const text_inputs = this . tokenizer ( text , {
0 commit comments