@@ -7,97 +7,105 @@ import { phonemize } from './phonemizer.js';
77
88const { Tensor, RawSession } = Supabase . ai ;
99
10+ /* NOTE: Reference [original paper](https://arxiv.org/pdf/2306.07691#Model%20Training):
11+ > All datasets were resampled to 24 kHz to match LibriTTS, and the texts
12+ > were converted into phonemes using phonemizer'
13+ */
14+ const SAMPLE_RATE = 24000 ; // 24 kHz
15+
16+ /* NOTE: Reference [original paper](https://arxiv.org/pdf/2306.07691#Detailed%20Model%20Architectures):
17+ > The size of s and c is 256 × 1
18+ */
1019const STYLE_DIM = 256 ;
11- const SAMPLE_RATE = 24000 ;
1220const MODEL_ID = 'onnx-community/Kokoro-82M-ONNX' ;
1321
1422// https://huggingface.co/onnx-community/Kokoro-82M-ONNX#samples
1523const ALLOWED_VOICES = [
16- 'af_bella' ,
17- 'af_nicole' ,
18- 'af_sarah' ,
19- 'af_sky' ,
20- 'am_adam' ,
21- 'am_michael' ,
22- 'bf_emma' ,
23- 'bf_isabella' ,
24- 'bm_george' ,
25- 'bm_lewis' ,
24+ 'af_bella' ,
25+ 'af_nicole' ,
26+ 'af_sarah' ,
27+ 'af_sky' ,
28+ 'am_adam' ,
29+ 'am_michael' ,
30+ 'bf_emma' ,
31+ 'bf_isabella' ,
32+ 'bm_george' ,
33+ 'bm_lewis' ,
2634] ;
2735
2836const session = await RawSession . fromHuggingFace ( MODEL_ID ) ;
2937
3038Deno . serve ( async ( req ) => {
31- const params = new URL ( req . url ) . searchParams ;
32- const text = params . get ( 'text' ) ?? 'Hello from Supabase!' ;
33- const voice = params . get ( 'voice' ) ?? 'af_bella' ;
34-
35- if ( ! ALLOWED_VOICES . includes ( voice ) ) {
36- return Response . json ( {
37- error : `invalid voice '${ voice } '` ,
38- must_be_one_of : ALLOWED_VOICES ,
39- } , { status : 400 } ) ;
40- }
41-
42- const tokenizer = await loadTokenizer ( ) ;
43- const language = voice . at ( 0 ) ; // 'a'merican | 'b'ritish
44- const phonemes = await phonemize ( text , language ) ;
45- const { input_ids } = tokenizer ( phonemes , {
46- truncation : true ,
47- } ) ;
48-
49- // Select voice style based on number of input tokens
50- const num_tokens = Math . max (
51- input_ids . dims . at ( - 1 ) - 2 , // Without padding;
52- 0 ,
53- ) ;
54-
55- const voiceStyle = await loadVoiceStyle ( voice , num_tokens ) ;
56-
57- const { waveform } = await session . run ( {
58- input_ids,
59- style : voiceStyle ,
60- speed : new Tensor ( 'float32' , [ 1 ] , [ 1 ] ) ,
61- } ) ;
62-
63- // Do `wave` encoding from rust backend
64- const audio = await waveform . tryEncodeAudio ( SAMPLE_RATE ) ;
65-
66- return new Response ( audio , {
67- headers : {
68- 'Content-Type' : 'audio/wav' ,
69- } ,
70- } ) ;
39+ const params = new URL ( req . url ) . searchParams ;
40+ const text = params . get ( 'text' ) ?? 'Hello from Supabase!' ;
41+ const voice = params . get ( 'voice' ) ?? 'af_bella' ;
42+
43+ if ( ! ALLOWED_VOICES . includes ( voice ) ) {
44+ return Response . json ( {
45+ error : `invalid voice '${ voice } '` ,
46+ must_be_one_of : ALLOWED_VOICES ,
47+ } , { status : 400 } ) ;
48+ }
49+
50+ const tokenizer = await loadTokenizer ( ) ;
51+ const language = voice . at ( 0 ) ; // 'a'merican | 'b'ritish
52+ const phonemes = await phonemize ( text , language ) ;
53+ const { input_ids } = tokenizer ( phonemes , {
54+ truncation : true ,
55+ } ) ;
56+
57+ // Select voice style based on number of input tokens
58+ const num_tokens = Math . max (
59+ input_ids . dims . at ( - 1 ) - 2 , // Without padding;
60+ 0 ,
61+ ) ;
62+
63+ const voiceStyle = await loadVoiceStyle ( voice , num_tokens ) ;
64+
65+ const { waveform } = await session . run ( {
66+ input_ids,
67+ style : voiceStyle ,
68+ speed : new Tensor ( 'float32' , [ 1 ] , [ 1 ] ) ,
69+ } ) ;
70+
71+ // Do `wave` encoding from rust backend
72+ const audio = await waveform . tryEncodeAudio ( SAMPLE_RATE ) ;
73+
74+ return new Response ( audio , {
75+ headers : {
76+ 'Content-Type' : 'audio/wav' ,
77+ } ,
78+ } ) ;
7179} ) ;
7280
7381async function loadVoiceStyle ( voice : string , num_tokens : number ) {
74- const voice_url =
75- `https://huggingface.co/onnx-community/Kokoro-82M-ONNX/resolve/main/voices/${ voice } .bin?download=true` ;
82+ const voice_url =
83+ `https://huggingface.co/onnx-community/Kokoro-82M-ONNX/resolve/main/voices/${ voice } .bin?download=true` ;
7684
77- console . log ( 'loading voice:' , voice_url ) ;
85+ console . log ( 'loading voice:' , voice_url ) ;
7886
79- const voiceBuffer = await fetch ( voice_url ) . then ( async ( res ) => await res . arrayBuffer ( ) ) ;
87+ const voiceBuffer = await fetch ( voice_url ) . then ( async ( res ) => await res . arrayBuffer ( ) ) ;
8088
81- const offset = num_tokens * STYLE_DIM ;
82- const voiceData = new Float32Array ( voiceBuffer ) . slice (
83- offset ,
84- offset + STYLE_DIM ,
85- ) ;
89+ const offset = num_tokens * STYLE_DIM ;
90+ const voiceData = new Float32Array ( voiceBuffer ) . slice (
91+ offset ,
92+ offset + STYLE_DIM ,
93+ ) ;
8694
87- return new Tensor ( 'float32' , voiceData , [ 1 , STYLE_DIM ] ) ;
95+ return new Tensor ( 'float32' , voiceData , [ 1 , STYLE_DIM ] ) ;
8896}
8997
9098async function loadTokenizer ( ) {
91- // BUG: invalid 'h' not JSON. That's why we need to manually fetch the assets
92- // const tokenizer = await AutoTokenizer.from_pretrained(MODEL_ID);
99+ // BUG: invalid 'h' not JSON. That's why we need to manually fetch the assets
100+ // const tokenizer = await AutoTokenizer.from_pretrained(MODEL_ID);
93101
94- const tokenizerData = await fetch (
95- 'https://huggingface.co/onnx-community/Kokoro-82M-ONNX/resolve/main/tokenizer.json?download=true' ,
96- ) . then ( async ( res ) => await res . json ( ) ) ;
102+ const tokenizerData = await fetch (
103+ 'https://huggingface.co/onnx-community/Kokoro-82M-ONNX/resolve/main/tokenizer.json?download=true' ,
104+ ) . then ( async ( res ) => await res . json ( ) ) ;
97105
98- const tokenizerConfig = await fetch (
99- 'https://huggingface.co/onnx-community/Kokoro-82M-ONNX/resolve/main/tokenizer_config.json?download=true' ,
100- ) . then ( async ( res ) => await res . json ( ) ) ;
106+ const tokenizerConfig = await fetch (
107+ 'https://huggingface.co/onnx-community/Kokoro-82M-ONNX/resolve/main/tokenizer_config.json?download=true' ,
108+ ) . then ( async ( res ) => await res . json ( ) ) ;
101109
102- return new PreTrainedTokenizer ( tokenizerData , tokenizerConfig ) ;
110+ return new PreTrainedTokenizer ( tokenizerData , tokenizerConfig ) ;
103111}
0 commit comments