@@ -1699,6 +1699,29 @@ image = sana(
16991699) ` ,
17001700] ;
17011701
1702+ export const vibevoice = ( model : ModelData ) : string [ ] => [
1703+ `import torch, soundfile as sf, librosa, numpy as np
1704+ from vibevoice.processor.vibevoice_processor import VibeVoiceProcessor
1705+ from vibevoice.modular.modeling_vibevoice_inference import VibeVoiceForConditionalGenerationInference
1706+
1707+ # Load voice sample (should be 24kHz mono)
1708+ voice, sr = sf.read("path/to/voice_sample.wav")
1709+ if voice.ndim > 1: voice = voice.mean(axis=1)
1710+ if sr != 24000: voice = librosa.resample(voice, sr, 24000)
1711+
1712+ processor = VibeVoiceProcessor.from_pretrained("${ model . id } ")
1713+ model = VibeVoiceForConditionalGenerationInference.from_pretrained(
1714+ "${ model . id } ", torch_dtype=torch.bfloat16
1715+ ).to("cuda").eval()
1716+ model.set_ddpm_inference_steps(5)
1717+
1718+ inputs = processor(text=["Speaker 0: Hello!\\nSpeaker 1: Hi there!"],
1719+ voice_samples=[[voice]], return_tensors="pt")
1720+ audio = model.generate(**inputs, cfg_scale=1.3,
1721+ tokenizer=processor.tokenizer).speech_outputs[0]
1722+ sf.write("output.wav", audio.cpu().numpy().squeeze(), 24000)` ,
1723+ ] ;
1724+
17021725export const videoprism = ( model : ModelData ) : string [ ] => [
17031726 `# Install from https://github.com/google-deepmind/videoprism
17041727import jax
0 commit comments