From c78803204727e88e056bfd21d141cd41b35b647d Mon Sep 17 00:00:00 2001 From: ronliwag Date: Tue, 21 Oct 2025 01:43:09 +0800 Subject: [PATCH 1/7] Fix audio processing issues and improve demo functionality - Fixed 404 errors for /process, /output, /asr, and /translation routes - Resolved TorchAudio sox dependency issues with custom audio conversion - Fixed pydub ffmpeg dependency by using soundfile for audio processing - Added proper audio normalization to prevent loud output - Fixed audio synchronization between input and output waveforms - Added favicon route to eliminate 404 errors - Updated requirements.txt with flexible version requirements - Added comprehensive .gitignore to exclude large model files and virtual environment --- .gitignore | 139 +++++++++ SETUP_INSTRUCTIONS.md | 264 ++++++++++++++++++ configs/es-en/config_gcmvn.yaml | 6 +- configs/es-en/config_mtl_asr_st_ctcst.yaml | 12 +- demo/.gitignore | 15 + demo/app.py | 247 ++++++++++++---- demo/config.json | 11 +- demo/paths_config_template.json | 32 +++ demo/setup_paths.py | 105 +++++++ demo/templates/index.html | 18 +- fairseq/examples/speech_to_text/__init__.py | 4 + .../speech_to_speech/modules/__init__.py | 1 + requirements.txt | 51 ++++ 13 files changed, 826 insertions(+), 79 deletions(-) create mode 100644 .gitignore create mode 100644 SETUP_INSTRUCTIONS.md create mode 100644 demo/.gitignore create mode 100644 demo/paths_config_template.json create mode 100644 demo/setup_paths.py create mode 100644 fairseq/examples/speech_to_text/__init__.py create mode 100644 requirements.txt diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..3fff701 --- /dev/null +++ b/.gitignore @@ -0,0 +1,139 @@ +# Virtual Environment +streamspeech_env/ + +# Large Model Files (move to drive) +pretrain_models/ +*.pt +*.pth +*.bin +*.safetensors + +# Python Cache +__pycache__/ +*.py[cod] +*$py.class +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +.hypothesis/ +.pytest_cache/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# pyenv +.python-version + +# celery beat schedule file +celerybeat-schedule + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# IDE +.vscode/ +.idea/ +*.swp +*.swo +*~ + +# OS +.DS_Store +.DS_Store? +._* +.Spotlight-V100 +.Trashes +ehthumbs.db +Thumbs.db + +# Audio files (if any) +*.wav +*.mp3 +*.flac +*.ogg +*.m4a + +# Temporary files +*.tmp +*.temp diff --git a/SETUP_INSTRUCTIONS.md b/SETUP_INSTRUCTIONS.md new file mode 100644 index 0000000..2a6305f --- /dev/null +++ b/SETUP_INSTRUCTIONS.md @@ -0,0 +1,264 @@ +# StreamSpeech Setup Instructions + +This guide will help you set up StreamSpeech for simultaneous speech-to-speech translation on Windows. + +## Prerequisites + +- **Python 3.10** (required - other versions may not work) +- **CUDA-capable GPU** (recommended for optimal performance) +- **Windows 10/11** (tested on Windows 10.0.19045) +- **Git** (for cloning repositories) + +## Quick Setup + +### 1. Install Python 3.10 + +If you don't have Python 3.10, install it using Windows Package Manager: + +```powershell +winget install Python.Python.3.10 +``` + +Verify installation: +```powershell +py -3.10 --version +``` + +### 2. Clone and Setup Environment + +```powershell +# Navigate to your desired directory +cd D:\StreamSpeech + +# Create virtual environment with Python 3.10 +py -3.10 -m venv streamspeech_env + +# Activate virtual environment +streamspeech_env\Scripts\activate + +# Upgrade pip +python -m pip install --upgrade pip +``` + +### 3. Install Dependencies + +#### Option A: Install from requirements.txt (Recommended) +```powershell +pip install -r requirements.txt +``` + +#### Option B: Manual installation +```powershell +# Install PyTorch with CUDA support +pip install torch==2.0.1 torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118 + +# Install fairseq +pip install fairseq + +# Install SimulEval (editable mode) +cd SimulEval +pip install --editable ./ +cd .. + +# Install Flask for web demo +pip install flask +``` + +### 4. Verify Installation + +```powershell +python -c "import torch; print(f'PyTorch: {torch.__version__}'); print(f'CUDA: {torch.cuda.is_available()}')" +python -c "import fairseq; print('Fairseq: OK')" +python -c "import simuleval; print('SimulEval: OK')" +python -c "import flask; print('Flask: OK')" +``` + +## Model Setup + +### 1. Download Pre-trained Models + +Create a `pretrain_models` directory and download the required models: + +```powershell +mkdir pretrain_models +cd pretrain_models +``` + +#### StreamSpeech Models (choose one language pair): + +**French-English:** +- Simultaneous: [streamspeech.simultaneous.fr-en.pt](https://huggingface.co/ICTNLP/StreamSpeech_Models/blob/main/streamspeech.simultaneous.fr-en.pt) +- Offline: [streamspeech.offline.fr-en.pt](https://huggingface.co/ICTNLP/StreamSpeech_Models/blob/main/streamspeech.offline.fr-en.pt) + +**Spanish-English:** +- Simultaneous: [streamspeech.simultaneous.es-en.pt](https://huggingface.co/ICTNLP/StreamSpeech_Models/blob/main/streamspeech.simultaneous.es-en.pt) +- Offline: [streamspeech.offline.es-en.pt](https://huggingface.co/ICTNLP/StreamSpeech_Models/blob/main/streamspeech.offline.es-en.pt) + +**German-English:** +- Simultaneous: [streamspeech.simultaneous.de-en.pt](https://huggingface.co/ICTNLP/StreamSpeech_Models/blob/main/streamspeech.simultaneous.de-en.pt) +- Offline: [streamspeech.offline.de-en.pt](https://huggingface.co/ICTNLP/StreamSpeech_Models/blob/main/streamspeech.offline.de-en.pt) + +#### HiFi-GAN Vocoder: +- Model: [g_00500000](https://dl.fbaipublicfiles.com/fairseq/speech_to_speech/vocoder/code_hifigan/mhubert_vp_en_es_fr_it3_400k_layer11_km1000_lj/g_00500000) +- Config: [config.json](https://dl.fbaipublicfiles.com/fairseq/speech_to_speech/vocoder/code_hifigan/mhubert_vp_en_es_fr_it3_400k_layer11_km1000_lj/config.json) + +Create the vocoder directory structure: +```powershell +mkdir unit-based_HiFi-GAN_vocoder\mHuBERT.layer11.km1000.en +# Place g_00500000 and config.json in this directory +``` + +### 2. Configure Paths + +#### Option A: Automatic Setup (Recommended) +```powershell +cd demo +python setup_paths.py +``` + +#### Option B: Manual Setup +1. Copy the template: `cp paths_config_template.json paths_config.json` +2. Edit `paths_config.json` with your actual paths: + +```json +{ + "streamspeech_root": "D:/StreamSpeech", + "pretrain_models_root": "D:/StreamSpeech/pretrain_models", + "language_pair": "es-en", + "models": { + "simultaneous": "D:/StreamSpeech/pretrain_models/streamspeech.simultaneous.es-en.pt", + "offline": "D:/StreamSpeech/pretrain_models/streamspeech.offline.es-en.pt" + }, + "vocoder": { + "checkpoint": "D:/StreamSpeech/pretrain_models/unit-based_HiFi-GAN_vocoder/mHuBERT.layer11.km1000.en/g_00500000", + "config": "D:/StreamSpeech/pretrain_models/unit-based_HiFi-GAN_vocoder/mHuBERT.layer11.km1000.en/config.json" + }, + "configs": { + "data_bin": "D:/StreamSpeech/configs/es-en", + "user_dir": "D:/StreamSpeech/researches/ctc_unity", + "agent_dir": "D:/StreamSpeech/agent" + } +} +``` + +#### Update Language Config Files +Update config files in `configs/es-en/`: +- Replace `/data/zhangshaolei/StreamSpeech` with your actual StreamSpeech path in: + - `config_gcmvn.yaml` + - `config_mtl_asr_st_ctcst.yaml` + +## Running the Application + +### 1. Command Line Interface + +```powershell +# Activate environment +streamspeech_env\Scripts\activate + +# Set CUDA device +$env:CUDA_VISIBLE_DEVICES="0" + +# Run inference +cd demo +python infer.py --data-bin ../configs/fr-en --user-dir ../researches/ctc_unity --agent-dir ../agent --model-path ../pretrain_models/streamspeech.simultaneous.fr-en.pt --config-yaml config_gcmvn.yaml --multitask-config-yaml config_mtl_asr_st_ctcst.yaml --segment-size 320 --vocoder ../pretrain_models/unit-based_HiFi-GAN_vocoder/mHuBERT.layer11.km1000.en/g_00500000 --vocoder-cfg ../pretrain_models/unit-based_HiFi-GAN_vocoder/mHuBERT.layer11.km1000.en/config.json --dur-prediction +``` + +### 2. Web Demo + +```powershell +# Activate environment +streamspeech_env\Scripts\activate + +# Start web server +cd demo +python app.py +``` + +Open your browser to `http://localhost:7860` + +## Features + +- **Streaming ASR**: Real-time speech recognition +- **Simultaneous S2TT**: Speech-to-text translation +- **Simultaneous S2ST**: Speech-to-speech translation +- **Adjustable Latency**: 320ms to 5000ms +- **Real-time Results**: Live updates during playback + +## Troubleshooting + +### Common Issues: + +1. **CUDA out of memory**: Reduce batch size or use CPU +2. **Model loading errors**: Check file paths in config.json +3. **Audio format issues**: Ensure audio is in supported format (WAV, MP3) +4. **Permission errors**: Run PowerShell as Administrator +5. **Python version issues**: Ensure Python 3.10 is used + +### Performance Tips: + +- **GPU Recommended**: Significant speedup with CUDA +- **Memory Requirements**: ~8GB+ GPU memory for optimal performance +- **Latency**: Lower values (320ms) = faster response, higher values = better quality + +## Paths Configuration System + +StreamSpeech uses a flexible paths configuration system that makes it easy to deploy across different environments: + +### Files: +- **`demo/paths_config.json`**: Your actual paths (not in git) +- **`demo/paths_config_template.json`**: Template for paths (in git) +- **`demo/setup_paths.py`**: Automatic setup script +- **`demo/.gitignore`**: Excludes local paths from git + +### Benefits: +- ✅ Easy to change paths without modifying code +- ✅ Git-friendly (local paths not committed) +- ✅ Environment-specific configurations +- ✅ Automatic path validation + +## Directory Structure + +``` +StreamSpeech/ +├── configs/ +│ └── [lang]-en/ # Language-specific configs +├── pretrain_models/ # Downloaded models +│ └── unit-based_HiFi-GAN_vocoder/ +├── demo/ +│ ├── config.json # Main configuration +│ ├── paths_config.json # Your paths (auto-generated) +│ ├── paths_config_template.json # Template +│ ├── setup_paths.py # Setup script +│ ├── app.py # Flask web app +│ └── templates/ +│ └── index.html # Web interface +├── requirements.txt # Dependencies +└── SETUP_INSTRUCTIONS.md # This file +``` + +## Supported Languages + +- French → English +- Spanish → English +- German → English + +## Citation + +If you use StreamSpeech in your research, please cite: + +```bibtex +@inproceedings{streamspeech, + title={StreamSpeech: Simultaneous Speech-to-Speech Translation with Multi-task Learning}, + author={Shaolei Zhang and Qingkai Fang and Shoutao Guo and Zhengrui Ma and Min Zhang and Yang Feng}, + year={2024}, + booktitle = {Proceedings of the 62th Annual Meeting of the Association for Computational Linguistics (Long Papers)}, + publisher = {Association for Computational Linguistics} +} +``` + +## Links + +- **Paper**: [arXiv:2406.03049](https://arxiv.org/abs/2406.03049) +- **Demo**: [StreamSpeech Demo](https://ictnlp.github.io/StreamSpeech-site/) +- **Models**: [Hugging Face](https://huggingface.co/ICTNLP/StreamSpeech_Models/tree/main) +- **GitHub**: [StreamSpeech Repository](https://github.com/ictnlp/StreamSpeech) diff --git a/configs/es-en/config_gcmvn.yaml b/configs/es-en/config_gcmvn.yaml index 6083d2a..9109ca1 100644 --- a/configs/es-en/config_gcmvn.yaml +++ b/configs/es-en/config_gcmvn.yaml @@ -1,5 +1,5 @@ global_cmvn: - stats_npz_path: /data/zhangshaolei/StreamSpeech/configs/es-en/gcmvn.npz + stats_npz_path: D:/StreamSpeech/configs/es-en/gcmvn.npz input_channels: 1 input_feat_per_channel: 80 specaugment: @@ -16,6 +16,6 @@ transforms: - global_cmvn - specaugment vocoder: - checkpoint: /data/zhangshaolei/pretrain_models/unit-based_HiFi-GAN_vocoder/mHuBERT.layer11.km1000.en/g_00500000 - config: /data/zhangshaolei/pretrain_models/unit-based_HiFi-GAN_vocoder/mHuBERT.layer11.km1000.en/config.json + checkpoint: D:/StreamSpeech/pretrain_models/unit-based_HiFi-GAN_vocoder/mHuBERT.layer11.km1000.en/g_00500000 + config: D:/StreamSpeech/pretrain_models/unit-based_HiFi-GAN_vocoder/mHuBERT.layer11.km1000.en/config.json type: code_hifigan diff --git a/configs/es-en/config_mtl_asr_st_ctcst.yaml b/configs/es-en/config_mtl_asr_st_ctcst.yaml index 5dba702..06b1d73 100644 --- a/configs/es-en/config_mtl_asr_st_ctcst.yaml +++ b/configs/es-en/config_mtl_asr_st_ctcst.yaml @@ -1,7 +1,7 @@ target_unigram: decoder_type: transformer - dict: /data/zhangshaolei/StreamSpeech/configs/es-en/tgt_unigram6000/spm_unigram_es.txt - data: /data/zhangshaolei/StreamSpeech/configs/es-en/tgt_unigram6000 + dict: D:/StreamSpeech/configs/es-en/tgt_unigram6000/spm_unigram_es.txt + data: D:/StreamSpeech/configs/es-en/tgt_unigram6000 loss_weight: 8.0 rdrop_alpha: 0.0 decoder_args: @@ -12,8 +12,8 @@ target_unigram: label_smoothing: 0.1 source_unigram: decoder_type: ctc - dict: /data/zhangshaolei/StreamSpeech/configs/es-en/src_unigram6000/spm_unigram_es.txt - data: /data/zhangshaolei/StreamSpeech/configs/es-en/src_unigram6000 + dict: D:/StreamSpeech/configs/es-en/src_unigram6000/spm_unigram_es.txt + data: D:/StreamSpeech/configs/es-en/src_unigram6000 loss_weight: 4.0 rdrop_alpha: 0.0 decoder_args: @@ -24,8 +24,8 @@ source_unigram: label_smoothing: 0.1 ctc_target_unigram: decoder_type: ctc - dict: /data/zhangshaolei/StreamSpeech/configs/es-en/tgt_unigram6000/spm_unigram_es.txt - data: /data/zhangshaolei/StreamSpeech/configs/es-en/tgt_unigram6000 + dict: D:/StreamSpeech/configs/es-en/tgt_unigram6000/spm_unigram_es.txt + data: D:/StreamSpeech/configs/es-en/tgt_unigram6000 loss_weight: 4.0 rdrop_alpha: 0.0 decoder_args: diff --git a/demo/.gitignore b/demo/.gitignore new file mode 100644 index 0000000..b23e4c7 --- /dev/null +++ b/demo/.gitignore @@ -0,0 +1,15 @@ +# Ignore actual paths configuration (contains local paths) +paths_config.json + +# Ignore uploads directory +uploads/ + +# Ignore Python cache +__pycache__/ +*.pyc +*.pyo +*.pyd +.Python + +# Ignore virtual environment +streamspeech_env/ diff --git a/demo/app.py b/demo/app.py index 2e27934..311b9d0 100644 --- a/demo/app.py +++ b/demo/app.py @@ -4,6 +4,11 @@ # # StreamSpeech: Simultaneous Speech-to-Speech Translation with Multi-task Learning (ACL 2024) ########################################## +import sys +import os +# Add fairseq to Python path +sys.path.insert(0, os.path.join(os.path.dirname(os.path.dirname(__file__)), 'fairseq')) + from flask import Flask, request, jsonify, render_template, send_from_directory,url_for import os import json @@ -23,7 +28,10 @@ from pathlib import Path from typing import Any, Dict, Optional, Union from fairseq.data.audio.audio_utils import convert_waveform -from examples.speech_to_text.data_utils import extract_fbank_features +# Import data_utils directly from the file path +import sys +sys.path.insert(0, os.path.join(os.path.dirname(os.path.dirname(__file__)), 'fairseq', 'examples', 'speech_to_text')) +from data_utils import extract_fbank_features import ast import math import os @@ -97,9 +105,18 @@ def __call__(self, new_samples, sr=ORG_SAMPLE_RATE): + self.len_ms_to_samples(self.window_size - self.shift_size) ) samples = samples[:effective_num_samples] - waveform, sample_rate = convert_waveform( - torch.tensor([samples]), sr, to_mono=True, to_sample_rate=16000 - ) + # Simple audio conversion without sox dependency + waveform = torch.tensor([samples]) + if sr != 16000: + # Simple resampling using torch.nn.functional.interpolate + # waveform is 2D: [1, samples_length] + target_length = int(len(samples) * 16000 / sr) + # For linear interpolation, we need 3D input: [batch, channels, length] + waveform = waveform.unsqueeze(0) # Now [1, 1, samples_length] + waveform = torch.nn.functional.interpolate( + waveform, size=target_length, mode='linear', align_corners=False + ).squeeze(0) # Back to [1, target_length] + sample_rate = 16000 output = extract_fbank_features(waveform, 16000) output = self.transform(output) return torch.tensor(output, device=self.device) @@ -824,7 +841,24 @@ def policy(self): def run(source): # if len(S2ST)!=0: return - samples, _ = soundfile.read(source, dtype="float32") + samples, sr = soundfile.read(source, dtype="float32") + + # Resample to expected sample rate if needed + if sr != ORG_SAMPLE_RATE: + print(f"Resampling from {sr}Hz to {ORG_SAMPLE_RATE}Hz") + # Simple resampling using torch + samples_tensor = torch.tensor(samples).unsqueeze(0).unsqueeze(0) # [1, 1, length] + target_length = int(len(samples) * ORG_SAMPLE_RATE / sr) + samples_tensor = torch.nn.functional.interpolate( + samples_tensor, size=target_length, mode='linear', align_corners=False + ) + samples = samples_tensor.squeeze().numpy() + + # Normalize input audio to prevent loud playback + max_val = np.max(np.abs(samples)) + if max_val > 0: + samples = samples / max_val * 0.8 # Normalize and scale to 80% + agent.reset() interval=int(agent.segment_size*(ORG_SAMPLE_RATE/1000)) @@ -856,31 +890,108 @@ def find_largest_key_value(dictionary, N): return dictionary[largest_key] def merge_audio(left_audio_path, right_audio_path, offset_ms): - # 读取左右声道音频文件 - left_audio = AudioSegment.from_file(left_audio_path) - right_audio = AudioSegment.from_file(right_audio_path) - - right_audio=AudioSegment.silent(duration=offset_ms)+right_audio - + # Use soundfile instead of pydub to avoid ffmpeg dependency + left_data, left_sr = soundfile.read(left_audio_path, dtype='float32') + right_data, right_sr = soundfile.read(right_audio_path, dtype='float32') - # 确保两个音频文件具有相同的长度 - if len(left_audio) > len(right_audio): - right_audio += AudioSegment.silent(duration=len(left_audio) - len(right_audio)) - elif len(left_audio) < len(right_audio): - left_audio += AudioSegment.silent(duration=len(right_audio) - len(left_audio)) - - # # 将左右声道音频合并 - # merged_audio = left_audio.overlay(right_audio.pan(1)) - # # 保存合并后的音频文件 - # merged_audio.export(output_file, format="wav") + # Convert offset from ms to samples + offset_samples = int(offset_ms * right_sr / 1000) - return left_audio,right_audio - + # Add silence at the beginning of right audio + right_data = np.concatenate([np.zeros(offset_samples), right_data]) + + # Ensure both audio files have the same length + max_length = max(len(left_data), len(right_data)) + + if len(left_data) < max_length: + left_data = np.concatenate([left_data, np.zeros(max_length - len(left_data))]) + if len(right_data) < max_length: + right_data = np.concatenate([right_data, np.zeros(max_length - len(right_data))]) + + # Normalize audio data before creating AudioSegment objects + left_max = np.max(np.abs(left_data)) + if left_max > 0: + left_data = left_data / left_max * 0.8 + + right_max = np.max(np.abs(right_data)) + if right_max > 0: + right_data = right_data / right_max * 0.8 + + # Convert to int16 for AudioSegment (standard format) + left_data_int16 = (left_data * 32767).astype(np.int16) + right_data_int16 = (right_data * 32767).astype(np.int16) + + # Create AudioSegment objects for compatibility with the rest of the code + left_audio = AudioSegment( + left_data_int16.tobytes(), + frame_rate=left_sr, + sample_width=2, # int16 = 2 bytes + channels=1 + ) + right_audio = AudioSegment( + right_data_int16.tobytes(), + frame_rate=right_sr, + sample_width=2, # int16 = 2 bytes + channels=1 + ) + + # Audio normalization is now handled at the source when writing the file + + return left_audio, right_audio + +# Flask routes will be defined after app initialization + +# Load main configuration +with open('config.json', 'r') as f: + main_config = json.load(f) + +# Load paths configuration +with open('paths_config.json', 'r') as f: + paths_config = json.load(f) + +# Merge configurations +args_dict = main_config.copy() +if main_config.get('use_paths_config', False): + # Add paths from paths_config.json + args_dict.update({ + 'data-bin': paths_config['configs']['data_bin'], + 'user-dir': paths_config['configs']['user_dir'], + 'agent-dir': paths_config['configs']['agent_dir'], + 'model-path': paths_config['models']['simultaneous'], + 'vocoder': paths_config['vocoder']['checkpoint'], + 'vocoder-cfg': paths_config['vocoder']['config'] + }) + +# Initialize Flask app with config app = Flask(__name__) -app.config['UPLOAD_FOLDER'] = 'uploads' +# Set upload folder from paths config +upload_folder = paths_config.get('demo', {}).get('upload_folder', 'uploads') +app.config['UPLOAD_FOLDER'] = upload_folder os.makedirs(app.config['UPLOAD_FOLDER'], exist_ok=True) +# Initialize agent +parser = argparse.ArgumentParser() +StreamSpeechS2STAgent.add_args(parser) + +# Create the list of arguments from args_dict +args_list = [] +# pdb.set_trace() +for key, value in args_dict.items(): + # Skip non-argument fields + if key.startswith('_') or key in ['use_paths_config', 'language_pair']: + continue + if isinstance(value, bool): + if value: + args_list.append(f'--{key}') + else: + args_list.append(f'--{key}') + args_list.append(str(value)) + +args = parser.parse_args(args_list) + +agent = StreamSpeechS2STAgent(args) +# Define Flask routes @app.route('/') def index(): return render_template('index.html') @@ -897,71 +1008,87 @@ def upload(): file.save(filepath) return filepath -@app.route('/uploads/') -def uploaded_file(filename): +@app.route('/process/') +def uploaded_file(filepath): latency = request.args.get('latency', default=320, type=int) agent.set_chunk_size(latency) - path=app.config['UPLOAD_FOLDER']+'/'+filename + # Handle both full path and just filename + if filepath.startswith(app.config['UPLOAD_FOLDER']): + path = filepath + else: + path = os.path.join(app.config['UPLOAD_FOLDER'], filepath) # pdb.set_trace() # if len(S2ST)==0: reset() run(path) - soundfile.write('/'.join(path.split('/')[:-1])+'/output.'+path.split('/')[-1],S2ST,SAMPLE_RATE) - left,right=merge_audio(path, '/'.join(path.split('/')[:-1])+'/output.'+path.split('/')[-1], OFFSET_MS) - left.export('/'.join(path.split('/')[:-1])+'/input.'+path.split('/')[-1], format="wav") - right.export('/'.join(path.split('/')[:-1])+'/output.'+path.split('/')[-1], format="wav") + filename = os.path.basename(path) + output_path = os.path.join(os.path.dirname(path), 'output.'+filename) + + # Normalize the audio data to prevent it from being too loud + if len(S2ST) > 0: + # Convert to numpy array and normalize + audio_data = np.array(S2ST, dtype=np.float32) + # Normalize to [-1, 1] range + max_val = np.max(np.abs(audio_data)) + if max_val > 0: + audio_data = audio_data / max_val * 0.8 # Scale to 80% of max to be safe + soundfile.write(output_path, audio_data, SAMPLE_RATE) + else: + # Create silent audio if no data + soundfile.write(output_path, np.zeros(1000), SAMPLE_RATE) + left,right=merge_audio(path, output_path, OFFSET_MS) + input_path = os.path.join(os.path.dirname(path), 'input.'+filename) + left.export(input_path, format="wav") + right.export(output_path, format="wav") # left=left.split_to_mono()[0] # right=right.split_to_mono()[1] # pdb.set_trace() return send_from_directory(app.config['UPLOAD_FOLDER'], 'input.'+filename) -@app.route('/uploads/output/') -def uploaded_output_file(filename): +@app.route('/output/') +def uploaded_output_file(filepath): + # Handle both full path and just filename + if filepath.startswith(app.config['UPLOAD_FOLDER']): + filename = os.path.basename(filepath) + else: + filename = filepath return send_from_directory(app.config['UPLOAD_FOLDER'], 'output.'+filename) -@app.route('/asr/', methods=['GET']) +@app.route('/asr/', methods=['GET']) def asr(current_time): + try: + current_time = float(current_time) + except ValueError: + return jsonify(result="") + # asr_result = f"ABCD... {int(current_time * 1000)}" N = current_time*ORG_SAMPLE_RATE asr_result=find_largest_key_value(ASR, N) return jsonify(result=asr_result) -@app.route('/translation/', methods=['GET']) +@app.route('/translation/', methods=['GET']) def translation(current_time): + try: + current_time = float(current_time) + except ValueError: + return jsonify(result="") + N = current_time*ORG_SAMPLE_RATE translation_result=find_largest_key_value(S2TT, N) # translation_result = f"1234... {int(current_time * 1000)}" return jsonify(result=translation_result) -with open('/data/zhangshaolei/StreamSpeech/demo/config.json', 'r') as f: - args_dict = json.load(f) - -# Initialize agent -parser = argparse.ArgumentParser() -StreamSpeechS2STAgent.add_args(parser) - -# Create the list of arguments from args_dict -args_list = [] -# pdb.set_trace() -for key, value in args_dict.items(): - if isinstance(value, bool): - if value: - args_list.append(f'--{key}') - else: - args_list.append(f'--{key}') - args_list.append(str(value)) - -args = parser.parse_args(args_list) - -agent = StreamSpeechS2STAgent(args) - - - +@app.route('/favicon.ico') +def favicon(): + # Return a simple 204 No Content response to stop the 404 error + return '', 204 if __name__ == '__main__': - app.run(host='0.0.0.0', port=7860, debug=True) + host = paths_config.get('demo', {}).get('host', '0.0.0.0') + port = paths_config.get('demo', {}).get('port', 7860) + app.run(host=host, port=port, debug=True) diff --git a/demo/config.json b/demo/config.json index 8d9b4ac..2340d62 100644 --- a/demo/config.json +++ b/demo/config.json @@ -1,12 +1,9 @@ { - "data-bin": "/data/zhangshaolei/StreamSpeech/configs/fr-en", - "user-dir": "/data/zhangshaolei/StreamSpeech/researches/ctc_unity", - "agent-dir": "/data/zhangshaolei/StreamSpeech/agent", - "model-path": "/data/zhangshaolei/StreamSpeech_model/streamspeech.simultaneous.fr-en.pt", + "_comment": "StreamSpeech Demo Configuration - Paths are loaded from paths_config.json", "config-yaml": "config_gcmvn.yaml", "multitask-config-yaml": "config_mtl_asr_st_ctcst.yaml", "segment-size": 320, - "vocoder": "/data/zhangshaolei/pretrain_models/unit-based_HiFi-GAN_vocoder/mHuBERT.layer11.km1000.en/g_00500000", - "vocoder-cfg": "/data/zhangshaolei/pretrain_models/unit-based_HiFi-GAN_vocoder/mHuBERT.layer11.km1000.en/config.json", - "dur-prediction": true + "dur-prediction": true, + "language_pair": "es-en", + "use_paths_config": true } diff --git a/demo/paths_config_template.json b/demo/paths_config_template.json new file mode 100644 index 0000000..c6a38de --- /dev/null +++ b/demo/paths_config_template.json @@ -0,0 +1,32 @@ +{ + "_comment": "StreamSpeech Paths Configuration Template", + "_instructions": "Copy this file to paths_config.json and update the paths for your environment", + "_note": "Use forward slashes (/) for paths, even on Windows", + + "streamspeech_root": "CHANGE_THIS_TO_YOUR_STREAMSPEECH_PATH", + "pretrain_models_root": "CHANGE_THIS_TO_YOUR_PRETRAIN_MODELS_PATH", + + "language_pair": "es-en", + + "models": { + "simultaneous": "CHANGE_THIS_TO_YOUR_SIMULTANEOUS_MODEL_PATH", + "offline": "CHANGE_THIS_TO_YOUR_OFFLINE_MODEL_PATH" + }, + + "vocoder": { + "checkpoint": "CHANGE_THIS_TO_YOUR_VOCODER_CHECKPOINT_PATH", + "config": "CHANGE_THIS_TO_YOUR_VOCODER_CONFIG_PATH" + }, + + "configs": { + "data_bin": "CHANGE_THIS_TO_YOUR_DATA_BIN_PATH", + "user_dir": "CHANGE_THIS_TO_YOUR_USER_DIR_PATH", + "agent_dir": "CHANGE_THIS_TO_YOUR_AGENT_DIR_PATH" + }, + + "demo": { + "upload_folder": "CHANGE_THIS_TO_YOUR_UPLOAD_FOLDER_PATH", + "host": "0.0.0.0", + "port": 7860 + } +} diff --git a/demo/setup_paths.py b/demo/setup_paths.py new file mode 100644 index 0000000..afc6f2c --- /dev/null +++ b/demo/setup_paths.py @@ -0,0 +1,105 @@ +#!/usr/bin/env python3 +""" +StreamSpeech Paths Setup Script + +This script helps you set up the paths_config.json file for your environment. +Run this script to automatically generate the paths configuration. +""" + +import os +import json +import sys +from pathlib import Path + +def get_streamspeech_root(): + """Get the StreamSpeech root directory""" + current_dir = Path(__file__).parent.parent.absolute() + return str(current_dir).replace('\\', '/') + +def setup_paths(): + """Set up paths configuration""" + streamspeech_root = get_streamspeech_root() + + # Default paths based on current directory structure + paths_config = { + "_comment": "StreamSpeech Paths Configuration - Auto-generated", + "_note": "Use forward slashes (/) for paths, even on Windows", + + "streamspeech_root": streamspeech_root, + "pretrain_models_root": f"{streamspeech_root}/pretrain_models", + + "language_pair": "es-en", + + "models": { + "simultaneous": f"{streamspeech_root}/pretrain_models/streamspeech.simultaneous.es-en.pt", + "offline": f"{streamspeech_root}/pretrain_models/streamspeech.offline.es-en.pt" + }, + + "vocoder": { + "checkpoint": f"{streamspeech_root}/pretrain_models/unit-based_HiFi-GAN_vocoder/mHuBERT.layer11.km1000.en/g_00500000", + "config": f"{streamspeech_root}/pretrain_models/unit-based_HiFi-GAN_vocoder/mHuBERT.layer11.km1000.en/config.json" + }, + + "configs": { + "data_bin": f"{streamspeech_root}/configs/es-en", + "user_dir": f"{streamspeech_root}/researches/ctc_unity", + "agent_dir": f"{streamspeech_root}/agent" + }, + + "demo": { + "upload_folder": f"{streamspeech_root}/demo/uploads", + "host": "0.0.0.0", + "port": 7860 + } + } + + # Check if files exist + print("Checking if required files exist...") + missing_files = [] + + for key, path in [ + ("Simultaneous Model", paths_config["models"]["simultaneous"]), + ("Offline Model", paths_config["models"]["offline"]), + ("Vocoder Checkpoint", paths_config["vocoder"]["checkpoint"]), + ("Vocoder Config", paths_config["vocoder"]["config"]), + ("Data Bin", paths_config["configs"]["data_bin"]), + ("User Dir", paths_config["configs"]["user_dir"]), + ("Agent Dir", paths_config["configs"]["agent_dir"]) + ]: + if os.path.exists(path): + print(f"✅ {key}: {path}") + else: + print(f"❌ {key}: {path} (NOT FOUND)") + missing_files.append((key, path)) + + if missing_files: + print(f"\n⚠️ Warning: {len(missing_files)} files/directories are missing!") + print("Please ensure all models are downloaded and paths are correct.") + response = input("Do you want to continue anyway? (y/N): ") + if response.lower() != 'y': + print("Setup cancelled.") + return False + + # Write the configuration file + config_path = Path(__file__).parent / "paths_config.json" + with open(config_path, 'w') as f: + json.dump(paths_config, f, indent=4) + + print(f"\n✅ Paths configuration saved to: {config_path}") + print("You can now run the StreamSpeech demo!") + + return True + +if __name__ == "__main__": + print("StreamSpeech Paths Setup") + print("=" * 30) + + if setup_paths(): + print("\n🎉 Setup completed successfully!") + print("\nNext steps:") + print("1. Activate your virtual environment: streamspeech_env\\Scripts\\activate") + print("2. Run the demo: python app.py") + print("3. Open your browser to: http://localhost:7860") + else: + print("\n❌ Setup failed. Please check the error messages above.") + sys.exit(1) diff --git a/demo/templates/index.html b/demo/templates/index.html index be31479..7574bd4 100644 --- a/demo/templates/index.html +++ b/demo/templates/index.html @@ -262,7 +262,7 @@

Simultaneous Speech-to-Speech Translation

normalize: true // Add normalize to output waveform }); - outputWaveSurfer.load(`/uploads/output/${filename}`); + outputWaveSurfer.load(`/output/${filename}`); playButton.disabled = false; playButton.style.backgroundColor = '#4CAF50'; // Change color to green @@ -301,11 +301,16 @@

Simultaneous Speech-to-Speech Translation

inputWaveSurfer.on('finish', function() { updateASRResult(inputWaveSurfer.getCurrentTime()); updateTranslationResult(inputWaveSurfer.getCurrentTime()); + + // Continue playing output audio even after input finishes + if (outputWaveSurfer && !outputWaveSurfer.isPlaying()) { + outputWaveSurfer.play(); + } }); }); // Pass the latency parameter to the server - inputWaveSurfer.load(`/uploads/${filename}?latency=${latency}`); + inputWaveSurfer.load(`/process/${filename}?latency=${latency}`); }) .catch(error => console.error('Error:', error)); }); @@ -330,7 +335,14 @@

Simultaneous Speech-to-Speech Translation

inputWaveSurfer.playPause(); } if (outputWaveSurfer) { - outputWaveSurfer.playPause(); + // If input is finished but output is still playing, just control output + if (inputWaveSurfer && inputWaveSurfer.isFinished() && outputWaveSurfer.isPlaying()) { + outputWaveSurfer.pause(); + } else if (inputWaveSurfer && inputWaveSurfer.isFinished() && !outputWaveSurfer.isPlaying()) { + outputWaveSurfer.play(); + } else { + outputWaveSurfer.playPause(); + } } } diff --git a/fairseq/examples/speech_to_text/__init__.py b/fairseq/examples/speech_to_text/__init__.py new file mode 100644 index 0000000..6264236 --- /dev/null +++ b/fairseq/examples/speech_to_text/__init__.py @@ -0,0 +1,4 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. diff --git a/fairseq/fairseq/models/speech_to_speech/modules/__init__.py b/fairseq/fairseq/models/speech_to_speech/modules/__init__.py index e69de29..6293554 100644 --- a/fairseq/fairseq/models/speech_to_speech/modules/__init__.py +++ b/fairseq/fairseq/models/speech_to_speech/modules/__init__.py @@ -0,0 +1 @@ +# Speech-to-Speech modules diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..fe130e8 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,51 @@ +# StreamSpeech Requirements +# Python 3.9+ required + +# Core PyTorch dependencies +torch>=2.0.0 +torchvision>=0.15.0 +torchaudio>=2.0.0 + +# Core ML/AI packages +fairseq>=0.12.0 +numpy>=1.21.0 +pandas>=1.3.0 + +# Audio processing +soundfile>=0.12.0 +pydub>=0.25.0 +librosa>=0.9.0 + +# Web framework +Flask>=2.0.0 +Werkzeug>=2.0.0 + +# Configuration and utilities +PyYAML>=6.0.0 +omegaconf>=2.0.0 +hydra-core>=1.0.0 +tqdm>=4.60.0 +regex>=2022.0.0 +sacrebleu>=2.0.0 +bitarray>=2.0.0 + +# Development and testing +pytest>=7.0.0 +pytest-cov>=4.0.0 +pytest-flake8>=1.0.0 +flake8>=5.0.0 + +# Additional utilities +colorama>=0.4.0 +tabulate>=0.9.0 +lxml>=4.0.0 +portalocker>=2.0.0 +tornado>=6.0.0 +textgrid>=1.5.0 +yt-dlp>=2023.0.0 + +# System dependencies (Windows) +pywin32>=300; sys_platform == "win32" + +# Note: SimulEval should be installed separately in editable mode: +# cd SimulEval && pip install --editable ./ From 0a6c918a52cb8b034a7c0220fc34ad8a17b6ffb0 Mon Sep 17 00:00:00 2001 From: ronliwag Date: Tue, 21 Oct 2025 01:54:12 +0800 Subject: [PATCH 2/7] added drive folder for pretrained model download --- SETUP_INSTRUCTIONS.md | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/SETUP_INSTRUCTIONS.md b/SETUP_INSTRUCTIONS.md index 2a6305f..f779b8b 100644 --- a/SETUP_INSTRUCTIONS.md +++ b/SETUP_INSTRUCTIONS.md @@ -77,7 +77,27 @@ python -c "import flask; print('Flask: OK')" ### 1. Download Pre-trained Models -Create a `pretrain_models` directory and download the required models: +**🚀 Fast Download Option (Recommended):** + +Download all pre-trained models from Google Drive for faster speeds: + +**[📁 Download Pre-trained Models from Google Drive](https://drive.google.com/drive/folders/1C4Y0sq_-tSRSbbu8dt0QGRQsk4h-9v5m?usp=drive_link)** + +1. Click the link above to access the Google Drive folder +2. Download the entire `pretrain_models` folder +3. Extract it to your StreamSpeech root directory + +The folder contains: +- **StreamSpeech Models**: All language pairs (French-English, Spanish-English, German-English) + - `streamspeech.simultaneous.[lang]-en.pt` (simultaneous translation) + - `streamspeech.offline.[lang]-en.pt` (offline translation) +- **HiFi-GAN Vocoder**: Complete unit-based vocoder with config + - `unit-based_HiFi-GAN_vocoder/mHuBERT.layer11.km1000.en/g_00500000` + - `unit-based_HiFi-GAN_vocoder/mHuBERT.layer11.km1000.en/config.json` + +**Alternative Download (Original Sources):** + +If you prefer to download from original sources: ```powershell mkdir pretrain_models From cba44ccab73ef0ca2081bffbd42853b6c0fffaab Mon Sep 17 00:00:00 2001 From: ronliwag Date: Fri, 7 Nov 2025 02:56:34 +0800 Subject: [PATCH 3/7] fixed setup --- SETUP_COMPLETE.md | 210 ++++++++++++++++++++++++++++++++++++++++++++++ demo/app.py | 34 +++++--- 2 files changed, 232 insertions(+), 12 deletions(-) create mode 100644 SETUP_COMPLETE.md diff --git a/SETUP_COMPLETE.md b/SETUP_COMPLETE.md new file mode 100644 index 0000000..e56410d --- /dev/null +++ b/SETUP_COMPLETE.md @@ -0,0 +1,210 @@ +# StreamSpeech Setup Complete! 🎉 + +## Virtual Environment Status +✅ **Virtual environment created**: `streamspeech_env` +✅ **All dependencies installed** +✅ **Fairseq configured** (via Python path) +✅ **SimulEval installed** (editable mode) + +## Installed Packages +- **PyTorch 2.0.1** with CUDA 11.8 support +- **TorchVision & TorchAudio** (compatible versions) +- **Fairseq** (custom version from local directory) +- **SimulEval 1.1.0** (for evaluation) +- **Flask** (for web demo) +- **Audio processing**: soundfile, librosa, pydub +- **ML utilities**: numpy, pandas, scipy, scikit-learn +- **Configuration**: PyYAML, omegaconf, hydra-core +- **Other tools**: tensorboardX, sacrebleu, tqdm, and more + +## CUDA Status +✅ **CUDA is available** on your system - GPU acceleration is ready! + +--- + +## 📥 Required Models to Download + +You need to download the following pre-trained models to use StreamSpeech: + +### Option 1: Quick Download (Recommended) +**All models are available on Hugging Face:** +https://huggingface.co/ICTNLP/StreamSpeech_Models + +### Option 2: Download Individual Models + +#### 1️⃣ **StreamSpeech Models** (Choose your language pair) + +**French → English:** +- **Simultaneous**: [streamspeech.simultaneous.fr-en.pt](https://huggingface.co/ICTNLP/StreamSpeech_Models/blob/main/streamspeech.simultaneous.fr-en.pt) (~1.2 GB) +- **Offline**: [streamspeech.offline.fr-en.pt](https://huggingface.co/ICTNLP/StreamSpeech_Models/blob/main/streamspeech.offline.fr-en.pt) (~1.2 GB) +- **Unity baseline**: [unity.fr-en.pt](https://huggingface.co/ICTNLP/StreamSpeech_Models/blob/main/unity.fr-en.pt) (~1.2 GB) + +**Spanish → English:** +- **Simultaneous**: [streamspeech.simultaneous.es-en.pt](https://huggingface.co/ICTNLP/StreamSpeech_Models/blob/main/streamspeech.simultaneous.es-en.pt) (~1.2 GB) +- **Offline**: [streamspeech.offline.es-en.pt](https://huggingface.co/ICTNLP/StreamSpeech_Models/blob/main/streamspeech.offline.es-en.pt) (~1.2 GB) +- **Unity baseline**: [unity.es-en.pt](https://huggingface.co/ICTNLP/StreamSpeech_Models/blob/main/unity.es-en.pt) (~1.2 GB) + +**German → English:** +- **Simultaneous**: [streamspeech.simultaneous.de-en.pt](https://huggingface.co/ICTNLP/StreamSpeech_Models/blob/main/streamspeech.simultaneous.de-en.pt) (~1.2 GB) +- **Offline**: [streamspeech.offline.de-en.pt](https://huggingface.co/ICTNLP/StreamSpeech_Models/blob/main/streamspeech.offline.de-en.pt) (~1.2 GB) +- **Unity baseline**: [unity.de-en.pt](https://huggingface.co/ICTNLP/StreamSpeech_Models/blob/main/unity.de-en.pt) (~1.2 GB) + +#### 2️⃣ **Unit-based HiFi-GAN Vocoder** (Required for speech synthesis) + +**For English output:** +- **Checkpoint**: [g_00500000](https://dl.fbaipublicfiles.com/fairseq/speech_to_speech/vocoder/code_hifigan/mhubert_vp_en_es_fr_it3_400k_layer11_km1000_lj/g_00500000) (~55 MB) +- **Config**: [config.json](https://dl.fbaipublicfiles.com/fairseq/speech_to_speech/vocoder/code_hifigan/mhubert_vp_en_es_fr_it3_400k_layer11_km1000_lj/config.json) (~1 KB) + +**For Spanish output (if needed):** +- **Checkpoint**: [g_00500000](https://dl.fbaipublicfiles.com/fairseq/speech_to_speech/vocoder/code_hifigan/mhubert_vp_en_es_fr_it3_400k_layer11_km1000_es_css10/g_00500000) +- **Config**: [config.json](https://dl.fbaipublicfiles.com/fairseq/speech_to_speech/vocoder/code_hifigan/mhubert_vp_en_es_fr_it3_400k_layer11_km1000_es_css10/config.json) + +**For French output (if needed):** +- **Checkpoint**: [g_00500000](https://dl.fbaipublicfiles.com/fairseq/speech_to_speech/vocoder/code_hifigan/mhubert_vp_en_es_fr_it3_400k_layer11_km1000_fr_css10/g_00500000) +- **Config**: [config.json](https://dl.fbaipublicfiles.com/fairseq/speech_to_speech/vocoder/code_hifigan/mhubert_vp_en_es_fr_it3_400k_layer11_km1000_fr_css10/config.json) + +#### 3️⃣ **mHuBERT Model** (For unit extraction) +- **Model**: [mhubert_base_vp_en_es_fr_it3.pt](https://dl.fbaipublicfiles.com/hubert/mhubert_base_vp_en_es_fr_it3.pt) (~316 MB) +- **K-means**: [mhubert_base_vp_en_es_fr_it3_L11_km1000.bin](https://dl.fbaipublicfiles.com/hubert/mhubert_base_vp_en_es_fr_it3_L11_km1000.bin) (~4 MB) + +--- + +## 📁 Recommended Directory Structure + +After downloading, organize your models like this: + +``` +D:\StreamSpeech\ +├── pretrain_models\ +│ ├── streamspeech.simultaneous.fr-en.pt +│ ├── streamspeech.offline.fr-en.pt +│ ├── unit-based_HiFi-GAN_vocoder\ +│ │ ├── mHuBERT.layer11.km1000.en\ +│ │ │ ├── g_00500000 +│ │ │ └── config.json +│ │ ├── mHuBERT.layer11.km1000.es\ +│ │ │ ├── g_00500000 +│ │ │ └── config.json +│ │ └── mHuBERT.layer11.km1000.fr\ +│ │ ├── g_00500000 +│ │ └── config.json +│ └── mHuBERT\ +│ ├── mhubert_base_vp_en_es_fr_it3.pt +│ └── mhubert_base_vp_en_es_fr_it3_L11_km1000.bin +└── ... (other project files) +``` + +**Create the directories:** +```powershell +mkdir pretrain_models +mkdir pretrain_models\unit-based_HiFi-GAN_vocoder\mHuBERT.layer11.km1000.en +mkdir pretrain_models\unit-based_HiFi-GAN_vocoder\mHuBERT.layer11.km1000.es +mkdir pretrain_models\unit-based_HiFi-GAN_vocoder\mHuBERT.layer11.km1000.fr +mkdir pretrain_models\mHuBERT +``` + +Then download the models into their respective directories. + +--- + +## 🚀 Quick Start Guide + +### 1. Activate the Environment +```powershell +.\streamspeech_env\Scripts\Activate.ps1 +``` + +### 2. Test the Installation +```powershell +python -c "import torch; print('CUDA:', torch.cuda.is_available())" +``` + +### 3. Run Example Inference (after downloading models) + +**Simultaneous Speech-to-Speech Translation:** +```powershell +$env:CUDA_VISIBLE_DEVICES="0" +$ROOT="D:\StreamSpeech" +$PRETRAIN_ROOT="D:\StreamSpeech\pretrain_models" +$LANG="fr" + +$env:PYTHONPATH="$ROOT\fairseq" +simuleval --data-bin "$ROOT\configs\$LANG-en" ` + --user-dir "$ROOT\researches\ctc_unity" ` + --agent-dir "$ROOT\agent" ` + --source "$ROOT\example\wav_list.txt" ` + --target "$ROOT\example\target.txt" ` + --model-path "$PRETRAIN_ROOT\streamspeech.simultaneous.$LANG-en.pt" ` + --config-yaml config_gcmvn.yaml ` + --multitask-config-yaml config_mtl_asr_st_ctcst.yaml ` + --agent "$ROOT\agent\speech_to_speech.streamspeech.agent.py" ` + --vocoder "$PRETRAIN_ROOT\unit-based_HiFi-GAN_vocoder\mHuBERT.layer11.km1000.en\g_00500000" ` + --vocoder-cfg "$PRETRAIN_ROOT\unit-based_HiFi-GAN_vocoder\mHuBERT.layer11.km1000.en\config.json" ` + --dur-prediction ` + --source-segment-size 320 ` + --device gpu ` + --computation-aware ` + --output-asr-translation True +``` + +### 4. Run Web Demo (after downloading models) +```powershell +cd demo +python app.py +``` +Then open your browser to `http://localhost:7860` + +--- + +## 📋 Summary of What You Need + +**For basic S2ST (French→English):** +1. ✅ Environment (already set up) +2. ⬇️ `streamspeech.simultaneous.fr-en.pt` (~1.2 GB) +3. ⬇️ HiFi-GAN vocoder for English (`g_00500000` + `config.json`) (~55 MB) +4. ⬇️ mHuBERT model (`.pt` file) (~316 MB) +5. ⬇️ mHuBERT k-means (`.bin` file) (~4 MB) + +**Total download size: ~1.6 GB** + +--- + +## 💡 Next Steps + +1. **Download Models**: Start with French→English simultaneous model and English vocoder +2. **Update Config Files**: Edit paths in `configs/fr-en/config_gcmvn.yaml` and `config_mtl_asr_st_ctcst.yaml` +3. **Test with Examples**: Use the provided example audio files in `example/wavs/` +4. **Explore Features**: Try different tasks (ASR, S2TT, S2ST) with different latency settings + +--- + +## 🔧 Troubleshooting + +**Issue**: ImportError for fairseq +**Solution**: Make sure the virtual environment is activated. The `.pth` file automatically adds fairseq to the path. + +**Issue**: CUDA out of memory +**Solution**: Use CPU mode by setting `--device cpu` or reduce batch size + +**Issue**: Module not found +**Solution**: Ensure PYTHONPATH includes the fairseq directory: +```powershell +$env:PYTHONPATH="D:\StreamSpeech\fairseq" +``` + +--- + +## 📚 Resources + +- **Paper**: https://arxiv.org/abs/2406.03049 +- **Demo Site**: https://ictnlp.github.io/StreamSpeech-site/ +- **Model Hub**: https://huggingface.co/ICTNLP/StreamSpeech_Models +- **GitHub**: https://github.com/ictnlp/StreamSpeech + +--- + +**Environment created on**: November 6, 2025 +**Python version**: 3.10 +**PyTorch version**: 2.0.1 + CUDA 11.8 +**GPU Support**: ✅ Enabled + diff --git a/demo/app.py b/demo/app.py index 311b9d0..1a74b98 100644 --- a/demo/app.py +++ b/demo/app.py @@ -841,7 +841,22 @@ def policy(self): def run(source): # if len(S2ST)!=0: return - samples, sr = soundfile.read(source, dtype="float32") + + # Handle MP3 files by converting to WAV first + if source.lower().endswith('.mp3'): + print(f"Converting MP3 to WAV: {source}") + audio = AudioSegment.from_mp3(source) + # Create a temporary WAV file + wav_path = source.rsplit('.', 1)[0] + '_temp.wav' + audio.export(wav_path, format='wav') + samples, sr = soundfile.read(wav_path, dtype="float32") + # Clean up temp file + try: + os.remove(wav_path) + except: + pass + else: + samples, sr = soundfile.read(source, dtype="float32") # Resample to expected sample rate if needed if sr != ORG_SAMPLE_RATE: @@ -1006,18 +1021,16 @@ def upload(): if file: filepath = os.path.join(app.config['UPLOAD_FOLDER'], file.filename) file.save(filepath) - return filepath + # Return just the filename, not the full path + return file.filename @app.route('/process/') def uploaded_file(filepath): latency = request.args.get('latency', default=320, type=int) agent.set_chunk_size(latency) - # Handle both full path and just filename - if filepath.startswith(app.config['UPLOAD_FOLDER']): - path = filepath - else: - path = os.path.join(app.config['UPLOAD_FOLDER'], filepath) + # Construct full path from upload folder and filename + path = os.path.join(app.config['UPLOAD_FOLDER'], filepath) # pdb.set_trace() # if len(S2ST)==0: reset() @@ -1048,11 +1061,8 @@ def uploaded_file(filepath): @app.route('/output/') def uploaded_output_file(filepath): - # Handle both full path and just filename - if filepath.startswith(app.config['UPLOAD_FOLDER']): - filename = os.path.basename(filepath) - else: - filename = filepath + # filepath is just the filename + filename = filepath return send_from_directory(app.config['UPLOAD_FOLDER'], 'output.'+filename) From 2fb7e9171506190495576ac6f75e640002d82acf Mon Sep 17 00:00:00 2001 From: ronliwag Date: Fri, 7 Nov 2025 03:37:21 +0800 Subject: [PATCH 4/7] added audio and discrete unit extraction --- EXTRACTION_GUIDE.md | 396 ++++++++++++++++++++++++++++++++++ EXTRACTION_QUICK_REFERENCE.md | 103 +++++++++ demo/app.py | 20 ++ demo/extract_intermediates.py | 173 +++++++++++++++ 4 files changed, 692 insertions(+) create mode 100644 EXTRACTION_GUIDE.md create mode 100644 EXTRACTION_QUICK_REFERENCE.md create mode 100644 demo/extract_intermediates.py diff --git a/EXTRACTION_GUIDE.md b/EXTRACTION_GUIDE.md new file mode 100644 index 0000000..bc8deae --- /dev/null +++ b/EXTRACTION_GUIDE.md @@ -0,0 +1,396 @@ +# StreamSpeech Intermediate Data Extraction Guide + +This guide shows you **exactly where** to extract the source Spanish audio and discrete speech units from StreamSpeech. + +--- + +## 📁 What You'll Extract + +1. **Source Spanish Audio Input** (`.wav` file) + - Original/resampled Spanish audio before feature extraction + - Location: `demo/app.py`, `run()` function + +2. **Discrete Speech Units** (`.pt` file) + - Integer codes representing phonetic units (before vocoder) + - Location: `agent/speech_to_speech.streamspeech.agent.py`, `policy()` method + +--- + +## 🎯 Code Location 1: Source Audio Input + +### File: `demo/app.py` + +**Location**: In the `run()` function, around **line 859** + +### Current Code: +```python +def run(source): + # if len(S2ST)!=0: return + + # Handle MP3 files by converting to WAV first + if source.lower().endswith('.mp3'): + print(f"Converting MP3 to WAV: {source}") + audio = AudioSegment.from_mp3(source) + # Create a temporary WAV file + wav_path = source.rsplit('.', 1)[0] + '_temp.wav' + audio.export(wav_path, format='wav') + samples, sr = soundfile.read(wav_path, dtype="float32") + # Clean up temp file + try: + os.remove(wav_path) + except: + pass + else: + samples, sr = soundfile.read(source, dtype="float32") + + # Resample to expected sample rate if needed + if sr != ORG_SAMPLE_RATE: + print(f"Resampling from {sr}Hz to {ORG_SAMPLE_RATE}Hz") + # Simple resampling using torch + samples_tensor = torch.tensor(samples).unsqueeze(0).unsqueeze(0) + target_length = int(len(samples) * ORG_SAMPLE_RATE / sr) + samples_tensor = torch.nn.functional.interpolate( + samples_tensor, size=target_length, mode='linear', align_corners=False + ) + samples = samples_tensor.squeeze().numpy() + + # Normalize input audio to prevent loud playback + max_val = np.max(np.abs(samples)) + if max_val > 0: + samples = samples / max_val * 0.8 + + # 👇 ADD EXTRACTION CODE HERE 👇 +``` + +### Modified Code (ADD THIS): +```python + # Normalize input audio to prevent loud playback + max_val = np.max(np.abs(samples)) + if max_val > 0: + samples = samples / max_val * 0.8 + + # ========================================== + # EXTRACT SOURCE AUDIO AT 16kHz + # ========================================== + # Save source audio for analysis (resampled to 16kHz) + import soundfile, torch + source_filename = os.path.basename(source).rsplit('.', 1)[0] + extract_dir = os.path.join(os.path.dirname(__file__), 'extracted_intermediates') + os.makedirs(extract_dir, exist_ok=True) + + # Resample to 16kHz (model's processing rate) + TARGET_SR = 16000 + if ORG_SAMPLE_RATE != TARGET_SR: + samples_tensor = torch.tensor(samples).unsqueeze(0).unsqueeze(0) + target_length = int(len(samples) * TARGET_SR / ORG_SAMPLE_RATE) + samples_tensor = torch.nn.functional.interpolate( + samples_tensor, size=target_length, mode='linear', align_corners=False + ) + samples_16k = samples_tensor.squeeze().numpy() + else: + samples_16k = samples + + source_audio_path = os.path.join(extract_dir, f"{source_filename}_source_audio_16k.wav") + soundfile.write(source_audio_path, samples_16k, TARGET_SR) + print(f"✅ EXTRACTED: Source audio saved to {source_audio_path}") + print(f" Sample rate: {TARGET_SR} Hz (16kHz), Duration: {len(samples_16k)/TARGET_SR:.2f}s") + # ========================================== + + agent.reset() + # ... rest of the function +``` + +--- + +## 🎯 Code Location 2: Discrete Speech Units + +### File: `agent/speech_to_speech.streamspeech.agent.py` + +**Location**: In the `policy()` method, around **line 713-748** + +### Current Code: +```python + for i, hypo in enumerate(finalized): + i_beam = 0 + tmp = hypo[i_beam]["tokens"].int() # hyp + eos + if tmp[-1] == self.generator.eos: + tmp = tmp[:-1] + unit = [] + for c in tmp: + u = self.generator.tgt_dict[c].replace("", "").replace("", "") + if u != "": + unit.append(int(u)) + + if len(unit) > 0 and unit[0] == " ": + unit = unit[1:] + text = " ".join([str(_) for _ in unit]) + if self.states.source_finished and not self.quiet: + with open(self.unit_file, "a") as file: + print(text, file=file) + cur_unit = unit if self.unit is None else unit[len(self.unit) :] + if len(unit) < 1 or len(cur_unit) < 1: + # ... return ReadAction or WriteAction + + x = { + "code": torch.tensor(unit, dtype=torch.long, device=self.device).view( + 1, -1 + ), + } + wav, dur = self.vocoder(x, self.dur_prediction) +``` + +### Modified Code (ADD THIS): +```python + for i, hypo in enumerate(finalized): + i_beam = 0 + tmp = hypo[i_beam]["tokens"].int() # hyp + eos + if tmp[-1] == self.generator.eos: + tmp = tmp[:-1] + unit = [] + for c in tmp: + u = self.generator.tgt_dict[c].replace("", "").replace("", "") + if u != "": + unit.append(int(u)) + + if len(unit) > 0 and unit[0] == " ": + unit = unit[1:] + text = " ".join([str(_) for _ in unit]) + if self.states.source_finished and not self.quiet: + with open(self.unit_file, "a") as file: + print(text, file=file) + cur_unit = unit if self.unit is None else unit[len(self.unit) :] + if len(unit) < 1 or len(cur_unit) < 1: + # ... return ReadAction or WriteAction + + # ========================================== + # EXTRACT DISCRETE UNITS (before vocoder) + # ========================================== + # Only save when source is finished (final complete units) + if self.states.source_finished and len(unit) > 0: + import torch + import os + from datetime import datetime + + extract_dir = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'demo', 'extracted_intermediates') + os.makedirs(extract_dir, exist_ok=True) + + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + units_pt_path = os.path.join(extract_dir, f"discrete_units_{timestamp}.pt") + units_txt_path = os.path.join(extract_dir, f"discrete_units_{timestamp}.txt") + + # Save as PyTorch tensor + units_tensor = torch.tensor(unit, dtype=torch.long) + torch.save(units_tensor, units_pt_path) + + # Also save as readable text + with open(units_txt_path, 'w') as f: + f.write(' '.join(map(str, unit)) + '\n') + f.write(f"\n# Number of units: {len(unit)}\n") + f.write(f"# Unit range: [{min(unit)}, {max(unit)}]\n") + + print(f"✅ EXTRACTED: Discrete units saved to {units_pt_path}") + print(f" Number of units: {len(unit)}, Range: [{min(unit)}, {max(unit)}]") + # ========================================== + + x = { + "code": torch.tensor(unit, dtype=torch.long, device=self.device).view( + 1, -1 + ), + } + wav, dur = self.vocoder(x, self.dur_prediction) +``` + +--- + +## 📊 Data Flow Visualization + +``` +┌─────────────────────────────────────────────────────────────┐ +│ StreamSpeech Pipeline │ +└─────────────────────────────────────────────────────────────┘ + +1. Source Spanish Audio (MP3/WAV) + │ + ├─> Load & Resample (demo/app.py, run()) + │ ├─> samples: numpy array (48000 Hz) + │ └─> 💾 EXTRACT HERE: source_audio.wav + │ + ├─> Feature Extraction (agent, OnlineFeatureExtractor) + │ └─> fbank features (80-dim) + │ + ├─> StreamSpeech Model (encoder + decoder) + │ ├─> ASR output (Spanish text) + │ ├─> Translation output (English text) + │ └─> Text-to-Unit decoder + │ + ├─> Discrete Speech Units (agent, policy()) + │ ├─> unit: list of integers [234, 567, 891, ...] + │ └─> 💾 EXTRACT HERE: discrete_units.pt + │ + ├─> HiFi-GAN Vocoder (CodeHiFiGAN) + │ └─> Synthesized English speech (16000 Hz) + │ + └─> Output Audio (WAV) +``` + +--- + +## 🔍 Understanding the Extracted Data + +### Source Audio (`.wav` file) +- **Format**: WAV, float32 +- **Sample Rate**: 16000 Hz (16kHz - model's processing rate) +- **Content**: Original Spanish speech, resampled and normalized to [-0.8, 0.8] +- **Use Case**: Input to acoustic feature extraction (same rate the model uses) + +### Discrete Units (`.pt` file) +- **Format**: PyTorch tensor (torch.long) +- **Content**: Integer codes representing phonetic units +- **Range**: Typically 0-999 (for 1000-unit codebook) +- **Length**: Variable, depends on speech duration +- **Example**: `tensor([234, 567, 891, 123, 456, ...])` + +**How to Load:** +```python +import torch + +# Load units +units = torch.load('discrete_units_20251107_123456.pt') +print(f"Shape: {units.shape}") +print(f"Units: {units}") + +# Or load as text +with open('discrete_units_20251107_123456.txt', 'r') as f: + units_str = f.readline().strip() + units_list = [int(x) for x in units_str.split()] +``` + +--- + +## 📂 Output Structure + +After running the demo, you'll find: + +``` +demo/ +├── extracted_intermediates/ +│ ├── common_voice_es_18311412_source_audio_16k.wav +│ ├── discrete_units_20251107_025030.pt +│ ├── discrete_units_20251107_025030.txt +│ ├── another_audio_source_audio_16k.wav +│ └── discrete_units_20251107_030145.pt +└── ... +``` + +--- + +## 🚀 Quick Implementation + +### Option 1: Manual Copy-Paste (Recommended) +1. Open `demo/app.py` +2. Find line ~859 (after `samples = samples / max_val * 0.8`) +3. Copy-paste the "EXTRACT SOURCE AUDIO" code block +4. Open `agent/speech_to_speech.streamspeech.agent.py` +5. Find line ~742 (before `x = {"code": ...}`) +6. Copy-paste the "EXTRACT DISCRETE UNITS" code block +7. Restart the demo app + +### Option 2: Use the Helper Script +The `demo/extract_intermediates.py` file contains reusable functions you can import. + +--- + +## 🧪 Testing + +1. Start the demo: + ```powershell + cd demo + python app.py + ``` + +2. Upload a Spanish audio file + +3. Process it + +4. Check the console output for: + ``` + ✅ EXTRACTED: Source audio saved to extracted_intermediates/... + ✅ EXTRACTED: Discrete units saved to extracted_intermediates/... + ``` + +5. Verify files in `demo/extracted_intermediates/` + +--- + +## 📝 Notes + +- **Source audio** is saved at the beginning of processing (immediately available) +- **Discrete units** are saved only when `source_finished=True` (at the end) +- Both use timestamps to avoid overwriting files +- `.txt` files are human-readable for inspection +- `.pt` files can be loaded back into PyTorch for further processing + +--- + +## 🔬 Advanced: Using the Extracted Data + +### Analyzing Discrete Units +```python +import torch +import matplotlib.pyplot as plt + +# Load units +units = torch.load('discrete_units_20251107_025030.pt') + +# Statistics +print(f"Total units: {len(units)}") +print(f"Unique units: {len(torch.unique(units))}") +print(f"Most common unit: {torch.mode(units).values.item()}") + +# Histogram +plt.hist(units.numpy(), bins=50) +plt.xlabel('Unit Index') +plt.ylabel('Frequency') +plt.title('Discrete Unit Distribution') +plt.show() +``` + +### Analyzing Source Audio +```python +import soundfile +import numpy as np + +# Load 16kHz source audio +audio, sr = soundfile.read('common_voice_es_18311412_source_audio_16k.wav') +print(f"Sample rate: {sr} Hz (should be 16000)") +print(f"Duration: {len(audio)/sr:.2f} seconds") +print(f"Shape: {audio.shape}") +print(f"Range: [{audio.min():.3f}, {audio.max():.3f}]") +``` + +### Reusing Units with Vocoder +```python +import torch + +# Load saved units +units = torch.load('discrete_units_20251107_025030.pt') + +# Feed directly to vocoder (without running full model) +# This would be in the agent context with vocoder loaded +x = { + "code": units.view(1, -1).to(device) +} +wav, dur = vocoder(x, dur_prediction=True) +# wav is now synthesized speech! +``` + +--- + +## ✅ Summary + +**Two extraction points:** +1. 📍 `demo/app.py:859` → Save source Spanish audio WAV +2. 📍 `agent/speech_to_speech.streamspeech.agent.py:742` → Save discrete units PT + +Both files will be in `demo/extracted_intermediates/` directory. + diff --git a/EXTRACTION_QUICK_REFERENCE.md b/EXTRACTION_QUICK_REFERENCE.md new file mode 100644 index 0000000..4e64029 --- /dev/null +++ b/EXTRACTION_QUICK_REFERENCE.md @@ -0,0 +1,103 @@ +# 🎯 Quick Reference: Extract Intermediates from StreamSpeech + +## Two Code Locations + +### 1️⃣ Source Spanish Audio @ 16kHz → `demo/app.py` line ~859 + +**Add after**: `samples = samples / max_val * 0.8` + +```python +# Save source audio at 16kHz +import soundfile, os, torch +source_filename = os.path.basename(source).rsplit('.', 1)[0] +extract_dir = os.path.join(os.path.dirname(__file__), 'extracted_intermediates') +os.makedirs(extract_dir, exist_ok=True) + +# Resample to 16kHz (model's processing rate) +TARGET_SR = 16000 +if ORG_SAMPLE_RATE != TARGET_SR: + samples_tensor = torch.tensor(samples).unsqueeze(0).unsqueeze(0) + target_length = int(len(samples) * TARGET_SR / ORG_SAMPLE_RATE) + samples_16k = torch.nn.functional.interpolate( + samples_tensor, size=target_length, mode='linear', align_corners=False + ).squeeze().numpy() +else: + samples_16k = samples + +source_audio_path = os.path.join(extract_dir, f"{source_filename}_source_audio_16k.wav") +soundfile.write(source_audio_path, samples_16k, TARGET_SR) +print(f"✅ Source audio (16kHz): {source_audio_path}") +``` + +**Output**: `demo/extracted_intermediates/_source_audio_16k.wav` + +--- + +### 2️⃣ Discrete Units → `agent/speech_to_speech.streamspeech.agent.py` line ~742 + +**Add before**: `x = {"code": torch.tensor(unit, ...}` + +```python +# Save discrete units +if self.states.source_finished and len(unit) > 0: + import torch, os + from datetime import datetime + extract_dir = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'demo', 'extracted_intermediates') + os.makedirs(extract_dir, exist_ok=True) + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + + # Save as PyTorch tensor + units_tensor = torch.tensor(unit, dtype=torch.long) + torch.save(units_tensor, os.path.join(extract_dir, f"units_{timestamp}.pt")) + + # Save as text + with open(os.path.join(extract_dir, f"units_{timestamp}.txt"), 'w') as f: + f.write(' '.join(map(str, unit))) + print(f"✅ Discrete units: {len(unit)} units saved") +``` + +**Output**: +- `demo/extracted_intermediates/units_.pt` +- `demo/extracted_intermediates/units_.txt` + +--- + +## 📊 What You Get + +| File | Format | Content | Size | +|------|--------|---------|------| +| `*_source_audio_16k.wav` | WAV, **16kHz**, float32 | Spanish speech at model's rate | ~1-2 MB/min | +| `units_*.pt` | PyTorch tensor | Discrete phonetic codes | ~1-2 KB | +| `units_*.txt` | Text | Human-readable units | ~1-2 KB | + +--- + +## 🔬 Usage + +### Load Source Audio +```python +import soundfile +audio, sr = soundfile.read('common_voice_es_18311412_source_audio_16k.wav') +print(f"Sample rate: {sr} Hz") # Should be 16000 +``` + +### Load Discrete Units +```python +import torch +units = torch.load('units_20251107_025030.pt') +# tensor([234, 567, 891, ...]) +``` + +--- + +## ✨ Quick Test + +1. Add the two code snippets above +2. Restart demo: `python demo/app.py` +3. Upload & process Spanish audio +4. Check `demo/extracted_intermediates/` folder + +--- + +See **EXTRACTION_GUIDE.md** for detailed explanations and advanced usage. + diff --git a/demo/app.py b/demo/app.py index 1a74b98..ebaee3e 100644 --- a/demo/app.py +++ b/demo/app.py @@ -805,6 +805,18 @@ def policy(self): finished=True, ) + # Extract discrete units before vocoder + if self.states.source_finished and len(unit) > 0: + try: + from extract_intermediates import save_discrete_units + # Use source filename if available, otherwise default to "output" + filename_prefix = getattr(self, 'current_source_filename', 'output') + save_discrete_units(unit, filename_prefix=f"{filename_prefix}_output") + except Exception as e: + import traceback + print(f"⚠️ Failed to save discrete units: {e}") + traceback.print_exc() + x = { "code": torch.tensor(unit, dtype=torch.long, device=self.device).view( 1, -1 @@ -858,6 +870,14 @@ def run(source): else: samples, sr = soundfile.read(source, dtype="float32") + # Extract source audio at 16kHz + source_filename = os.path.basename(source).split('.')[0] + from extract_intermediates import save_source_audio + save_source_audio(samples, ORG_SAMPLE_RATE, filename_prefix=source_filename) + + # Store filename for later use in discrete units extraction + agent.current_source_filename = source_filename + # Resample to expected sample rate if needed if sr != ORG_SAMPLE_RATE: print(f"Resampling from {sr}Hz to {ORG_SAMPLE_RATE}Hz") diff --git a/demo/extract_intermediates.py b/demo/extract_intermediates.py new file mode 100644 index 0000000..03cc017 --- /dev/null +++ b/demo/extract_intermediates.py @@ -0,0 +1,173 @@ +""" +Extract Intermediate Outputs from StreamSpeech +================================================ + +This script modifies the demo app to save: +1. Source Spanish audio input (WAV file) +2. Discrete speech units output (PT file) + +Add this code to your demo/app.py to extract intermediates. +""" + +import torch +import soundfile +import os +from datetime import datetime + +# Directory to save extracted files - use absolute path +# This file is in demo/, so go up to project root, then into demo/extracted_intermediates +_current_dir = os.path.dirname(os.path.abspath(__file__)) +EXTRACT_DIR = os.path.join(_current_dir, "extracted_intermediates") +os.makedirs(EXTRACT_DIR, exist_ok=True) + +def save_source_audio(samples, sample_rate, filename_prefix=None, target_sample_rate=16000): + """ + Save the source audio input as WAV file at 16kHz. + + Call this after loading/processing the source audio. + Location: In demo/app.py, run() function, after samples are loaded + + Args: + samples: numpy array of audio samples + sample_rate: current sample rate (e.g., 48000) + filename_prefix: optional prefix for filename + target_sample_rate: target sample rate (default 16000) + """ + if filename_prefix is None: + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + filename_prefix = f"source_{timestamp}" + + output_path = os.path.join(EXTRACT_DIR, f"{filename_prefix}_source_audio_16k.wav") + + # Resample to 16kHz if needed + if sample_rate != target_sample_rate: + import torch + samples_tensor = torch.tensor(samples).unsqueeze(0).unsqueeze(0) # [1, 1, length] + target_length = int(len(samples) * target_sample_rate / sample_rate) + samples_tensor = torch.nn.functional.interpolate( + samples_tensor, size=target_length, mode='linear', align_corners=False + ) + samples = samples_tensor.squeeze().numpy() + print(f" - Resampled from {sample_rate}Hz to {target_sample_rate}Hz") + + # Save as WAV file at 16kHz + soundfile.write(output_path, samples, target_sample_rate) + + print(f"✓ Saved source audio: {output_path}") + print(f" - Sample rate: {target_sample_rate} Hz (16kHz)") + print(f" - Duration: {len(samples)/target_sample_rate:.2f} seconds") + print(f" - Shape: {samples.shape}") + + return output_path + + +def save_discrete_units(units_tensor, filename_prefix=None, save_as_text=True): + """ + Save discrete speech units as PT file (and optionally as text). + + Call this when units are generated before being fed to vocoder. + Location: In agent/speech_to_speech.streamspeech.agent.py, + in policy() method, after line 713-724 where units are generated + + Args: + units_tensor: torch tensor of discrete units (can be list or tensor) + filename_prefix: optional prefix for filename + save_as_text: also save units as readable text file + """ + if filename_prefix is None: + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + filename_prefix = f"units_{timestamp}" + + # Convert to tensor if it's a list + if isinstance(units_tensor, list): + units_tensor = torch.tensor(units_tensor, dtype=torch.long) + + # Save as PyTorch file + pt_path = os.path.join(EXTRACT_DIR, f"{filename_prefix}_discrete_units.pt") + torch.save(units_tensor, pt_path) + + print(f"✓ Saved discrete units: {pt_path}") + print(f" - Shape: {units_tensor.shape}") + print(f" - Number of units: {units_tensor.numel()}") + print(f" - Unit range: [{units_tensor.min().item()}, {units_tensor.max().item()}]") + + # Also save as text for inspection + if save_as_text: + txt_path = os.path.join(EXTRACT_DIR, f"{filename_prefix}_discrete_units.txt") + units_list = units_tensor.cpu().tolist() if units_tensor.dim() > 0 else [units_tensor.item()] + with open(txt_path, 'w') as f: + # Save as space-separated values + if isinstance(units_list[0], list): + for row in units_list: + f.write(' '.join(map(str, row)) + '\n') + else: + f.write(' '.join(map(str, units_list)) + '\n') + print(f"✓ Saved units as text: {txt_path}") + + return pt_path + + +# Example usage metadata +def save_metadata(source_audio_path, units_path, additional_info=None): + """Save metadata about the extraction""" + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + metadata_path = os.path.join(EXTRACT_DIR, f"metadata_{timestamp}.txt") + + with open(metadata_path, 'w') as f: + f.write("StreamSpeech Intermediate Outputs\n") + f.write("=" * 50 + "\n\n") + f.write(f"Timestamp: {timestamp}\n") + f.write(f"Source Audio: {source_audio_path}\n") + f.write(f"Discrete Units: {units_path}\n") + if additional_info: + f.write(f"\nAdditional Info:\n") + for key, value in additional_info.items(): + f.write(f" {key}: {value}\n") + + print(f"✓ Saved metadata: {metadata_path}") + return metadata_path + + +""" +INTEGRATION INSTRUCTIONS +======================== + +1. In demo/app.py, modify the run() function: + + Add at line ~859 (after samples are loaded and resampled): + + ```python + # Import the extraction functions + from extract_intermediates import save_source_audio + + # Save source audio at 16kHz + # Note: Even though samples are at 48kHz here, the function will resample to 16kHz + save_source_audio(samples, ORG_SAMPLE_RATE, filename_prefix=os.path.basename(source).split('.')[0]) + ``` + +2. In agent/speech_to_speech.streamspeech.agent.py, modify the policy() method: + + Add at line ~744 (before units are fed to vocoder): + + ```python + # Import the extraction functions (add at top of file) + from demo.extract_intermediates import save_discrete_units + + # Save discrete units (add right before line 744) + if self.states.source_finished: # Only save final units + save_discrete_units(unit, filename_prefix="output") + + x = { + "code": torch.tensor(unit, dtype=torch.long, device=self.device).view( + 1, -1 + ), + } + ``` + +3. The extracted files will be saved in: demo/extracted_intermediates/ +""" + +if __name__ == "__main__": + print(__doc__) + print("\nExtracted files will be saved to:", os.path.abspath(EXTRACT_DIR)) + From 61394041de1cca5600fcf104913ceb185dbc8fc3 Mon Sep 17 00:00:00 2001 From: ronliwag Date: Fri, 7 Nov 2025 03:53:16 +0800 Subject: [PATCH 5/7] added frontend components for comparison --- demo/templates/index.html | 107 +++++++++++++++++++++++++++++++++----- 1 file changed, 93 insertions(+), 14 deletions(-) diff --git a/demo/templates/index.html b/demo/templates/index.html index 7574bd4..40d5abe 100644 --- a/demo/templates/index.html +++ b/demo/templates/index.html @@ -59,7 +59,14 @@ button:hover:not([disabled]) { background-color: #45a049; } - #waveform, #outputWaveform { + #playModifiedButton { + background-color: #FF9800; + margin-left: 10px; + } + #playModifiedButton:hover:not([disabled]) { + background-color: #F57C00; + } + #waveform, #outputWaveform, #outputWaveformModified { margin: 10px 0; /* Reduced the margin */ border: 1px solid #ddd; border-radius: 5px; @@ -156,17 +163,10 @@
-
ACL 2024
+
MODIFIED

StreamSpeech: Simultaneous Speech-to-Speech Translation with Multi-task Learning

-

Authors: Shaolei Zhang, Qingkai Fang, Shoutao Guo, Zhengrui Ma, Min Zhang, Yang Feng*

-

💡StreamSpeech is an "All in One" seamless model for offline and simultaneous speech recognition, speech translation and speech synthesis under any latency.

-
- arXiv - Demo - StreamSpeech Models - Visitors -
+
@@ -179,9 +179,9 @@

💡StreamSpeech is an "All in One" seaml

Streaming Inputs

- + +
-

Streaming Speech Recognition


@@ -191,15 +191,20 @@

Simultaneous Speech-to-Text Translation


-

Simultaneous Speech-to-Speech Translation

+

Simultaneous Speech-to-Speech Translation (StreamSpeech)

+
+

Simultaneous Speech-to-Speech Translation (Modified Vocoder)

+
+