add whisper translate (#235)

dkupnicki · web-flow · commit ae8c8284c006 · 2024-04-10T12:15:58.000+02:00
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -119,6 +119,7 @@ jobs:
       S3_URL_IMAGENET_DATASET_LABELS: ${{ secrets.S3_URL_IMAGENET_DATASET_LABELS }}
       S3_URL_COCO_DATASET: ${{ secrets.S3_URL_COCO_DATASET }}
       S3_URL_COCO_DATASET_ANNOTATIONS: ${{ secrets.S3_URL_COCO_DATASET_ANNOTATIONS }}
+      S3_URL_COVOST2_DATASET: ${{ secrets.S3_URL_COVOST2_DATASET }}
       HF_HUB_TOKEN: ${{ secrets.HF_HUB_TOKEN }}
     steps:
       - name: Install git
@@ -179,6 +180,7 @@ jobs:
       S3_URL_IMAGENET_DATASET_LABELS: ${{ secrets.S3_URL_IMAGENET_DATASET_LABELS }}
       S3_URL_COCO_DATASET: ${{ secrets.S3_URL_COCO_DATASET }}
       S3_URL_COCO_DATASET_ANNOTATIONS: ${{ secrets.S3_URL_COCO_DATASET_ANNOTATIONS }}
+      S3_URL_COVOST2_DATASET: ${{ secrets.S3_URL_COVOST2_DATASET }}
       HF_HUB_TOKEN: ${{ secrets.HF_HUB_TOKEN }}
     steps:
       - name: Install Ampere optimized PyTorch
@@ -221,6 +223,7 @@ jobs:
       S3_URL_IMAGENET_DATASET_LABELS: ${{ secrets.S3_URL_IMAGENET_DATASET_LABELS }}
       S3_URL_COCO_DATASET: ${{ secrets.S3_URL_COCO_DATASET }}
       S3_URL_COCO_DATASET_ANNOTATIONS: ${{ secrets.S3_URL_COCO_DATASET_ANNOTATIONS }}
+      S3_URL_COVOST2_DATASET: ${{ secrets.S3_URL_COVOST2_DATASET }}
       HF_HUB_TOKEN: ${{ secrets.HF_HUB_TOKEN }}
     steps:
       - name: Git checkout & pull submodules
diff --git a/setup_deb.sh b/setup_deb.sh
@@ -174,6 +174,12 @@ pip3 install --no-deps --upgrade \
    streamlit-drawable-canvas==0.8.0 \
    safetensors>=0.3.1
 
+apt install -y autoconf autogen automake build-essential libasound2-dev \
+	libflac-dev libogg-dev libtool libvorbis-dev libopus-dev libmp3lame-dev \
+        libmpg123-dev pkg-config
+apt remove -y libsndfile1
+git clone https://github.com/libsndfile/libsndfile.git && cd libsndfile/ && autoreconf -vif && ./configure --enable-werror && make -j && make install && ldconfig && cd .. && rm -rf libsndfile
+
 if [ "$(PYTHONPATH=$SCRIPT_DIR python3 -c 'from cpuinfo import get_cpu_info; from benchmark import which_ampere_cpu; cpu = which_ampere_cpu(get_cpu_info()["flags"], 1); print("AmpereOne" in cpu)')" == "True" ]; then
    # Only on AmpereOne family
    pip3 install --upgrade --no-deps \
diff --git a/speech_recognition/whisper_translate/README.md b/speech_recognition/whisper_translate/README.md
@@ -0,0 +1,43 @@
+# Whisper translate
+
+This folder contains the script to run Whisper on speech-to-text translation task in PyTorch framework.
+
+The original paper on the architecture is available here: https://arxiv.org/pdf/2212.04356.pdf
+
+
+### Dataset
+
+Download the Common Voice Corpus for the Japanese language here: https://commonvoice.mozilla.org/en/datasets
+
+Extract the dataset:
+```
+tar -xvf ja.tar
+```
+
+### Running instructions
+
+Before running any code you should first export the PYTHONPATH variable with path pointing to the Ampere Model Library directory,
+as well as AIO_NUM_THREADS specifying the number of threads to be used.
+
+```
+export PYTHONPATH=/path/to/ampere_model_library
+export AIO_NUM_THREADS=1
+```
+
+For the best experience we also recommend setting environment variables as specified below.
+
+```
+export COMMONVOICE_PATH=/path/to/dataset
+```
+
+Now you are able to run the run.py script. 
+
+To get detailed information on the script's recognized arguments run it with -h flag for help.
+
+For PyTorch implementation the size of the model (with a flag "-m") has to be specified.
+
+Example command for PyTorch:
+
+```
+python3 run.py -m medium --timeout 600
+```
diff --git a/speech_recognition/whisper_translate/run.py b/speech_recognition/whisper_translate/run.py
@@ -0,0 +1,44 @@
+# SPDX-License-Identifier: Apache-2.0
+# Copyright (c) 2024, Ampere Computing LLC
+import os
+import sys
+import torch
+
+
+def run_pytorch_fp32(model_name, num_runs, timeout, dataset_path, **kwargs):
+    batch_size = 1
+    sys.path.append(os.path.join(os.path.dirname(os.path.realpath(__file__)), "whisper"))
+    from utils.benchmark import run_model
+    from utils.misc import print_warning_message
+    from utils.pytorch import PyTorchRunnerV2
+    from utils.speech_recognition.covost2 import Covost2
+    from speech_recognition.whisper.whisper.whisper import load_model
+    from speech_recognition.whisper.whisper.whisper.transcribe import transcribe
+    model = load_model(model_name)
+    model.eval()
+
+    def single_pass_pytorch(_runner, _covost2):
+        array = _covost2.get_input_array()
+        audio = torch.from_numpy(array.astype("float32"))
+        _covost2.submit_translation(
+            _runner.run(batch_size * array.shape[0], audio)["text"].lstrip().replace(".", "")
+        )
+
+    def translate_wrapper(audio):
+        return transcribe(model, audio, verbose=None, task="translate", language="ja")
+
+    runner = PyTorchRunnerV2(translate_wrapper, throughput_only=True)
+    librispeech = Covost2(dataset_path)
+    print_warning_message("Sampling rate Whisper operates at is 16,000 Hz, therefore throughput values below can be "
+                          "divided by 16,000 to derive 'seconds of processed audio per second'")
+    return run_model(single_pass_pytorch, runner, librispeech, batch_size, num_runs, timeout)
+
+
+if __name__ == "__main__":
+    from utils.helpers import DefaultArgParser
+    whisper_variants = ["tiny", "base", "small", "medium", "large"]
+    parser = DefaultArgParser(["pytorch"])
+    parser.require_model_name(whisper_variants)
+    parser.add_argument("--dataset_path", type=str, required=True,
+                        help="path to the CommonVoice Japanese dataset directory")
+    run_pytorch_fp32(**vars(parser.parse()))
diff --git a/tests/test_pytorch_models.py b/tests/test_pytorch_models.py
@@ -114,6 +114,37 @@ def test_whisper_large(self):
         self.assertTrue(wer_ref / acc["wer_score"] > 0.95)
 
 
+class WhisperTranslate(unittest.TestCase):
+    def setUp(self):
+        from speech_recognition.whisper_translate.run import run_pytorch_fp32
+
+        self.dataset_path = pathlib.Path(get_downloads_path(), "covost2_ja")
+        if not self.dataset_path.exists():
+            url = os.environ.get("S3_URL_COVOST2_DATASET")
+            assert url is not None
+            subprocess.run(f"mkdir {self.dataset_path}".split(),
+                           check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
+            subprocess.run(f"wget -P /tmp {url}".split(),
+                           check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
+            subprocess.run(f"tar -xf /tmp/covost2_ja.tar -C {self.dataset_path}".split(),
+                           check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
+            subprocess.run("rm /tmp/covost2_ja.tar".split(),
+                           check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
+
+        def wrapper(**kwargs):
+            kwargs["q"].put(run_pytorch_fp32(**kwargs)[0])
+
+        self.wrapper = wrapper
+
+    @unittest.skipIf(psutil.virtual_memory().available / 1024 ** 3 < 100, "too little memory")
+    @unittest.skipUnless('_aio_profiler_print' in dir(torch._C), "too slow to run with native")
+    def test_whisper_translate_medium(self):
+        wer_ref = 0.475
+        acc = run_process(self.wrapper, {"model_name": "large", "num_runs": 30, "timeout": None,
+                                         "dataset_path": self.dataset_path})
+        self.assertTrue(wer_ref / acc["bleu_score"] > 0.95)
+
+
 class DLRM(unittest.TestCase):
     def setUp(self):
         self.dataset_path = pathlib.Path(get_downloads_path(), "criteo_preprocessed")
diff --git a/utils/speech_recognition/covost2.py b/utils/speech_recognition/covost2.py
@@ -0,0 +1,45 @@
+# SPDX-License-Identifier: Apache-2.0
+# Copyright (c) 2024, Ampere Computing LLC
+from evaluate import load
+from datasets import load_dataset
+import utils.misc as utils
+from utils.misc import OutOfInstances
+from utils.helpers import Dataset
+
+
+class Covost2(Dataset):
+    sampling_rate = 16000
+
+    def __init__(self, dataset_path=None):
+
+        if dataset_path is None:
+            env_var = "COMMONVOICE_PATH"
+            dataset_path = utils.get_env_variable(
+                env_var, f"Path to CommonVoice directory has not been specified with {env_var} flag")
+
+        self._covost2 = load_dataset("covost2", "ja_en", split="validation", data_dir=dataset_path)
+        self.available_instances = len(self._covost2["audio"])
+        self._idx = 0
+        self._translations = []
+
+    def get_input_array(self):
+        try:
+            return self._covost2["audio"][self._idx]["array"]
+        except IndexError:
+            raise OutOfInstances
+
+    def submit_translation(self, text: str):
+        self._translations.append(text)
+        self._idx += 1
+
+    def reset(self):
+        self._idx = 0
+        self._translations = []
+        return True
+
+    def _summarize_accuracy(self):
+        assert len(self._translations) == len(self._covost2["translation"][:self._idx])
+        bleu_score = load("bleu").compute(
+            references=self._covost2["translation"][:self._idx], predictions=self._translations
+        )
+        return {"bleu_score": bleu_score["bleu"]}