Whisper HF (#241)

jan-grzybek-ampere · web-flow · commit 650ef6967f33 · 2024-04-20T11:04:37.000+02:00
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -81,24 +81,24 @@ jobs:
           tar -xf aio_objdet_dataset.tar.gz > /dev/null
           
           wget $S3_URL_RESNET_50_V15_TF_FP32 > /dev/null 2>&1
-          python3 computer_vision/classification/resnet_50_v15/run.py -m resnet_50_v15_tf_fp32.pb -p fp32 -f tf --timeout=60
+          IGNORE_DATASET_LIMITS=1 python3 computer_vision/classification/resnet_50_v15/run.py -m resnet_50_v15_tf_fp32.pb -p fp32 -f tf --timeout=60
           
-          python3 computer_vision/classification/mobilenet_v2/run.py -p fp32 -f pytorch --timeout=60
+          IGNORE_DATASET_LIMITS=1 python3 computer_vision/classification/mobilenet_v2/run.py -p fp32 -f pytorch --timeout=60
           
           wget https://github.com/ultralytics/assets/releases/download/v0.0.0/yolov8n.pt > /dev/null 2>&1
-          python3 computer_vision/object_detection/yolo_v8/run.py -m yolov8n.pt -f pytorch -p fp32 --timeout=60
+          IGNORE_DATASET_LIMITS=1 python3 computer_vision/object_detection/yolo_v8/run.py -m yolov8n.pt -f pytorch -p fp32 --timeout=60
           
           python3 speech_recognition/whisper/run.py -m small.en
           
           wget $S3_URL_SSD_INCEPTION_V2_TF_FP32 > /dev/null 2>&1
-          python3 computer_vision/object_detection/ssd_inception_v2/run.py -m ssd_inception_v2_tf_fp32.pb -p fp32 --timeout=60
+          IGNORE_DATASET_LIMITS=1 python3 computer_vision/object_detection/ssd_inception_v2/run.py -m ssd_inception_v2_tf_fp32.pb -p fp32 --timeout=60
           
           wget https://zenodo.org/records/4735647/files/resnet50_v1.onnx > /dev/null 2>&1
-          python3 computer_vision/classification/resnet_50_v1/run.py -m resnet50_v1.onnx -p fp32 -f ort
+          IGNORE_DATASET_LIMITS=1 python3 computer_vision/classification/resnet_50_v1/run.py -m resnet50_v1.onnx -p fp32 -f ort
           
           wget https://s3.amazonaws.com/onnx-model-zoo/vgg/vgg16/vgg16.tar.gz > /dev/null 2>&1
           tar -xf vgg16.tar.gz > /dev/null
-          python3 computer_vision/classification/vgg_16/run.py -m vgg16/vgg16.onnx -p fp32 -f ort
+          IGNORE_DATASET_LIMITS=1 python3 computer_vision/classification/vgg_16/run.py -m vgg16/vgg16.onnx -p fp32 -f ort
 
   test_arm64:
     runs-on: self-hosted
@@ -145,24 +145,24 @@ jobs:
           tar -xf aio_objdet_dataset.tar.gz > /dev/null
           
           wget $S3_URL_RESNET_50_V15_TF_FP32 > /dev/null 2>&1
-          python3 computer_vision/classification/resnet_50_v15/run.py -m resnet_50_v15_tf_fp32.pb -p fp32 -f tf --timeout=60
+          IGNORE_DATASET_LIMITS=1 python3 computer_vision/classification/resnet_50_v15/run.py -m resnet_50_v15_tf_fp32.pb -p fp32 -f tf --timeout=60
           
-          python3 computer_vision/classification/mobilenet_v2/run.py -p fp32 -f pytorch --timeout=60
+          IGNORE_DATASET_LIMITS=1 python3 computer_vision/classification/mobilenet_v2/run.py -p fp32 -f pytorch --timeout=60
           
           wget https://github.com/ultralytics/assets/releases/download/v0.0.0/yolov8n.pt > /dev/null 2>&1
-          python3 computer_vision/object_detection/yolo_v8/run.py -m yolov8n.pt -f pytorch -p fp32 --timeout=60
+          IGNORE_DATASET_LIMITS=1 python3 computer_vision/object_detection/yolo_v8/run.py -m yolov8n.pt -f pytorch -p fp32 --timeout=60
           
           python3 speech_recognition/whisper/run.py -m small.en
           
           wget $S3_URL_SSD_INCEPTION_V2_TF_FP32 > /dev/null 2>&1
-          python3 computer_vision/object_detection/ssd_inception_v2/run.py -m ssd_inception_v2_tf_fp32.pb -p fp32 --timeout=60
+          IGNORE_DATASET_LIMITS=1 python3 computer_vision/object_detection/ssd_inception_v2/run.py -m ssd_inception_v2_tf_fp32.pb -p fp32 --timeout=60
           
           wget https://zenodo.org/records/4735647/files/resnet50_v1.onnx > /dev/null 2>&1
-          python3 computer_vision/classification/resnet_50_v1/run.py -m resnet50_v1.onnx -p fp32 -f ort
+          IGNORE_DATASET_LIMITS=1 python3 computer_vision/classification/resnet_50_v1/run.py -m resnet50_v1.onnx -p fp32 -f ort
           
           wget https://s3.amazonaws.com/onnx-model-zoo/vgg/vgg16/vgg16.tar.gz > /dev/null 2>&1
           tar -xf vgg16.tar.gz > /dev/null
-          python3 computer_vision/classification/vgg_16/run.py -m vgg16/vgg16.onnx -p fp32 -f ort
+          IGNORE_DATASET_LIMITS=1 python3 computer_vision/classification/vgg_16/run.py -m vgg16/vgg16.onnx -p fp32 -f ort
 
   test_pytorch_arm64_sh:
     runs-on: self-hosted
@@ -260,10 +260,10 @@ jobs:
           
           AIO_IMPLICIT_FP16_TRANSFORM_FILTER=".*" python3 speech_recognition/whisper/run.py -m tiny.en 
           
-          python3 computer_vision/classification/mobilenet_v2/run.py -p fp32 -f pytorch --timeout=60
+          IGNORE_DATASET_LIMITS=1 python3 computer_vision/classification/mobilenet_v2/run.py -p fp32 -f pytorch --timeout=60
           
           wget https://github.com/ultralytics/assets/releases/download/v0.0.0/yolov8l.pt > /dev/null 2>&1
-          AIO_IMPLICIT_FP16_TRANSFORM_FILTER=".*" python3 computer_vision/object_detection/yolo_v8/run.py -m yolov8l.pt -p fp32 -f pytorch              
+          IGNORE_DATASET_LIMITS=1 AIO_IMPLICIT_FP16_TRANSFORM_FILTER=".*" python3 computer_vision/object_detection/yolo_v8/run.py -m yolov8l.pt -p fp32 -f pytorch              
           
           wget -O bert_large_mlperf.pt https://zenodo.org/records/3733896/files/model.pytorch?download=1 > /dev/null 2>&1
           AIO_IMPLICIT_FP16_TRANSFORM_FILTER=".*" python3 natural_language_processing/extractive_question_answering/bert_large/run_mlperf.py -m bert_large_mlperf.pt -p fp32 -f pytorch
@@ -346,8 +346,8 @@ jobs:
           tar -xvf aio_objdet_dataset.tar.gz > /dev/null
 
           wget https://zenodo.org/records/4735647/files/resnet50_v1.onnx > /dev/null 2>&1
-          AIO_IMPLICIT_FP16_TRANSFORM_FILTER=".*" python3 computer_vision/classification/resnet_50_v1/run.py -m resnet50_v1.onnx -p fp32 -f ort
+          IGNORE_DATASET_LIMITS=1 AIO_IMPLICIT_FP16_TRANSFORM_FILTER=".*" python3 computer_vision/classification/resnet_50_v1/run.py -m resnet50_v1.onnx -p fp32 -f ort
           
           wget https://s3.amazonaws.com/onnx-model-zoo/vgg/vgg16/vgg16.tar.gz > /dev/null 2>&1
           tar -xf vgg16.tar.gz > /dev/null
-          AIO_IMPLICIT_FP16_TRANSFORM_FILTER=".*" python3 computer_vision/classification/vgg_16/run.py -m vgg16/vgg16.onnx -p fp32 -f ort
+          IGNORE_DATASET_LIMITS=1 AIO_IMPLICIT_FP16_TRANSFORM_FILTER=".*" python3 computer_vision/classification/vgg_16/run.py -m vgg16/vgg16.onnx -p fp32 -f ort
diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,5 @@
+utils/torch_jit_cache
+=*
 .DS_Store
 .idea/
 .setup_completed
diff --git a/speech_recognition/whisper/run_hf.py b/speech_recognition/whisper/run_hf.py
@@ -0,0 +1,50 @@
+# SPDX-License-Identifier: Apache-2.0
+# Copyright (c) 2024, Ampere Computing LLC
+
+TORCH_JIT_TRACE = False  # otherwise, run torch.compile()
+
+
+def run_pytorch_fp32(model_name, batch_size, num_runs, timeout, **kwargs):
+    from utils.benchmark import run_model
+    from utils.misc import print_warning_message
+    from utils.pytorch import PyTorchRunnerV2, apply_compile, apply_jit_trace_module
+    from utils.speech_recognition.libri_speech_v2 import LibriSpeech
+    from transformers import WhisperProcessor, WhisperForConditionalGeneration
+    processor = WhisperProcessor.from_pretrained(model_name)
+    model = WhisperForConditionalGeneration.from_pretrained(model_name, torchscript=TORCH_JIT_TRACE)
+    model.eval()
+    librispeech = LibriSpeech()
+    if TORCH_JIT_TRACE:
+        waveform = [librispeech.get_input_array() for _ in range(batch_size)]
+        input_features = processor(
+            waveform, sampling_rate=LibriSpeech.sampling_rate, return_tensors="pt").input_features
+        model = apply_jit_trace_module(model, {"generate": input_features})
+        librispeech = LibriSpeech()  # reset
+        model = model.generate
+    else:
+        model = apply_compile(model.generate)
+
+    def single_pass_pytorch(_runner, _librispeech):
+        waveform = [_librispeech.get_input_array() for _ in range(batch_size)]
+        input_features = processor(
+            waveform, sampling_rate=LibriSpeech.sampling_rate, return_tensors="pt").input_features
+        predicted_ids = _runner.run(sum([x.shape[0] for x in waveform]), input_features)
+        decoded_output = processor.batch_decode(predicted_ids, skip_special_tokens=True)
+        for i in range(batch_size):
+            _librispeech.submit_transcription(decoded_output[i].lstrip().replace(",", "").replace(".", "").upper())
+
+    runner = PyTorchRunnerV2(model, throughput_only=True)
+    print_warning_message("Sampling rate Whisper operates at is 16,000 Hz, therefore throughput values below can be "
+                          "divided by 16,000 to derive 'seconds of processed audio per second'")
+    return run_model(single_pass_pytorch, runner, librispeech, batch_size, num_runs, timeout)
+
+
+if __name__ == "__main__":
+    from utils.helpers import DefaultArgParser
+    whisper_variants = ["openai/whisper-tiny", "openai/whisper-base", "openai/whisper-small", "openai/whisper-medium",
+                        "openai/whisper-large", "openai/whisper-large-v2", "openai/whisper-large-v3"]
+    whisper_variants = whisper_variants + [f"{name}.en" for name in whisper_variants[:4]]
+    parser = DefaultArgParser(["pytorch"])
+    parser.require_model_name(whisper_variants)
+    parser.ask_for_batch_size(1)
+    run_pytorch_fp32(**vars(parser.parse()))
diff --git a/tests/test_pytorch_models.py b/tests/test_pytorch_models.py
@@ -93,24 +93,35 @@ def wrapper(**kwargs):
 
 class Whisper(unittest.TestCase):
     def setUp(self):
-        from speech_recognition.whisper.run import run_pytorch_fp32
+        def wrapper_openai(**kwargs):
+            from speech_recognition.whisper.run import run_pytorch_fp32
+            kwargs["q"].put(run_pytorch_fp32(**kwargs)[0])
 
-        def wrapper(**kwargs):
+        def wrapper_hf(**kwargs):
+            from speech_recognition.whisper.run_hf import run_pytorch_fp32
             kwargs["q"].put(run_pytorch_fp32(**kwargs)[0])
 
-        self.wrapper = wrapper
+        self.wrapper_openai = wrapper_openai
+        self.wrapper_hf = wrapper_hf
 
     @unittest.skipIf(psutil.virtual_memory().available / 1024 ** 3 < 50, "too little memory")
     def test_whisper_tiny_en(self):
         wer_ref = 0.155
-        acc = run_process(self.wrapper, {"model_name": "tiny.en", "num_runs": 30, "timeout": None})
+        acc = run_process(self.wrapper_openai, {"model_name": "tiny.en", "num_runs": 30, "timeout": None})
+        self.assertTrue(wer_ref / acc["wer_score"] > 0.95)
+
+    @unittest.skipIf(psutil.virtual_memory().available / 1024 ** 3 < 50, "too little memory")
+    def test_whisper_hf_tiny_en(self):
+        wer_ref = 0.111
+        acc = run_process(self.wrapper_hf, {"model_name": "openai/whisper-tiny.en", "num_runs": 18,
+                                            "batch_size": 4, "timeout": None})
         self.assertTrue(wer_ref / acc["wer_score"] > 0.95)
 
     @unittest.skipIf(psutil.virtual_memory().available / 1024 ** 3 < 100, "too little memory")
     @unittest.skipUnless('_aio_profiler_print' in dir(torch._C), "too slow to run with native")
     def test_whisper_large(self):
         wer_ref = 0.124
-        acc = run_process(self.wrapper, {"model_name": "large", "num_runs": 30, "timeout": None})
+        acc = run_process(self.wrapper_openai, {"model_name": "large", "num_runs": 30, "timeout": None})
         self.assertTrue(wer_ref / acc["wer_score"] > 0.95)
 
 
diff --git a/utils/benchmark.py b/utils/benchmark.py
@@ -14,7 +14,7 @@
 
 warnings.filterwarnings("ignore")
 
-WARM_UP_RUNS = 3
+WARM_UP_RUNS = 9
 intra_op_parallelism_threads = None
 
 
diff --git a/utils/pytorch.py b/utils/pytorch.py
@@ -248,7 +248,12 @@ def apply_jit_trace(model, example_inputs):
     return load_from_cache_or_apply(model, lambda: torch.jit.trace(model, example_inputs))
 
 
+def apply_jit_trace_module(model, example_inputs):
+    return load_from_cache_or_apply(model, lambda: torch.jit.trace_module(model, example_inputs))
+
+
 def apply_compile(model):
+    torch._dynamo.config.cache_size_limit = 512
     if os.environ.get("TORCH_COMPILE") == "0":
         return model
     if version.parse(pkg_resources.get_distribution("torch").version) >= version.parse("1.14"):
@@ -264,11 +269,8 @@ def apply_compile(model):
             options = {}
             utils.print_warning_message(
                 f"AIO unavailable or disabled, applying torch.compile() with \"{backend}\" backend.")
-        return torch.compile(
-            model,
-            backend=backend,
-            options=options
-        )
+        model = torch.compile(model, backend=backend, options=options)
+        return model
     else:
         utils.print_goodbye_message_and_die(
             f"Installed PyTorch version is {pkg_resources.get_distribution('torch').version}. "
diff --git a/utils/speech_recognition/libri_speech_v2.py b/utils/speech_recognition/libri_speech_v2.py
@@ -18,7 +18,9 @@ def __init__(self):
 
     def get_input_array(self):
         try:
-            return self._librispeech["audio"][self._idx]["array"]
+            array = self._librispeech["audio"][self._idx]["array"]
+            self._idx += 1
+            return array
         except IndexError:
             if os.environ.get("IGNORE_DATASET_LIMITS") == "1":
                 if self.reset():
@@ -28,9 +30,7 @@ def get_input_array(self):
     def submit_transcription(self, text: str):
         if self.do_skip():
             return
-
         self._transcriptions.append(text)
-        self._idx += 1
 
     def reset(self):
         self._idx = 0
@@ -41,10 +41,11 @@ def _summarize_accuracy(self):
         if self.do_skip():
             return
 
-        assert len(self._transcriptions) == len(self._librispeech["text"][:self._idx])
+        assert len(self._transcriptions) == self._idx
         wer_score = load("wer").compute(
             references=self._librispeech["text"][:self._idx], predictions=self._transcriptions
         )
+        assert wer_score <= 1.0
         # print("\n  WER score = {:.3f}".format(wer_score))
         # print(f"\n  Accuracy figures above calculated on the basis of {self._idx} sample(s).")
         return {"wer_score": wer_score}

Original file line number	Diff line number	Diff line change
`@@ -1,3 +1,5 @@`
	`1`	`+utils/torch_jit_cache`
	`2`	`+=*`
`1`	`3`	`.DS_Store`
`2`	`4`	`.idea/`
`3`	`5`	`.setup_completed`