NVIDIA
diff --git a/‎PyTorch/SpeechRecognition/wav2vec2/.dockerignore‎
Lines changed: 6 additions & 0 deletions b/‎PyTorch/SpeechRecognition/wav2vec2/.dockerignore‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎PyTorch/SpeechRecognition/wav2vec2/Dockerfile‎
Lines changed: 24 additions & 0 deletions b/‎PyTorch/SpeechRecognition/wav2vec2/Dockerfile‎
Lines changed: 24 additions & 0 deletions
diff --git a/‎PyTorch/SpeechRecognition/wav2vec2/README.md‎
Lines changed: 587 additions & 0 deletions b/‎PyTorch/SpeechRecognition/wav2vec2/README.md‎
Lines changed: 587 additions & 0 deletions
diff --git a/‎PyTorch/SpeechRecognition/wav2vec2/common/__init__.py‎ b/‎PyTorch/SpeechRecognition/wav2vec2/common/__init__.py‎
diff --git a/‎PyTorch/SpeechRecognition/wav2vec2/common/dataset.py‎
Lines changed: 156 additions & 0 deletions b/‎PyTorch/SpeechRecognition/wav2vec2/common/dataset.py‎
Lines changed: 156 additions & 0 deletions
diff --git a/‎PyTorch/SpeechRecognition/wav2vec2/common/fairseq/data/__init__.py‎
Lines changed: 30 additions & 0 deletions b/‎PyTorch/SpeechRecognition/wav2vec2/common/fairseq/data/__init__.py‎
Lines changed: 30 additions & 0 deletions
diff --git a/‎PyTorch/SpeechRecognition/wav2vec2/common/fairseq/data/add_target_dataset.py‎
Lines changed: 134 additions & 0 deletions b/‎PyTorch/SpeechRecognition/wav2vec2/common/fairseq/data/add_target_dataset.py‎
Lines changed: 134 additions & 0 deletions
@@ -0,0 +1,6 @@
+datasets/
+results/
+models/
+pretrained_models/
+tb_*/
+*.pt
@@ -0,0 +1,24 @@
+# Copyright (c) 2023 NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+ARG FROM_IMAGE_NAME=nvcr.io/nvidia/pytorch:22.11-py3
+FROM ${FROM_IMAGE_NAME}
+
+ENV PYTHONPATH /workspace/wav2vec2
+WORKDIR /workspace/wav2vec2
+
+COPY requirements.txt .
+RUN pip install -r requirements.txt
+
+COPY . .
@@ -0,0 +1,156 @@
+# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import copy
+import numpy as np
+from torch.utils.data import DataLoader
+
+from common.fairseq.data import data_utils
+from common.helpers import print_once
+from common.sampler import DistributedIndicesSampler
+
+
+def adjust_max_tokens(train_dataset, world_size, args):
+
+    def get_steps_per_epoch(world_size, max_tokens, update_freq):
+        train_loader, sampler = get_batch_iterator(
+            train_dataset,
+            True,
+            max_tokens=max_tokens,
+            max_sentences=args.batch_size,
+            max_positions=(max_tokens, max_tokens),
+            ignore_invalid_inputs=True,
+            required_batch_size_multiple=args.required_batch_size_multiple,
+            seed=args.seed,
+            num_shards=world_size,
+            shard_id=0,
+            num_workers=args.num_workers)
+
+        steps_per_epoch = len(train_loader) // update_freq
+        return steps_per_epoch
+
+    steps_ref = get_steps_per_epoch(args.ref_world_size, args.ref_max_tokens, 1)
+
+    min_ = args.ref_max_tokens // 20
+    max_ = args.ref_max_tokens * 20
+
+    prev_max_tokens = 0
+    align_to = 1000
+    while min_ < max_:
+        max_tokens = (max_ + min_) // 2 // align_to * align_to  # try to round
+        if max_tokens == prev_max_tokens:
+            break
+        prev_max_tokens = max_tokens
+        steps = get_steps_per_epoch(world_size, max_tokens, args.update_freq)
+        print_once(f"max_tokens={max_tokens} yields {steps} steps "
+                   f"(adjusting for {steps_ref}).")
+        if steps == steps_ref:
+            break
+        elif steps > steps_ref:
+            min_ = max_tokens
+        else:
+            max_ = max_tokens
+
+    args.max_tokens = max_tokens
+    args.max_tokens_valid = max_tokens
+
+
+def filter_indices_by_size(
+    indices, dataset, max_positions=None, ignore_invalid_inputs=False
+):
+    """
+    Filter examples that are too large
+
+    Args:
+        indices (np.array): original array of sample indices
+        dataset (~fairseq.data.FairseqDataset): dataset to batch
+        max_positions (optional): max sentence length supported by the
+            model (default: None).
+        ignore_invalid_inputs (bool, optional): don't raise Exception for
+            sentences that are too long (default: False).
+    Returns:
+        np.array: array of filtered sample indices
+    """
+    indices, ignored = dataset.filter_indices_by_size(indices, max_positions)
+    # TODO: consider removing this function. If `len(ignored) > 0`,
+    #  an error is raised in fairseq dataset code, both in sup and unsup case
+    if len(ignored) > 0:
+        if not ignore_invalid_inputs:
+            raise Exception(
+                (
+                    "Size of sample #{} is invalid (={}) since max_positions={}, "
+                    "skip this example with --skip-invalid-size-inputs-valid-test"
+                ).format(ignored[0], dataset.size(ignored[0]), max_positions)
+            )
+        print(
+            (
+                "WARNING: {:,} samples have invalid sizes and will be skipped, "
+                "max_positions={}, first few sample ids={}"
+            ).format(len(ignored), max_positions, ignored[:10])
+        )
+    return indices
+
+
+def get_batch_iterator(
+        dataset,
+        training,
+        max_tokens=None,
+        max_sentences=None,
+        max_positions=None,
+        ignore_invalid_inputs=False,
+        required_batch_size_multiple=1,
+        seed=1,
+        num_shards=1,
+        shard_id=0,
+        num_workers=0,
+        num_concat_batches=1,
+):
+    # get indices ordered by example size
+    with data_utils.numpy_seed(seed):
+        indices = dataset.ordered_indices()
+
+    # filter examples that are too large
+    if max_positions is not None:
+        indices = filter_indices_by_size(
+            indices, dataset, max_positions, ignore_invalid_inputs)
+
+    # create mini-batches with given size constraints
+    batch_inds, non_grouped_batch_inds = dataset.batch_by_size(
+        indices,
+        max_tokens=max_tokens,
+        max_sentences=max_sentences,
+        required_batch_size_multiple=required_batch_size_multiple,
+        num_concat_batches=num_concat_batches,
+    )
+
+    batch_ids = copy.deepcopy(non_grouped_batch_inds)
+    [bi.fill(i) for i, bi in enumerate(batch_ids)]
+    inds_ids = zip(np.concatenate(batch_inds), np.concatenate(batch_ids))
+    dataset.batch_ids = {idx: batch_idx for idx, batch_idx in inds_ids}
+
+    # Batches are already specified, now we just need to shuffle them
+    batch_ind_sampler = DistributedIndicesSampler(batch_inds, shuffle=training,
+                                                  num_replicas=num_shards,
+                                                  rank=shard_id, seed=seed,
+                                                  drop_last=training,
+                                                  fillvalue=[])
+    loader = DataLoader(
+        dataset=dataset,
+        collate_fn=dataset.collater,
+        batch_sampler=batch_ind_sampler,
+        num_workers=num_workers,
+        pin_memory=True,
+        persistent_workers=num_workers > 0,
+    )
+    return loader, batch_ind_sampler
@@ -0,0 +1,30 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""isort:skip_file"""
+
+from .add_target_dataset import AddTargetDataset, BaseWrapperDataset
+from .audio.raw_audio_dataset import FileAudioDataset
+from .dictionary import Dictionary
+
+
+__all__ = [
+    "AddTargetDataset",
+    "Dictionary",
+    "FileAudioDataset",
+]
@@ -0,0 +1,134 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+
+from . import data_utils
+
+
+class BaseWrapperDataset(torch.utils.data.Dataset):
+    def __init__(self, dataset):
+        super().__init__()
+        self.dataset = dataset
+
+    def __getitem__(self, index):
+        return self.dataset[index]
+
+    def __len__(self):
+        return len(self.dataset)
+
+    @property
+    def sizes(self):
+        return self.dataset.sizes
+
+    def num_tokens(self, index):
+        return self.dataset.num_tokens(index)
+
+    def size(self, index):
+        return self.dataset.size(index)
+
+    def ordered_indices(self):
+        return self.dataset.ordered_indices()
+
+    def batch_by_size(
+        self,
+        indices,
+        max_tokens=None,
+        max_sentences=None,
+        required_batch_size_multiple=1,
+        num_concat_batches=1,
+    ):
+        return self.dataset.batch_by_size(
+            indices,
+            max_tokens=max_tokens,
+            max_sentences=max_sentences,
+            required_batch_size_multiple=required_batch_size_multiple,
+            num_concat_batches=num_concat_batches,
+        )
+
+    def filter_indices_by_size(self, indices, max_sizes):
+        return self.dataset.filter_indices_by_size(indices, max_sizes)
+
+
+class AddTargetDataset(BaseWrapperDataset):
+    def __init__(
+        self,
+        dataset,
+        labels,
+        pad,
+        eos,
+        batch_targets,
+        process_label=None,
+        add_to_input=False,
+    ):
+        super().__init__(dataset)
+        self.labels = labels
+        self.batch_targets = batch_targets
+        self.pad = pad
+        self.eos = eos
+        self.process_label = process_label
+        self.add_to_input = add_to_input
+
+    def get_label(self, index):
+        return (
+            self.labels[index]
+            if self.process_label is None
+            else self.process_label(self.labels[index])
+        )
+
+    def __getitem__(self, index):
+        item = self.dataset[index]
+        item["label"] = self.get_label(index)
+        return item
+
+    def size(self, index):
+        sz = self.dataset.size(index)
+        own_sz = len(self.get_label(index))
+        return (sz, own_sz)
+
+    def collater(self, samples):
+        collated = self.dataset.collater(samples)
+        if len(collated) == 0:
+            return collated
+        indices = set(collated["id"].tolist())
+        target = [s["label"] for s in samples if s["id"] in indices]
+
+        if self.batch_targets:
+            collated["target_lengths"] = torch.LongTensor([len(t) for t in target])
+            target = data_utils.collate_tokens(target, pad_idx=self.pad, left_pad=False)
+            collated["ntokens"] = collated["target_lengths"].sum().item()
+        else:
+            collated["ntokens"] = sum([len(t) for t in target])
+
+        collated["target"] = target
+
+        if self.add_to_input:
+            eos = target.new_full((target.size(0), 1), self.eos)
+            collated["target"] = torch.cat([target, eos], dim=-1).long()
+            collated["net_input"]["prev_output_tokens"] = torch.cat(
+                [eos, target], dim=-1
+            ).long()
+            collated["ntokens"] += target.size(0)
+        return collated
+
+    def __setattr__(self, attr, val):
+        if attr == "batch_ids":
+            self.dataset.batch_ids = val
+        else:
+            super().__setattr__(attr, val)