NVIDIA
diff --git a/‎PyTorch/LanguageModeling/BART/Dockerfile‎
Lines changed: 12 additions & 42 deletions b/‎PyTorch/LanguageModeling/BART/Dockerfile‎
Lines changed: 12 additions & 42 deletions
diff --git a/‎PyTorch/LanguageModeling/BART/README.md‎
Lines changed: 223 additions & 140 deletions b/‎PyTorch/LanguageModeling/BART/README.md‎
Lines changed: 223 additions & 140 deletions
diff --git a/‎PyTorch/LanguageModeling/BART/bart/configuration/configuration_bart.py‎
Lines changed: 1 addition & 6 deletions b/‎PyTorch/LanguageModeling/BART/bart/configuration/configuration_bart.py‎
Lines changed: 1 addition & 6 deletions
diff --git a/‎PyTorch/LanguageModeling/BART/bart/configuration/configuration_t5.py‎
Lines changed: 1 addition & 0 deletions b/‎PyTorch/LanguageModeling/BART/bart/configuration/configuration_t5.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎PyTorch/LanguageModeling/BART/bart/configuration/configuration_utils.py‎
Lines changed: 1 addition & 1 deletion b/‎PyTorch/LanguageModeling/BART/bart/configuration/configuration_utils.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎PyTorch/LanguageModeling/BART/bart/modeling/bert_attn.py‎
Lines changed: 78 additions & 0 deletions b/‎PyTorch/LanguageModeling/BART/bart/modeling/bert_attn.py‎
Lines changed: 78 additions & 0 deletions
@@ -14,55 +14,25 @@
 # limitations under the License.
 # ==============================================================================
 
-ARG FROM_IMAGE_NAME=nvcr.io/nvidia/pytorch:21.02-py3
-
-######
-# Tokenizers is only available pre-built on x86
-#
-FROM ${FROM_IMAGE_NAME} AS tokenizers_amd64
-WORKDIR /wheelhouse
-RUN pip download tokenizers==0.8.0
-
-FROM quay.io/pypa/manylinux2014_aarch64 as tokenizers_arm64
-ARG PYVER=38
-RUN yum install -y openssl-devel
-RUN curl https://sh.rustup.rs -sSf | sh -s -- --default-toolchain nightly-2020-05-14 -y
-ENV PATH="/root/.cargo/bin:$PATH"
-ENV PYBIN=/opt/python/cp${PYVER}-cp${PYVER}/bin
-ENV PYTHON_SYS_EXECUTABLE="$PYBIN/python"
-RUN git clone -b python-v0.8.0 https://github.com/huggingface/tokenizers.git /opt/tokenizers
-WORKDIR /opt/tokenizers/bindings/python
-RUN "${PYBIN}/pip" install setuptools-rust \
- && "${PYBIN}/python" setup.py bdist_wheel \
- && rm -rf build/* \
- && for whl in dist/*.whl; do \
-        auditwheel repair "$whl" -w dist/; \
-    done \
- && rm dist/*-linux_* \
- && mkdir -p /wheelhouse \
- && mv dist/*.whl /wheelhouse
-
-ARG TARGETARCH
-FROM tokenizers_${TARGETARCH} AS tokenizers
-#
-#####
-
-
+ARG FROM_IMAGE_NAME=nvcr.io/nvidia/pytorch:22.08-py3
 FROM ${FROM_IMAGE_NAME}
-RUN apt-get update && apt-get install -y pbzip2
 
-RUN --mount=from=tokenizers,source=/wheelhouse,target=/tmp/wheelhouse \
-    pip install --no-cache-dir /tmp/wheelhouse/tokenizers*.whl
 
-RUN pip install --no-cache-dir dataclasses gitpython rouge-score pynvml==8.0.4 \
-  git+https://github.com/NVIDIA/dllogger pytorch-lightning==1.1.5 gdown sacrebleu
-
-RUN pip install tqdm --upgrade
+RUN apt-get update
+COPY requirements.txt .
+RUN pip install --upgrade --no-cache-dir pip \
+ && pip install --no-cache-dir -r requirements.txt
 
 WORKDIR /workspace
-RUN git clone https://github.com/artmatsak/cnn-dailymail.git
+RUN git clone https://github.com/abisee/cnn-dailymail.git
 RUN git clone https://github.com/gcunhase/AMICorpusXML.git
 
+# Re-build apex
+RUN git clone https://github.com/nv-joseli/apex.git
+RUN cd apex && \
+    git checkout bf16lamb && \
+    NVCC_APPEND_FLAGS='--threads 1' pip install -v --disable-pip-version-check --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" .
+
 WORKDIR /workspace/bart
 
 COPY . .
@@ -1,4 +1,5 @@
 # coding=utf-8
+# Copyright (c) 2022 NVIDIA CORPORATION. All rights reserved.
 # Copyright 2020 The Fairseq Authors and The HuggingFace Inc. team.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -88,8 +89,6 @@
             Google "layerdrop arxiv", as its not explainable in one line.
         decoder_layerdrop: (:obj:`float`, optional, defaults to 0.0):
             Google "layerdrop arxiv", as its not explainable in one line.
-        extra_pos_embeddings: (:obj:`int`, optional, defaults to 2):
-            How many extra learned positional embeddings to use. Should be pad_token_id+1 for bart.
         num_labels: (:obj:`int`, optional, defaults to 2):
             for SequenceClassification
         is_encoder_decoder (:obj:`int`, optional, defaults to True):
@@ -109,7 +108,6 @@ class BartConfig(PretrainedConfig):
     def __init__(
         self,
         activation_dropout=0.0,
-        extra_pos_embeddings=2,  # FIXME(@sshleifer): delete?
         activation_function="gelu",
         vocab_size=50265,
         d_model=1024,
@@ -194,9 +192,6 @@ def __init__(
         # Classifier stuff
         self.classif_dropout = classifier_dropout
 
-        # pos embedding offset
-        self.extra_pos_embeddings = self.pad_token_id + 1
-
         self.force_bos_token_to_be_generated = force_bos_token_to_be_generated
         self.attention_bias = attention_bias
 
 
@@ -1,4 +1,5 @@
 # coding=utf-8
+# Copyright (c) 2022 NVIDIA CORPORATION. All rights reserved.
 # Copyright 2010, The T5 Authors and HuggingFace Inc.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 
@@ -1,6 +1,6 @@
 # coding=utf-8
+# Copyright (c) 2022 NVIDIA CORPORATION. All rights reserved.
 # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
-# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 
@@ -0,0 +1,78 @@
+# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+class BertSelfAttention(nn.Module):
+
+    def __init__(
+        self,
+        embed_dim,
+        num_heads,
+        dropout=0.0,
+        bias=True,
+        encoder_decoder_attention=False,  # otherwise self_attention
+    ):
+    def __init__(self, config):
+        super(BertSelfAttention, self).__init__()
+        if config.hidden_size % num_heads != 0:
+            raise ValueError(
+                "The hidden size (%d) is not a multiple of the number of attention "
+                "heads (%d)" % (config.hidden_size, num_heads))
+        self.num_heads = num_heads
+        self.attention_head_size = int(config.hidden_size / num_heads)
+        self.all_head_size = self.num_heads * self.attention_head_size
+
+        self.query = nn.Linear(config.hidden_size, self.all_head_size)
+        self.key = nn.Linear(config.hidden_size, self.all_head_size)
+        self.value = nn.Linear(config.hidden_size, self.all_head_size)
+
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+
+    def transpose_for_scores(self, x):
+        new_x_shape = x.size()[:-1] + (self.num_heads, self.attention_head_size)
+        x = torch.reshape(x, new_x_shape)
+        return x.permute(0, 2, 1, 3)
+
+    def transpose_key_for_scores(self, x):
+        new_x_shape = x.size()[:-1] + (self.num_heads, self.attention_head_size)
+        x = torch.reshape(x, new_x_shape)
+        return x.permute(0, 2, 3, 1)
+
+    def forward(self, hidden_states, attention_mask):
+        mixed_query_layer = self.query(hidden_states)
+        mixed_key_layer = self.key(hidden_states)
+        mixed_value_layer = self.value(hidden_states)
+
+        query_layer = self.transpose_for_scores(mixed_query_layer)
+        key_layer = self.transpose_key_for_scores(mixed_key_layer)
+        value_layer = self.transpose_for_scores(mixed_value_layer)
+
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = torch.matmul(query_layer, key_layer)
+        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
+        # Apply the attention mask is (precomputed for all layers in BertModel forward() function)
+        attention_scores = attention_scores + attention_mask
+
+        # Normalize the attention scores to probabilities.
+        attention_probs = F.softmax(attention_scores, dim=-1)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.dropout(attention_probs)
+
+        context_layer = torch.matmul(attention_probs, value_layer)
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
+        context_layer = torch.reshape(context_layer, new_context_layer_shape)
+        return context_layer
Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,5 @@`
`1`	`1`	`# coding=utf-8`
	`2`	`+# Copyright (c) 2022 NVIDIA CORPORATION. All rights reserved.`
`2`	`3`	`# Copyright 2010, The T5 Authors and HuggingFace Inc.`
`3`	`4`	`#`
`4`	`5`	`# Licensed under the Apache License, Version 2.0 (the "License");`
Original file line number	Diff line number	Diff line change
`@@ -1,6 +1,6 @@`
`1`	`1`	`# coding=utf-8`
	`2`	`+# Copyright (c) 2022 NVIDIA CORPORATION. All rights reserved.`
`2`	`3`	`# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.`
`3`		`-# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.`
`4`	`4`	`#`
`5`	`5`	`# Licensed under the Apache License, Version 2.0 (the "License");`
`6`	`6`	`# you may not use this file except in compliance with the License.`