Accuracy checker support for bloomz-560m (#3801)

sstrehlk · web-flow · commit d7fd1e1b40fa · 2023-10-17T11:33:01.000+04:00
* Support for attention mask in wikitext2raw annotation converter

* Support for bloomz-560m

* Added transformers and scipy to requirements

* Unified tkenizer preparation

* Changed scipy import location

* Return to original log_softmax implementation

* cosmetic

* Return to log_softmax from scipy.
There was a division by zero with original log_softmax implementation (for whole set of dataset).

* a few data type fixes

* Corrected meta shapes preparation.

* Corrected fit to input to return proper data type.

* Align scipy version with another requirements

* Removed unneeded change.

* Readme update, conditional using log_softmax from scipy
diff --git a/tools/accuracy_checker/openvino/tools/accuracy_checker/annotation_converters/README.md b/tools/accuracy_checker/openvino/tools/accuracy_checker/annotation_converters/README.md
@@ -370,6 +370,11 @@ The main difference between this converter and `super_resolution` in data organi
 * `squad_emb` - converts the Stanford Question Answering Dataset ([SQuAD](https://rajpurkar.github.io/SQuAD-explorer/)) to `Question Answering Embedding Annotation`. **Note: This converter not only converts data to metric specific format but also tokenize and encodes input for model.**
   * `testing_file` - path to testing file.
   * `vocab_file` - path to model co vocabulary file.
+  * `class_token_first` - Add [CLS] token to the begin of sequence. If False, will be added as the last token.
+  * `enable_padding` - pad input sequence to max length.
+  * `tokenizer_dir` - path to a directory containing vocabulary files required by the transformers tokenizer
+  * `model_id` - model id of a predefined tokenizer hosted inside a model repo on huggingface.co.
+  * `lower_case` - converts output to lower case.
   * `max_seq_length` - maximum total input sequence length after word-piece tokenization (Optional, default value is 128).
   * `max_query_length` - maximum number of tokens for the question (Optional, default value is 64).
   * `lower_case` - allows switching tokens to lower case register. It is useful for working with uncased models (Optional, default value is False)
diff --git a/tools/accuracy_checker/openvino/tools/accuracy_checker/annotation_converters/wikitext2raw.py b/tools/accuracy_checker/openvino/tools/accuracy_checker/annotation_converters/wikitext2raw.py
@@ -17,9 +17,10 @@
 import numpy as np
 
 from ..representation import LanguageModelingAnnotation
-from ..config import PathField, NumberField
+from ..config import PathField, NumberField, StringField, BoolField
 from ..utils import UnsupportedPackage
 from .format_converter import BaseFormatConverter, ConverterReturn
+from ._nlp_common import get_tokenizer
 
 try:
     from tokenizers import Tokenizer, pre_tokenizers, decoders
@@ -42,6 +43,19 @@ def parameters(cls):
             'testing_file': PathField(description="Path to testing file."),
             'merges_file': PathField(description="Path to merges file."),
             'vocab_file': PathField(description='Path to vocabulary file.'),
+            'class_token_first': BoolField(
+                optional=True, default=True,
+                description='Add [CLS] token to the begin of sequence. If False, will be added as the last token.'),
+            'enable_padding': BoolField(optional=True, default=True, description='pad input sequence to max length'),
+            'tokenizer_dir': PathField(
+                optional=True, is_directory=True,
+                description='A path to a directory containing vocabulary files required by the transformers tokenizer'
+            ),
+            'model_id': StringField(
+                optional=True,
+                description='The model id of a predefined tokenizer hosted inside a model repo on huggingface.co'
+            ),
+            'lower_case': BoolField(optional=True, default=False, description='converts output to lower case'),
             'max_seq_length': NumberField(
                 description='The maximum total input sequence length after tokenization.',
                 optional=True, default=128, value_type=int
@@ -57,29 +71,35 @@ def configure(self):
         self.vocab_file = self.get_value_from_config('vocab_file')
         self.merges_file = self.get_value_from_config('merges_file')
         self.max_seq_length = int(self.get_value_from_config('max_seq_length'))
-        self.tokenizer = Tokenizer(BPE.from_file(str(self.vocab_file), str(self.merges_file)))
+        self.model_id = self.get_value_from_config('model_id')
+        self.lower_case = self.get_value_from_config('lower_case')
+        self.tokenizer, self.external_tok = get_tokenizer(self.config, self.lower_case)
+        if not self.external_tok:
+            self.tokenizer = Tokenizer(BPE.from_file(str(self.vocab_file), str(self.merges_file)))
+            self.tokenizer.decoder = decoders.ByteLevel()
         self.tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False)
-        self.tokenizer.decoder = decoders.ByteLevel()
 
     def convert(self, check_content=False, progress_callback=None, progress_interval=100, **kwargs):
         with open(str(self.testing_file), encoding="utf-8") as f:
             text = f.read()
 
-        tokens = self.tokenizer.encode_batch([text])
+        tokens = self.tokenizer([text]) if self.external_tok else self.tokenizer.encode_batch([text])
 
         encoding = tokens[0]
         annotations = []
         unique_id = 1000000000
         for idx in range(0, len(encoding.ids) - self.max_seq_length + 1, self.max_seq_length):
             ids = encoding.ids[idx: idx + self.max_seq_length]
             tokens = encoding.tokens[idx:idx + self.max_seq_length]
-            identifier = ['input_ids_{}'.format(idx), 'labels_{}'.format(idx)]
+            attention_mask = encoding.attention_mask[idx:idx + self.max_seq_length]
+            identifier = ['input_ids_{}'.format(idx), 'input_mask_{}'.format(idx), 'labels_{}'.format(idx)]
             annotation = LanguageModelingAnnotation(
                 identifier,
                 np.array(unique_id),
                 np.array([ids]),
                 tokens,
                 labels=np.array(ids),
+                input_mask=np.array([attention_mask])
             )
             annotations.append(annotation)
             unique_id += 1
diff --git a/tools/accuracy_checker/openvino/tools/accuracy_checker/launcher/openvino_launcher.py b/tools/accuracy_checker/openvino/tools/accuracy_checker/launcher/openvino_launcher.py
@@ -806,7 +806,7 @@ def fit_to_input(self, data, layer_name, layout, precision, template=None):
             data = data.astype(precision)
         if layer_name in self.dyn_input_layers:
             self._do_reshape = not self.is_dynamic
-            return data, template
+            return data
         data_shape = np.shape(data)
         if data_shape != layer_shape:
             if self.allow_reshape_input:
@@ -836,7 +836,7 @@ def _data_to_blob_dyn(layer_rang, data, layout, template=None):
                 template = [1] * (np.ndim(data) - len(template)) + template
             if len(template) > np.ndim(data):
                 template = template[0]
-        if len(layout) == len(data_shape):
+        if layout and len(layout) == len(data_shape):
             if template is not None:
                 new_template = [template[l_dim] for l_dim in layout]
                 template = new_template
diff --git a/tools/accuracy_checker/openvino/tools/accuracy_checker/metrics/language_modeling.py b/tools/accuracy_checker/openvino/tools/accuracy_checker/metrics/language_modeling.py
@@ -15,6 +15,13 @@
 """
 
 import numpy as np
+from ..utils import UnsupportedPackage
+
+try:
+    from scipy.special import log_softmax as scipy_log_softmax
+except ImportError as import_error:
+    scipy_log_softmax = UnsupportedPackage('scipy', import_error.msg)
+
 
 from ..representation import LanguageModelingAnnotation, LanguageModelingPrediction
 from .metric import PerImageEvaluationMetric
@@ -33,7 +40,12 @@ def __init__(self, *args, **kwargs):
 
     def update(self, annotation, prediction):
         def cross_entropy(logits, target):
-            return nll_loss(log_softmax(logits, 1), target)
+            log_softmax_res = log_softmax(logits, 1)
+            if -np.inf in log_softmax_res:
+                log_softmax_res = scipy_log_softmax(logits, 1)
+                if isinstance(scipy_log_softmax, UnsupportedPackage):
+                    scipy_log_softmax.raise_error(self.__provider__)
+            return nll_loss(log_softmax_res, target)
 
         def log_softmax(x, dim):
             e_x = np.exp(x - np.max(x, axis=-1, keepdims=True))
diff --git a/tools/accuracy_checker/openvino/tools/accuracy_checker/representation/nlp_representation.py b/tools/accuracy_checker/openvino/tools/accuracy_checker/representation/nlp_representation.py
@@ -44,11 +44,12 @@ def __init__(self, identifier=''):
 
 
 class LanguageModelingAnnotation(LanguageModeling):
-    def __init__(self, identifier, unique_id, input_ids, tokens, labels=None):
+    def __init__(self, identifier, unique_id, input_ids, tokens, labels=None, input_mask=None):
         super().__init__(identifier)
         self.unique_id = unique_id
         self.tokens = tokens
         self.input_ids = input_ids
+        self.input_mask = input_mask
         self.labels = labels if labels is not None else []