Completes OPEN-5122 Support latencyColumnName and numOfTokensColumnName

gustavocidornelas · whoseoyster · commit 4ac7687764b4 · 2023-10-17T11:09:10.000-07:00
diff --git a/openlayer/schemas.py b/openlayer/schemas.py
@@ -96,6 +96,11 @@ class BaseDatasetSchema(ma.Schema):
         load_default="en",
         validate=LANGUAGE_CODE_REGEX,
     )
+    latencyColumnName = ma.fields.Str(
+        validate=COLUMN_NAME_VALIDATION_LIST,
+        allow_none=True,
+        load_default=None,
+    )
     metadata = ma.fields.Dict(allow_none=True, load_default={})
     sep = ma.fields.Str(load_default=",")
     timestampColumnName = ma.fields.Str(
@@ -185,6 +190,11 @@ class LLMOutputSchema(BaseDatasetSchema):
     groundTruthColumnName = ma.fields.Str(
         validate=COLUMN_NAME_VALIDATION_LIST, allow_none=True, load_default=None
     )
+    numOfTokenColumnName = ma.fields.Str(
+        validate=COLUMN_NAME_VALIDATION_LIST,
+        allow_none=True,
+        load_default=None,
+    )
     outputColumnName = ma.fields.Str(
         validate=COLUMN_NAME_VALIDATION_LIST,
         allow_none=True,
diff --git a/openlayer/validators/dataset_validators.py b/openlayer/validators/dataset_validators.py
@@ -198,11 +198,13 @@ def _validate_dataset_and_config_consistency(self):
             # Dataset-wide validations
             self._validate_dataset_dtypes()
 
-            # Timestamps and id validations
+            # Timestamps, id, and latency validations
             if self.dataset_config.get("timestampColumnName"):
                 self._validate_timestamps()
             if self.dataset_config.get("inferenceIdColumnName"):
                 self._validate_inference_ids()
+            if self.dataset_config.get("latencyColumnName"):
+                self._validate_latencies()
 
             self._validate_inputs()
             self._validate_outputs()
@@ -297,6 +299,35 @@ def _validate_inference_ids(self):
                     "Please make sure that the inference ids are unique."
                 )
 
+    def _validate_latencies(self):
+        """Checks if the latencies are in the correct format."""
+        latency_column_name = self.dataset_config.get("latencyColumnName")
+        if latency_column_name not in self.dataset_df.columns:
+            self.failed_validations.append(
+                f"The latency column `{latency_column_name}` specified as "
+                "`latencyColumnName` is not in the dataset."
+            )
+        else:
+            # Validate if values in the latency column are numbers (ints or floats)
+            if not self._values_are_numbers(self.dataset_df, latency_column_name):
+                self.failed_validations.append(
+                    f"The latencies in the column `{latency_column_name}` specified"
+                    " as `latencyColumnName` are not in the correct format. "
+                    "Please make sure that the dtype of the column with the latencies "
+                    "is one of int32, int64, float32, or float64."
+                )
+
+    def _values_are_numbers(self, dataset_df: pd.DataFrame, column_name: str) -> bool:
+        """Checks whether the values in the column are numbers (ints or floats)."""
+        if dataset_df[column_name].dtype.name in (
+            "int64",
+            "int32",
+            "float32",
+            "float64",
+        ):
+            return True
+        return False
+
     @abstractmethod
     def _validate_inputs(self):
         """To be implemented by InputValidator child classes."""
@@ -717,6 +748,7 @@ def _validate_outputs(self):
         """Validates the LLM outputs (i.e., ground truth and output)."""
         self.ground_truth_column_name = self.dataset_config.get("groundTruthColumnName")
         self.output_column_name = self.dataset_config.get("outputColumnName")
+        self.num_of_token_column_name = self.dataset_config.get("numOfTokenColumnName")
 
         if self.ground_truth_column_name:
             self._validate_ground_truth()
@@ -727,6 +759,9 @@ def _validate_outputs(self):
         if self.ground_truth_column_name and self.output_column_name:
             self._validate_ground_truth_and_output_columns_different()
 
+        if self.num_of_token_column_name:
+            self._validate_num_of_token()
+
     def _validate_ground_truth(self):
         """Validations on the ground truth column."""
         if self.ground_truth_column_name not in self.dataset_df.columns:
@@ -773,6 +808,23 @@ def _validate_ground_truth_and_output_columns_different(self):
                 "Please specify different columns for the output and the ground truths."
             )
 
+    def _validate_num_of_token(self):
+        """Validates the number of tokens column."""
+        if self.num_of_token_column_name not in self.dataset_df.columns:
+            self.failed_validations.append(
+                f"The number of tokens column `{self.num_of_token_column_name}` "
+                "specified as `numOfTokenColumnName` is not in the dataset."
+            )
+        elif not self._values_are_numbers(
+            self.dataset_df, self.num_of_token_column_name
+        ):
+            self.failed_validations.append(
+                f"The number of tokens in the column `{self.num_of_token_column_name}`"
+                " specified as `numOfTokenColumnName` are not in the correct format. "
+                "Please make sure that the dtype of the column with the number of"
+                " tokens is one of int32, int64, float32, or float64."
+            )
+
 
 class RegressionOutputValidator(BaseDatasetValidator):
     """Validates regression outputs.