Completes OPEN-5038 Reference datasets should have label 'reference'

gustavocidornelas · whoseoyster · commit e0bb27cb1ffe · 2023-10-09T20:34:37.000-07:00
diff --git a/openlayer/__init__.py b/openlayer/__init__.py
@@ -1723,11 +1723,13 @@ def upload_reference_dataset(
                 "Either `dataset_config` or `dataset_config_file_path` must be"
                 " provided."
             )
+        if dataset_config_file_path is not None:
+            dataset_config = utils.read_yaml(dataset_config_file_path)
+        dataset_config["label"] = "reference"
 
         # Validate dataset
         dataset_validator = dataset_validators.get_validator(
             task_type=task_type,
-            dataset_config_file_path=dataset_config_file_path,
             dataset_config=dataset_config,
             dataset_file_path=file_path,
         )
@@ -1740,8 +1742,6 @@ def upload_reference_dataset(
             ) from None
 
         # Load dataset config and augment with defaults
-        if dataset_config_file_path is not None:
-            dataset_config = utils.read_yaml(dataset_config_file_path)
         dataset_data = DatasetSchema().load(
             {"task_type": task_type.value, **dataset_config}
         )
diff --git a/openlayer/datasets.py b/openlayer/datasets.py
@@ -28,6 +28,8 @@ class DatasetType(Enum):
     Training = "training"
     #: For production data.
     Production = "production"
+    #: For reference datasets.
+    Reference = "reference"
 
 
 class Dataset:
diff --git a/openlayer/schemas.py b/openlayer/schemas.py
@@ -210,7 +210,7 @@ class RegressionOutputSchema(BaseDatasetSchema):
 class LLMDatasetSchema(LLMInputSchema, LLMOutputSchema):
     """LLM dataset schema."""
 
-    # Override the label to allow for a 'fine-tuning' label instead
+    # Overwrite the label to allow for a 'fine-tuning' label instead
     # of the 'training' label
     label = ma.fields.Str(
         validate=ma.validate.OneOf(
@@ -490,3 +490,79 @@ class ProjectSchema(ma.Schema):
             + " https://reference.openlayer.com/reference/api/openlayer.TaskType.html.\n ",
         ),
     )
+
+
+# ---------------------------- Reference datasets ---------------------------- #
+class LLMReferenceDatasetSchema(LLMDatasetSchema):
+    """LLM reference dataset schema."""
+
+    # Overwrite the label to allow for a 'reference' label instead
+    label = ma.fields.Str(
+        validate=ma.validate.OneOf(
+            [DatasetType.Reference.value],
+            error="`label` not supported." + "The supported `labels` are 'reference'.",
+        ),
+        required=True,
+    )
+
+
+class TabularClassificationReferenceDatasetSchema(TabularClassificationDatasetSchema):
+    """Tabular classification reference dataset schema."""
+
+    # Overwrite the label to allow for a 'reference' label instead
+    label = ma.fields.Str(
+        validate=ma.validate.OneOf(
+            [DatasetType.Reference.value],
+            error="`label` not supported." + "The supported `labels` are 'reference'.",
+        ),
+        required=True,
+    )
+
+
+class TabularRegressionReferenceDatasetSchema(TabularRegressionDatasetSchema):
+    """Tabular regression reference dataset schema."""
+
+    # Overwrite the label to allow for a 'reference' label instead
+    label = ma.fields.Str(
+        validate=ma.validate.OneOf(
+            [DatasetType.Reference.value],
+            error="`label` not supported." + "The supported `labels` are 'reference'.",
+        ),
+        required=True,
+    )
+
+
+class TextClassificationReferenceDatasetSchema(TextClassificationDatasetSchema):
+    """Text classification reference dataset schema."""
+
+    # Overwrite the label to allow for a 'reference' label instead
+    label = ma.fields.Str(
+        validate=ma.validate.OneOf(
+            [DatasetType.Reference.value],
+            error="`label` not supported." + "The supported `labels` are 'reference'.",
+        ),
+        required=True,
+    )
+
+
+class ReferenceDatasetSchema(maos.OneOfSchema):
+    """One of schema for reference datasets.
+    Returns the correct schema based on the task type."""
+
+    type_field = "task_type"
+    # pylint: ignore=line-too-long
+    type_schemas = {
+        TaskType.TabularClassification.value: TabularClassificationReferenceDatasetSchema,
+        TaskType.TabularRegression.value: TabularRegressionReferenceDatasetSchema,
+        TaskType.TextClassification.value: TextClassificationReferenceDatasetSchema,
+        TaskType.LLM.value: LLMReferenceDatasetSchema,
+        TaskType.LLMNER.value: LLMReferenceDatasetSchema,
+        TaskType.LLMQuestionAnswering.value: LLMReferenceDatasetSchema,
+        TaskType.LLMSummarization.value: LLMReferenceDatasetSchema,
+        TaskType.LLMTranslation.value: LLMReferenceDatasetSchema,
+    }
+
+    def get_obj_type(self, obj):
+        if obj not in [task_type.value for task_type in TaskType]:
+            raise ma.ValidationError(f"Unknown object type: {obj.__class__.__name__}")
+        return obj
diff --git a/openlayer/validators/dataset_validators.py b/openlayer/validators/dataset_validators.py
@@ -102,28 +102,66 @@ def _validate_dataset_config(self):
 
         Beware of the order of the validations, as it is important.
         """
+        self._validate_file_existence()
+        self._load_dataset_config()
+        self._validate_dataset_label()
+        self._validate_dataset_schema()
+
+    def _validate_file_existence(self):
+        """Checks whether the dataset_config_file_path exists."""
         # File existence check
         if self.dataset_config_file_path:
             if not os.path.isfile(os.path.expanduser(self.dataset_config_file_path)):
                 self.failed_validations.append(
                     f"File `{self.dataset_config_file_path}` does not exist."
                 )
-            else:
+
+    def _load_dataset_config(self):
+        """Loads the dataset_config_file_path into the `self.dataset_config`
+        attribute."""
+        if self.dataset_config_file_path:
+            try:
                 with open(
                     self.dataset_config_file_path, "r", encoding="UTF-8"
                 ) as stream:
                     self.dataset_config = yaml.safe_load(stream)
+            except:
+                self.failed_validations.append(
+                    f"File `{self.dataset_config_file_path}` is not a valid .yaml file."
+                )
 
+    def _validate_dataset_label(self):
+        """Checks whether the dataset label is valid."""
         if self.dataset_config:
-            dataset_schema = schemas.DatasetSchema()
-            try:
-                dataset_schema.load(
-                    {"task_type": self.task_type.value, **self.dataset_config}
+            if self.dataset_config.get("label") is None:
+                self.failed_validations.append(
+                    "Missing value for required property `label` in the dataset config."
                 )
-            except ma.ValidationError as err:
-                self.failed_validations.extend(
-                    self._format_marshmallow_error_message(err)
+            else:
+                label = self.dataset_config["label"]
+                if not isinstance(label, str):
+                    self.failed_validations.append(
+                        "The value of `label` in the dataset config must be a string."
+                    )
+
+    def _validate_dataset_schema(self):
+        """Checks whether the dataset schema is valid."""
+        if self.dataset_config:
+            label = self.dataset_config.get("label")
+            if label:
+                dataset_schema = (
+                    schemas.ReferenceDatasetSchema()
+                    if label == "reference"
+                    else schemas.DatasetSchema()
                 )
+                try:
+                    dataset_schema.load(
+                        {"task_type": self.task_type.value, **self.dataset_config}
+                    )
+                except ma.ValidationError as err:
+                    self.failed_validations.extend(
+                        self._format_marshmallow_error_message(err)
+                    )
 
     def _validate_dataset_file(self):
         """Checks whether the dataset file exists and is valid.