openlayer-ai
diff --git a/‎openlayer/__init__.py‎
Lines changed: 15 additions & 15 deletions b/‎openlayer/__init__.py‎
Lines changed: 15 additions & 15 deletions
diff --git a/‎openlayer/constants.py‎
Lines changed: 22 additions & 0 deletions b/‎openlayer/constants.py‎
Lines changed: 22 additions & 0 deletions
diff --git a/‎openlayer/datasets.py‎
Lines changed: 6 additions & 4 deletions b/‎openlayer/datasets.py‎
Lines changed: 6 additions & 4 deletions
@@ -39,7 +39,7 @@
 from .inference_pipelines import InferencePipeline
 from .project_versions import ProjectVersion
 from .projects import Project
-from .schemas import BaselineModelSchema, DatasetSchema, ModelSchema
+from .schemas import dataset_schemas, model_schemas
 from .tasks import TaskType
 from .validators import (
     baseline_model_validators,
@@ -334,7 +334,9 @@ def add_model(
         # Load model config and augment with defaults
         if model_config_file_path is not None:
             model_config = utils.read_yaml(model_config_file_path)
-        model_data = ModelSchema().load({"task_type": task_type.value, **model_config})
+        model_data = model_schemas.ModelSchema().load(
+            {"task_type": task_type.value, **model_config}
+        )
 
         # Copy relevant resources to temp directory
         with tempfile.TemporaryDirectory() as temp_dir:
@@ -432,7 +434,7 @@ def add_baseline_model(
         if model_config_file_path is not None:
             model_config = utils.read_yaml(model_config_file_path)
         model_config["modelType"] = "baseline"
-        model_data = BaselineModelSchema().load(
+        model_data = model_schemas.BaselineModelSchema().load(
             {"task_type": task_type.value, **model_config}
         )
 
@@ -481,7 +483,7 @@ def add_dataset(
         # Load dataset config and augment with defaults
         if dataset_config_file_path is not None:
             dataset_config = utils.read_yaml(dataset_config_file_path)
-        dataset_data = DatasetSchema().load(
+        dataset_data = dataset_schemas.DatasetSchema().load(
             {"task_type": task_type.value, **dataset_config}
         )
         if dataset_data.get("columnNames") is None:
@@ -930,7 +932,7 @@ def create_inference_pipeline(
                         " upload.",
                     ) from None
 
-                reference_dataset_data = DatasetSchema().load(
+                reference_dataset_data = dataset_schemas.ReferenceDatasetSchema().load(
                     {"task_type": task_type.value, **reference_dataset_config}
                 )
 
@@ -1034,7 +1036,7 @@ def upload_reference_dataset(
             ) from None
 
         # Load dataset config and augment with defaults
-        dataset_data = DatasetSchema().load(
+        dataset_data = dataset_schemas.ReferenceDatasetSchema().load(
             {"task_type": task_type.value, **dataset_config}
         )
 
@@ -1116,7 +1118,10 @@ def stream_data(
         stream_config, stream_df = self._add_default_columns(
             config=stream_config, df=stream_df
         )
-        stream_config = self._strip_read_only_fields(stream_config)
+
+        # Remove the `label` for the upload
+        stream_config.pop("label", None)
+
         body = {
             "config": stream_config,
             "rows": stream_df.to_dict(orient="records"),
@@ -1129,13 +1134,6 @@ def stream_data(
         if self.verbose:
             print("Stream published!")
 
-    def _strip_read_only_fields(self, config: Dict[str, any]) -> Dict[str, any]:
-        """Strips read-only fields from the config."""
-        stripped_config = copy.deepcopy(config)
-        for field in ["columnNames", "label"]:
-            stripped_config.pop(field, None)
-        return stripped_config
-
     def publish_batch_data(
         self,
         inference_pipeline_id: str,
@@ -1245,7 +1243,9 @@ def _validate_production_data_and_load_config(
                 "Make sure to fix all of the issues listed above before the upload.",
             ) from None
 
-        config = DatasetSchema().load({"task_type": task_type.value, **config})
+        config = dataset_schemas.ProductionDataSchema().load(
+            {"task_type": task_type.value, **config}
+        )
 
         return config
 
 
@@ -2,6 +2,8 @@
 """
 import os
 
+import marshmallow as ma
+
 # ---------------------------- Commit/staging flow --------------------------- #
 VALID_RESOURCE_NAMES = {"model", "training", "validation", "fine-tuning"}
 OPENLAYER_DIR = os.path.join(os.path.expanduser("~"), ".openlayer")
@@ -12,3 +14,23 @@
 
 # ----------------------------------- APIs ----------------------------------- #
 REQUESTS_TIMEOUT = 60 * 60 * 3  # 3 hours
+
+# ---------------------------- Validation patterns --------------------------- #
+COLUMN_NAME_REGEX = validate = ma.validate.Regexp(
+    r"^(?!openlayer)[a-zA-Z0-9_-]+$",
+    error="strings that are not alphanumeric with underscores or hyphens."
+    + " Spaces and special characters are not allowed."
+    + " The string cannot start with `openlayer`.",
+)
+LANGUAGE_CODE_REGEX = ma.validate.Regexp(
+    r"^[a-z]{2}(-[A-Z]{2})?$",
+    error="`language` of the dataset is not in the ISO 639-1 (alpha-2 code) format.",
+)
+
+COLUMN_NAME_VALIDATION_LIST = [
+    ma.validate.Length(
+        min=1,
+        max=60,
+    ),
+    COLUMN_NAME_REGEX,
+]
@@ -22,14 +22,16 @@ class DatasetType(Enum):
     Used by the ``dataset_type`` argument of the :meth:`openlayer.OpenlayerClient.add_dataset` and
     :meth:`openlayer.OpenlayerClient.add_dataframe` methods."""
 
-    #: For validation sets.
-    Validation = "validation"
-    #: For training sets.
-    Training = "training"
+    #: For fine-tuning data.
+    FineTuning = "fine-tuning"
     #: For production data.
     Production = "production"
     #: For reference datasets.
     Reference = "reference"
+    #: For training sets.
+    Training = "training"
+    #: For validation sets.
+    Validation = "validation"
 
 
 class Dataset: