Skip to content

Commit c7ef11f

Browse files
Brush up in error messages and validations
1 parent 38475dc commit c7ef11f

File tree

4 files changed

+159
-117
lines changed

4 files changed

+159
-117
lines changed

unboxapi/__init__.py

Lines changed: 78 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,12 @@
22
import os
33
import pandas as pd
44
import shutil
5+
import sys
56
import tarfile
67
import tempfile
78
import traceback
89
import uuid
10+
import warnings
911

1012
from enum import Enum
1113
from typing import Callable, Dict, List, Optional
@@ -35,6 +37,25 @@
3537
from marshmallow import ValidationError
3638

3739

40+
class HidePrints:
41+
"""Class that suppresses the prints and warnings to stdout and Jupyter's stdout. Used
42+
to hide the print / warning statements that can be inside the uploaded function while
43+
we test it.
44+
"""
45+
46+
def __enter__(self):
47+
self._original_stdout = sys.stdout
48+
sys.stdout = open(os.devnull, "w")
49+
sys._jupyter_stdout = sys.stdout
50+
warnings.filterwarnings("ignore")
51+
52+
def __exit__(self, exc_type, exc_val, exc_tb):
53+
sys.stdout.close()
54+
sys.stdout = self._original_stdout
55+
sys._jupyter_stdout = sys.stdout
56+
warnings.filterwarnings("default")
57+
58+
3859
class DeploymentType(Enum):
3960
"""Specify the storage medium being used by your Unbox deployment."""
4061

@@ -95,6 +116,10 @@ def create_project(
95116
description=description,
96117
)
97118
project_data = self.api.post_request(endpoint, body=payload)
119+
120+
print(
121+
f"Creating project on Unbox! Check out https://unbox.ai/projects to have a look!"
122+
)
98123
return Project(project_data, self.upload, self.subscription_plan, self)
99124

100125
def load_project(self, name: str):
@@ -355,11 +380,11 @@ def add_model(
355380
]:
356381
raise UnboxValidationError(
357382
"`task_type` must be either TaskType.TabularClassification or TaskType.TextClassification. \n"
358-
)
383+
) from None
359384
if model_type not in [model_framework for model_framework in ModelType]:
360385
raise UnboxValidationError(
361386
"`model_type` must be one of the supported ModelTypes. Check out our API reference for a full list https://reference.unbox.ai/reference/api/unboxapi.ModelType.html. \n"
362-
)
387+
) from None
363388
model_schema = ModelSchema()
364389
try:
365390
model_schema.load(
@@ -379,7 +404,7 @@ def add_model(
379404
}
380405
)
381406
except ValidationError as err:
382-
raise UnboxValidationError(self._format_error_message(err))
407+
raise UnboxValidationError(self._format_error_message(err)) from None
383408

384409
# --------------------------- Resource validations --------------------------- #
385410
# Requirements check
@@ -389,21 +414,21 @@ def add_model(
389414
raise UnboxResourceError(
390415
f"The file path `{requirements_txt_file}` specified on `requirements_txt_file` does not"
391416
" contain a file with the requirements. \n"
392-
)
417+
) from None
393418

394419
# Setup script
395420
if setup_script and not os.path.isfile(os.path.expanduser(setup_script)):
396421
raise UnboxResourceError(
397422
f"The file path `{setup_script}` specified on `setup_script` does not"
398423
" contain a file with the bash script with commands required before model loading. \n"
399-
)
424+
) from None
400425

401426
# Dependent dir
402427
if dependent_dir and dependent_dir == os.getcwd():
403428
raise UnboxResourceError(
404429
"`dependent_dir` cannot be the working directory. \n",
405-
mitigation=f"Make sure that the specified `dependent_dir` is different than {os.getcwd()}",
406-
)
430+
mitigation=f"Make sure that the specified `dependent_dir` is different than `{os.getcwd()}`.",
431+
) from None
407432

408433
# Training set
409434
if task_type in [TaskType.TabularClassification, TaskType.TabularRegression]:
@@ -412,15 +437,15 @@ def add_model(
412437
context="There is an issue with the specified `train_sample_df`. \n",
413438
message=f"The `train_sample_df` is too small, with only {len(train_sample_df.index)} rows. \n",
414439
mitigation="Make sure to upload a training set sample with at least 100 rows.",
415-
)
440+
) from None
416441
if train_sample_df.isnull().values.any():
417442
raise UnboxResourceError(
418443
context="There is an issue with the specified `train_sample_df`. \n",
419444
message=f"The `train_sample_df` contains missing values. \n",
420445
mitigation="Currently, Unbox does not support datasets with missing values."
421446
+ "Make sure to upload a training set sample without missing values by applying the same"
422447
+ " preprocessing steps expected by your model.",
423-
)
448+
) from None
424449

425450
train_sample_df = train_sample_df.sample(
426451
min(3000, len(train_sample_df.index))
@@ -430,7 +455,7 @@ def add_model(
430455
if not isinstance(function, Callable):
431456
raise UnboxValidationError(
432457
f"- The argument `{function}` specified as `function` is not callable. \n"
433-
)
458+
) from None
434459

435460
user_args = function.__code__.co_varnames[: function.__code__.co_argcount][2:]
436461
kwarg_keys = tuple(kwargs)
@@ -439,26 +464,32 @@ def add_model(
439464
context="There is an issue with the speficied `function`. \n",
440465
message=f"Your function's additional args {user_args} do not match the kwargs you specifed {kwarg_keys}. \n",
441466
mitigation=f"Make sure to include all of the required kwargs to run inference with your `function`.",
442-
)
467+
) from None
443468
try:
444469
if task_type in [
445470
TaskType.TabularClassification,
446471
TaskType.TabularRegression,
447472
]:
448473
test_input = train_sample_df[:3][feature_names].to_numpy()
449-
function(model, test_input, **kwargs)
474+
with HidePrints():
475+
function(model, test_input, **kwargs)
450476
else:
451-
test_input = ["Test predict function.", "Unbox is great!"]
452-
function(model, test_input, **kwargs)
477+
test_input = [
478+
"Unbox is great!",
479+
"Let's see if this function is ready for some error analysis",
480+
]
481+
with HidePrints():
482+
function(model, test_input, **kwargs)
453483
except Exception as e:
454484
exception_stack = "".join(
455485
traceback.format_exception(type(e), e, e.__traceback__)
456486
)
457487
raise UnboxResourceError(
458-
context="There is n issue with the specified `function`. \n",
459-
message=f"It is failing with the following error: \n{exception_stack} \n",
460-
mitigation="Make sure your function receives the model and the input as arguments, plus the additional kwargs.",
461-
)
488+
context="There is an issue with the specified `function`. \n",
489+
message=f"It is failing with the following error: \n{exception_stack}",
490+
mitigation="Make sure your function receives the model and the input as arguments, plus the additional kwargs. Additionally,"
491+
+ "you may find it useful to debug it on the Jupyter notebook, to ensure it is working correctly before uploading it.",
492+
) from None
462493

463494
# Transformers resources
464495
if model_type is ModelType.transformers:
@@ -467,7 +498,7 @@ def add_model(
467498
context="There is a missing keyword argument for the specified model type. \n",
468499
message="The `tokenizer` must be specified in kwargs when using a transformers model. \n",
469500
mitigation="Make sure to specify the additional kwargs needed for the model type.",
470-
)
501+
) from None
471502

472503
# ------------------ Resource-schema consistency validations ----------------- #
473504
# Feature validations
@@ -486,7 +517,7 @@ def add_model(
486517
]
487518
raise UnboxDatasetInconsistencyError(
488519
f"The features {features_not_in_dataset} specified in `feature_names` are not on the dataset. \n"
489-
)
520+
) from None
490521

491522
required_fields = [
492523
(feature_names, "feature_names"),
@@ -498,7 +529,7 @@ def add_model(
498529
raise UnboxDatasetInconsistencyError(
499530
message=f"TabularClassification task with `{field}` missing. \n",
500531
mitigation=f"Make sure to specify `{field}` for tabular classification tasks.",
501-
)
532+
) from None
502533

503534
with TempDirectory() as dir:
504535
bento_service = create_template_model(
@@ -568,14 +599,18 @@ def add_model(
568599
categoricalFeatureNames=categorical_feature_names,
569600
trainSampleLabelColumnName=train_sample_label_column_name,
570601
)
571-
print("Uploading model to Unbox...")
602+
572603
modeldata = self.upload(
573604
endpoint=endpoint,
574605
file_path=tarfile_path,
575606
object_name="tarfile",
576607
body=payload,
577608
)
578609
os.remove("template_model.py")
610+
611+
print(
612+
f"Uploading model to Unbox! Check out https://unbox.ai/models to have a look!"
613+
)
579614
return Model(modeldata)
580615

581616
def add_dataset(
@@ -730,7 +765,7 @@ def add_dataset(
730765
]:
731766
raise UnboxValidationError(
732767
"`task_type` must be either TaskType.TabularClassification or TaskType.TextClassification. \n"
733-
)
768+
) from None
734769
dataset_schema = DatasetSchema()
735770
try:
736771
dataset_schema.load(
@@ -750,15 +785,15 @@ def add_dataset(
750785
}
751786
)
752787
except ValidationError as err:
753-
raise UnboxValidationError(self._format_error_message(err))
788+
raise UnboxValidationError(self._format_error_message(err)) from None
754789

755790
# --------------------------- Resource validations --------------------------- #
756791
exp_file_path = os.path.expanduser(file_path)
757792
object_name = "original.csv"
758793
if not os.path.isfile(exp_file_path):
759794
raise UnboxResourceError(
760-
f"The file path `{file_path}` specified on `file_path` does not contain a file with the dataset."
761-
)
795+
f"The file path `{file_path}` specified on `file_path` does not contain a file with the dataset. \n"
796+
) from None
762797

763798
with open(exp_file_path, "rt") as f:
764799
reader = csv.reader(f, delimiter=sep)
@@ -774,7 +809,7 @@ def add_dataset(
774809
mitigation="Currently, Unbox does not support datasets with missing values."
775810
+ "Make sure to upload a training set sample without missing values by applying the same"
776811
+ " preprocessing steps expected by your model.",
777-
)
812+
) from None
778813

779814
# ------------------ Resource-schema consistency validations ----------------- #
780815
# Label column validations
@@ -783,15 +818,15 @@ def add_dataset(
783818
except ValueError:
784819
raise UnboxDatasetInconsistencyError(
785820
f"The column `{label_column_name}` specified as `label_column_name` is not on the dataset. \n"
786-
)
821+
) from None
787822

788823
dataset_classes = list(df[label_column_name].unique())
789824
if len(dataset_classes) > len(class_names):
790825
raise UnboxDatasetInconsistencyError(
791826
f"There are {len(dataset_classes)} classes represented on the dataset, but there are only"
792827
f"{len(class_names)} items on the `class_names` list. \n",
793828
mitigation=f"Make sure that there are at most {len(class_names)} classes in your dataset.",
794-
)
829+
) from None
795830

796831
# Feature validations
797832
try:
@@ -803,14 +838,14 @@ def add_dataset(
803838
if text_column_name:
804839
raise UnboxDatasetInconsistencyError(
805840
f"The column `{text_column_name}` specified as `text_column_name` is not on the dataset. \n"
806-
)
841+
) from None
807842
else:
808843
features_not_in_dataset = [
809844
feature for feature in feature_names if feature not in headers
810845
]
811846
raise UnboxDatasetInconsistencyError(
812847
f"The features {features_not_in_dataset} specified in `feature_names` are not on the dataset. \n"
813-
)
848+
) from None
814849

815850
# Tag column validation
816851
try:
@@ -819,22 +854,22 @@ def add_dataset(
819854
except ValueError:
820855
raise UnboxDatasetInconsistencyError(
821856
f"The column `{tag_column_name}` specified as `tag_column_name` is not on the dataset. \n"
822-
)
857+
) from None
823858

824859
# ----------------------- Subscription plan validations ---------------------- #
825860
if row_count > self.subscription_plan["datasetSize"]:
826861
raise UnboxSubscriptionPlanException(
827862
f"The dataset your are trying to upload contains {row_count} rows, which exceeds your plan's"
828863
f" limit of {self.subscription_plan['datasetSize']}. \n"
829-
)
864+
) from None
830865
if task_type == TaskType.TextClassification:
831-
max_text_size = df.text_column.str.len().max()
866+
max_text_size = df[text_column_name].str.len().max()
832867
# TODO: set limit per subscription plan
833868
if max_text_size > 100000:
834869
raise UnboxSubscriptionPlanException(
835870
f"The dataset you are trying to upload contains texts with {max_text_size} characters,"
836871
"which exceeds your plan's limit of 100,000 characters."
837-
)
872+
) from None
838873

839874
endpoint = "datasets"
840875
payload = dict(
@@ -850,6 +885,9 @@ def add_dataset(
850885
featureNames=feature_names,
851886
categoricalFeatureNames=categorical_feature_names,
852887
)
888+
print(
889+
f"Uploading dataset to Unbox! Check out https://unbox.ai/datasets to have a look!"
890+
)
853891
return Dataset(
854892
self.upload(
855893
endpoint=endpoint,
@@ -999,6 +1037,11 @@ def add_dataframe(
9991037
... )
10001038
>>> dataset.to_dict()
10011039
"""
1040+
# --------------------------- Resource validations --------------------------- #
1041+
if not isinstance(df, pd.DataFrame):
1042+
raise UnboxValidationError(
1043+
f"- `df` is a {type(df)}, but it must be a pandas dataframe (pd.DataFrame). \n"
1044+
)
10021045
with tempfile.TemporaryDirectory() as tmp_dir:
10031046
file_path = os.path.join(tmp_dir, str(uuid.uuid1()))
10041047
df.to_csv(file_path, index=False)
@@ -1032,14 +1075,3 @@ def _format_error_message(err) -> str:
10321075
temp_msg = list(msg.values())[0][0].lower()
10331076
error_msg += f"- `{input}` contains items that are {temp_msg} \n"
10341077
return error_msg
1035-
1036-
@staticmethod
1037-
def _validate_categorical_features(
1038-
df: pd.DataFrame, categorical_features_map: Dict[str, List[str]]
1039-
):
1040-
for feature, options in categorical_features_map.items():
1041-
if len(df[feature].unique()) > len(options):
1042-
raise UnboxInvalidRequest(
1043-
f"Feature '{feature}' contains more options in the df than provided "
1044-
"for it in `categorical_features_map`"
1045-
)

unboxapi/api.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ def __init__(self, api_key: str):
3333
raise UnboxException(
3434
"There is an issue instantiating the UnboxClient. \n"
3535
"No valid API key is being provided. \n"
36-
"Make sure to provide a valid API key, as in `UnboxClient('YOUR_API_KEY _HERE')`. You can find your API keys in the Profile page on the Unbox platform."
36+
"Make sure to provide a valid API key using the syntax `UnboxClient('YOUR_API_KEY _HERE')`. You can find your API keys in your Profile page on the Unbox platform."
3737
)
3838

3939
self.api_key = api_key

unboxapi/exceptions.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ def __init__(self, message, context=None, mitigation=None):
3434
class UnboxValidationError(UnboxException):
3535
def __init__(self, message, context=None, mitigation=None):
3636
if not context:
37-
context = "There are issues with the data being passed as argument. \n"
37+
context = "There are issues with some of the arguments: \n"
3838
if not mitigation:
3939
mitigation = (
4040
"Make sure to respect the datatypes and constraints specified above."
@@ -45,7 +45,7 @@ def __init__(self, message, context=None, mitigation=None):
4545
class UnboxDatasetInconsistencyError(UnboxException):
4646
def __init__(self, message, context=None, mitigation=None):
4747
if not context:
48-
context = "There are inconsistencies between the dataset and the data being passed as argument. \n"
48+
context = "There are inconsistencies between the dataset and some of the arguments: \n"
4949
if not mitigation:
5050
mitigation = "Make sure that the value specified in the argument is a column header in the dataframe or csv being uploaded."
5151
super().__init__(context + message + mitigation)

0 commit comments

Comments
 (0)