@@ -28,6 +28,18 @@ def __init__(self, spec: AnomalyOperatorSpec):
2828 super ().__init__ (spec = spec , name = "test_data" )
2929
3030
31+ class ValidationData (AbstractData ):
32+ def __init__ (self , spec : AnomalyOperatorSpec ):
33+ super ().__init__ (spec = spec , name = "validation_data" )
34+
35+ def _ingest_data (self , spec ):
36+ self .X_valid_dict = dict ()
37+ self .y_valid_dict = dict ()
38+ for s_id , df in self .get_dict_by_series ().items ():
39+ self .X_valid_dict [s_id ] = df .drop ([OutputColumns .ANOMALY_COL ], axis = 1 )
40+ self .y_valid_dict [s_id ] = df [OutputColumns .ANOMALY_COL ]
41+
42+
3143class AnomalyDatasets :
3244 def __init__ (self , spec : AnomalyOperatorSpec ):
3345 """Instantiates the DataIO instance.
@@ -39,63 +51,23 @@ def __init__(self, spec: AnomalyOperatorSpec):
3951 """
4052 self ._data = AnomalyData (spec )
4153 self .data = self ._data .get_data_long ()
42- # self.test_data = None
43- # self.target_columns = None
4454 self .full_data_dict = self ._data .get_dict_by_series ()
45- # self._load_data(spec)
46-
47- # def _load_data(self, spec):
48- # """Loads anomaly input data."""
49- # try:
50- # self.data = load_data(
51- # filename=spec.input_data.url,
52- # format=spec.input_data.format,
53- # columns=spec.input_data.columns,
54- # )
55- # except InvalidParameterError as e:
56- # e.args = e.args + ("Invalid Parameter: input_data",)
57- # raise e
58- # date_col = spec.datetime_column.name
59- # self.data[date_col] = pd.to_datetime(self.data[date_col])
60- # try:
61- # spec.freq = get_frequency_of_datetime(self.data, spec)
62- # except TypeError as e:
63- # logger.warn(
64- # f"Error determining frequency: {e.args}. Setting Frequency to None"
65- # )
66- # logger.debug(f"Full traceback: {e}")
67- # spec.freq = None
68-
69- # if spec.target_category_columns is None:
70- # if spec.target_column is None:
71- # target_col = [
72- # col
73- # for col in self.data.columns
74- # if col not in [spec.datetime_column.name]
75- # ]
76- # spec.target_column = target_col[0]
77- # self.full_data_dict = {spec.target_column: self.data}
78- # else:
79- # # Merge target category columns
80-
81- # self.data[OutputColumns.Series] = merge_category_columns(
82- # self.data, spec.target_category_columns
83- # )
84- # unique_categories = self.data[OutputColumns.Series].unique()
85- # self.full_data_dict = dict()
86-
87- # for cat in unique_categories:
88- # data_by_cat = self.data[self.data[OutputColumns.Series] == cat].drop(
89- # spec.target_category_columns + [OutputColumns.Series], axis=1
90- # )
91- # self.full_data_dict[cat] = data_by_cat
55+ if spec .validation_data is not None :
56+ self .valid_data = ValidationData (spec )
57+ self .X_valid_dict = self .valid_data .X_valid_dict
58+ self .y_valid_dict = self .valid_data .y_valid_dict
9259
9360
9461class AnomalyOutput :
9562 def __init__ (self , date_column ):
9663 self .category_map = dict ()
9764 self .date_column = date_column
9865
66+ def list_categories (self ):
67+ categories = list (self .category_map .keys ())
68+ categories .sort ()
69+ return categories
70+
9971 def add_output (self , category : str , anomalies : pd .DataFrame , scores : pd .DataFrame ):
10072 self .category_map [category ] = (anomalies , scores )
10173
@@ -126,7 +98,7 @@ def get_outliers_by_cat(self, category: str, data: pd.DataFrame):
12698 def get_inliers (self , data ):
12799 inliers = pd .DataFrame ()
128100
129- for category in self .category_map . keys ():
101+ for category in self .list_categories ():
130102 inliers = pd .concat (
131103 [
132104 inliers ,
@@ -145,7 +117,7 @@ def get_inliers(self, data):
145117 def get_outliers (self , data ):
146118 outliers = pd .DataFrame ()
147119
148- for category in self .category_map . keys ():
120+ for category in self .list_categories ():
149121 outliers = pd .concat (
150122 [
151123 outliers ,
@@ -163,10 +135,10 @@ def get_outliers(self, data):
163135
164136 def get_scores (self , target_category_columns ):
165137 if target_category_columns is None :
166- return self .get_scores_by_cat (list ( self .category_map . keys () )[0 ])
138+ return self .get_scores_by_cat (self .list_categories ( )[0 ])
167139
168140 scores = pd .DataFrame ()
169- for category in self .category_map . keys ():
141+ for category in self .list_categories ():
170142 score = self .get_scores_by_cat (category )
171143 score [target_category_columns [0 ]] = category
172144 scores = pd .concat ([scores , score ], axis = 0 , ignore_index = True )
0 commit comments