99from ads .opctl .operator .lowcode .common .utils import (
1010 find_output_dirname ,
1111)
12- from . const import ForecastOutputColumns
12+ from ads . opctl . operator . lowcode . common . const import DataColumns
1313from .model .forecast_datasets import ForecastDatasets
1414from .operator_config import ForecastOperatorConfig
15-
15+ from pathlib import Path
16+ import pandas as pd
1617
1718class ModelEvaluator :
1819 def __init__ (self , models , k = 5 , subsample_ratio = 0.20 ):
1920 self .models = models
2021 self .k = k
2122 self .subsample_ratio = subsample_ratio
23+ self .minimum_sample_count = 5
24+
25+ def generate_cutoffs (self , unique_dates , horizon ):
26+ sorted_dates = np .sort (unique_dates )
27+ train_window_size = [len (sorted_dates ) - (i + 1 ) * horizon for i in range (self .k )]
28+ valid_train_window_size = [ws for ws in train_window_size if ws >= horizon * 3 ]
29+ if len (valid_train_window_size ) < self .k :
30+ logger .warn (f"Only { valid_train_window_size } backtests can be created" )
31+ cut_offs = sorted_dates [- horizon - 1 :- horizon * (self .k + 1 ):- horizon ][:len (valid_train_window_size )]
32+ return cut_offs
2233
2334 def generate_k_fold_data (self , datasets : ForecastDatasets , date_col : str , horizon : int ):
2435 historical_data = datasets .historical_data .data .reset_index ()
25- series_col = ForecastOutputColumns . SERIES
36+ series_col = DataColumns . Series
2637 group_counts = historical_data [series_col ].value_counts ()
2738
28- sample_count = max (5 , int (len (group_counts ) * self .subsample_ratio ))
39+ sample_count = max (self . minimum_sample_count , int (len (group_counts ) * self .subsample_ratio ))
2940 sampled_groups = group_counts .head (sample_count )
3041 sampled_historical_data = historical_data [historical_data [series_col ].isin (sampled_groups .index )]
3142
3243 min_group = group_counts .idxmin ()
3344 min_series_data = historical_data [historical_data [series_col ] == min_group ]
3445 unique_dates = min_series_data [date_col ].unique ()
3546
36- sorted_dates = np .sort (unique_dates )
37- train_window_size = [len (sorted_dates ) - (i + 1 ) * horizon for i in range (self .k )]
38- valid_train_window_size = [ws for ws in train_window_size if ws >= horizon * 3 ]
39- if len (valid_train_window_size ) < self .k :
40- logger .warn (f"Only ${ valid_train_window_size } backtests can be created" )
41-
42- cut_offs = sorted_dates [- horizon - 1 :- horizon * (self .k + 1 ):- horizon ][:len (valid_train_window_size )]
47+ cut_offs = self .generate_cutoffs (unique_dates , horizon )
4348 training_datasets = [sampled_historical_data [sampled_historical_data [date_col ] <= cut_off_date ] for cut_off_date
4449 in cut_offs ]
4550 test_datasets = [sampled_historical_data [sampled_historical_data [date_col ] > cut_offs [0 ]]]
@@ -54,35 +59,55 @@ def remove_none_values(self, obj):
5459 else :
5560 return obj
5661
62+ def create_operator_config (self , operator_config , backtest , model , historical_data , test_data ):
63+ output_dir = find_output_dirname (operator_config .spec .output_directory )
64+ output_file_path = f'{ output_dir } back_testing/{ model } /{ backtest } '
65+ Path (output_file_path ).mkdir (parents = True , exist_ok = True )
66+ historical_data_url = f'{ output_file_path } /historical.csv'
67+ test_data_url = f'{ output_file_path } /test.csv'
68+ historical_data .to_csv (historical_data_url , index = False )
69+ test_data .to_csv (test_data_url , index = False )
70+ backtest_op_config_draft = operator_config .to_dict ()
71+ backtest_spec = backtest_op_config_draft ["spec" ]
72+ backtest_spec ["historical_data" ]["url" ] = historical_data_url
73+ backtest_spec ["test_data" ]["url" ] = test_data_url
74+ backtest_spec ["model" ] = model
75+ backtest_spec ["output_directory" ]["url" ] = output_file_path
76+ backtest_spec ["target_category_columns" ] = [DataColumns .Series ]
77+ backtest_spec .pop ('additional_data' , None ) # todo create additional data
78+ cleaned_config = self .remove_none_values (backtest_op_config_draft )
79+
80+ backtest_op_config = ForecastOperatorConfig .from_dict (
81+ obj_dict = cleaned_config )
82+ return backtest_op_config
83+
5784 def run_all_models (self , datasets : ForecastDatasets , operator_config : ForecastOperatorConfig ):
5885 date_col = operator_config .spec .datetime_column .name
5986 horizon = operator_config .spec .horizon
6087 cut_offs , train_sets , test_sets = self .generate_k_fold_data (datasets , date_col , horizon )
61-
88+ metrics = {}
6289 for model in self .models :
6390 from .model .factory import ForecastOperatorModelFactory
91+ metrics [model ] = {}
6492 for i in range (len (cut_offs )):
6593 backtest_historical_data = train_sets [i ]
6694 backtest_test_data = test_sets [i ]
67- output_dir = find_output_dirname (operator_config .spec .output_directory )
68- output_file_path = f'{ output_dir } back_test/{ i } '
69- from pathlib import Path
70- Path (output_file_path ).mkdir (parents = True , exist_ok = True )
71- historical_data_url = f'{ output_file_path } /historical.csv'
72- test_data_url = f'{ output_file_path } /test.csv'
73- backtest_historical_data .to_csv (historical_data_url , index = False )
74- backtest_test_data .to_csv (test_data_url , index = False )
75- backtest_op_config_draft = operator_config .to_dict ()
76- backtest_spec = backtest_op_config_draft ["spec" ]
77- backtest_spec ["historical_data" ]["url" ] = historical_data_url
78- backtest_spec ["test_data" ]["url" ] = test_data_url
79- backtest_spec ["model" ] = model
80- backtest_spec ["output_directory" ]["url" ] = output_dir
81- cleaned_config = self .remove_none_values (backtest_op_config_draft )
82- backtest_op_cofig = ForecastOperatorConfig .from_dict (
83- obj_dict = cleaned_config )
84- datasets = ForecastDatasets (backtest_op_cofig )
85-
95+ backtest_operator_config = self .create_operator_config (operator_config , i , model ,
96+ backtest_historical_data ,
97+ backtest_test_data )
98+ datasets = ForecastDatasets (backtest_operator_config )
8699 ForecastOperatorModelFactory .get_model (
87- operator_config , datasets
100+ backtest_operator_config , datasets
88101 ).generate_report ()
102+ metrics_df = pd .read_csv (f"{ backtest_operator_config .spec .output_directory .url } /metrics.csv" )
103+ metrics_df ["average_accross_series" ] = metrics_df .drop ('metrics' , axis = 1 ).mean (axis = 1 )
104+ metrics_average_dict = dict (zip (metrics_df ['metrics' ].str .lower (), metrics_df ['average_accross_series' ]))
105+ metrics [model ][i ] = metrics_average_dict [operator_config .spec .metric ]
106+ return metrics
107+
108+ def find_best_model (self , datasets : ForecastDatasets , operator_config : ForecastOperatorConfig ):
109+ metrics = self .run_all_models (datasets , operator_config )
110+ avg_backtests_metrics = {key : sum (value .values ()) / len (value .values ()) for key , value in metrics .items ()}
111+ best_model = min (avg_backtests_metrics , key = avg_backtests_metrics .get )
112+ logger .info (f"Among models { self .models } , { best_model } model shows better performance during backtesting." )
113+ return best_model
0 commit comments