55
66
77import numpy as np
8+ import pandas as pd
9+ from pathlib import Path
10+
811from ads .opctl import logger
9- from ads .opctl .operator .lowcode .common .utils import (
10- find_output_dirname ,
11- )
1212from ads .opctl .operator .lowcode .common .const import DataColumns
1313from .model .forecast_datasets import ForecastDatasets
1414from .operator_config import ForecastOperatorConfig
15- from pathlib import Path
16- import pandas as pd
15+
1716
1817class ModelEvaluator :
1918 def __init__ (self , models , k = 5 , subsample_ratio = 0.20 ):
@@ -31,7 +30,9 @@ def generate_cutoffs(self, unique_dates, horizon):
3130 cut_offs = sorted_dates [- horizon - 1 :- horizon * (self .k + 1 ):- horizon ][:len (valid_train_window_size )]
3231 return cut_offs
3332
34- def generate_k_fold_data (self , datasets : ForecastDatasets , date_col : str , horizon : int ):
33+ def generate_k_fold_data (self , datasets : ForecastDatasets , operator_config : ForecastOperatorConfig ):
34+ date_col = operator_config .spec .datetime_column .name
35+ horizon = operator_config .spec .horizon
3536 historical_data = datasets .historical_data .data .reset_index ()
3637 series_col = DataColumns .Series
3738 group_counts = historical_data [series_col ].value_counts ()
@@ -51,63 +52,80 @@ def generate_k_fold_data(self, datasets: ForecastDatasets, date_col: str, horizo
5152 for i , current in enumerate (cut_offs [1 :]):
5253 test_datasets .append (sampled_historical_data [(current < sampled_historical_data [date_col ]) & (
5354 sampled_historical_data [date_col ] <= cut_offs [i ])])
54- return cut_offs , training_datasets , test_datasets
55+ all_additional = datasets .additional_data .data .reset_index ()
56+ sampled_additional_data = all_additional [all_additional [series_col ].isin (sampled_groups .index )]
57+ max_historical_date = sampled_historical_data [date_col ].max ()
58+ additional_data = [sampled_additional_data [sampled_additional_data [date_col ] <= max_historical_date ]]
59+ for cut_off in cut_offs [:- 1 ]:
60+ trimmed_additional_data = sampled_additional_data [sampled_additional_data [date_col ] <= cut_off ]
61+ additional_data .append (trimmed_additional_data )
62+ return cut_offs , training_datasets , additional_data , test_datasets
5563
5664 def remove_none_values (self , obj ):
5765 if isinstance (obj , dict ):
5866 return {k : self .remove_none_values (v ) for k , v in obj .items () if k is not None and v is not None }
5967 else :
6068 return obj
6169
62- def create_operator_config (self , operator_config , backtest , model , historical_data , test_data ):
63- output_dir = find_output_dirname ( operator_config .spec .output_directory )
64- output_file_path = f'{ output_dir } back_testing/{ model } /{ backtest } '
70+ def create_operator_config (self , operator_config , backtest , model , historical_data , additional_data , test_data ):
71+ output_dir = operator_config .spec .output_directory . url
72+ output_file_path = f'{ output_dir } / back_testing/{ model } /{ backtest } '
6573 Path (output_file_path ).mkdir (parents = True , exist_ok = True )
6674 historical_data_url = f'{ output_file_path } /historical.csv'
75+ additional_data_url = f'{ output_file_path } /additional.csv'
6776 test_data_url = f'{ output_file_path } /test.csv'
6877 historical_data .to_csv (historical_data_url , index = False )
78+ additional_data .to_csv (additional_data_url , index = False )
6979 test_data .to_csv (test_data_url , index = False )
7080 backtest_op_config_draft = operator_config .to_dict ()
7181 backtest_spec = backtest_op_config_draft ["spec" ]
7282 backtest_spec ["historical_data" ]["url" ] = historical_data_url
83+ backtest_spec ["additional_data" ]["url" ] = additional_data_url
7384 backtest_spec ["test_data" ]["url" ] = test_data_url
7485 backtest_spec ["model" ] = model
75- backtest_spec ["output_directory" ][ "url" ] = output_file_path
86+ backtest_spec ["output_directory" ] = { "url" : output_file_path }
7687 backtest_spec ["target_category_columns" ] = [DataColumns .Series ]
77- backtest_spec . pop ( 'additional_data' , None ) # todo create additional data
88+ backtest_spec [ 'generate_explanations' ] = False
7889 cleaned_config = self .remove_none_values (backtest_op_config_draft )
7990
8091 backtest_op_config = ForecastOperatorConfig .from_dict (
8192 obj_dict = cleaned_config )
8293 return backtest_op_config
8394
8495 def run_all_models (self , datasets : ForecastDatasets , operator_config : ForecastOperatorConfig ):
85- date_col = operator_config .spec .datetime_column .name
86- horizon = operator_config .spec .horizon
87- cut_offs , train_sets , test_sets = self .generate_k_fold_data (datasets , date_col , horizon )
96+ cut_offs , train_sets , additional_data , test_sets = self .generate_k_fold_data (datasets , operator_config )
8897 metrics = {}
8998 for model in self .models :
9099 from .model .factory import ForecastOperatorModelFactory
91100 metrics [model ] = {}
92101 for i in range (len (cut_offs )):
93102 backtest_historical_data = train_sets [i ]
103+ backtest_additional_data = additional_data [i ]
94104 backtest_test_data = test_sets [i ]
95105 backtest_operator_config = self .create_operator_config (operator_config , i , model ,
96106 backtest_historical_data ,
107+ backtest_additional_data ,
97108 backtest_test_data )
98109 datasets = ForecastDatasets (backtest_operator_config )
99110 ForecastOperatorModelFactory .get_model (
100111 backtest_operator_config , datasets
101112 ).generate_report ()
102- metrics_df = pd .read_csv (f"{ backtest_operator_config .spec .output_directory .url } /metrics.csv" )
103- metrics_df ["average_accross_series" ] = metrics_df .drop ('metrics' , axis = 1 ).mean (axis = 1 )
104- metrics_average_dict = dict (zip (metrics_df ['metrics' ].str .lower (), metrics_df ['average_accross_series' ]))
113+ test_metrics_filename = backtest_operator_config .spec .test_metrics_filename
114+ metrics_df = pd .read_csv (
115+ f"{ backtest_operator_config .spec .output_directory .url } /{ test_metrics_filename } " )
116+ metrics_df ["average_across_series" ] = metrics_df .drop ('metrics' , axis = 1 ).mean (axis = 1 )
117+ metrics_average_dict = dict (zip (metrics_df ['metrics' ].str .lower (), metrics_df ['average_across_series' ]))
105118 metrics [model ][i ] = metrics_average_dict [operator_config .spec .metric ]
106119 return metrics
107120
108121 def find_best_model (self , datasets : ForecastDatasets , operator_config : ForecastOperatorConfig ):
109122 metrics = self .run_all_models (datasets , operator_config )
110- avg_backtests_metrics = {key : sum (value .values ()) / len (value .values ()) for key , value in metrics .items ()}
123+ avg_backtests_metrics = {key : sum (value .values ()) / len (value .values ()) for key , value in metrics .items ()}
111124 best_model = min (avg_backtests_metrics , key = avg_backtests_metrics .get )
112125 logger .info (f"Among models { self .models } , { best_model } model shows better performance during backtesting." )
126+ backtest_stats = pd .DataFrame (metrics ).rename_axis ('backtest' )
127+ backtest_stats .reset_index (inplace = True )
128+ output_dir = operator_config .spec .output_directory .url
129+ backtest_report_name = "backtest_stats.csv"
130+ backtest_stats .to_csv (f"{ output_dir } /{ backtest_report_name } " , index = False )
113131 return best_model
0 commit comments