-
Notifications
You must be signed in to change notification settings - Fork 59
odsc-65115 : Transform columns names to make them compatible across models #1046
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 5 commits
7f29766
53dc395
7c08cdc
efe3e35
a20b90f
043fe73
178d922
8947e3f
9b457c7
8f21350
fa5f53d
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -33,6 +33,7 @@ def __init__(self, dataset_info, name="historical_data"): | |
| self.dataset_info = dataset_info | ||
| self.target_category_columns = dataset_info.target_category_columns | ||
| self.target_column_name = dataset_info.target_column | ||
| self.raw_column_names = None | ||
| self.dt_column_name = ( | ||
| dataset_info.datetime_column.name if dataset_info.datetime_column else None | ||
| ) | ||
|
|
@@ -59,7 +60,8 @@ def run(self, data): | |
|
|
||
| """ | ||
| clean_df = self._remove_trailing_whitespace(data) | ||
| # clean_df = self._normalize_column_names(clean_df) | ||
| if hasattr(self.dataset_info, 'horizon'): | ||
| clean_df = self._clean_column_names(clean_df) | ||
| if self.name == "historical_data": | ||
| self._check_historical_dataset(clean_df) | ||
| clean_df = self._set_series_id_column(clean_df) | ||
|
|
@@ -97,8 +99,34 @@ def run(self, data): | |
| def _remove_trailing_whitespace(self, df): | ||
| return df.apply(lambda x: x.str.strip() if x.dtype == "object" else x) | ||
|
|
||
| # def _normalize_column_names(self, df): | ||
| # return df.rename(columns=lambda x: re.sub("[^A-Za-z0-9_]+", "", x)) | ||
| def _clean_column_names(self, df): | ||
| """ | ||
| Remove all whitespaces from column names in a DataFrame and store the original names. | ||
|
|
||
| Parameters: | ||
| df (pd.DataFrame): The DataFrame whose column names need to be cleaned. | ||
|
|
||
| Returns: | ||
| pd.DataFrame: The DataFrame with cleaned column names. | ||
| """ | ||
| self.raw_column_names = { | ||
| col: col.replace(" ", "") for col in df.columns if " " in col | ||
| } | ||
|
|
||
| if self.target_column_name: | ||
| self.target_column_name = self.raw_column_names.get( | ||
| self.target_column_name, self.target_column_name | ||
| ) | ||
| self.dt_column_name = self.raw_column_names.get( | ||
| self.dt_column_name, self.dt_column_name | ||
| ) | ||
|
|
||
| if self.target_category_columns: | ||
| self.target_category_columns = [ | ||
| self.raw_column_names.get(col, col) for col in self.target_category_columns | ||
| ] | ||
| df.columns = df.columns.str.replace(" ", "") | ||
|
||
| return df | ||
|
|
||
| def _set_series_id_column(self, df): | ||
| self._target_category_columns_map = {} | ||
|
|
@@ -226,6 +254,10 @@ def _check_historical_dataset(self, df): | |
| expected_names = [self.target_column_name, self.dt_column_name] + ( | ||
| self.target_category_columns if self.target_category_columns else [] | ||
| ) | ||
|
|
||
| if self.raw_column_names: | ||
| expected_names.extend(list(self.raw_column_names.values())) | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Are these guaranteed to be in historical data?
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yes, as we are making this transformation to the historical data columns, expected columns has to be extended using the same post transformed columns |
||
|
|
||
| if set(df.columns) != set(expected_names): | ||
| raise DataMismatchError( | ||
| f"Expected {self.name} to have columns: {expected_names}, but instead found column names: {df.columns}. Is the {self.name} path correct?" | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is this a proxy for "if it's forecasting"?
If so it may be better to use kind/type for future compatibility
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
In the transformations we use OperatorSpec, incase of the forecasting it is ForecastOperatorSpec, type is available in the OperatorConfig which is not required here and adding type to spec will make it redundant, so I have doing that. Updated this to us operatorspec instance type check for better readability .