|
11 | 11 | from ads.common.decorator.runtime_dependency import OptionalDependency |
12 | 12 | from ads.feature_store.common.utils.feature_schema_mapper import ( |
13 | 13 | map_spark_type_to_feature_type, |
14 | | - map_pandas_type_to_feature_type, |
| 14 | + map_feature_type_to_pandas, |
15 | 15 | ) |
16 | 16 | from ads.feature_store.feature import Feature, DatasetFeature |
17 | 17 | from ads.feature_store.feature_group_expectation import Rule, Expectation |
18 | 18 | from ads.feature_store.input_feature_detail import FeatureDetail |
| 19 | +from ads.feature_store.common.spark_session_singleton import SparkSessionSingleton |
19 | 20 |
|
20 | 21 | try: |
21 | 22 | from pyspark.pandas import DataFrame |
@@ -154,18 +155,9 @@ def get_features( |
154 | 155 |
|
155 | 156 |
|
156 | 157 | def get_schema_from_pandas_df(df: pd.DataFrame): |
157 | | - schema_details = [] |
158 | | - |
159 | | - for order_number, field in enumerate(df.columns, start=1): |
160 | | - details = { |
161 | | - "name": field, |
162 | | - "feature_type": map_pandas_type_to_feature_type(field, df[field]), |
163 | | - "order_number": order_number, |
164 | | - } |
165 | | - |
166 | | - schema_details.append(details) |
167 | | - |
168 | | - return schema_details |
| 158 | + spark = SparkSessionSingleton().get_spark_session() |
| 159 | + converted_df = spark.createDataFrame(df) |
| 160 | + return get_schema_from_spark_df(converted_df) |
169 | 161 |
|
170 | 162 |
|
171 | 163 | def get_schema_from_spark_df(df: DataFrame): |
@@ -268,3 +260,45 @@ def largest_matching_subset_of_primary_keys(left_feature_group, right_feature_gr |
268 | 260 | common_keys = left_primary_keys.intersection(right_primary_keys) |
269 | 261 |
|
270 | 262 | return common_keys |
| 263 | + |
| 264 | + |
| 265 | +def convert_pandas_datatype_with_schema( |
| 266 | + raw_feature_details: List[dict], input_df: pd.DataFrame |
| 267 | +): |
| 268 | + feature_detail_map = {} |
| 269 | + columns_to_remove = [] |
| 270 | + for feature_details in raw_feature_details: |
| 271 | + feature_detail_map[feature_details.get("name")] = feature_details |
| 272 | + for column in input_df.columns: |
| 273 | + if column in feature_detail_map.keys(): |
| 274 | + feature_details = feature_detail_map[column] |
| 275 | + feature_type = feature_details.get("featureType") |
| 276 | + pandas_type = map_feature_type_to_pandas(feature_type) |
| 277 | + input_df[column] = ( |
| 278 | + input_df[column] |
| 279 | + .astype(pandas_type) |
| 280 | + .where(pd.notnull(input_df[column]), None) |
| 281 | + ) |
| 282 | + else: |
| 283 | + logger.warning("column" + column + "doesnt exist in the input feature details") |
| 284 | + columns_to_remove.append(column) |
| 285 | + return input_df.drop(columns = columns_to_remove) |
| 286 | + |
| 287 | + |
| 288 | +def validate_spark_dataframe_schema(raw_feature_details: List[dict], input_df: DataFrame): |
| 289 | + feature_detail_map = {} |
| 290 | + columns_to_remove = [] |
| 291 | + for feature_details in raw_feature_details: |
| 292 | + feature_detail_map[feature_details.get("name")] = feature_details |
| 293 | + for column in input_df.columns: |
| 294 | + if column not in feature_detail_map.keys(): |
| 295 | + logger.warning("column" + column + "doesnt exist in the input feature details") |
| 296 | + columns_to_remove.append(column) |
| 297 | + |
| 298 | + return input_df.drop(*columns_to_remove) |
| 299 | + |
| 300 | + |
| 301 | +def validate_input_feature_details(input_feature_details, data_frame): |
| 302 | + if isinstance(data_frame, pd.DataFrame): |
| 303 | + return convert_pandas_datatype_with_schema(input_feature_details, data_frame) |
| 304 | + return validate_spark_dataframe_schema(input_feature_details, data_frame) |
0 commit comments