Skip to content

Commit 14f3cc6

Browse files
committed
Merge branch 'feature/feature-store' of github.com:oracle/accelerated-data-science into ODSC-44773/fs_doc_update
2 parents 9e2ab5b + ca61db6 commit 14f3cc6

20 files changed

+370
-421
lines changed

ads/feature_store/common/spark_session_singleton.py

Lines changed: 29 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,6 @@
99

1010
from ads.common.decorator.runtime_dependency import OptionalDependency
1111
import os
12-
1312
from ads.common.oci_client import OCIClientFactory
1413

1514
try:
@@ -32,8 +31,33 @@
3231
raise
3332

3433

34+
def get_env_bool(env_var: str, default: bool = False) -> bool:
35+
"""
36+
:param env_var: Environment variable name
37+
:param default: Default environment variable value
38+
:return: Value of the boolean env variable
39+
"""
40+
env_val = os.getenv(env_var)
41+
if env_val is None:
42+
env_val = default
43+
else:
44+
env_val = env_val.lower()
45+
if env_val == "true":
46+
env_val = True
47+
elif env_val == "false":
48+
env_val = False
49+
else:
50+
raise ValueError(
51+
"For environment variable: {0} only string values T/true or F/false are allowed but: \
52+
{1} was provided.".format(
53+
env_var, env_val
54+
)
55+
)
56+
return env_val
57+
58+
3559
def developer_enabled():
36-
return os.getenv("DEVELOPER_MODE")
60+
return get_env_bool("DEVELOPER_MODE", False)
3761

3862

3963
class SingletonMeta(type):
@@ -75,8 +99,9 @@ def __init__(self, metastore_id: str = None):
7599
"spark.hadoop.oracle.dcat.metastore.id", metastore_id
76100
).config(
77101
"spark.sql.warehouse.dir", metastore.default_managed_table_location
78-
)\
79-
.config("spark.driver.memory", "16G")
102+
).config(
103+
"spark.driver.memory", "16G"
104+
)
80105

81106
if developer_enabled():
82107
# Configure spark session with delta jars only in developer mode. In other cases,

ads/feature_store/common/utils/feature_schema_mapper.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -241,7 +241,6 @@ def map_feature_type_to_pandas(feature_type):
241241
raise TypeError(f"Feature Type {feature_type} is not supported for pandas")
242242

243243

244-
245244
def map_spark_type_to_stats_data_type(spark_type):
246245
"""Maps the spark data types to MLM library data types
247246
args:

ads/feature_store/common/utils/transformation_query_validator.py

Lines changed: 0 additions & 96 deletions
This file was deleted.

ads/feature_store/common/utils/utility.py

Lines changed: 108 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
#!/usr/bin/env python
22
# -*- coding: utf-8; -*-
3+
import copy
4+
import os
35

46
# Copyright (c) 2023 Oracle and/or its affiliates.
57
# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/
@@ -41,6 +43,7 @@
4143
from ads.feature_engineering.feature_type import datetime
4244

4345
logger = logging.getLogger(__name__)
46+
logger.setLevel(logging.INFO)
4447

4548

4649
def get_execution_engine_type(
@@ -117,6 +120,92 @@ def validate_delta_format_parameters(
117120
raise Exception(f"version number cannot be negative")
118121

119122

123+
def show_ingestion_summary(
124+
entity_id: str,
125+
entity_type: EntityType = EntityType.FEATURE_GROUP,
126+
error_details: str = None,
127+
):
128+
"""
129+
Displays a ingestion summary table with the given entity type and error details.
130+
131+
Args:
132+
entity_id: str
133+
entity_type (EntityType, optional): The type of entity being ingested. Defaults to EntityType.FEATURE_GROUP.
134+
error_details (str, optional): Details of any errors that occurred during ingestion. Defaults to None.
135+
"""
136+
from tabulate import tabulate
137+
138+
table_headers = ["entity_id", "entity_type", "ingestion_status", "error_details"]
139+
ingestion_status = "Failed" if error_details else "Succeeded"
140+
141+
table_values = [
142+
entity_id,
143+
entity_type.value,
144+
ingestion_status,
145+
error_details if error_details else "None",
146+
]
147+
148+
logger.info(
149+
"Ingestion Summary \n"
150+
+ tabulate(
151+
[table_values],
152+
headers=table_headers,
153+
tablefmt="fancy_grid",
154+
numalign="center",
155+
stralign="center",
156+
)
157+
)
158+
159+
160+
def show_validation_summary(ingestion_status: str, validation_output, expectation_type):
161+
from tabulate import tabulate
162+
163+
statistics = validation_output["statistics"]
164+
165+
table_headers = (
166+
["expectation_type"] + list(statistics.keys()) + ["ingestion_status"]
167+
)
168+
169+
table_values = [expectation_type] + list(statistics.values()) + [ingestion_status]
170+
171+
logger.info(
172+
"Validation Summary \n"
173+
+ tabulate(
174+
[table_values],
175+
headers=table_headers,
176+
tablefmt="fancy_grid",
177+
numalign="center",
178+
stralign="center",
179+
)
180+
)
181+
182+
rule_table_headers = ["rule_type", "arguments", "status"]
183+
184+
rule_table_values = [
185+
[
186+
rule_output["expectation_config"].get("expectation_type"),
187+
{
188+
key: value
189+
for key, value in rule_output["expectation_config"]["kwargs"].items()
190+
if key != "batch_id"
191+
},
192+
rule_output.get("success"),
193+
]
194+
for rule_output in validation_output["results"]
195+
]
196+
197+
logger.info(
198+
"Validations Rules Summary \n"
199+
+ tabulate(
200+
rule_table_values,
201+
headers=rule_table_headers,
202+
tablefmt="fancy_grid",
203+
numalign="center",
204+
stralign="center",
205+
)
206+
)
207+
208+
120209
def get_features(
121210
output_columns: List[dict],
122211
parent_id: str,
@@ -154,8 +243,10 @@ def get_features(
154243
return features
155244

156245

157-
def get_schema_from_pandas_df(df: pd.DataFrame):
158-
spark = SparkSessionSingleton().get_spark_session()
246+
def get_schema_from_pandas_df(df: pd.DataFrame, feature_store_id: str):
247+
spark = SparkSessionSingleton(
248+
get_metastore_id(feature_store_id)
249+
).get_spark_session()
159250
converted_df = spark.createDataFrame(df)
160251
return get_schema_from_spark_df(converted_df)
161252

@@ -174,27 +265,29 @@ def get_schema_from_spark_df(df: DataFrame):
174265
return schema_details
175266

176267

177-
def get_schema_from_df(data_frame: Union[DataFrame, pd.DataFrame]) -> List[dict]:
268+
def get_schema_from_df(
269+
data_frame: Union[DataFrame, pd.DataFrame], feature_store_id: str
270+
) -> List[dict]:
178271
"""
179272
Given a DataFrame, returns a list of dictionaries that describe its schema.
180273
If the DataFrame is a pandas DataFrame, it uses pandas methods to get the schema.
181274
If it's a PySpark DataFrame, it uses PySpark methods to get the schema.
182275
"""
183276
if isinstance(data_frame, pd.DataFrame):
184-
return get_schema_from_pandas_df(data_frame)
277+
return get_schema_from_pandas_df(data_frame, feature_store_id)
185278
else:
186279
return get_schema_from_spark_df(data_frame)
187280

188281

189282
def get_input_features_from_df(
190-
data_frame: Union[DataFrame, pd.DataFrame]
283+
data_frame: Union[DataFrame, pd.DataFrame], feature_store_id: str
191284
) -> List[FeatureDetail]:
192285
"""
193286
Given a DataFrame, returns a list of FeatureDetail objects that represent its input features.
194287
Each FeatureDetail object contains information about a single input feature, such as its name, data type, and
195288
whether it's categorical or numerical.
196289
"""
197-
schema_details = get_schema_from_df(data_frame)
290+
schema_details = get_schema_from_df(data_frame, feature_store_id)
198291
feature_details = []
199292

200293
for schema_detail in schema_details:
@@ -263,7 +356,7 @@ def largest_matching_subset_of_primary_keys(left_feature_group, right_feature_gr
263356

264357

265358
def convert_pandas_datatype_with_schema(
266-
raw_feature_details: List[dict], input_df: pd.DataFrame
359+
raw_feature_details: List[dict], input_df: pd.DataFrame
267360
) -> pd.DataFrame:
268361
feature_detail_map = {}
269362
columns_to_remove = []
@@ -280,21 +373,25 @@ def convert_pandas_datatype_with_schema(
280373
.where(pd.notnull(input_df[column]), None)
281374
)
282375
else:
283-
logger.warning("column" + column + "doesn't exist in the input feature details")
376+
logger.warning(
377+
"column" + column + "doesn't exist in the input feature details"
378+
)
284379
columns_to_remove.append(column)
285-
return input_df.drop(columns = columns_to_remove)
380+
return input_df.drop(columns=columns_to_remove)
286381

287382

288383
def convert_spark_dataframe_with_schema(
289-
raw_feature_details: List[dict], input_df: DataFrame
384+
raw_feature_details: List[dict], input_df: DataFrame
290385
) -> DataFrame:
291386
feature_detail_map = {}
292387
columns_to_remove = []
293388
for feature_details in raw_feature_details:
294389
feature_detail_map[feature_details.get("name")] = feature_details
295390
for column in input_df.columns:
296391
if column not in feature_detail_map.keys():
297-
logger.warning("column" + column + "doesn't exist in the input feature details")
392+
logger.warning(
393+
"column" + column + "doesn't exist in the input feature details"
394+
)
298395
columns_to_remove.append(column)
299396

300397
return input_df.drop(*columns_to_remove)

ads/feature_store/data_validation/great_expectation.py

Lines changed: 2 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -115,7 +115,7 @@ def apply_validations(expectation_details, expectation_suite_name, dataframe):
115115
str
116116
A string representation of the validation result.
117117
"""
118-
validation_output = None
118+
expectation_response = None
119119
if (
120120
expectation_details
121121
and expectation_details.get("expectationType")
@@ -126,14 +126,4 @@ def apply_validations(expectation_details, expectation_suite_name, dataframe):
126126
expectation_details, expectation_suite_name, dataframe
127127
)
128128

129-
validation_output = str(expectation_response)
130-
131-
if expectation_details["expectationType"] == ExpectationType.STRICT.value:
132-
if not expectation_response["success"]:
133-
raise Exception(
134-
"Expectation failed with statistics: {0} ... Aborting ingestion.".format(
135-
expectation_response["statistics"]
136-
)
137-
)
138-
139-
return validation_output
129+
return expectation_response
330 KB
Loading
289 KB
Loading
227 KB
Loading

0 commit comments

Comments
 (0)