oracle
diff --git a/‎ads/common/oci_mixin.py‎
Lines changed: 1 addition & 1 deletion b/‎ads/common/oci_mixin.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎ads/feature_store/common/enums.py‎
Lines changed: 26 additions & 3 deletions b/‎ads/feature_store/common/enums.py‎
Lines changed: 26 additions & 3 deletions
diff --git a/‎ads/feature_store/common/utils/feature_schema_mapper.py‎
Lines changed: 171 additions & 41 deletions b/‎ads/feature_store/common/utils/feature_schema_mapper.py‎
Lines changed: 171 additions & 41 deletions
diff --git a/‎ads/feature_store/common/utils/utility.py‎
Lines changed: 2 additions & 4 deletions b/‎ads/feature_store/common/utils/utility.py‎
Lines changed: 2 additions & 4 deletions
diff --git a/‎ads/feature_store/execution_strategy/engine/spark_engine.py‎
Lines changed: 1 addition & 1 deletion b/‎ads/feature_store/execution_strategy/engine/spark_engine.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎ads/feature_store/execution_strategy/spark/spark_execution.py‎
Lines changed: 5 additions & 12 deletions b/‎ads/feature_store/execution_strategy/spark/spark_execution.py‎
Lines changed: 5 additions & 12 deletions
@@ -726,7 +726,7 @@ def update_from_oci_model(
         for attr in self.swagger_types.keys():
             if (
                 hasattr(oci_model_instance, attr)
-                and getattr(oci_model_instance, attr)
+                and getattr(oci_model_instance, attr) is not None
                 and (
                     not hasattr(self, attr)
                     or not getattr(self, attr)
 
@@ -261,17 +261,40 @@ class FeatureType(Enum):
     """
 
     STRING = "STRING"
+    SHORT = "SHORT"
     INTEGER = "INTEGER"
+    LONG = "LONG"
     FLOAT = "FLOAT"
     DOUBLE = "DOUBLE"
     BOOLEAN = "BOOLEAN"
     DATE = "DATE"
     TIMESTAMP = "TIMESTAMP"
     DECIMAL = "DECIMAL"
     BINARY = "BINARY"
-    ARRAY = "ARRAY"
-    MAP = "MAP"
-    STRUCT = "STRUCT"
+    BYTE = "BYTE"
+    STRING_ARRAY = "STRING_ARRAY"
+    INTEGER_ARRAY = "INTEGER_ARRAY"
+    SHORT_ARRAY = "SHORT_ARRAY"
+    LONG_ARRAY = "LONG_ARRAY"
+    FLOAT_ARRAY = "FLOAT_ARRAY"
+    DOUBLE_ARRAY = "DOUBLE_ARRAY"
+    BINARY_ARRAY = "BINARY_ARRAY"
+    DATE_ARRAY = "DATE_ARRAY"
+    TIMESTAMP_ARRAY = "TIMESTAMP_ARRAY"
+    BYTE_ARRAY = "BYTE_ARRAY"
+    BOOLEAN_ARRAY = "BOOLEAN_ARRAY"
+    STRING_STRING_MAP = "STRING_STRING_MAP"
+    STRING_INTEGER_MAP = "STRING_INTEGER_MAP"
+    STRING_SHORT_MAP = "STRING_SHORT_MAP"
+    STRING_LONG_MAP = "STRING_LONG_MAP"
+    STRING_FLOAT_MAP = "STRING_FLOAT_MAP"
+    STRING_DOUBLE_MAP = "STRING_DOUBLE_MAP"
+    STRING_TIMESTAMP_MAP = "STRING_TIMESTAMP_MAP"
+    STRING_DATE_MAP = "STRING_DATE_MAP"
+    STRING_BYTE_MAP = "STRING_BYTE_MAP"
+    STRING_BINARY_MAP = "STRING_BINARY_MAP"
+    STRING_BOOLEAN_MAP = "STRING_BOOLEAN_MAP"
+    UNKNOWN = "UNKNOWN"
 
 
 class EntityType(Enum):
 
@@ -6,7 +6,11 @@
 
 from typing import List
 
+import numpy as np
+import pandas as pd
+
 from ads.common.decorator.runtime_dependency import OptionalDependency
+from ads.feature_store.common.enums import FeatureType
 
 try:
     from pyspark.sql.types import *
@@ -25,65 +29,153 @@ def map_spark_type_to_feature_type(spark_type):
     :return:
     """
     spark_type_to_feature_type = {
-        StringType(): "string",
-        IntegerType(): "integer",
-        FloatType(): "float",
-        DoubleType(): "double",
-        BooleanType(): "boolean",
-        DateType(): "date",
-        TimestampType(): "timestamp",
-        DecimalType(): "decimal",
-        BinaryType(): "binary",
-        ArrayType(StringType()): "array",
-        MapType(StringType(), StringType()): "map",
-        StructType(): "struct",
-        ByteType(): "byte",
-        ShortType(): "short",
-        LongType(): "long",
+        StringType(): FeatureType.STRING,
+        IntegerType(): FeatureType.INTEGER,
+        ShortType(): FeatureType.SHORT,
+        LongType(): FeatureType.LONG,
+        FloatType(): FeatureType.FLOAT,
+        DoubleType(): FeatureType.DOUBLE,
+        BooleanType(): FeatureType.BOOLEAN,
+        DateType(): FeatureType.DATE,
+        TimestampType(): FeatureType.TIMESTAMP,
+        BinaryType(): FeatureType.BINARY,
+        ByteType(): FeatureType.BYTE,
+        ArrayType(StringType()): FeatureType.STRING_ARRAY,
+        ArrayType(IntegerType()): FeatureType.INTEGER_ARRAY,
+        ArrayType(LongType()): FeatureType.LONG_ARRAY,
+        ArrayType(FloatType()): FeatureType.FLOAT_ARRAY,
+        ArrayType(DoubleType()): FeatureType.DOUBLE_ARRAY,
+        ArrayType(BinaryType()): FeatureType.BINARY_ARRAY,
+        ArrayType(DateType()): FeatureType.DATE_ARRAY,
+        ArrayType(TimestampType()): FeatureType.TIMESTAMP_ARRAY,
+        ArrayType(ByteType()): FeatureType.BYTE_ARRAY,
+        ArrayType(BooleanType()): FeatureType.BOOLEAN_ARRAY,
+        ArrayType(ShortType()): FeatureType.SHORT_ARRAY,
+        MapType(StringType(), StringType()): FeatureType.STRING_STRING_MAP,
+        MapType(StringType(), IntegerType()): FeatureType.STRING_INTEGER_MAP,
+        MapType(StringType(), ShortType()): FeatureType.STRING_SHORT_MAP,
+        MapType(StringType(), LongType()): FeatureType.STRING_LONG_MAP,
+        MapType(StringType(), FloatType()): FeatureType.STRING_FLOAT_MAP,
+        MapType(StringType(), DoubleType()): FeatureType.STRING_DOUBLE_MAP,
+        MapType(StringType(), TimestampType()): FeatureType.STRING_TIMESTAMP_MAP,
+        MapType(StringType(), DateType()): FeatureType.STRING_DATE_MAP,
+        MapType(StringType(), BinaryType()): FeatureType.STRING_BINARY_MAP,
+        MapType(StringType(), ByteType()): FeatureType.STRING_BYTE_MAP,
+        MapType(StringType(), BooleanType()): FeatureType.STRING_BOOLEAN_MAP,
     }
-
-    return spark_type_to_feature_type.get(spark_type).upper()
+    if spark_type in spark_type_to_feature_type:
+        return spark_type_to_feature_type.get(spark_type)
+    else:
+        return FeatureType.UNKNOWN
+
+
+def map_pandas_type_to_feature_type(feature_name, values):
+    pandas_type = str(values.dtype)
+    inferred_dtype = FeatureType.UNKNOWN
+    if pandas_type is "object":
+        for row in values:
+            if isinstance(row, (list, np.ndarray)):
+                raise TypeError(f"object of type {type(row)} not supported")
+            pandas_basic_type = type(row).__name__
+            current_dtype = map_pandas_basic_type_to_feature_type(pandas_basic_type)
+            if inferred_dtype is FeatureType.UNKNOWN:
+                inferred_dtype = current_dtype
+            else:
+                if (
+                    current_dtype != inferred_dtype
+                    and current_dtype is not FeatureType.UNKNOWN
+                ):
+                    raise TypeError(
+                        f"Input feature '{feature_name}' has mixed types, {current_dtype} and {inferred_dtype}. "
+                        f"That is not allowed. "
+                    )
+    else:
+        inferred_dtype = map_pandas_basic_type_to_feature_type(pandas_type)
+    if inferred_dtype is FeatureType.UNKNOWN:
+        raise TypeError(
+            f"Input feature '{feature_name}' has type {str(pandas_type)} which is not supported"
+        )
+    else:
+        return inferred_dtype
 
 
-def map_pandas_type_to_feature_type(pandas_type):
+def map_pandas_basic_type_to_feature_type(pandas_type):
     """Returns the feature type corresponding to pandas_type
     :param pandas_type:
     :return:
     """
+    # TODO uint64 with bigger number cant be mapped to LongType
     pandas_type_to_feature_type = {
-        "object": "string",
-        "int64": "integer",
-        "float64": "float",
-        "bool": "boolean",
+        "str": FeatureType.STRING,
+        "string": FeatureType.STRING,
+        "int": FeatureType.INTEGER,
+        "int8": FeatureType.INTEGER,
+        "int16": FeatureType.INTEGER,
+        "int32": FeatureType.LONG,
+        "int64": FeatureType.LONG,
+        "uint8": FeatureType.INTEGER,
+        "uint16": FeatureType.INTEGER,
+        "uint32": FeatureType.LONG,
+        "uint64": FeatureType.LONG,
+        "float": FeatureType.FLOAT,
+        "float16": FeatureType.FLOAT,
+        "float32": FeatureType.DOUBLE,
+        "float64": FeatureType.DOUBLE,
+        "datetime64[ns]": FeatureType.TIMESTAMP,
+        "datetime64[ns, UTC]": FeatureType.TIMESTAMP,
+        "timedelta64[ns]": FeatureType.LONG,
+        "bool": FeatureType.BOOLEAN,
+        "Decimal": FeatureType.DECIMAL,
+        "date": FeatureType.DATE,
     }
-
-    return pandas_type_to_feature_type.get(pandas_type).upper()
+    if pandas_type in pandas_type_to_feature_type:
+        return pandas_type_to_feature_type.get(pandas_type)
+    return FeatureType.UNKNOWN
 
 
 def map_feature_type_to_spark_type(feature_type):
     """Returns the Spark Type for a particular feature type.
     :param feature_type:
     :return: Spark Type
     """
+    feature_type_in = FeatureType(feature_type)
     spark_types = {
-        "string": StringType(),
-        "integer": IntegerType(),
-        "float": FloatType(),
-        "double": DoubleType(),
-        "boolean": BooleanType(),
-        "date": DateType(),
-        "timestamp": TimestampType(),
-        "decimal": DecimalType(),
-        "binary": BinaryType(),
-        "array": ArrayType(StringType()),
-        "map": MapType(StringType(), StringType()),
-        "struct": StructType(),
-        "byte": ByteType(),
-        "short": ShortType(),
-        "long": LongType(),
+        FeatureType.STRING: StringType(),
+        FeatureType.SHORT: ShortType(),
+        FeatureType.INTEGER: IntegerType(),
+        FeatureType.LONG: LongType(),
+        FeatureType.FLOAT: FloatType(),
+        FeatureType.DOUBLE: DoubleType(),
+        FeatureType.BOOLEAN: BooleanType(),
+        FeatureType.DATE: DateType(),
+        FeatureType.TIMESTAMP: TimestampType(),
+        FeatureType.DECIMAL: DecimalType(),
+        FeatureType.BINARY: BinaryType(),
+        FeatureType.STRING_ARRAY: ArrayType(StringType()),
+        FeatureType.INTEGER_ARRAY: ArrayType(IntegerType()),
+        FeatureType.SHORT_ARRAY: ArrayType(ShortType()),
+        FeatureType.LONG_ARRAY: ArrayType(LongType()),
+        FeatureType.FLOAT_ARRAY: ArrayType(FloatType()),
+        FeatureType.DOUBLE_ARRAY: ArrayType(DoubleType()),
+        FeatureType.BINARY_ARRAY: ArrayType(BinaryType()),
+        FeatureType.DATE_ARRAY: ArrayType(DateType()),
+        FeatureType.BOOLEAN_ARRAY: ArrayType(BooleanType()),
+        FeatureType.TIMESTAMP_ARRAY: ArrayType(TimestampType()),
+        FeatureType.STRING_STRING_MAP: MapType(StringType(), StringType()),
+        FeatureType.STRING_INTEGER_MAP: MapType(StringType(), IntegerType()),
+        FeatureType.STRING_SHORT_MAP: MapType(StringType(), ShortType()),
+        FeatureType.STRING_LONG_MAP: MapType(StringType(), LongType()),
+        FeatureType.STRING_FLOAT_MAP: MapType(StringType(), FloatType()),
+        FeatureType.STRING_DOUBLE_MAP: MapType(StringType(), DoubleType()),
+        FeatureType.STRING_DATE_MAP: MapType(StringType(), DateType()),
+        FeatureType.STRING_TIMESTAMP_MAP: MapType(StringType(), TimestampType()),
+        FeatureType.STRING_BOOLEAN_MAP: MapType(StringType(), BooleanType()),
+        FeatureType.BYTE: ByteType(),
     }
-
-    return spark_types.get(feature_type.lower(), None)
+    if feature_type_in in spark_types:
+        return spark_types.get(feature_type_in)
+    else:
+        return "UNKNOWN"
 
 
 def get_raw_data_source_schema(raw_feature_details: List[dict]):
@@ -94,6 +186,7 @@ def get_raw_data_source_schema(raw_feature_details: List[dict]):
 
     Returns:
       StructType: Spark schema.
+      :param raw_feature_details:
     """
     # Initialize the schema
     features_schema = StructType()
@@ -113,3 +206,40 @@ def get_raw_data_source_schema(raw_feature_details: List[dict]):
         features_schema.add(feature_name, feature_type, is_nullable)
 
     return features_schema
+
+
+def map_feature_type_to_pandas(feature_type):
+    feature_type_in = FeatureType(feature_type)
+    supported_feature_type = {
+        FeatureType.STRING: str,
+        FeatureType.LONG: "int64",
+        FeatureType.DOUBLE: "float64",
+        FeatureType.TIMESTAMP: "datetime64[ns]",
+        FeatureType.BOOLEAN: "bool",
+        FeatureType.FLOAT: "float32",
+        FeatureType.INTEGER: "int32",
+        FeatureType.DECIMAL: "object",
+        FeatureType.DATE: "object",
+    }
+    if feature_type_in in supported_feature_type:
+        return supported_feature_type.get(feature_type_in)
+    else:
+        raise TypeError(f"Feature Type {feature_type} is not supported for pandas")
+
+
+def convert_pandas_datatype_with_schema(
+    raw_feature_details: List[dict], input_df: pd.DataFrame
+):
+    feature_detail_map = {}
+    for feature_details in raw_feature_details:
+        feature_detail_map[feature_details.get("name")] = feature_details
+    for column in input_df.columns:
+        if column in feature_detail_map.keys():
+            feature_details = feature_detail_map[column]
+            feature_type = feature_details.get("featureType")
+            pandas_type = map_feature_type_to_pandas(feature_type)
+            input_df[column] = (
+                input_df[column]
+                .astype(pandas_type)
+                .where(pd.notnull(input_df[column]), None)
+            )
@@ -159,9 +159,7 @@ def get_schema_from_pandas_df(df: pd.DataFrame):
     for order_number, field in enumerate(df.columns, start=1):
         details = {
             "name": field,
-            "feature_type": FeatureType(
-                map_pandas_type_to_feature_type(str(df[field].dtype))
-            ),
+            "feature_type": map_pandas_type_to_feature_type(field, df[field]),
             "order_number": order_number,
         }
 
@@ -176,7 +174,7 @@ def get_schema_from_spark_df(df: DataFrame):
     for order_number, field in enumerate(df.schema.fields, start=1):
         details = {
             "name": field.name,
-            "feature_type": FeatureType(map_spark_type_to_feature_type(field.dataType)),
+            "feature_type": map_spark_type_to_feature_type(field.dataType),
             "order_number": order_number,
         }
         schema_details.append(details)
 
@@ -138,7 +138,7 @@ def get_columns_from_table(self, table_name: str):
             target_table_columns.append(
                 {
                     "name": field.name,
-                    "featureType": map_spark_type_to_feature_type(field.dataType),
+                    "featureType": map_spark_type_to_feature_type(field.dataType).value,
                 }
             )
         return target_table_columns
 
@@ -28,7 +28,7 @@
 )
 from ads.feature_store.common.spark_session_singleton import SparkSessionSingleton
 from ads.feature_store.common.utils.feature_schema_mapper import (
-    get_raw_data_source_schema,
+    convert_pandas_datatype_with_schema,
 )
 from ads.feature_store.common.utils.transformation_utils import TransformationUtils
 from ads.feature_store.data_validation.great_expectation import ExpectationService
@@ -177,16 +177,9 @@ def _save_offline_dataframe(
             database = feature_group.entity_id
             self.spark_engine.create_database(database)
 
-            if data_frame is None:
-                raw_schema = get_raw_data_source_schema(
-                    feature_group.input_feature_details
-                )
-            elif isinstance(data_frame, pd.DataFrame):
-                raw_schema = self.spark_engine.convert_from_pandas_to_spark_dataframe(
-                    data_frame
-                ).schema
-            else:
-                raw_schema = data_frame.schema
+            if isinstance(data_frame, pd.DataFrame):
+                if not feature_group.is_infer_schema:
+                    convert_pandas_datatype_with_schema(feature_group.input_feature_details, data_frame)
 
             # TODO: Get event timestamp column and apply filtering basis from and to timestamp
 
@@ -223,7 +216,7 @@ def _save_offline_dataframe(
                 target_table,
                 feature_group.primary_keys,
                 feature_group_job.ingestion_mode,
-                raw_schema,
+                featured_data.schema,
                 feature_group_job.feature_option_details,
             )
Original file line number	Diff line number	Diff line change
`@@ -138,7 +138,7 @@ def get_columns_from_table(self, table_name: str):`
`138`	`138`	`target_table_columns.append(`
`139`	`139`	`{`
`140`	`140`	`"name": field.name,`
`141`		`- "featureType": map_spark_type_to_feature_type(field.dataType),`
	`141`	`+ "featureType": map_spark_type_to_feature_type(field.dataType).value,`
`142`	`142`	`}`
`143`	`143`	`)`
`144`	`144`	`return target_table_columns`