11#!/usr/bin/env python
22# -*- coding: utf-8; -*-
33
4- # Copyright (c) 2020, 2022 Oracle and/or its affiliates.
4+ # Copyright (c) 2020, 2023 Oracle and/or its affiliates.
55# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/
66
77from __future__ import absolute_import , print_function
1010import importlib
1111from collections import defaultdict
1212from numbers import Number
13- from typing import Union
13+ from typing import Tuple , Union
1414
1515import pandas as pd
1616from ads .common import utils , logger
2323from ads .dataset .dataset import ADSDataset
2424from ads .dataset .feature_engineering_transformer import FeatureEngineeringTransformer
2525from ads .dataset .feature_selection import FeatureImportance
26- from ads .dataset .helper import deprecate_default_value , deprecate_variable
26+ from ads .dataset .helper import (
27+ DatasetDefaults ,
28+ deprecate_default_value ,
29+ deprecate_variable ,
30+ generate_sample ,
31+ get_target_type ,
32+ is_text_data ,
33+ )
2734from ads .dataset .label_encoder import DataFrameLabelEncoder
2835from ads .dataset .pipeline import TransformerPipeline
2936from ads .dataset .progress import DummyProgressBar
3037from ads .dataset .recommendation import Recommendation
3138from ads .dataset .recommendation_transformer import RecommendationTransformer
3239from ads .dataset .target import TargetVariable
33- from ads .type_discovery .typed_feature import DateTimeTypedFeature
40+ from ads .type_discovery .typed_feature import (
41+ CategoricalTypedFeature ,
42+ ContinuousTypedFeature ,
43+ DocumentTypedFeature ,
44+ GISTypedFeature ,
45+ OrdinalTypedFeature ,
46+ TypedFeature ,
47+ DateTimeTypedFeature ,
48+ TypedFeature
49+ )
3450from sklearn .model_selection import train_test_split
3551from pandas .io .formats .printing import pprint_thing
3652from sklearn .preprocessing import FunctionTransformer
@@ -45,10 +61,10 @@ class ADSDatasetWithTarget(ADSDataset, metaclass=ABCMeta):
4561 def __init__ (
4662 self ,
4763 df ,
48- sampled_df ,
4964 target ,
50- target_type ,
51- shape ,
65+ sampled_df = None ,
66+ shape = None ,
67+ target_type = None ,
5268 sample_max_rows = - 1 ,
5369 type_discovery = True ,
5470 types = {},
@@ -61,6 +77,16 @@ def __init__(
6177 ** kwargs ,
6278 ):
6379 self .recommendation_transformer = None
80+ if shape is None :
81+ shape = df .shape
82+ if sampled_df is None :
83+ sampled_df = generate_sample (
84+ df ,
85+ shape [0 ],
86+ DatasetDefaults .sampling_confidence_level ,
87+ DatasetDefaults .sampling_confidence_interval ,
88+ ** kwargs ,
89+ )
6490
6591 if parent is None :
6692 cols = sampled_df .columns .tolist ()
@@ -135,6 +161,8 @@ def __init__(
135161 cols .insert (0 , cols .pop (cols .index (target )))
136162 self .sampled_df = self .sampled_df [[* cols ]]
137163
164+ if target_type is None :
165+ target_type = get_target_type (target , sampled_df , ** kwargs )
138166 self .target = TargetVariable (self , target , target_type )
139167
140168 # remove target from type discovery conversion
@@ -145,6 +173,141 @@ def __init__(
145173 ):
146174 step [1 ].kw_args ["dtypes" ].pop (self .target .name )
147175
176+ @staticmethod
177+ def from_dataframe (
178+ df : pd .DataFrame ,
179+ target : str ,
180+ sampled_df : pd .DataFrame = None ,
181+ shape : Tuple [int , int ] = None ,
182+ target_type : TypedFeature = None ,
183+ positive_class = None ,
184+ ** init_kwargs ,
185+ ):
186+ from ads .dataset .classification_dataset import (
187+ BinaryClassificationDataset ,
188+ BinaryTextClassificationDataset ,
189+ MultiClassClassificationDataset ,
190+ MultiClassTextClassificationDataset
191+ )
192+ from ads .dataset .forecasting_dataset import ForecastingDataset
193+ from ads .dataset .regression_dataset import RegressionDataset
194+
195+ if sampled_df is None :
196+ sampled_df = generate_sample (
197+ df ,
198+ (shape or df .shape )[0 ],
199+ DatasetDefaults .sampling_confidence_level ,
200+ DatasetDefaults .sampling_confidence_interval ,
201+ ** init_kwargs ,
202+ )
203+
204+ if target_type is None :
205+ target_type = get_target_type (target , sampled_df , ** init_kwargs )
206+
207+ if len (df [target ].dropna ()) == 0 :
208+ logger .warning (
209+ "It is not recommended to use an empty column as the target variable."
210+ )
211+ raise ValueError (
212+ f"We do not support using empty columns as the chosen target"
213+ )
214+ if utils .is_same_class (target_type , ContinuousTypedFeature ):
215+ return RegressionDataset (
216+ df = df ,
217+ sampled_df = sampled_df ,
218+ target = target ,
219+ target_type = target_type ,
220+ shape = shape ,
221+ ** init_kwargs ,
222+ )
223+ elif utils .is_same_class (
224+ target_type , DateTimeTypedFeature
225+ ) or df .index .dtype .name .startswith ("datetime" ):
226+ return ForecastingDataset (
227+ df = df ,
228+ sampled_df = sampled_df ,
229+ target = target ,
230+ target_type = target_type ,
231+ shape = shape ,
232+ ** init_kwargs ,
233+ )
234+
235+ # Adding ordinal typed feature, but ultimately we should rethink how we want to model this type
236+ elif utils .is_same_class (target_type , CategoricalTypedFeature ) or utils .is_same_class (
237+ target_type , OrdinalTypedFeature
238+ ):
239+ if target_type .meta_data ["internal" ]["unique" ] == 2 :
240+ if is_text_data (sampled_df , target ):
241+ return BinaryTextClassificationDataset (
242+ df = df ,
243+ sampled_df = sampled_df ,
244+ target = target ,
245+ shape = shape ,
246+ target_type = target_type ,
247+ positive_class = positive_class ,
248+ ** init_kwargs ,
249+ )
250+
251+ return BinaryClassificationDataset (
252+ df = df ,
253+ sampled_df = sampled_df ,
254+ target = target ,
255+ shape = shape ,
256+ target_type = target_type ,
257+ positive_class = positive_class ,
258+ ** init_kwargs ,
259+ )
260+ else :
261+ if is_text_data (sampled_df , target ):
262+ return MultiClassTextClassificationDataset (
263+ df = df ,
264+ sampled_df = sampled_df ,
265+ target = target ,
266+ target_type = target_type ,
267+ shape = shape ,
268+ ** init_kwargs ,
269+ )
270+ return MultiClassClassificationDataset (
271+ df = df ,
272+ sampled_df = sampled_df ,
273+ target = target ,
274+ target_type = target_type ,
275+ shape = shape ,
276+ ** init_kwargs ,
277+ )
278+ elif (
279+ utils .is_same_class (target , DocumentTypedFeature )
280+ or "text" in target_type ["type" ]
281+ or "text" in target
282+ ):
283+ raise ValueError (
284+ f"The column { target } cannot be used as the target column."
285+ )
286+ elif (
287+ utils .is_same_class (target_type , GISTypedFeature )
288+ or "coord" in target_type ["type" ]
289+ or "coord" in target
290+ ):
291+ raise ValueError (
292+ f"The column { target } cannot be used as the target column."
293+ )
294+ # This is to catch constant columns that are boolean. Added as a fix for pd.isnull(), and datasets with a
295+ # binary target, but only data on one instance
296+ elif target_type and target_type ["low_level_type" ] == "bool" :
297+ return BinaryClassificationDataset (
298+ df = df ,
299+ sampled_df = sampled_df ,
300+ target = target ,
301+ shape = shape ,
302+ target_type = target_type ,
303+ positive_class = positive_class ,
304+ ** init_kwargs ,
305+ )
306+ raise ValueError (
307+ f"Unable to identify problem type. Specify the data type of { target } using 'types'. "
308+ f"For example, types = {{{ target } : 'category'}}"
309+ )
310+
148311 def rename_columns (self , columns ):
149312 """
150313 Returns a dataset with columns renamed.
0 commit comments