@@ -314,7 +314,6 @@ def _get_dtype_from_error(e):
314314 error_string = str (e )
315315
316316 if "mismatched dtypes" in error_string .lower ():
317-
318317 # For the mismatched dtypes error, dask either returns a error message containing the dtype argument
319318 # to specify, or the found and expected dtypes in a table format, depending on what stage
320319 # the type inferencing fails. The below logic supports building the dtype dictionary for both cases
@@ -732,8 +731,8 @@ def down_sample(df, target):
732731 """
733732 dfs = []
734733 target_value_counts = df [target ].value_counts ()
735- min_key = min (target_value_counts .iteritems (), key = lambda k : k [1 ])
736- for key , value in target_value_counts .iteritems ():
734+ min_key = min (target_value_counts .items (), key = lambda k : k [1 ])
735+ for key , value in target_value_counts .items ():
737736 if key != min_key [0 ]:
738737 dfs .append (
739738 df [df [target ] == key ].sample (frac = 1 - ((value - min_key [1 ]) / value ))
@@ -835,6 +834,7 @@ def _log_yscale_not_set():
835834 "`yscale` parameter is not set. Valid values are `'linear'`, `'log'`, `'symlog'`."
836835 )
837836
837+
838838def infer_target_type (target , target_series , discover_target_type = True ):
839839 # if type discovery is turned off, infer type from pandas dtype
840840 if discover_target_type :
@@ -845,13 +845,15 @@ def infer_target_type(target, target_series, discover_target_type=True):
845845 target_type = get_feature_type (target , target_series )
846846 return target_type
847847
848+
848849def get_target_type (target , sampled_df , ** init_kwargs ):
849850 discover_target_type = init_kwargs .get ("type_discovery" , True )
850851 if target in init_kwargs .get ("types" , {}):
851852 sampled_df [target ] = sampled_df [target ].astype (init_kwargs .get ("types" )[target ])
852853 discover_target_type = False
853854 return infer_target_type (target , sampled_df [target ], discover_target_type )
854855
856+
855857def get_dataset (
856858 df : pd .DataFrame ,
857859 sampled_df : pd .DataFrame ,
@@ -860,12 +862,12 @@ def get_dataset(
860862 shape : Tuple [int , int ],
861863 positive_class = None ,
862864 ** init_kwargs ,
863- ):
865+ ):
864866 from ads .dataset .classification_dataset import (
865- BinaryClassificationDataset ,
866- BinaryTextClassificationDataset ,
867- MultiClassClassificationDataset ,
868- MultiClassTextClassificationDataset
867+ BinaryClassificationDataset ,
868+ BinaryTextClassificationDataset ,
869+ MultiClassClassificationDataset ,
870+ MultiClassTextClassificationDataset ,
869871 )
870872 from ads .dataset .forecasting_dataset import ForecastingDataset
871873 from ads .dataset .regression_dataset import RegressionDataset
@@ -874,9 +876,7 @@ def get_dataset(
874876 logger .warning (
875877 "It is not recommended to use an empty column as the target variable."
876878 )
877- raise ValueError (
878- f"We do not support using empty columns as the chosen target"
879- )
879+ raise ValueError (f"We do not support using empty columns as the chosen target" )
880880 if utils .is_same_class (target_type , ContinuousTypedFeature ):
881881 return RegressionDataset (
882882 df = df ,
@@ -899,9 +899,9 @@ def get_dataset(
899899 )
900900
901901 # Adding ordinal typed feature, but ultimately we should rethink how we want to model this type
902- elif utils .is_same_class (target_type , CategoricalTypedFeature ) or utils . is_same_class (
903- target_type , OrdinalTypedFeature
904- ):
902+ elif utils .is_same_class (
903+ target_type , CategoricalTypedFeature
904+ ) or utils . is_same_class ( target_type , OrdinalTypedFeature ) :
905905 if target_type .meta_data ["internal" ]["unique" ] == 2 :
906906 if is_text_data (sampled_df , target ):
907907 return BinaryTextClassificationDataset (
@@ -946,17 +946,13 @@ def get_dataset(
946946 or "text" in target_type ["type" ]
947947 or "text" in target
948948 ):
949- raise ValueError (
950- f"The column { target } cannot be used as the target column."
951- )
949+ raise ValueError (f"The column { target } cannot be used as the target column." )
952950 elif (
953951 utils .is_same_class (target_type , GISTypedFeature )
954952 or "coord" in target_type ["type" ]
955953 or "coord" in target
956954 ):
957- raise ValueError (
958- f"The column { target } cannot be used as the target column."
959- )
955+ raise ValueError (f"The column { target } cannot be used as the target column." )
960956 # This is to catch constant columns that are boolean. Added as a fix for pd.isnull(), and datasets with a
961957 # binary target, but only data on one instance
962958 elif target_type ["low_level_type" ] == "bool" :
@@ -974,6 +970,7 @@ def get_dataset(
974970 f"For example, types = {{{ target } : 'category'}}"
975971 )
976972
973+
977974def open (
978975 source ,
979976 target = None ,
@@ -1074,9 +1071,7 @@ def open(
10741071 progress .update ("Opening data" )
10751072 path = ElaboratedPath (source , format = format , ** kwargs )
10761073 reader_fn = (
1077- get_format_reader (path = path , ** kwargs )
1078- if reader_fn is None
1079- else reader_fn
1074+ get_format_reader (path = path , ** kwargs ) if reader_fn is None else reader_fn
10801075 )
10811076 df = load_dataset (path = path , reader_fn = reader_fn , ** kwargs )
10821077 name = path .name
@@ -1108,6 +1103,7 @@ def open(
11081103 ),
11091104 )
11101105
1106+
11111107def build_dataset (
11121108 df : pd .DataFrame ,
11131109 shape : Tuple [int , int ],
@@ -1149,9 +1145,7 @@ def build_dataset(
11491145 discover_target_type = False
11501146
11511147 # if type discovery is turned off, infer type from pandas dtype
1152- target_type = infer_target_type (
1153- target , sampled_df [target ], discover_target_type
1154- )
1148+ target_type = infer_target_type (target , sampled_df [target ], discover_target_type )
11551149
11561150 result = get_dataset (
11571151 df = df ,
@@ -1168,6 +1162,7 @@ def build_dataset(
11681162 )
11691163 return result
11701164
1165+
11711166class CustomFormatReaders :
11721167 @staticmethod
11731168 def read_tsv (path : str , ** kwargs ) -> pd .DataFrame :
@@ -1352,7 +1347,6 @@ def read_xml(path: str, **kwargs) -> pd.DataFrame:
13521347 import xml .etree .cElementTree as et
13531348
13541349 def get_children (df , node , parent , i ):
1355-
13561350 for name in node .attrib .keys ():
13571351 df .at [i , parent + name ] = node .attrib [name ]
13581352 for child in list (node ):
@@ -1374,6 +1368,7 @@ def get_children(df, node, parent, i):
13741368 last_i = i
13751369 return ret_df
13761370
1371+
13771372reader_fns = {
13781373 "csv" : pd .read_csv ,
13791374 "tsv" : CustomFormatReaders .read_tsv ,
@@ -1399,13 +1394,15 @@ def get_children(df, node, parent, i):
13991394 "xml" : CustomFormatReaders .read_xml ,
14001395}
14011396
1397+
14021398def validate_kwargs (func : Callable , kwargs ):
14031399 valid_params = inspect .signature (func ).parameters
14041400 if "kwargs" in valid_params :
14051401 return kwargs
14061402 else :
14071403 return {k : v for k , v in kwargs .items () if k in valid_params }
14081404
1405+
14091406def get_format_reader (path : ElaboratedPath , ** kwargs ) -> Callable :
14101407 format_key = path .format
14111408 try :
@@ -1420,6 +1417,7 @@ def get_format_reader(path: ElaboratedPath, **kwargs) -> Callable:
14201417
14211418 return reader_fn
14221419
1420+
14231421def load_dataset (path : ElaboratedPath , reader_fn : Callable , ** kwargs ) -> pd .DataFrame :
14241422 dfs = []
14251423 for filename in path .paths :
0 commit comments