@@ -28,9 +28,12 @@ def gen_series_fixed_str(data_num, data_length, input_data, data_width):
2828 return results
2929
3030
31- def gen_arr_from_input (input_data , data_length , random = True ):
31+ def gen_arr_from_input (data_length , input_data , random = True , repeat = True , seed = None ):
32+ if seed is not None :
33+ np .random .seed (seed )
34+
3235 if random :
33- return np .random .choice (input_data , data_length )
36+ return np .random .choice (input_data , data_length , replace = repeat )
3437 else :
3538 return np .asarray (multiply_oneds_data (input_data , data_length ))
3639
@@ -50,7 +53,7 @@ def gen_arr_of_dtype(data_length, dtype='float', random=True, limits=None, nuniq
5053
5154 # prefer generation based on input data if it's provided
5255 if input_data is not None :
53- return gen_arr_from_input (input_data , data_length , random = random )
56+ return gen_arr_from_input (data_length , input_data , random = random )
5457
5558 if dtype == 'float' :
5659 return np .random .ranf (data_length )
@@ -67,6 +70,21 @@ def gen_arr_of_dtype(data_length, dtype='float', random=True, limits=None, nuniq
6770 return None
6871
6972
73+ def gen_unique_values (data_length , dtype = 'int' , seed = None ):
74+ """
75+ data_length: result length of array of unique values,
76+ dtype: dtype of generated array,
77+ seed: seed to initialize random state
78+ """
79+
80+ if dtype in ('float' , 'int' ):
81+ values = np .arange (data_length , dtype = dtype )
82+ if dtype == 'str' :
83+ values = gen_strlist (data_length )
84+
85+ return gen_arr_from_input (data_length , values , repeat = False , seed = seed )
86+
87+
7088def gen_series (data_length , dtype = 'float' , random = True , limits = None , nunique = 1000 , input_data = None , seed = None ):
7189 """
7290 data_length: result series length,
@@ -82,7 +100,7 @@ def gen_series(data_length, dtype='float', random=True, limits=None, nunique=100
82100
83101 # prefer generation based on input data if it's provided
84102 if input_data is not None :
85- series_data = gen_arr_from_input (input_data , data_length , random = random )
103+ series_data = gen_arr_from_input (data_length , input_data , random = random )
86104 else :
87105 series_data = gen_arr_of_dtype (data_length , dtype = dtype , limits = limits , nunique = nunique )
88106
@@ -98,13 +116,15 @@ def gen_df(data_length,
98116 limits = None ,
99117 nunique = 1000 ,
100118 input_data = None ,
119+ index_gen = None ,
101120 seed = None ):
102121 """
103122 data_length: result series length,
104123 dtype: dtype of generated series,
105124 limits: a tuple of (min, max) limits for numeric series,
106125 nunique: number of unique values in generated series,
107126 input_data: 2D sequence of values used for generation of dataframe columns,
127+ index_gen: callable that will generate index of needed size,
108128 seed: seed to initialize random state
109129 """
110130
@@ -116,10 +136,10 @@ def gen_df(data_length,
116136 for i in range (columns ):
117137 # prefer generation based on input data if it's provided
118138 if (input_data is not None and i < len (input_data )):
119- col_data = gen_arr_from_input (input_data [i ], data_length , random = random )
139+ col_data = gen_arr_from_input (data_length , input_data [i ], random = random )
120140 else :
121141 col_data = gen_arr_of_dtype (data_length , dtype = dtype , limits = limits , nunique = nunique )
122142 all_data .append (col_data )
123143
124- # TODO: support index generation
125- return pd .DataFrame (dict (zip (col_names , all_data )))
144+ index_data = index_gen ( data_length ) if index_gen is not None else None
145+ return pd .DataFrame (dict (zip (col_names , all_data )), index = index_data )
0 commit comments