@@ -295,3 +295,43 @@ def airline_regression(dataset_dir: Path) -> bool:
295295 np .save (os .path .join (dataset_dir , filename ), data )
296296 logging .info (f'dataset { dataset_name } is ready.' )
297297 return True
298+
299+
300+ def higgs_10500K (dataset_dir : Path ) -> bool :
301+ """
302+ Higgs dataset from UCI machine learning repository
303+ https://archive.ics.uci.edu/ml/datasets/HIGGS
304+
305+ Classification task. n_classes = 2.
306+ higgs_10500K X train dataset (10500000, 28)
307+ higgs_10500K y train dataset (10500000, 1)
308+ higgs_10500K X test dataset (500000, 28)
309+ higgs_10500K y test dataset (500000, 1)
310+ """
311+ dataset_name = 'higgs_10500K'
312+ os .makedirs (dataset_dir , exist_ok = True )
313+
314+ url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00280/HIGGS.csv.gz'
315+ local_url = os .path .join (dataset_dir , os .path .basename (url ))
316+ if not os .path .isfile (local_url ):
317+ logging .info (f'Started loading { dataset_name } ' )
318+ retrieve (url , local_url )
319+ logging .info (f'{ dataset_name } is loaded, started parsing...' )
320+
321+ nrows_train , nrows_test , dtype = 10500000 , 500000 , np .float32
322+ data : Any = pd .read_csv (local_url , delimiter = "," , header = None ,
323+ compression = "gzip" , dtype = dtype ,
324+ nrows = nrows_train + nrows_test )
325+
326+ X = data [data .columns [1 :]]
327+ y = data [data .columns [0 :1 ]]
328+
329+ x_train , x_test , y_train , y_test = train_test_split (
330+ X , y , train_size = nrows_train , test_size = nrows_test , shuffle = False )
331+
332+ for data , name in zip ((x_train , x_test , y_train , y_test ),
333+ ('x_train' , 'x_test' , 'y_train' , 'y_test' )):
334+ filename = f'{ dataset_name } _{ name } .npy'
335+ np .save (os .path .join (dataset_dir , filename ), data )
336+ logging .info (f'dataset { dataset_name } is ready.' )
337+ return True
0 commit comments