@@ -334,6 +334,40 @@ def epsilon(dataset_dir: Path) -> bool:
334334 return True
335335
336336
337+ def epsilon_30K (dataset_dir : Path ) -> bool :
338+ """
339+ Epsilon dataset
340+ https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary.html
341+
342+ Classification task. n_classes = 2.
343+ epsilon_30K x train dataset (30000, 2000)
344+ epsilon_30K y train dataset (30000, 2000)
345+ """
346+ dataset_name = 'epsilon_30K'
347+ os .makedirs (dataset_dir , exist_ok = True )
348+
349+ url_train = 'https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary' \
350+ '/epsilon_normalized.bz2'
351+ local_url_train = os .path .join (dataset_dir , os .path .basename (url_train ))
352+
353+ num_train , dtype = 30000 , np .float32
354+ if not os .path .isfile (local_url_train ):
355+ logging .info (f'Started loading { dataset_name } , train' )
356+ retrieve (url_train , local_url_train )
357+ logging .info (f'{ dataset_name } is loaded, started parsing...' )
358+ X_train , y_train = load_svmlight_file (local_url_train ,
359+ dtype = dtype )
360+ X_train = X_train .toarray ()[:num_train ]
361+ y_train = y_train [:num_train ]
362+
363+ for data , name in zip ((X_train , y_train ),
364+ ('x_train' , 'y_train' )):
365+ filename = f'{ dataset_name } _{ name } .npy'
366+ np .save (os .path .join (dataset_dir , filename ), data )
367+ logging .info (f'dataset { dataset_name } is ready.' )
368+ return True
369+
370+
337371def fraud (dataset_dir : Path ) -> bool :
338372 """
339373 Credit Card Fraud Detection contest
@@ -688,6 +722,51 @@ def skin_segmentation(dataset_dir: Path) -> bool:
688722 return True
689723
690724
725+ def cifar_binary (dataset_dir : Path ) -> bool :
726+ """
727+ Cifar dataset from LIBSVM Datasets (
728+ https://www.cs.toronto.edu/~kriz/cifar.html#cifar)
729+ TaskType: Classification
730+ cifar_binary x train dataset (50000, 3072)
731+ cifar_binary y train dataset (50000, 1)
732+ cifar_binary x test dataset (10000, 3072)
733+ cifar_binary y test dataset (10000, 1)
734+ """
735+ dataset_name = 'cifar_binary'
736+ os .makedirs (dataset_dir , exist_ok = True )
737+
738+ url_train = 'https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/multiclass/cifar10.bz2'
739+ url_test = 'https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/multiclass/cifar10.t.bz2'
740+ local_url_train = os .path .join (dataset_dir , os .path .basename (url_train ))
741+ local_url_test = os .path .join (dataset_dir , os .path .basename (url_test ))
742+
743+ if not os .path .isfile (local_url_train ):
744+ logging .info (f'Started loading { dataset_name } , train' )
745+ retrieve (url_train , local_url_train )
746+ logging .info (f'{ dataset_name } is loaded, started parsing...' )
747+ x_train , y_train = load_svmlight_file (local_url_train ,
748+ dtype = np .float32 )
749+
750+ if not os .path .isfile (local_url_test ):
751+ logging .info (f'Started loading { dataset_name } , test' )
752+ retrieve (url_test , local_url_test )
753+ logging .info (f'{ dataset_name } is loaded, started parsing...' )
754+ x_test , y_test = load_svmlight_file (local_url_test ,
755+ dtype = np .float32 )
756+
757+ x_train = x_train .toarray ()
758+ y_train = (y_train > 0 ).astype (int )
759+
760+ x_test = x_test .toarray ()
761+ y_test = (y_test > 0 ).astype (int )
762+
763+ for data , name in zip ((x_train , x_test , y_train , y_test ),
764+ ('x_train' , 'x_test' , 'y_train' , 'y_test' )):
765+ filename = f'{ dataset_name } _{ name } .npy'
766+ np .save (os .path .join (dataset_dir , filename ), data )
767+ return True
768+
769+
691770def susy (dataset_dir : Path ) -> bool :
692771 """
693772 SUSY dataset from UCI machine learning repository (
0 commit comments