66# -----------------------------------------------------------------------------
77
88import importlib
9+ import logging
910from pathlib import Path
1011
1112from QEfficient .finetune .utils .logging_utils import logger
@@ -26,51 +27,81 @@ def load_module_from_py_file(py_file: str) -> object:
2627
2728
2829def get_custom_dataset (dataset_config , tokenizer , split : str , context_length = None ):
29- if ":" in dataset_config .file :
30- module_path , func_name = dataset_config .file .split (":" )
31- else :
32- module_path , func_name = dataset_config .file , "get_custom_dataset"
30+ if not hasattr (dataset_config , "preproc_file" ):
31+ logger .raise_error ("Can not find preproc_file key in dataset_config file." , RuntimeError )
32+
33+ if ":" not in dataset_config .preproc_file :
34+ logger .raise_error (
35+ "The 'preproc_file' key in dataset_config file should follow the format: python_file_path:function_name" ,
36+ RuntimeError ,
37+ )
38+
39+ module_path , func_name = dataset_config .preproc_file .split (":" )
40+ logger .log_rank_zero (
41+ f"Using '{ func_name } ' function from { module_path } as preprocessing function in dataset preprocessing." ,
42+ logging .DEBUG ,
43+ )
3344
3445 if not module_path .endswith (".py" ):
35- logger .raise_error (f"Dataset file { module_path } is not a .py file." , ValueError )
46+ logger .raise_error (f"Custom dataset preprocessing file { module_path } is not a .py file." , ValueError )
3647
3748 module_path = Path (module_path )
3849 if not module_path .is_file ():
3950 logger .raise_error (
40- f"Dataset py file { module_path .as_posix ()} does not exist or is not a file." , FileNotFoundError
51+ f"Custom dataset file { module_path .as_posix ()} does not exist or is not a file." , FileNotFoundError
4152 )
4253
4354 module = load_module_from_py_file (module_path .as_posix ())
4455 try :
4556 return getattr (module , func_name )(dataset_config , tokenizer , split , context_length )
4657 except AttributeError :
4758 logger .raise_error (
48- f"It seems like the given method name ({ func_name } ) is not present in the dataset .py file ({ module_path .as_posix ()} )." ,
59+ f"For custom dataset preprocessing, the method ({ func_name } ) is not "
60+ f"present in the file ({ module_path .as_posix ()} )." ,
4961 AttributeError ,
5062 )
5163
5264
5365def get_data_collator (dataset_processer , dataset_config ):
54- if ":" in dataset_config .file :
55- module_path , func_name = dataset_config .file .split (":" )
66+ if not hasattr (dataset_config , "collate_file" ):
67+ logger .log_rank_zero (
68+ "Can not find collate_file key in dataset_config file. Using the default data collator function instead." ,
69+ logging .WARNING ,
70+ )
71+ return None
72+
73+ if ":" not in dataset_config .collate_file :
74+ logger .log_rank_zero (
75+ "Can not find function name in 'collate_file' key in dataset_config "
76+ "file. Using the default data collator function instead. If this is "
77+ "not intended then change the format of the 'collate_file' key in "
78+ "dataset_config file to follow the format: python_file_path:function_name" ,
79+ logging .WARNING ,
80+ )
81+ return None
5682 else :
57- module_path , func_name = dataset_config .file , "get_data_collator"
83+ module_path , func_name = dataset_config .collate_file .split (":" )
84+ logger .log_rank_zero (
85+ f"Using '{ func_name } ' function from { module_path } as collate_fn in dataset preprocessing." ,
86+ logging .DEBUG ,
87+ )
5888
5989 if not module_path .endswith (".py" ):
60- logger .raise_error (f"Dataset file { module_path } is not a .py file." , ValueError )
90+ logger .raise_error (f"Custom dataset collate file { module_path } is not a .py file." , ValueError )
6191
6292 module_path = Path (module_path )
6393 if not module_path .is_file ():
6494 logger .raise_error (
65- f"Dataset py file { module_path .as_posix ()} does not exist or is not a file." , FileNotFoundError
95+ f"Custom dataset collate file { module_path .as_posix ()} does not exist or is not a file." , FileNotFoundError
6696 )
6797
6898 module = load_module_from_py_file (module_path .as_posix ())
6999 try :
70100 return getattr (module , func_name )(dataset_processer )
71101 except AttributeError :
72102 logger .log_rank_zero (
73- f"Can not find the custom data_collator in the dataset.py file ({ module_path .as_posix ()} )."
103+ f"Can not find the function { func_name } in file "
104+ f"({ module_path .as_posix ()} ). Using the default data collator "
105+ "function instead."
74106 )
75- logger .log_rank_zero ("Using the default data_collator instead." )
76107 return None
0 commit comments