2626"""
2727
2828import argparse
29- from collections .abc import Sequence
29+ import dataclasses
30+ import typing
3031
3132from etils import epath
33+ import simple_parsing
3234from tensorflow_datasets .core import file_adapters
3335from tensorflow_datasets .scripts .cli import convert_format_utils
3436
3537
36- def add_parser_arguments (parser : argparse .ArgumentParser ) -> None :
37- """Add arguments for `convert_format` subparser."""
38- parser .add_argument (
39- '--root_data_dir' ,
40- type = str ,
41- help = (
42- 'Root data dir that contains all datasets. All datasets and all their'
43- ' configs and versions that are in this folder will be converted.'
44- ),
45- required = False ,
46- )
47- parser .add_argument (
48- '--dataset_dir' ,
49- type = str ,
50- help = (
51- 'Path where the dataset to be converted is located. Converts all'
52- ' configs and versions in this folder.'
53- ),
54- required = False ,
55- )
56- parser .add_argument (
57- '--dataset_version_dir' ,
58- type = str ,
59- help = (
60- 'Path where the dataset to be converted is located. Should include'
61- ' config and version. Can also be a comma-separated list of paths. If'
62- ' multiple paths are specified, `--out_dir` should not be specified,'
63- ' since each dataset will be converted in the same directory as the'
64- ' input dataset.'
65- ),
66- required = False ,
67- )
68- parser .add_argument (
69- '--out_file_format' ,
70- type = str ,
71- choices = [file_format .value for file_format in file_adapters .FileFormat ],
72- help = 'File format to convert the dataset to.' ,
73- required = True ,
74- )
75- parser .add_argument (
76- '--out_dir' ,
77- type = str ,
78- help = (
79- 'Path where the converted dataset will be stored. Should include the'
80- ' config and version, e.g. `/data/dataset_name/config/1.2.3`. If not'
81- ' specified, the converted shards will be stored in the same'
82- ' directory as the input dataset.'
83- ),
84- default = '' ,
85- required = False ,
86- )
87- parser .add_argument (
88- '--overwrite' ,
89- action = 'store_true' ,
90- help = 'Whether to overwrite the output directory if it already exists.' ,
91- )
92- parser .add_argument (
93- '--use_beam' ,
94- action = 'store_true' ,
95- help = 'Use beam to convert the dataset.' ,
96- )
97- parser .add_argument (
98- '--num_workers' ,
99- type = int ,
100- default = 8 ,
101- help = (
102- 'Number of workers to use when not using Beam. If `--use_beam` is'
103- ' set, this flag is ignored. If `--num_workers=1`, the conversion'
104- ' will be done sequentially.'
105- ),
38+ @dataclasses .dataclass (frozen = True , kw_only = True )
39+ class Args :
40+ """CLI arguments for converting datasets from one file format to another.
41+
42+ Attributes:
43+ root_data_dir: Root data dir that contains all datasets. All datasets and
44+ all their configs and versions that are in this folder will be converted.
45+ dataset_dir: Path where the dataset to be converted is located. Converts all
46+ configs and versions in this folder.
47+ dataset_version_dir: Path where the dataset to be converted is located.
48+ Should include config and version. Can also be a comma-separated list of
49+ paths. If multiple paths are specified, `--out_dir` should not be
50+ specified, since each dataset will be converted in the same directory as
51+ the input dataset.
52+ out_file_format: File format to convert the dataset to.
53+ out_dir: Path where the converted dataset will be stored. Datasets will be
54+ stored with the same folder structure as the input folder. If `None`, the
55+ converted shards will be stored in the same folder as the input datasets.
56+ overwrite: Whether to overwrite the output directory if it already exists.
57+ use_beam: Use beam to convert the dataset.
58+ num_workers: Number of workers to use when not using Beam. If `--use_beam`
59+ is set, this flag is ignored. If `--num_workers=1`, the conversion will be
60+ done sequentially.
61+ only_log_errors: If set, errors during the conversion will be logged as
62+ errors and will not crash the conversion. If you are converting a large
63+ number of datasets, you might want to set this flag to true.
64+ """
65+
66+ root_data_dir : epath .Path | None = None
67+ dataset_dir : epath .Path | None = None
68+ dataset_version_dir : list [epath .Path ] = simple_parsing .field (
69+ default_factory = list ,
70+ type = lambda dataset_version_dirs_str : [
71+ epath .Path (path ) for path in dataset_version_dirs_str .split (',' )
72+ ],
73+ nargs = '?' ,
10674 )
107- parser .add_argument (
108- '--only_log_errors' ,
109- action = 'store_true' ,
110- default = False ,
111- help = (
112- 'If set, errors during the conversion will be logged as errors and'
113- ' will not crash the conversion. If you are converting a large number'
114- ' of datasets, you might want to set this flag to true.'
115- ),
75+ out_file_format : str = simple_parsing .choice (
76+ * (file_format .value for file_format in file_adapters .FileFormat ),
11677 )
78+ out_dir : epath .Path | None = None
79+ overwrite : bool = False
80+ use_beam : bool = False
81+ num_workers : int = 8
82+ only_log_errors : bool = False
83+
84+ def execute (self ) -> None :
85+ """Converts a dataset from one file format to another."""
86+ convert_format_utils .convert_dataset (
87+ out_dir = self .out_dir ,
88+ out_file_format = self .out_file_format ,
89+ dataset_dir = self .dataset_dir ,
90+ root_data_dir = self .root_data_dir ,
91+ dataset_version_dir = self .dataset_version_dir ,
92+ overwrite = self .overwrite ,
93+ use_beam = self .use_beam ,
94+ num_workers = self .num_workers ,
95+ fail_on_error = not self .only_log_errors ,
96+ )
11797
11898
11999def register_subparser (parsers : argparse ._SubParsersAction ) -> None :
@@ -122,27 +102,6 @@ def register_subparser(parsers: argparse._SubParsersAction) -> None:
122102 'convert_format' ,
123103 help = 'Converts a dataset from one file format to another format.' ,
124104 )
125- add_parser_arguments (parser )
126-
127- def _parse_dataset_version_dir (
128- dataset_version_dir : str | None ,
129- ) -> Sequence [epath .Path ] | None :
130- if not dataset_version_dir :
131- return None
132- return [epath .Path (path ) for path in dataset_version_dir .split (',' )]
133-
134- parser .set_defaults (
135- subparser_fn = lambda args : convert_format_utils .convert_dataset (
136- out_dir = epath .Path (args .out_dir ) if args .out_dir else None ,
137- out_file_format = args .out_file_format ,
138- dataset_dir = args .dataset_dir or None ,
139- root_data_dir = args .root_data_dir or None ,
140- dataset_version_dir = _parse_dataset_version_dir (
141- args .dataset_version_dir
142- ),
143- overwrite = args .overwrite ,
144- use_beam = args .use_beam ,
145- num_workers = args .num_workers ,
146- fail_on_error = not args .only_log_errors ,
147- )
148- )
105+ parser = typing .cast (simple_parsing .ArgumentParser , parser )
106+ parser .add_arguments (Args , dest = 'args' )
107+ parser .set_defaults (subparser_fn = lambda args : args .args .execute ())
0 commit comments