File tree Expand file tree Collapse file tree 3 files changed +23
-1
lines changed Expand file tree Collapse file tree 3 files changed +23
-1
lines changed Original file line number Diff line number Diff line change @@ -108,6 +108,9 @@ class DownloadConfig:
108108 used.
109109 ignore_duplicates: whether to ignore duplicated examples with the same key.
110110 If there are multiple examples with the same key, the first one is kept.
111+ nondeterministic_order: If True, it will not assure deterministic ordering
112+ when writing' examples to disk in the case of beam datasets. This might
113+ result in quicker dataset preparation.
111114 """
112115
113116 extract_dir : epath .PathLike | None = None
@@ -126,6 +129,7 @@ class DownloadConfig:
126129 min_shard_size : int = shard_utils .DEFAULT_MIN_SHARD_SIZE
127130 max_shard_size : int = shard_utils .DEFAULT_MAX_SHARD_SIZE
128131 ignore_duplicates : bool = False
132+ nondeterministic_order : bool = False
129133
130134 def get_shard_config (self ) -> shard_utils .ShardConfig :
131135 return shard_utils .ShardConfig (
Original file line number Diff line number Diff line change 3232
3333
3434def register_subparser (parsers : argparse ._SubParsersAction ) -> None : # pylint: disable=protected-access
35- """Add subparser for `build` command."""
35+ """Add subparser for `build` command.
36+
37+ New flags should be added to `cli_utils` module.
38+
39+ Args:
40+ parsers: The subparsers object to add the parser to.
41+ """
3642 build_parser = parsers .add_parser (
3743 'build' , help = 'Commands for downloading and preparing datasets.'
3844 )
@@ -357,6 +363,7 @@ def _download_and_prepare(
357363 skip_if_published = args .skip_if_published ,
358364 overwrite = args .overwrite ,
359365 beam_pipeline_options = args .beam_pipeline_options ,
366+ nondeterministic_order = args .nondeterministic_order ,
360367 )
361368
362369
Original file line number Diff line number Diff line change @@ -261,6 +261,14 @@ def add_generation_argument_group(parser: argparse.ArgumentParser):
261261 default = 1 ,
262262 help = 'Number of parallel build processes.' ,
263263 )
264+ generation_group .add_argument (
265+ '--nondeterministic_order' ,
266+ action = 'store_false' ,
267+ help = (
268+ 'If True, it will not assure deterministic ordering when writing'
269+ ' examples to disk. This might result in quicker dataset preparation.'
270+ ),
271+ )
264272
265273
266274def add_publish_argument_group (parser : argparse .ArgumentParser ):
@@ -300,6 +308,7 @@ def download_and_prepare(
300308 skip_if_published : bool ,
301309 overwrite : bool ,
302310 beam_pipeline_options : str | None ,
311+ nondeterministic_order : bool = False ,
303312) -> None :
304313 """Generate a single builder."""
305314 dataset = builder .info .full_name
@@ -317,6 +326,8 @@ def download_and_prepare(
317326 download_config = download .DownloadConfig ()
318327 if overwrite and not download_config .download_mode .overwrite_dataset :
319328 download_config .download_mode = download .GenerateMode .REUSE_CACHE_IF_EXISTS
329+ if nondeterministic_order :
330+ download_config .nondeterministic_order = True
320331
321332 # Add Apache Beam options to download config
322333 try :
You can’t perform that action at this time.
0 commit comments