@@ -108,9 +108,22 @@ class _ShardInfo:
108108 num_exceptions : int
109109
110110
111+ def _load_dataset (
112+ hf_builder : hf_datasets .DatasetBuilder ,
113+ split : str ,
114+ ) -> hf_datasets .Dataset :
115+ """Efficiently loads a HuggingFace iterable dataset from its builder."""
116+ return hf_datasets .load_dataset (
117+ hf_builder .repo_id ,
118+ hf_builder .config_id ,
119+ split = split ,
120+ streaming = True ,
121+ )
122+
123+
111124def _write_shard (
112125 shard_spec : _ShardSpec ,
113- hf_builder ,
126+ hf_builder : hf_datasets . DatasetBuilder ,
114127 example_writer ,
115128 features : feature_lib .FeaturesDict ,
116129 ignore_hf_errors : bool ,
@@ -136,12 +149,19 @@ def _write_shard(
136149 def get_serialized_examples_iter ():
137150 nonlocal num_bytes
138151 nonlocal num_exceptions
139- dataset = hf_builder .as_dataset (
140- split = shard_spec .shard_split , run_post_process = False
152+ dataset = _load_dataset (
153+ hf_builder ,
154+ shard_spec .hf_split ,
141155 )
156+ dataset = iter (dataset )
142157 for i in range (shard_spec .num_examples ):
158+ if i < shard_spec .start_index :
159+ next (dataset )
160+ continue
161+ if i >= shard_spec .end_index :
162+ break
143163 try :
144- hf_value = dataset [ i ]
164+ hf_value = next ( dataset )
145165 except Exception : # pylint: disable=broad-exception-caught
146166 num_exceptions += 1
147167 if ignore_hf_errors :
@@ -257,14 +277,6 @@ def _create_builder_config(
257277 ) -> Optional [dataset_builder .BuilderConfig ]:
258278 return self ._converted_builder_config
259279
260- @functools .lru_cache (maxsize = 1 )
261- def _hf_download_and_prepare (self ):
262- login_to_hf (self ._hf_hub_token )
263- self ._hf_builder .download_and_prepare (
264- num_proc = self ._hf_num_proc ,
265- verification_mode = self ._verification_mode ,
266- )
267-
268280 @property
269281 def _hf_info (self ) -> hf_datasets .DatasetInfo :
270282 """Retrieves the dataset info from the HuggingFace Datasets."""
@@ -278,11 +290,18 @@ def _hf_hub_info(self) -> huggingface_hub.hf_api.DatasetInfo:
278290 )
279291
280292 def _hf_features (self ) -> hf_datasets .Features :
281- if not self ._hf_info .features :
282- # We need to download and prepare the data to know its features.
283- self ._hf_download_and_prepare ()
284-
285- return self ._hf_info .features
293+ # Return the features from the builder info.
294+ if self ._hf_info .features :
295+ return self ._hf_info .features
296+ # Return the features from the first split.
297+ for split in self ._hf_info .splits :
298+ ds = _load_dataset (
299+ self ._hf_builder ,
300+ split ,
301+ )
302+ if hasattr (ds , 'info' ) and ds .info .features :
303+ return ds .info .features
304+ raise ValueError ('No features found in the dataset.' )
286305
287306 def _info (self ) -> dataset_info_lib .DatasetInfo :
288307 return dataset_info_lib .DatasetInfo (
@@ -309,7 +328,6 @@ def _generate_splits(
309328 ) -> Sequence [splits_lib .SplitInfo ]:
310329 """Prepares the dataset by writing to shards directly."""
311330 del dl_manager , download_config # Unused.
312- self ._hf_download_and_prepare ()
313331
314332 shard_specs_by_split : dict [str , Sequence [_ShardSpec ]] = {}
315333 for hf_split , hf_split_info in self ._hf_info .splits .items ():
0 commit comments