Skip to content

Commit eb7ae91

Browse files
marcenacpThe TensorFlow Datasets Authors
authored and
The TensorFlow Datasets Authors
committed
Stream from Hugging Face instead of downloading and preparing everything.
PiperOrigin-RevId: 657212303
1 parent 2123db7 commit eb7ae91

File tree

1 file changed

+36
-18
lines changed

1 file changed

+36
-18
lines changed

tensorflow_datasets/core/dataset_builders/huggingface_dataset_builder.py

Lines changed: 36 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -108,9 +108,22 @@ class _ShardInfo:
108108
num_exceptions: int
109109

110110

111+
def _load_dataset(
112+
hf_builder: hf_datasets.DatasetBuilder,
113+
split: str,
114+
) -> hf_datasets.Dataset:
115+
"""Efficiently loads a HuggingFace iterable dataset from its builder."""
116+
return hf_datasets.load_dataset(
117+
hf_builder.repo_id,
118+
hf_builder.config_id,
119+
split=split,
120+
streaming=True,
121+
)
122+
123+
111124
def _write_shard(
112125
shard_spec: _ShardSpec,
113-
hf_builder,
126+
hf_builder: hf_datasets.DatasetBuilder,
114127
example_writer,
115128
features: feature_lib.FeaturesDict,
116129
ignore_hf_errors: bool,
@@ -136,12 +149,19 @@ def _write_shard(
136149
def get_serialized_examples_iter():
137150
nonlocal num_bytes
138151
nonlocal num_exceptions
139-
dataset = hf_builder.as_dataset(
140-
split=shard_spec.shard_split, run_post_process=False
152+
dataset = _load_dataset(
153+
hf_builder,
154+
shard_spec.hf_split,
141155
)
156+
dataset = iter(dataset)
142157
for i in range(shard_spec.num_examples):
158+
if i < shard_spec.start_index:
159+
next(dataset)
160+
continue
161+
if i >= shard_spec.end_index:
162+
break
143163
try:
144-
hf_value = dataset[i]
164+
hf_value = next(dataset)
145165
except Exception: # pylint: disable=broad-exception-caught
146166
num_exceptions += 1
147167
if ignore_hf_errors:
@@ -257,14 +277,6 @@ def _create_builder_config(
257277
) -> Optional[dataset_builder.BuilderConfig]:
258278
return self._converted_builder_config
259279

260-
@functools.lru_cache(maxsize=1)
261-
def _hf_download_and_prepare(self):
262-
login_to_hf(self._hf_hub_token)
263-
self._hf_builder.download_and_prepare(
264-
num_proc=self._hf_num_proc,
265-
verification_mode=self._verification_mode,
266-
)
267-
268280
@property
269281
def _hf_info(self) -> hf_datasets.DatasetInfo:
270282
"""Retrieves the dataset info from the HuggingFace Datasets."""
@@ -278,11 +290,18 @@ def _hf_hub_info(self) -> huggingface_hub.hf_api.DatasetInfo:
278290
)
279291

280292
def _hf_features(self) -> hf_datasets.Features:
281-
if not self._hf_info.features:
282-
# We need to download and prepare the data to know its features.
283-
self._hf_download_and_prepare()
284-
285-
return self._hf_info.features
293+
# Return the features from the builder info.
294+
if self._hf_info.features:
295+
return self._hf_info.features
296+
# Return the features from the first split.
297+
for split in self._hf_info.splits:
298+
ds = _load_dataset(
299+
self._hf_builder,
300+
split,
301+
)
302+
if hasattr(ds, 'info') and ds.info.features:
303+
return ds.info.features
304+
raise ValueError('No features found in the dataset.')
286305

287306
def _info(self) -> dataset_info_lib.DatasetInfo:
288307
return dataset_info_lib.DatasetInfo(
@@ -309,7 +328,6 @@ def _generate_splits(
309328
) -> Sequence[splits_lib.SplitInfo]:
310329
"""Prepares the dataset by writing to shards directly."""
311330
del dl_manager, download_config # Unused.
312-
self._hf_download_and_prepare()
313331

314332
shard_specs_by_split: dict[str, Sequence[_ShardSpec]] = {}
315333
for hf_split, hf_split_info in self._hf_info.splits.items():

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy