Skip to content

Instantly share code, notes, and snippets.

@s-kganz
Created May 10, 2024 18:48
Show Gist options
  • Save s-kganz/9dea2e03af23e7d5d12cd0a7dee4b40d to your computer and use it in GitHub Desktop.
Save s-kganz/9dea2e03af23e7d5d12cd0a7dee4b40d to your computer and use it in GitHub Desktop.
TFDS UnicodeDecodeError stack trace
---------------------------------------------------------------------------
UnicodeDecodeError Traceback (most recent call last)
Cell In[10], line 1
----> 1 b = Builder()
File ~\miniconda3\envs\gee\lib\site-packages\tensorflow_datasets\core\logging\__init__.py:289, in builder_init.<locals>.decorator(function, dsbuilder, args, kwargs)
287 _thread_id_to_builder_init_count[metadata.thread_id] += 1
288 try:
--> 289 return function(*args, **kwargs)
290 except Exception:
291 metadata.mark_error()
File ~\miniconda3\envs\gee\lib\site-packages\tensorflow_datasets\core\dataset_builder.py:1370, in FileReaderBuilder.__init__(self, file_format, **kwargs)
1353 @tfds_logging.builder_init()
1354 def __init__(
1355 self,
(...)
1358 **kwargs: Any,
1359 ):
1360 """Initializes an instance of FileReaderBuilder.
1361
1362 Callers must pass arguments as keyword arguments.
(...)
1368 **kwargs: Arguments passed to `DatasetBuilder`.
1369 """
-> 1370 super().__init__(**kwargs)
1371 self.info.set_file_format(file_format)
File ~\miniconda3\envs\gee\lib\site-packages\tensorflow_datasets\core\logging\__init__.py:289, in builder_init.<locals>.decorator(function, dsbuilder, args, kwargs)
287 _thread_id_to_builder_init_count[metadata.thread_id] += 1
288 try:
--> 289 return function(*args, **kwargs)
290 except Exception:
291 metadata.mark_error()
File ~\miniconda3\envs\gee\lib\site-packages\tensorflow_datasets\core\dataset_builder.py:287, in DatasetBuilder.__init__(self, data_dir, config, version)
285 self.info.read_from_directory(self._data_dir)
286 else: # Use the code version (do not restore data)
--> 287 self.info.initialize_from_bucket()
File ~\miniconda3\envs\gee\lib\site-packages\tensorflow_datasets\core\logging\__init__.py:169, in _FunctionDecorator.__call__(self, function, instance, args, kwargs)
167 metadata = self._start_call()
168 try:
--> 169 return function(*args, **kwargs)
170 except Exception:
171 metadata.mark_error()
File ~\miniconda3\envs\gee\lib\site-packages\tensorflow_datasets\core\dataset_builder.py:482, in DatasetBuilder.info(self)
474 if not getattr(self, "_version", None):
475 # Message for developers creating new dataset. Will trigger if they are
476 # using .info in the constructor before calling super().__init__
477 raise AssertionError(
478 "Info should not been called before version has been defined. "
479 "Otherwise, the created .info may not match the info version from "
480 "the restored dataset."
481 )
--> 482 info = self._info()
483 if not isinstance(info, dataset_info.DatasetInfo):
484 raise TypeError(
485 "DatasetBuilder._info should returns `tfds.core.DatasetInfo`, not "
486 f" {type(info)}."
487 )
File project\test\test_dataset_builder.py:17, in Builder._info(self)
15 """Returns the dataset metadata."""
16 # TODO(test): Specifies the tfds.core.DatasetInfo object
---> 17 return self.dataset_info_from_configs(
18 features=tfds.features.FeaturesDict({
19 # These are the features of your dataset like images, labels ...
20 'image': tfds.features.Image(shape=(None, None, 3)),
21 'label': tfds.features.ClassLabel(names=['no', 'yes']),
22 }),
23 # If there's a common (input, target) tuple from the
24 # features, specify them here. They'll be used if
25 # `as_supervised=True` in `builder.as_dataset`.
26 supervised_keys=('image', 'label'), # Set to `None` to disable
27 homepage='https://dataset-homepage/',
28 )
File ~\miniconda3\envs\gee\lib\site-packages\tensorflow_datasets\core\dataset_builder.py:1124, in DatasetBuilder.dataset_info_from_configs(self, **kwargs)
1112 def dataset_info_from_configs(self, **kwargs):
1113 """Returns the DatasetInfo using given kwargs and config files.
1114
1115 Sub-class should call this and add information not present in config files
(...)
1122 **kwargs: kw args to pass to DatasetInfo directly.
1123 """
-> 1124 metadata = self.get_metadata()
1125 if metadata.description:
1126 kwargs["description"] = metadata.description
File ~\miniconda3\envs\gee\lib\site-packages\tensorflow_datasets\core\dataset_builder.py:245, in DatasetBuilder.get_metadata(cls)
238 @classmethod
239 def get_metadata(cls) -> dataset_metadata.DatasetMetadata:
240 """Returns metadata (README, CITATIONS, ...) specified in config files.
241
242 The config files are read from the same package where the DatasetBuilder has
243 been defined, so those metadata might be wrong for legacy builders.
244 """
--> 245 return dataset_metadata.load(cls._get_pkg_dir_path())
File ~\miniconda3\envs\gee\lib\site-packages\tensorflow_datasets\core\dataset_metadata.py:81, in load(pkg_path)
78 @functools.lru_cache(maxsize=256)
79 def load(pkg_path: epath.Path) -> DatasetMetadata:
80 """Returns dataset metadata loaded from files in pkg."""
---> 81 raw_metadata = _read_files(pkg_path)
82 tags = _get_tags(raw_metadata.get(TAGS_FILENAME, ""))
83 return DatasetMetadata(
84 description=raw_metadata.get(DESCRIPTIONS_FILENAME, None),
85 citation=raw_metadata.get(CITATIONS_FILENAME, None),
86 tags=tags,
87 )
File ~\miniconda3\envs\gee\lib\site-packages\tensorflow_datasets\core\dataset_metadata.py:104, in _read_files(path)
102 if inode.name in _METADATA_FILES:
103 name2path[inode.name] = path.joinpath(inode.name)
--> 104 return etree.parallel_map(lambda f: f.read_text(encoding="utf-8"), name2path)
File ~\miniconda3\envs\gee\lib\site-packages\etils\etree\tree_utils.py:93, in TreeAPI.parallel_map(self, map_fn, num_threads, progress_bar, is_leaf, *trees)
91 for f in itr: # Propagate exception to main thread.
92 if f.exception():
---> 93 raise f.exception()
95 return self.backend.map(lambda f: f.result(), futures)
File ~\miniconda3\envs\gee\lib\concurrent\futures\thread.py:58, in _WorkItem.run(self)
55 return
57 try:
---> 58 result = self.fn(*self.args, **self.kwargs)
59 except BaseException as exc:
60 self.future.set_exception(exc)
File ~\miniconda3\envs\gee\lib\site-packages\tensorflow_datasets\core\dataset_metadata.py:104, in _read_files.<locals>.<lambda>(f)
102 if inode.name in _METADATA_FILES:
103 name2path[inode.name] = path.joinpath(inode.name)
--> 104 return etree.parallel_map(lambda f: f.read_text(encoding="utf-8"), name2path)
File ~\miniconda3\envs\gee\lib\site-packages\etils\epath\abstract_path.py:157, in Path.read_text(self, encoding)
155 """Reads contents of self as a string."""
156 with self.open('r', encoding=encoding) as f:
--> 157 return f.read()
File ~\miniconda3\envs\gee\lib\codecs.py:322, in BufferedIncrementalDecoder.decode(self, input, final)
319 def decode(self, input, final=False):
320 # decode input (taking the buffer into account)
321 data = self.buffer + input
--> 322 (result, consumed) = self._buffer_decode(data, self.errors, final)
323 # keep undecoded input until the next call
324 self.buffer = data[consumed:]
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xed in position 4352: invalid continuation byte
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment