You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
Hi. Thank you for your great project. But I encounter a problem and I cannot fix it.
I run file helper/create_train.py and produce a json file with format:
When I use this file for run_pre_training.py, I encounter error
───────────────────── Traceback (most recent call last) ──────────────────────╮
│ /opt/conda/lib/python3.10/site-packages/datasets/packaged_modules/json/json. │
│ py:152 in _generate_tables │
│ │
│ 149 │ │ │ │ │ │ except pa.ArrowInvalid as e: │
│ 150 │ │ │ │ │ │ │ try: │
│ 151 │ │ │ │ │ │ │ │ with open(file, encoding="utf-8") as f │
│ ❱ 152 │ │ │ │ │ │ │ │ │ dataset = json.load(f) │
│ 153 │ │ │ │ │ │ │ except json.JSONDecodeError: │
│ 154 │ │ │ │ │ │ │ │ logger.error(f"Failed to read file '{f │
│ 155 │ │ │ │ │ │ │ │ raise e │
│ │
│ /opt/conda/lib/python3.10/json/__init__.py:293 in load │
│ │
│ 290 │ To use a custom ``JSONDecoder`` subclass, specify it with the ``cl │
│ 291 │ kwarg; otherwise ``JSONDecoder`` is used. │
│ 292 │ """ │
│ ❱ 293 │ return loads(fp.read(), │
│ 294 │ │ cls=cls, object_hook=object_hook, │
│ 295 │ │ parse_float=parse_float, parse_int=parse_int, │
│ 296 │ │ parse_constant=parse_constant, object_pairs_hook=object_pairs_ │
│ │
│ /opt/conda/lib/python3.10/json/__init__.py:346 in loads │
│ │
│ 343 │ if (cls is None and object_hook is None and │
│ 344 │ │ │ parse_int is None and parse_float is None and │
│ 345 │ │ │ parse_constant is None and object_pairs_hook is None and n │
│ ❱ 346 │ │ return _default_decoder.decode(s) │
│ 347 │ if cls is None: │
│ 348 │ │ cls = JSONDecoder │
│ 349 │ if object_hook is not None: │
│ │
│ /opt/conda/lib/python3.10/json/decoder.py:337 in decode │
│ │
│ 334 │ │ containing a JSON document). │
│ 335 │ │ │
│ 336 │ │ """ │
│ ❱ 337 │ │ obj, end = self.raw_decode(s, idx=_w(s, 0).end()) │
│ 338 │ │ end = _w(s, end).end() │
│ 339 │ │ if end != len(s): │
│ 340 │ │ │ raise JSONDecodeError("Extra data", s, end) │
│ │
│ /opt/conda/lib/python3.10/json/decoder.py:355 in raw_decode │
│ │
│ 352 │ │ try: │
│ 353 │ │ │ obj, end = self.scan_once(s, idx) │
│ 354 │ │ except StopIteration as err: │
│ ❱ 355 │ │ │ raise JSONDecodeError("Expecting value", s, err.value) fro │
│ 356 │ │ return obj, end │
│ 357 │
╰──────────────────────────────────────────────────────────────────────────────╯
JSONDecodeError: Expecting value: line 1 column 1 (char 0)
During handling of the above exception, another exception occurred:
╭───────────────────── Traceback (most recent call last) ──────────────────────╮
│ /opt/conda/lib/python3.10/site-packages/datasets/builder.py:1860 in │
│ _prepare_split_single │
│ │
│ 1857 │ │ │ ) │
│ 1858 │ │ │ try: │
│ 1859 │ │ │ │ _time = time.time() │
│ ❱ 1860 │ │ │ │ for _, table in generator: │
│ 1861 │ │ │ │ │ if max_shard_size is not None and writer._num_byt │
│ 1862 │ │ │ │ │ │ num_examples, num_bytes = writer.finalize() │
│ 1863 │ │ │ │ │ │ writer.close() │
│ │
│ /opt/conda/lib/python3.10/site-packages/datasets/packaged_modules/json/json. │
│ py:155 in _generate_tables │
│ │
│ 152 │ │ │ │ │ │ │ │ │ dataset = json.load(f) │
│ 153 │ │ │ │ │ │ │ except json.JSONDecodeError: │
│ 154 │ │ │ │ │ │ │ │ logger.error(f"Failed to read file '{f │
│ ❱ 155 │ │ │ │ │ │ │ │ raise e │
│ 156 │ │ │ │ │ │ │ # If possible, parse the file as a list of │
│ 157 │ │ │ │ │ │ │ if isinstance(dataset, list): # list is t │
│ 158 │ │ │ │ │ │ │ │ try: │
│ │
│ /opt/conda/lib/python3.10/site-packages/datasets/packaged_modules/json/json. │
│ py:131 in _generate_tables │
│ │
│ 128 │ │ │ │ │ │ try: │
│ 129 │ │ │ │ │ │ │ while True: │
│ 130 │ │ │ │ │ │ │ │ try: │
│ ❱ 131 │ │ │ │ │ │ │ │ │ pa_table = paj.read_json( │
│ 132 │ │ │ │ │ │ │ │ │ │ io.BytesIO(batch), read_option │
│ 133 │ │ │ │ │ │ │ │ │ ) │
│ 134 │ │ │ │ │ │ │ │ │ break │
│ │
│ /kaggle/working/zalo_ltr_2021/pyarrow/_json.pyx:259 in │
│ pyarrow._json.read_json │
│ │
│ [Errno 2] No such file or directory: │
│ '/kaggle/working/zalo_ltr_2021/pyarrow/_json.pyx' │
│ │
│ /kaggle/working/zalo_ltr_2021/pyarrow/error.pxi:144 in │
│ pyarrow.lib.pyarrow_internal_check_status │
│ │
│ [Errno 2] No such file or directory: │
│ '/kaggle/working/zalo_ltr_2021/pyarrow/error.pxi' │
│ │
│ /kaggle/working/zalo_ltr_2021/pyarrow/error.pxi:100 in │
│ pyarrow.lib.check_status │
│ │
│ [Errno 2] No such file or directory: │
│ '/kaggle/working/zalo_ltr_2021/pyarrow/error.pxi' │
╰──────────────────────────────────────────────────────────────────────────────╯
ArrowInvalid: JSON parse error: Invalid value. in row 0
The above exception was the direct cause of the following exception:
╭───────────────────── Traceback (most recent call last) ──────────────────────╮
│ /kaggle/working/zalo_ltr_2021/Condenser/run_pre_training.py:202 in <module> │
│ │
│ 199 │
│ 200 │
│ 201 if __name__ == "__main__": │
│ ❱ 202 │ main() │
│ 203 │
│ │
│ /kaggle/working/zalo_ltr_2021/Condenser/run_pre_training.py:95 in main │
│ │
│ 92 │ # Set seed before initializing model. │
│ 93 │ set_seed(training_args.seed) │
│ 94 │ │
│ ❱ 95 │ train_set = load_dataset( │
│ 96 │ │ 'json', │
│ 97 │ │ data_files=data_args.train_path, │
│ 98 │ │ block_size=2**25, │
│ │
│ /opt/conda/lib/python3.10/site-packages/datasets/load.py:1782 in │
│ load_dataset │
│ │
│ 1779 │ try_from_hf_gcs = path not in _PACKAGED_DATASETS_MODULES │
│ 1780 │ │
│ 1781 │ # Download and prepare data │
│ ❱ 1782 │ builder_instance.download_and_prepare( │
│ 1783 │ │ download_config=download_config, │
│ 1784 │ │ download_mode=download_mode, │
│ 1785 │ │ verification_mode=verification_mode, │
│ │
│ /opt/conda/lib/python3.10/site-packages/datasets/builder.py:872 in │
│ download_and_prepare │
│ │
│ 869 │ │ │ │ │ │ │ prepare_split_kwargs["max_shard_size"] = │
│ 870 │ │ │ │ │ │ if num_proc is not None: │
│ 871 │ │ │ │ │ │ │ prepare_split_kwargs["num_proc"] = num_pr │
│ ❱ 872 │ │ │ │ │ │ self._download_and_prepare( │
│ 873 │ │ │ │ │ │ │ dl_manager=dl_manager, │
│ 874 │ │ │ │ │ │ │ verification_mode=verification_mode, │
│ 875 │ │ │ │ │ │ │ **prepare_split_kwargs, │
│ │
│ /opt/conda/lib/python3.10/site-packages/datasets/builder.py:967 in │
│ _download_and_prepare │
│ │
│ 964 │ │ │ │
│ 965 │ │ │ try: │
│ 966 │ │ │ │ # Prepare split will record examples associated to th │
│ ❱ 967 │ │ │ │ self._prepare_split(split_generator, **prepare_split_ │
│ 968 │ │ │ except OSError as e: │
│ 969 │ │ │ │ raise OSError( │
│ 970 │ │ │ │ │ "Cannot find data file. " │
│ │
│ /opt/conda/lib/python3.10/site-packages/datasets/builder.py:1749 in │
│ _prepare_split │
│ │
│ 1746 │ │ │ gen_kwargs = split_generator.gen_kwargs │
│ 1747 │ │ │ job_id = 0 │
│ 1748 │ │ │ with pbar: │
│ ❱ 1749 │ │ │ │ for job_id, done, content in self._prepare_split_sing │
│ 1750 │ │ │ │ │ gen_kwargs=gen_kwargs, job_id=job_id, **_prepare_ │
│ 1751 │ │ │ │ ): │
│ 1752 │ │ │ │ │ if done: │
│ │
│ /opt/conda/lib/python3.10/site-packages/datasets/builder.py:1892 in │
│ _prepare_split_single │
│ │
│ 1889 │ │ │ # Ignore the writer's error for no examples written to th │
│ 1890 │ │ │ if isinstance(e, SchemaInferenceError) and e.__context__ │
│ 1891 │ │ │ │ e = e.__context__ │
│ ❱ 1892 │ │ │ raise DatasetGenerationError("An error occurred while gen │
│ 1893 │ │ │
│ 1894 │ │ yield job_id, True, (total_num_examples, total_num_bytes, wri │
│ 1895 │
╰──────────────────────────────────────────────────────────────────────────────╯
DatasetGenerationError: An error occurred while generating the dataset
I guess possibly the problem is incompatible version of datasets and transformers. But I tried many versions of datasets, I still encounter error. Can you help me fix this? Thank you so much!
The text was updated successfully, but these errors were encountered:
Hi. Thank you for your great project. But I encounter a problem and I cannot fix it.
I run file helper/create_train.py and produce a json file with format:
When I use this file for run_pre_training.py, I encounter error
I guess possibly the problem is incompatible version of datasets and transformers. But I tried many versions of datasets, I still encounter error. Can you help me fix this? Thank you so much!
The text was updated successfully, but these errors were encountered: