-
-
Save sshleifer/75145ba828fcb4e998d5e34c46ce13fc to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/arrow/cpp/src/plasma/io.cc:177: Connection to IPC socket failed for pathname /tmp/plasma, retrying 200 more times | |
/arrow/cpp/src/plasma/io.cc:177: Connection to IPC socket failed for pathname /tmp/plasma, retrying 199 more times | |
/arrow/cpp/src/plasma/io.cc:177: Connection to IPC socket failed for pathname /tmp/plasma, retrying 198 more times | |
/arrow/cpp/src/plasma/store.cc:1274: Allowing the Plasma store to use up to 214.748GB of memory. | |
/arrow/cpp/src/plasma/store.cc:1297: Starting object store with directory /dev/shm and huge page support disabled | |
2021-02-27 13:09:45 | INFO | fairseq_cli.train | Started plasma server pid 2854566 | |
2021-02-27 13:09:45 | INFO | fairseq_cli.train | task: LanguageModelingTask | |
2021-02-27 13:09:45 | INFO | fairseq_cli.train | model: TransformerLanguageModel | |
2021-02-27 13:09:45 | INFO | fairseq_cli.train | criterion: CrossEntropyCriterion | |
2021-02-27 13:09:45 | INFO | fairseq_cli.train | num. model params: 6,484,352 (num. trained: 6,484,352) | |
2021-02-27 13:09:48 | INFO | fairseq.utils | ***********************CUDA enviroments for all 1 workers*********************** | |
2021-02-27 13:09:48 | INFO | fairseq.utils | rank 0: capabilities = 7.0 ; total memory = 31.749 GB ; name = Tesla V100-SXM2-32GB | |
2021-02-27 13:09:48 | INFO | fairseq.utils | ***********************CUDA enviroments for all 1 workers*********************** | |
2021-02-27 13:09:48 | INFO | fairseq_cli.train | training on 1 devices (GPUs/TPUs) | |
2021-02-27 13:09:48 | INFO | fairseq_cli.train | max tokens per GPU = None and batch size per GPU = 1 | |
2021-02-27 13:09:48 | INFO | fairseq.trainer | Preparing to load checkpoint x.pt | |
2021-02-27 13:09:48 | INFO | fairseq.trainer | No existing checkpoint found x.pt | |
2021-02-27 13:09:48 | INFO | fairseq.trainer | loading train data for epoch 1 | |
2021-02-27 13:09:48 | INFO | fairseq.data.data_utils | loaded 100 examples from: /private/home/sshleifer/stories_small/train | |
P: pid: 2854513, id: ObjectID(3030303030303030303030303030303030303004), lock: False: PUT | |
P: pid: 2854513, id: ObjectID(3030303030303030303030303030303030303006), lock: False: PUT | |
P: pid: 2854513, id: ObjectID(3030303030303030303030303030303030303006), lock: False: GET | |
P: pid: 2854513, id: ObjectID(3030303030303030303030303030303030303006), lock: False: GOT | |
P: pid: 2854513, id: ObjectID(3030303030303030303030303030303030303004), lock: False: GET | |
P: pid: 2854513, id: ObjectID(3030303030303030303030303030303030303004), lock: False: GOT | |
2021-02-27 13:09:48 | INFO | fairseq.trainer | begin training epoch 1 | |
2021-02-27 13:09:48 | INFO | fairseq_cli.train | Start iterating over samples | |
/arrow/cpp/src/plasma/store.cc:586: Check failed: RemoveFromClientObjectIds(object_id, entry, client) == 1 | |
/private/home/sshleifer/.conda/envs/fdev/lib/python3.7/site-packages/pyarrow/libarrow.so.300(+0x720cd8)[0x7fc8f1ca3cd8] | |
/private/home/sshleifer/.conda/envs/fdev/lib/python3.7/site-packages/pyarrow/libarrow.so.300(_ZN5arrow4util8ArrowLogD1Ev+0xed)[0x7fc8f1ca414d] | |
/private/home/sshleifer/.conda/envs/fdev/bin/plasma_store[0x4158ec] | |
/private/home/sshleifer/.conda/envs/fdev/bin/plasma_store[0x418069] | |
/private/home/sshleifer/.conda/envs/fdev/bin/plasma_store[0x41965e] | |
/private/home/sshleifer/.conda/envs/fdev/bin/plasma_store[0x40c56a] | |
/private/home/sshleifer/.conda/envs/fdev/bin/plasma_store[0x422668] | |
/private/home/sshleifer/.conda/envs/fdev/bin/plasma_store[0x4213e5] | |
/private/home/sshleifer/.conda/envs/fdev/bin/plasma_store[0x41335d] | |
/private/home/sshleifer/.conda/envs/fdev/bin/plasma_store[0x40b409] | |
/lib/x86_64-linux-gnu/libc.so.6(__libc_start_main+0xf3)[0x7fc8f10100b3] | |
/private/home/sshleifer/.conda/envs/fdev/bin/plasma_store[0x40c350] | |
2021-02-27 13:09:48 | INFO | wandb.sdk.internal.internal | Internal process exited | |
Traceback (most recent call last): | |
File "/private/home/sshleifer/.conda/envs/fdev/bin/fairseq-train", line 33, in <module> | |
sys.exit(load_entry_point('fairseq', 'console_scripts', 'fairseq-train')()) | |
File "/private/home/sshleifer/fairseq-py/fairseq_cli/train.py", line 466, in cli_main | |
distributed_utils.call_main(cfg, main) | |
File "/private/home/sshleifer/fairseq-py/fairseq/distributed/utils.py", line 364, in call_main | |
main(cfg, **kwargs) | |
File "/private/home/sshleifer/fairseq-py/fairseq_cli/train.py", line 143, in main | |
valid_losses, should_stop = train(cfg, trainer, task, epoch_itr) | |
File "/private/home/sshleifer/.conda/envs/fdev/lib/python3.7/contextlib.py", line 74, in inner | |
return func(*args, **kwds) | |
File "/private/home/sshleifer/fairseq-py/fairseq_cli/train.py", line 251, in train | |
for i, samples in enumerate(progress): | |
File "/private/home/sshleifer/fairseq-py/fairseq/logging/progress_bar.py", line 256, in __iter__ | |
for i, obj in enumerate(self.iterable, start=self.n): | |
File "/private/home/sshleifer/fairseq-py/fairseq/data/iterators.py", line 59, in __iter__ | |
for x in self.iterable: | |
File "/private/home/sshleifer/fairseq-py/fairseq/data/iterators.py", line 528, in _chunk_iterator | |
for x in itr: | |
File "/private/home/sshleifer/fairseq-py/fairseq/data/iterators.py", line 59, in __iter__ | |
for x in self.iterable: | |
File "/private/home/sshleifer/fairseq-py/fairseq/data/iterators.py", line 650, in __next__ | |
raise item | |
File "/private/home/sshleifer/fairseq-py/fairseq/data/iterators.py", line 581, in run | |
for item in self._source: | |
File "/private/home/sshleifer/.conda/envs/fdev/lib/python3.7/site-packages/torch/utils/data/dataloader.py", line 435, in __next__ | |
data = self._next_data() | |
File "/private/home/sshleifer/.conda/envs/fdev/lib/python3.7/site-packages/torch/utils/data/dataloader.py", line 1085, in _next_data | |
return self._process_data(data) | |
File "/private/home/sshleifer/.conda/envs/fdev/lib/python3.7/site-packages/torch/utils/data/dataloader.py", line 1111, in _process_data | |
data.reraise() | |
File "/private/home/sshleifer/.conda/envs/fdev/lib/python3.7/site-packages/torch/_utils.py", line 428, in reraise | |
raise self.exc_type(msg) | |
OSError: Caught OSError in DataLoader worker process 0. | |
Original Traceback (most recent call last): | |
File "/private/home/sshleifer/.conda/envs/fdev/lib/python3.7/site-packages/torch/utils/data/_utils/worker.py", line 198, in _worker_loop | |
data = fetcher.fetch(index) | |
File "/private/home/sshleifer/.conda/envs/fdev/lib/python3.7/site-packages/torch/utils/data/_utils/fetch.py", line 44, in fetch | |
data = [self.dataset[idx] for idx in possibly_batched_index] | |
File "/private/home/sshleifer/.conda/envs/fdev/lib/python3.7/site-packages/torch/utils/data/_utils/fetch.py", line 44, in <listcomp> | |
data = [self.dataset[idx] for idx in possibly_batched_index] | |
File "/private/home/sshleifer/fairseq-py/fairseq/data/monolingual_dataset.py", line 104, in __getitem__ | |
source, future_target, past_target = self.dataset[index] | |
File "/private/home/sshleifer/fairseq-py/fairseq/data/token_block_dataset.py", line 152, in __getitem__ | |
si = self.slice_indices | |
File "/private/home/sshleifer/fairseq-py/fairseq/data/token_block_dataset.py", line 132, in slice_indices | |
return self._slice_indices.array | |
File "/private/home/sshleifer/fairseq-py/fairseq/data/plasma_utils.py", line 136, in array | |
ret = self.client.get(self.object_id) | |
File "pyarrow/_plasma.pyx", line 595, in pyarrow._plasma.PlasmaClient.get | |
File "pyarrow/_plasma.pyx", line 583, in pyarrow._plasma.PlasmaClient.get | |
File "pyarrow/_plasma.pyx", line 431, in pyarrow._plasma.PlasmaClient.get_buffers | |
File "pyarrow/_plasma.pyx", line 325, in pyarrow._plasma.PlasmaClient._get_object_buffers | |
File "pyarrow/_plasma.pyx", line 289, in pyarrow._plasma.plasma_check_status | |
File "pyarrow/error.pxi", line 99, in pyarrow.lib.check_status | |
OSError: Encountered unexpected EOF | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment