Skip to content

Instantly share code, notes, and snippets.

@st-le
Last active July 5, 2022 01:16
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save st-le/6a1ea5614a764d304bef360daf84588a to your computer and use it in GitHub Desktop.
Save st-le/6a1ea5614a764d304bef360daf84588a to your computer and use it in GitHub Desktop.
num_workers > 0 issue
[07/01 16:54:04 d2.data.detection_utils]: TransformGens used in training: [ResizeShortestEdge(short_edge_length=(800,), max_size=1333, sample_style='choice')]
>>> cfg stats: ('trainset',) False
Traceback (most recent call last):
File "/usr/lib/python3.8/multiprocessing/queues.py", line 239, in _feed
File "/usr/lib/python3.8/multiprocessing/reduction.py", line 51, in dumps
File "/usr/local/lib/python3.8/dist-packages/torch/multiprocessing/reductions.py", line 329, in reduce_storage
File "/usr/lib/python3.8/multiprocessing/reduction.py", line 198, in DupFd
File "/usr/lib/python3.8/multiprocessing/resource_sharer.py", line 48, in __init__
new_fd = os.dup(fd)
OSError: [Errno 24] Too many open files
Process Process-1:
Traceback (most recent call last):
File "/usr/lib/python3.8/multiprocessing/process.py", line 315, in _bootstrap
self.run()
File "/usr/lib/python3.8/multiprocessing/process.py", line 108, in run
self._target(*self._args, **self._kwargs)
File "/usr/local/lib/python3.8/dist-packages/torch/utils/data/_utils/worker.py", line 260, in _worker_loop
r = index_queue.get(timeout=MP_STATUS_CHECK_INTERVAL)
File "/usr/lib/python3.8/multiprocessing/queues.py", line 116, in get
return _ForkingPickler.loads(res)
File "/usr/local/lib/python3.8/dist-packages/torch/multiprocessing/reductions.py", line 289, in rebuild_storage_fd
fd = df.detach()
File "/usr/lib/python3.8/multiprocessing/resource_sharer.py", line 58, in detach
return reduction.recv_handle(conn)
File "/usr/lib/python3.8/multiprocessing/reduction.py", line 188, in recv_handle
with socket.fromfd(conn.fileno(), socket.AF_UNIX, socket.SOCK_STREAM) as s:
File "/usr/lib/python3.8/socket.py", line 543, in fromfd
nfd = dup(fd)
OSError: [Errno 24] Too many open files
Traceback (most recent call last):
File "main_moco.py", line 496, in <module>
main()
File "main_moco.py", line 180, in main
main_worker(args.gpu, ngpus_per_node, args)
File "main_moco.py", line 217, in main_worker
mrcnn_tool_k = slmdptc_main(transform=transf, image_folder=args.data, arch=args.arch, args=args)
File "/home/solomon/public/Shawn/2022/Jun/moco/slmdptc/main.py", line 1641, in main
mrcnn_tool.train(True, get_model_only=use_ssl, use_ssl=use_ssl)
File "/home/solomon/public/Shawn/2022/Jun/moco/slmdptc/main.py", line 1004, in train
self.trainer = SolomSSLTrainer(self.cfg, mapper=mapper)
File "/home/solomon/public/Shawn/2022/Jun/moco/slmdptc/main.py", line 209, in __init__
super().__init__(cfg)
File "/home/solomon/public/Shawn/2022/Jun/moco/slmdptc/detectron2/detectron2/engine/defaults.py", line 319, in __init__
data_loader = self.build_train_loader(cfg)
File "/home/solomon/public/Shawn/2022/Jun/moco/slmdptc/main.py", line 213, in build_train_loader
data_loader, cls.train_dataset = build_detection_train_loader(cfg, mapper=cls.mapper, get_dataset=True)
File "/home/solomon/public/Shawn/2022/Jun/moco/slmdptc/detectron2/detectron2/data/build.py", line 313, in build_detection_train_loader
dataset_dicts = get_detection_dataset_dicts(
File "/home/solomon/public/Shawn/2022/Jun/moco/slmdptc/detectron2/detectron2/data/build.py", line 231, in get_detection_dataset_dicts
dataset_dicts = [DatasetCatalog.get(dataset_name) for dataset_name in dataset_names]
File "/home/solomon/public/Shawn/2022/Jun/moco/slmdptc/detectron2/detectron2/data/build.py", line 231, in <listcomp>
dataset_dicts = [DatasetCatalog.get(dataset_name) for dataset_name in dataset_names]
File "/home/solomon/public/Shawn/2022/Jun/moco/slmdptc/detectron2/detectron2/data/catalog.py", line 62, in get
return f()
File "/home/solomon/public/Shawn/2022/Jun/moco/slmdptc/detectron2/detectron2/data/datasets/solom_ssl.py", line 19, in <lambda>
DatasetCatalog.register(name, lambda: load_solom_ssl_json(json_file, image_root, name))
File "/home/solomon/public/Shawn/2022/Jun/moco/slmdptc/detectron2/detectron2/data/datasets/solom_ssl.py", line 38, in load_solom_ssl_json
sample, _ = train_dataset.__getitem__(i)
File "/usr/local/lib/python3.8/dist-packages/torchvision/datasets/folder.py", line 232, in __getitem__
sample = self.loader(path)
File "/usr/local/lib/python3.8/dist-packages/torchvision/datasets/folder.py", line 269, in default_loader
return pil_loader(path)
File "/usr/local/lib/python3.8/dist-packages/torchvision/datasets/folder.py", line 251, in pil_loader
return img.convert('RGB')
File "/usr/local/lib/python3.8/dist-packages/PIL/Image.py", line 904, in convert
self.load()
File "/usr/local/lib/python3.8/dist-packages/PIL/ImageFile.py", line 266, in load
n, err_code = decoder.decode(b)
File "/usr/local/lib/python3.8/dist-packages/torch/utils/data/_utils/signal_handling.py", line 66, in handler
_error_if_any_worker_fails()
RuntimeError: DataLoader worker (pid 2440898) exited unexpectedly with exit code 1. Details are lost due to multiprocessing. Rerunning with num_workers=0 may give better error trace.
######################################################################
A longer log when num_worker=32, batch size=256, nprocs=2
######################################################################
[07/04 17:22:15 d2.data.detection_utils]: TransformGens used in training: [ResizeShortestEdge(short_edge_length=(800,), max_size=1333, sample_style='choice')]
Import torch OK
Initail import finished.
Process Process-1:31:
Process Process-1:1:
Process Process-1:32:
Process Process-1:19:
Process Process-1:29:
Process Process-1:22:
Process Process-1:16:
Process Process-1:20:
Process Process-1:7:
Process Process-1:15:
Process Process-1:6:
Process Process-1:23:
Process Process-1:26:
Process Process-1:17:
Process Process-1:9:
Process Process-1:24:
Process Process-1:28:
Process Process-1:30:
Process Process-1:13:
Process Process-1:5:
Process Process-1:2:
Process Process-1:14:
Process Process-1:21:
Process Process-1:11:
Process Process-1:12:
Process Process-1:3:
Traceback (most recent call last):
Traceback (most recent call last):
File "/usr/lib/python3.8/multiprocessing/process.py", line 315, in _bootstrap
self.run()
File "/usr/lib/python3.8/multiprocessing/process.py", line 108, in run
self._target(*self._args, **self._kwargs)
File "/usr/local/lib/python3.8/dist-packages/torch/utils/data/_utils/worker.py", line 260, in _worker_loop
r = index_queue.get(timeout=MP_STATUS_CHECK_INTERVAL)
File "/usr/lib/python3.8/multiprocessing/process.py", line 315, in _bootstrap
self.run()
File "/usr/lib/python3.8/multiprocessing/queues.py", line 116, in get
return _ForkingPickler.loads(res)
File "/usr/lib/python3.8/multiprocessing/process.py", line 108, in run
self._target(*self._args, **self._kwargs)
Traceback (most recent call last):
File "/usr/local/lib/python3.8/dist-packages/torch/multiprocessing/reductions.py", line 289, in rebuild_storage_fd
fd = df.detach()
Traceback (most recent call last):
File "/usr/local/lib/python3.8/dist-packages/torch/utils/data/_utils/worker.py", line 260, in _worker_loop
r = index_queue.get(timeout=MP_STATUS_CHECK_INTERVAL)
File "/usr/lib/python3.8/multiprocessing/resource_sharer.py", line 57, in detach
with _resource_sharer.get_connection(self._id) as conn:
File "/usr/lib/python3.8/multiprocessing/process.py", line 315, in _bootstrap
self.run()
Traceback (most recent call last):
File "/usr/lib/python3.8/multiprocessing/queues.py", line 116, in get
return _ForkingPickler.loads(res)
File "/usr/lib/python3.8/multiprocessing/resource_sharer.py", line 87, in get_connection
c = Client(address, authkey=process.current_process().authkey)
File "/usr/lib/python3.8/multiprocessing/process.py", line 315, in _bootstrap
self.run()
Traceback (most recent call last):
File "/usr/lib/python3.8/multiprocessing/process.py", line 108, in run
self._target(*self._args, **self._kwargs)
Traceback (most recent call last):
File "/usr/local/lib/python3.8/dist-packages/torch/multiprocessing/reductions.py", line 289, in rebuild_storage_fd
fd = df.detach()
Traceback (most recent call last):
File "/usr/lib/python3.8/multiprocessing/process.py", line 315, in _bootstrap
self.run()
File "/usr/lib/python3.8/multiprocessing/connection.py", line 502, in Client
c = SocketClient(address)
Traceback (most recent call last):
File "/usr/lib/python3.8/multiprocessing/process.py", line 108, in run
self._target(*self._args, **self._kwargs)
Traceback (most recent call last):
File "/usr/local/lib/python3.8/dist-packages/torch/utils/data/_utils/worker.py", line 260, in _worker_loop
r = index_queue.get(timeout=MP_STATUS_CHECK_INTERVAL)
File "/usr/lib/python3.8/multiprocessing/process.py", line 315, in _bootstrap
self.run()
File "/usr/lib/python3.8/multiprocessing/resource_sharer.py", line 57, in detach
with _resource_sharer.get_connection(self._id) as conn:
File "/usr/lib/python3.8/multiprocessing/process.py", line 108, in run
self._target(*self._args, **self._kwargs)
File "/usr/lib/python3.8/multiprocessing/connection.py", line 630, in SocketClient
s.connect(address)
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
File "/usr/lib/python3.8/multiprocessing/process.py", line 315, in _bootstrap
self.run()
File "/usr/local/lib/python3.8/dist-packages/torch/utils/data/_utils/worker.py", line 260, in _worker_loop
r = index_queue.get(timeout=MP_STATUS_CHECK_INTERVAL)
Traceback (most recent call last):
Traceback (most recent call last):
File "/usr/lib/python3.8/multiprocessing/queues.py", line 116, in get
return _ForkingPickler.loads(res)
Traceback (most recent call last):
File "/usr/lib/python3.8/multiprocessing/process.py", line 315, in _bootstrap
self.run()
File "/usr/lib/python3.8/multiprocessing/process.py", line 108, in run
self._target(*self._args, **self._kwargs)
Traceback (most recent call last):
File "/usr/lib/python3.8/multiprocessing/resource_sharer.py", line 87, in get_connection
c = Client(address, authkey=process.current_process().authkey)
File "/usr/lib/python3.8/multiprocessing/process.py", line 315, in _bootstrap
self.run()
File "/usr/local/lib/python3.8/dist-packages/torch/utils/data/_utils/worker.py", line 260, in _worker_loop
r = index_queue.get(timeout=MP_STATUS_CHECK_INTERVAL)
File "/usr/lib/python3.8/multiprocessing/process.py", line 315, in _bootstrap
self.run()
ConnectionRefusedError: [Errno 111] Connection refused
File "/usr/lib/python3.8/multiprocessing/process.py", line 108, in run
self._target(*self._args, **self._kwargs)
File "/usr/lib/python3.8/multiprocessing/queues.py", line 116, in get
return _ForkingPickler.loads(res)
File "/usr/lib/python3.8/multiprocessing/process.py", line 315, in _bootstrap
self.run()
File "/usr/local/lib/python3.8/dist-packages/torch/multiprocessing/reductions.py", line 289, in rebuild_storage_fd
fd = df.detach()
File "/usr/lib/python3.8/multiprocessing/process.py", line 315, in _bootstrap
self.run()
Traceback (most recent call last):
Traceback (most recent call last):
File "/usr/lib/python3.8/multiprocessing/process.py", line 108, in run
self._target(*self._args, **self._kwargs)
Traceback (most recent call last):
File "/usr/lib/python3.8/multiprocessing/process.py", line 315, in _bootstrap
self.run()
File "/usr/local/lib/python3.8/dist-packages/torch/utils/data/_utils/worker.py", line 260, in _worker_loop
r = index_queue.get(timeout=MP_STATUS_CHECK_INTERVAL)
File "/usr/lib/python3.8/multiprocessing/connection.py", line 502, in Client
c = SocketClient(address)
File "/usr/lib/python3.8/multiprocessing/process.py", line 315, in _bootstrap
self.run()
Traceback (most recent call last):
File "/usr/lib/python3.8/multiprocessing/process.py", line 108, in run
self._target(*self._args, **self._kwargs)
File "/usr/lib/python3.8/multiprocessing/process.py", line 315, in _bootstrap
self.run()
Traceback (most recent call last):
Traceback (most recent call last):
File "/usr/lib/python3.8/multiprocessing/queues.py", line 116, in get
return _ForkingPickler.loads(res)
File "/usr/lib/python3.8/multiprocessing/process.py", line 108, in run
self._target(*self._args, **self._kwargs)
File "/usr/lib/python3.8/multiprocessing/process.py", line 315, in _bootstrap
self.run()
File "/usr/local/lib/python3.8/dist-packages/torch/utils/data/_utils/worker.py", line 260, in _worker_loop
r = index_queue.get(timeout=MP_STATUS_CHECK_INTERVAL)
File "/usr/local/lib/python3.8/dist-packages/torch/multiprocessing/reductions.py", line 289, in rebuild_storage_fd
fd = df.detach()
File "/usr/lib/python3.8/multiprocessing/process.py", line 315, in _bootstrap
self.run()
File "/usr/lib/python3.8/multiprocessing/process.py", line 108, in run
self._target(*self._args, **self._kwargs)
Traceback (most recent call last):
Traceback (most recent call last):
File "/usr/lib/python3.8/multiprocessing/resource_sharer.py", line 57, in detach
with _resource_sharer.get_connection(self._id) as conn:
File "/usr/lib/python3.8/multiprocessing/process.py", line 108, in run
self._target(*self._args, **self._kwargs)
File "/usr/local/lib/python3.8/dist-packages/torch/utils/data/_utils/worker.py", line 260, in _worker_loop
r = index_queue.get(timeout=MP_STATUS_CHECK_INTERVAL)
File "/usr/lib/python3.8/multiprocessing/process.py", line 108, in run
self._target(*self._args, **self._kwargs)
File "/usr/lib/python3.8/multiprocessing/queues.py", line 116, in get
return _ForkingPickler.loads(res)
File "/usr/lib/python3.8/multiprocessing/connection.py", line 630, in SocketClient
s.connect(address)
File "/usr/lib/python3.8/multiprocessing/process.py", line 108, in run
self._target(*self._args, **self._kwargs)
File "/usr/lib/python3.8/multiprocessing/process.py", line 315, in _bootstrap
self.run()
File "/usr/lib/python3.8/multiprocessing/process.py", line 315, in _bootstrap
self.run()
File "/usr/local/lib/python3.8/dist-packages/torch/utils/data/_utils/worker.py", line 260, in _worker_loop
r = index_queue.get(timeout=MP_STATUS_CHECK_INTERVAL)
File "/usr/lib/python3.8/multiprocessing/process.py", line 108, in run
self._target(*self._args, **self._kwargs)
File "/usr/lib/python3.8/multiprocessing/process.py", line 315, in _bootstrap
self.run()
File "/usr/local/lib/python3.8/dist-packages/torch/multiprocessing/reductions.py", line 289, in rebuild_storage_fd
fd = df.detach()
File "/usr/local/lib/python3.8/dist-packages/torch/utils/data/_utils/worker.py", line 260, in _worker_loop
r = index_queue.get(timeout=MP_STATUS_CHECK_INTERVAL)
File "/usr/lib/python3.8/multiprocessing/process.py", line 108, in run
self._target(*self._args, **self._kwargs)
File "/usr/lib/python3.8/multiprocessing/process.py", line 315, in _bootstrap
self.run()
File "/usr/lib/python3.8/multiprocessing/queues.py", line 116, in get
return _ForkingPickler.loads(res)
File "/usr/lib/python3.8/multiprocessing/resource_sharer.py", line 57, in detach
with _resource_sharer.get_connection(self._id) as conn:
File "/usr/lib/python3.8/multiprocessing/process.py", line 108, in run
self._target(*self._args, **self._kwargs)
File "/usr/lib/python3.8/multiprocessing/process.py", line 315, in _bootstrap
self.run()
File "/usr/lib/python3.8/multiprocessing/process.py", line 315, in _bootstrap
self.run()
File "/usr/local/lib/python3.8/dist-packages/torch/utils/data/_utils/worker.py", line 260, in _worker_loop
r = index_queue.get(timeout=MP_STATUS_CHECK_INTERVAL)
File "/usr/lib/python3.8/multiprocessing/resource_sharer.py", line 87, in get_connection
c = Client(address, authkey=process.current_process().authkey)
File "/usr/local/lib/python3.8/dist-packages/torch/utils/data/_utils/worker.py", line 260, in _worker_loop
r = index_queue.get(timeout=MP_STATUS_CHECK_INTERVAL)
File "/usr/lib/python3.8/multiprocessing/queues.py", line 116, in get
return _ForkingPickler.loads(res)
File "/usr/local/lib/python3.8/dist-packages/torch/utils/data/_utils/worker.py", line 260, in _worker_loop
r = index_queue.get(timeout=MP_STATUS_CHECK_INTERVAL)
File "/usr/local/lib/python3.8/dist-packages/torch/multiprocessing/reductions.py", line 289, in rebuild_storage_fd
fd = df.detach()
File "/usr/local/lib/python3.8/dist-packages/torch/utils/data/_utils/worker.py", line 260, in _worker_loop
r = index_queue.get(timeout=MP_STATUS_CHECK_INTERVAL)
File "/usr/lib/python3.8/multiprocessing/process.py", line 315, in _bootstrap
self.run()
File "/usr/lib/python3.8/multiprocessing/process.py", line 315, in _bootstrap
self.run()
ConnectionRefusedError: [Errno 111] Connection refused
File "/usr/lib/python3.8/multiprocessing/process.py", line 108, in run
self._target(*self._args, **self._kwargs)
File "/usr/lib/python3.8/multiprocessing/process.py", line 108, in run
self._target(*self._args, **self._kwargs)
File "/usr/lib/python3.8/multiprocessing/queues.py", line 116, in get
return _ForkingPickler.loads(res)
File "/usr/local/lib/python3.8/dist-packages/torch/utils/data/_utils/worker.py", line 260, in _worker_loop
r = index_queue.get(timeout=MP_STATUS_CHECK_INTERVAL)
File "/usr/lib/python3.8/multiprocessing/process.py", line 108, in run
self._target(*self._args, **self._kwargs)
File "/usr/lib/python3.8/multiprocessing/resource_sharer.py", line 57, in detach
with _resource_sharer.get_connection(self._id) as conn:
File "/usr/lib/python3.8/multiprocessing/queues.py", line 116, in get
return _ForkingPickler.loads(res)
File "/usr/local/lib/python3.8/dist-packages/torch/utils/data/_utils/worker.py", line 260, in _worker_loop
r = index_queue.get(timeout=MP_STATUS_CHECK_INTERVAL)
File "/usr/local/lib/python3.8/dist-packages/torch/multiprocessing/reductions.py", line 289, in rebuild_storage_fd
fd = df.detach()
File "/usr/lib/python3.8/multiprocessing/process.py", line 108, in run
self._target(*self._args, **self._kwargs)
File "/usr/local/lib/python3.8/dist-packages/torch/multiprocessing/reductions.py", line 289, in rebuild_storage_fd
fd = df.detach()
File "/usr/lib/python3.8/multiprocessing/resource_sharer.py", line 87, in get_connection
c = Client(address, authkey=process.current_process().authkey)
File "/usr/local/lib/python3.8/dist-packages/torch/utils/data/_utils/worker.py", line 260, in _worker_loop
r = index_queue.get(timeout=MP_STATUS_CHECK_INTERVAL)
File "/usr/lib/python3.8/multiprocessing/process.py", line 108, in run
self._target(*self._args, **self._kwargs)
File "/usr/lib/python3.8/multiprocessing/process.py", line 108, in run
self._target(*self._args, **self._kwargs)
File "/usr/lib/python3.8/multiprocessing/queues.py", line 116, in get
return _ForkingPickler.loads(res)
File "/usr/lib/python3.8/multiprocessing/connection.py", line 502, in Client
c = SocketClient(address)
File "/usr/lib/python3.8/multiprocessing/queues.py", line 116, in get
return _ForkingPickler.loads(res)
File "/usr/local/lib/python3.8/dist-packages/torch/multiprocessing/reductions.py", line 289, in rebuild_storage_fd
fd = df.detach()
File "/usr/lib/python3.8/multiprocessing/queues.py", line 116, in get
return _ForkingPickler.loads(res)
File "/usr/lib/python3.8/multiprocessing/resource_sharer.py", line 57, in detach
with _resource_sharer.get_connection(self._id) as conn:
File "/usr/lib/python3.8/multiprocessing/queues.py", line 116, in get
return _ForkingPickler.loads(res)
File "/usr/lib/python3.8/multiprocessing/process.py", line 108, in run
self._target(*self._args, **self._kwargs)
File "/usr/lib/python3.8/multiprocessing/process.py", line 108, in run
self._target(*self._args, **self._kwargs)
File "/usr/local/lib/python3.8/dist-packages/torch/utils/data/_utils/worker.py", line 260, in _worker_loop
r = index_queue.get(timeout=MP_STATUS_CHECK_INTERVAL)
File "/usr/local/lib/python3.8/dist-packages/torch/utils/data/_utils/worker.py", line 260, in _worker_loop
r = index_queue.get(timeout=MP_STATUS_CHECK_INTERVAL)
File "/usr/local/lib/python3.8/dist-packages/torch/multiprocessing/reductions.py", line 289, in rebuild_storage_fd
fd = df.detach()
File "/usr/lib/python3.8/multiprocessing/queues.py", line 116, in get
return _ForkingPickler.loads(res)
File "/usr/local/lib/python3.8/dist-packages/torch/utils/data/_utils/worker.py", line 260, in _worker_loop
r = index_queue.get(timeout=MP_STATUS_CHECK_INTERVAL)
File "/usr/lib/python3.8/multiprocessing/resource_sharer.py", line 87, in get_connection
c = Client(address, authkey=process.current_process().authkey)
File "/usr/lib/python3.8/multiprocessing/queues.py", line 116, in get
return _ForkingPickler.loads(res)
File "/usr/lib/python3.8/multiprocessing/resource_sharer.py", line 57, in detach
with _resource_sharer.get_connection(self._id) as conn:
File "/usr/local/lib/python3.8/dist-packages/torch/utils/data/_utils/worker.py", line 260, in _worker_loop
r = index_queue.get(timeout=MP_STATUS_CHECK_INTERVAL)
File "/usr/lib/python3.8/multiprocessing/resource_sharer.py", line 57, in detach
with _resource_sharer.get_connection(self._id) as conn:
File "/usr/lib/python3.8/multiprocessing/connection.py", line 502, in Client
c = SocketClient(address)
File "/usr/lib/python3.8/multiprocessing/queues.py", line 116, in get
return _ForkingPickler.loads(res)
File "/usr/local/lib/python3.8/dist-packages/torch/utils/data/_utils/worker.py", line 260, in _worker_loop
r = index_queue.get(timeout=MP_STATUS_CHECK_INTERVAL)
File "/usr/local/lib/python3.8/dist-packages/torch/utils/data/_utils/worker.py", line 260, in _worker_loop
r = index_queue.get(timeout=MP_STATUS_CHECK_INTERVAL)
File "/usr/local/lib/python3.8/dist-packages/torch/multiprocessing/reductions.py", line 289, in rebuild_storage_fd
fd = df.detach()
File "/usr/lib/python3.8/multiprocessing/connection.py", line 630, in SocketClient
s.connect(address)
File "/usr/local/lib/python3.8/dist-packages/torch/multiprocessing/reductions.py", line 289, in rebuild_storage_fd
fd = df.detach()
File "/usr/lib/python3.8/multiprocessing/resource_sharer.py", line 57, in detach
with _resource_sharer.get_connection(self._id) as conn:
File "/usr/local/lib/python3.8/dist-packages/torch/multiprocessing/reductions.py", line 289, in rebuild_storage_fd
fd = df.detach()
File "/usr/lib/python3.8/multiprocessing/resource_sharer.py", line 87, in get_connection
c = Client(address, authkey=process.current_process().authkey)
File "/usr/local/lib/python3.8/dist-packages/torch/multiprocessing/reductions.py", line 289, in rebuild_storage_fd
fd = df.detach()
File "/usr/local/lib/python3.8/dist-packages/torch/utils/data/_utils/worker.py", line 260, in _worker_loop
r = index_queue.get(timeout=MP_STATUS_CHECK_INTERVAL)
File "/usr/local/lib/python3.8/dist-packages/torch/utils/data/_utils/worker.py", line 260, in _worker_loop
r = index_queue.get(timeout=MP_STATUS_CHECK_INTERVAL)
File "/usr/lib/python3.8/multiprocessing/queues.py", line 116, in get
return _ForkingPickler.loads(res)
File "/usr/lib/python3.8/multiprocessing/queues.py", line 116, in get
return _ForkingPickler.loads(res)
File "/usr/lib/python3.8/multiprocessing/resource_sharer.py", line 57, in detach
with _resource_sharer.get_connection(self._id) as conn:
File "/usr/local/lib/python3.8/dist-packages/torch/multiprocessing/reductions.py", line 289, in rebuild_storage_fd
fd = df.detach()
File "/usr/lib/python3.8/multiprocessing/queues.py", line 116, in get
return _ForkingPickler.loads(res)
Traceback (most recent call last):
File "/usr/lib/python3.8/multiprocessing/connection.py", line 502, in Client
c = SocketClient(address)
File "/usr/local/lib/python3.8/dist-packages/torch/multiprocessing/reductions.py", line 289, in rebuild_storage_fd
fd = df.detach()
File "/usr/local/lib/python3.8/dist-packages/torch/multiprocessing/reductions.py", line 289, in rebuild_storage_fd
fd = df.detach()
File "/usr/lib/python3.8/multiprocessing/connection.py", line 630, in SocketClient
s.connect(address)
File "/usr/lib/python3.8/multiprocessing/resource_sharer.py", line 57, in detach
with _resource_sharer.get_connection(self._id) as conn:
File "/usr/lib/python3.8/multiprocessing/resource_sharer.py", line 87, in get_connection
c = Client(address, authkey=process.current_process().authkey)
File "/usr/lib/python3.8/multiprocessing/connection.py", line 502, in Client
c = SocketClient(address)
ConnectionRefusedError: [Errno 111] Connection refused
File "/usr/lib/python3.8/multiprocessing/connection.py", line 630, in SocketClient
s.connect(address)
File "/usr/lib/python3.8/multiprocessing/resource_sharer.py", line 57, in detach
with _resource_sharer.get_connection(self._id) as conn:
File "/usr/lib/python3.8/multiprocessing/process.py", line 315, in _bootstrap
self.run()
File "/usr/lib/python3.8/multiprocessing/resource_sharer.py", line 87, in get_connection
c = Client(address, authkey=process.current_process().authkey)
File "/usr/lib/python3.8/multiprocessing/process.py", line 108, in run
self._target(*self._args, **self._kwargs)
ConnectionRefusedError: [Errno 111] Connection refused
File "/usr/lib/python3.8/multiprocessing/connection.py", line 502, in Client
c = SocketClient(address)
File "/usr/local/lib/python3.8/dist-packages/torch/utils/data/_utils/worker.py", line 260, in _worker_loop
r = index_queue.get(timeout=MP_STATUS_CHECK_INTERVAL)
File "/usr/lib/python3.8/multiprocessing/connection.py", line 630, in SocketClient
s.connect(address)
Process Process-1:27:
File "/usr/lib/python3.8/multiprocessing/queues.py", line 116, in get
return _ForkingPickler.loads(res)
File "/usr/local/lib/python3.8/dist-packages/torch/multiprocessing/reductions.py", line 289, in rebuild_storage_fd
fd = df.detach()
ConnectionRefusedError: [Errno 111] Connection refused
File "/usr/lib/python3.8/multiprocessing/resource_sharer.py", line 57, in detach
with _resource_sharer.get_connection(self._id) as conn:
File "/usr/lib/python3.8/multiprocessing/resource_sharer.py", line 87, in get_connection
c = Client(address, authkey=process.current_process().authkey)
File "/usr/lib/python3.8/multiprocessing/connection.py", line 502, in Client
c = SocketClient(address)
File "/usr/lib/python3.8/multiprocessing/connection.py", line 630, in SocketClient
s.connect(address)
Process Process-1:25:
ConnectionRefusedError: [Errno 111] Connection refused
Process Process-1:18:
Process Process-1:8:
Process Process-1:10:
File "/usr/lib/python3.8/multiprocessing/resource_sharer.py", line 87, in get_connection
c = Client(address, authkey=process.current_process().authkey)
File "/usr/lib/python3.8/multiprocessing/queues.py", line 116, in get
return _ForkingPickler.loads(res)
File "/usr/lib/python3.8/multiprocessing/resource_sharer.py", line 87, in get_connection
c = Client(address, authkey=process.current_process().authkey)
File "/usr/local/lib/python3.8/dist-packages/torch/multiprocessing/reductions.py", line 289, in rebuild_storage_fd
fd = df.detach()
File "/usr/lib/python3.8/multiprocessing/connection.py", line 502, in Client
c = SocketClient(address)
File "/usr/lib/python3.8/multiprocessing/resource_sharer.py", line 57, in detach
with _resource_sharer.get_connection(self._id) as conn:
File "/usr/lib/python3.8/multiprocessing/connection.py", line 630, in SocketClient
s.connect(address)
File "/usr/lib/python3.8/multiprocessing/resource_sharer.py", line 87, in get_connection
c = Client(address, authkey=process.current_process().authkey)
File "/usr/lib/python3.8/multiprocessing/connection.py", line 630, in SocketClient
s.connect(address)
File "/usr/lib/python3.8/multiprocessing/connection.py", line 502, in Client
c = SocketClient(address)
File "/usr/local/lib/python3.8/dist-packages/torch/multiprocessing/reductions.py", line 289, in rebuild_storage_fd
fd = df.detach()
File "/usr/lib/python3.8/multiprocessing/resource_sharer.py", line 58, in detach
return reduction.recv_handle(conn)
Traceback (most recent call last):
File "/usr/lib/python3.8/multiprocessing/queues.py", line 116, in get
return _ForkingPickler.loads(res)
File "/usr/lib/python3.8/multiprocessing/reduction.py", line 189, in recv_handle
return recvfds(s, 1)[0]
Traceback (most recent call last):
File "/usr/lib/python3.8/multiprocessing/reduction.py", line 159, in recvfds
raise EOFError
File "/usr/lib/python3.8/multiprocessing/queues.py", line 116, in get
return _ForkingPickler.loads(res)
File "/usr/local/lib/python3.8/dist-packages/torch/multiprocessing/reductions.py", line 289, in rebuild_storage_fd
fd = df.detach()
File "/usr/lib/python3.8/multiprocessing/process.py", line 315, in _bootstrap
self.run()
File "/usr/lib/python3.8/multiprocessing/resource_sharer.py", line 57, in detach
with _resource_sharer.get_connection(self._id) as conn:
EOFError
File "/usr/local/lib/python3.8/dist-packages/torch/multiprocessing/reductions.py", line 289, in rebuild_storage_fd
fd = df.detach()
File "/usr/lib/python3.8/multiprocessing/resource_sharer.py", line 57, in detach
with _resource_sharer.get_connection(self._id) as conn:
File "/usr/lib/python3.8/multiprocessing/process.py", line 108, in run
self._target(*self._args, **self._kwargs)
File "/usr/lib/python3.8/multiprocessing/resource_sharer.py", line 87, in get_connection
c = Client(address, authkey=process.current_process().authkey)
File "/usr/lib/python3.8/multiprocessing/process.py", line 315, in _bootstrap
self.run()
File "/usr/lib/python3.8/multiprocessing/resource_sharer.py", line 57, in detach
with _resource_sharer.get_connection(self._id) as conn:
File "/usr/lib/python3.8/multiprocessing/resource_sharer.py", line 57, in detach
with _resource_sharer.get_connection(self._id) as conn:
File "/usr/local/lib/python3.8/dist-packages/torch/utils/data/_utils/worker.py", line 260, in _worker_loop
r = index_queue.get(timeout=MP_STATUS_CHECK_INTERVAL)
Process Process-1:4:
File "/usr/lib/python3.8/multiprocessing/resource_sharer.py", line 87, in get_connection
c = Client(address, authkey=process.current_process().authkey)
File "/usr/lib/python3.8/multiprocessing/resource_sharer.py", line 87, in get_connection
c = Client(address, authkey=process.current_process().authkey)
File "/usr/lib/python3.8/multiprocessing/connection.py", line 502, in Client
c = SocketClient(address)
File "/usr/lib/python3.8/multiprocessing/process.py", line 108, in run
self._target(*self._args, **self._kwargs)
ConnectionRefusedError: [Errno 111] Connection refused
Traceback (most recent call last):
File "/usr/lib/python3.8/multiprocessing/resource_sharer.py", line 87, in get_connection
c = Client(address, authkey=process.current_process().authkey)
File "/usr/lib/python3.8/multiprocessing/resource_sharer.py", line 57, in detach
with _resource_sharer.get_connection(self._id) as conn:
File "/usr/lib/python3.8/multiprocessing/queues.py", line 116, in get
return _ForkingPickler.loads(res)
File "/usr/lib/python3.8/multiprocessing/resource_sharer.py", line 87, in get_connection
c = Client(address, authkey=process.current_process().authkey)
File "/usr/lib/python3.8/multiprocessing/connection.py", line 502, in Client
c = SocketClient(address)
File "/usr/lib/python3.8/multiprocessing/connection.py", line 502, in Client
c = SocketClient(address)
File "/usr/lib/python3.8/multiprocessing/connection.py", line 630, in SocketClient
s.connect(address)
File "/usr/lib/python3.8/multiprocessing/connection.py", line 508, in Client
answer_challenge(c, authkey)
File "/usr/local/lib/python3.8/dist-packages/torch/utils/data/_utils/worker.py", line 260, in _worker_loop
r = index_queue.get(timeout=MP_STATUS_CHECK_INTERVAL)
File "/usr/lib/python3.8/multiprocessing/connection.py", line 502, in Client
c = SocketClient(address)
File "/usr/lib/python3.8/multiprocessing/process.py", line 315, in _bootstrap
self.run()
File "/usr/local/lib/python3.8/dist-packages/torch/multiprocessing/reductions.py", line 289, in rebuild_storage_fd
fd = df.detach()
File "/usr/lib/python3.8/multiprocessing/process.py", line 108, in run
self._target(*self._args, **self._kwargs)
File "/usr/lib/python3.8/multiprocessing/connection.py", line 502, in Client
c = SocketClient(address)
File "/usr/lib/python3.8/multiprocessing/resource_sharer.py", line 87, in get_connection
c = Client(address, authkey=process.current_process().authkey)
File "/usr/lib/python3.8/multiprocessing/connection.py", line 630, in SocketClient
s.connect(address)
File "/usr/lib/python3.8/multiprocessing/connection.py", line 630, in SocketClient
s.connect(address)
File "/usr/lib/python3.8/multiprocessing/connection.py", line 752, in answer_challenge
message = connection.recv_bytes(256) # reject large message
File "/usr/lib/python3.8/multiprocessing/queues.py", line 116, in get
return _ForkingPickler.loads(res)
ConnectionRefusedError: [Errno 111] Connection refused
File "/usr/local/lib/python3.8/dist-packages/torch/multiprocessing/reductions.py", line 289, in rebuild_storage_fd
fd = df.detach()
File "/usr/lib/python3.8/multiprocessing/connection.py", line 630, in SocketClient
s.connect(address)
File "/usr/lib/python3.8/multiprocessing/resource_sharer.py", line 57, in detach
with _resource_sharer.get_connection(self._id) as conn:
File "/usr/local/lib/python3.8/dist-packages/torch/utils/data/_utils/worker.py", line 260, in _worker_loop
r = index_queue.get(timeout=MP_STATUS_CHECK_INTERVAL)
File "/usr/lib/python3.8/multiprocessing/connection.py", line 630, in SocketClient
s.connect(address)
File "/usr/lib/python3.8/multiprocessing/queues.py", line 116, in get
return _ForkingPickler.loads(res)
File "/usr/lib/python3.8/multiprocessing/connection.py", line 508, in Client
answer_challenge(c, authkey)
File "/usr/lib/python3.8/multiprocessing/connection.py", line 216, in recv_bytes
buf = self._recv_bytes(maxlength)
ConnectionRefusedError: [Errno 111] Connection refused
File "/usr/lib/python3.8/multiprocessing/resource_sharer.py", line 57, in detach
with _resource_sharer.get_connection(self._id) as conn:
ConnectionRefusedError: [Errno 111] Connection refused
File "/usr/lib/python3.8/multiprocessing/resource_sharer.py", line 87, in get_connection
c = Client(address, authkey=process.current_process().authkey)
File "/usr/lib/python3.8/multiprocessing/resource_sharer.py", line 87, in get_connection
c = Client(address, authkey=process.current_process().authkey)
ConnectionRefusedError: [Errno 111] Connection refused
File "/usr/local/lib/python3.8/dist-packages/torch/multiprocessing/reductions.py", line 289, in rebuild_storage_fd
fd = df.detach()
File "/usr/lib/python3.8/multiprocessing/connection.py", line 752, in answer_challenge
message = connection.recv_bytes(256) # reject large message
ConnectionRefusedError: [Errno 111] Connection refused
File "/usr/lib/python3.8/multiprocessing/connection.py", line 216, in recv_bytes
buf = self._recv_bytes(maxlength)
File "/usr/lib/python3.8/multiprocessing/connection.py", line 414, in _recv_bytes
buf = self._recv(4)
File "/usr/lib/python3.8/multiprocessing/connection.py", line 502, in Client
c = SocketClient(address)
File "/usr/lib/python3.8/multiprocessing/connection.py", line 502, in Client
c = SocketClient(address)
File "/usr/lib/python3.8/multiprocessing/resource_sharer.py", line 57, in detach
with _resource_sharer.get_connection(self._id) as conn:
File "/usr/lib/python3.8/multiprocessing/connection.py", line 414, in _recv_bytes
buf = self._recv(4)
File "/usr/lib/python3.8/multiprocessing/connection.py", line 379, in _recv
chunk = read(handle, remaining)
File "/usr/lib/python3.8/multiprocessing/connection.py", line 630, in SocketClient
s.connect(address)
File "/usr/lib/python3.8/multiprocessing/connection.py", line 630, in SocketClient
s.connect(address)
File "/usr/lib/python3.8/multiprocessing/resource_sharer.py", line 87, in get_connection
c = Client(address, authkey=process.current_process().authkey)
File "/usr/lib/python3.8/multiprocessing/connection.py", line 379, in _recv
chunk = read(handle, remaining)
ConnectionResetError: [Errno 104] Connection reset by peer
ConnectionRefusedError: [Errno 111] Connection refused
File "/usr/lib/python3.8/multiprocessing/connection.py", line 502, in Client
c = SocketClient(address)
ConnectionRefusedError: [Errno 111] Connection refused
File "/usr/lib/python3.8/multiprocessing/connection.py", line 630, in SocketClient
s.connect(address)
ConnectionResetError: [Errno 104] Connection reset by peer
ConnectionRefusedError: [Errno 111] Connection refused
File "/usr/lib/python3.8/multiprocessing/connection.py", line 630, in SocketClient
s.connect(address)
ConnectionRefusedError: [Errno 111] Connection refused
Traceback (most recent call last):
File "/usr/lib/python3.8/multiprocessing/process.py", line 315, in _bootstrap
self.run()
File "/usr/lib/python3.8/multiprocessing/process.py", line 108, in run
self._target(*self._args, **self._kwargs)
File "/usr/local/lib/python3.8/dist-packages/torch/utils/data/_utils/worker.py", line 260, in _worker_loop
r = index_queue.get(timeout=MP_STATUS_CHECK_INTERVAL)
File "/usr/lib/python3.8/multiprocessing/queues.py", line 116, in get
return _ForkingPickler.loads(res)
File "/usr/local/lib/python3.8/dist-packages/torch/multiprocessing/reductions.py", line 289, in rebuild_storage_fd
fd = df.detach()
File "/usr/lib/python3.8/multiprocessing/resource_sharer.py", line 57, in detach
with _resource_sharer.get_connection(self._id) as conn:
File "/usr/lib/python3.8/multiprocessing/resource_sharer.py", line 87, in get_connection
c = Client(address, authkey=process.current_process().authkey)
File "/usr/lib/python3.8/multiprocessing/connection.py", line 502, in Client
c = SocketClient(address)
File "/usr/lib/python3.8/multiprocessing/connection.py", line 630, in SocketClient
s.connect(address)
ConnectionRefusedError: [Errno 111] Connection refused
Traceback (most recent call last):
File "/usr/lib/python3.8/multiprocessing/process.py", line 315, in _bootstrap
self.run()
File "/usr/lib/python3.8/multiprocessing/process.py", line 108, in run
self._target(*self._args, **self._kwargs)
File "/usr/local/lib/python3.8/dist-packages/torch/utils/data/_utils/worker.py", line 260, in _worker_loop
r = index_queue.get(timeout=MP_STATUS_CHECK_INTERVAL)
File "/usr/lib/python3.8/multiprocessing/queues.py", line 116, in get
return _ForkingPickler.loads(res)
File "/usr/local/lib/python3.8/dist-packages/torch/multiprocessing/reductions.py", line 289, in rebuild_storage_fd
fd = df.detach()
File "/usr/lib/python3.8/multiprocessing/resource_sharer.py", line 57, in detach
with _resource_sharer.get_connection(self._id) as conn:
File "/usr/lib/python3.8/multiprocessing/resource_sharer.py", line 87, in get_connection
c = Client(address, authkey=process.current_process().authkey)
File "/usr/lib/python3.8/multiprocessing/connection.py", line 502, in Client
c = SocketClient(address)
File "/usr/lib/python3.8/multiprocessing/connection.py", line 630, in SocketClient
s.connect(address)
ConnectionRefusedError: [Errno 111] Connection refused
File "/usr/lib/python3.8/multiprocessing/resource_sharer.py", line 57, in detach
with _resource_sharer.get_connection(self._id) as conn:
File "/usr/lib/python3.8/multiprocessing/queues.py", line 116, in get
return _ForkingPickler.loads(res)
File "/usr/lib/python3.8/multiprocessing/resource_sharer.py", line 87, in get_connection
c = Client(address, authkey=process.current_process().authkey)
File "/usr/lib/python3.8/multiprocessing/connection.py", line 502, in Client
c = SocketClient(address)
File "/usr/lib/python3.8/multiprocessing/connection.py", line 630, in SocketClient
s.connect(address)
File "/usr/local/lib/python3.8/dist-packages/torch/multiprocessing/reductions.py", line 289, in rebuild_storage_fd
fd = df.detach()
File "/usr/lib/python3.8/multiprocessing/queues.py", line 116, in get
return _ForkingPickler.loads(res)
File "/usr/lib/python3.8/multiprocessing/resource_sharer.py", line 57, in detach
with _resource_sharer.get_connection(self._id) as conn:
File "/usr/lib/python3.8/multiprocessing/connection.py", line 502, in Client
c = SocketClient(address)
ConnectionRefusedError: [Errno 111] Connection refused
File "/usr/lib/python3.8/multiprocessing/resource_sharer.py", line 87, in get_connection
c = Client(address, authkey=process.current_process().authkey)
File "/usr/lib/python3.8/multiprocessing/connection.py", line 630, in SocketClient
s.connect(address)
File "/usr/lib/python3.8/multiprocessing/connection.py", line 502, in Client
c = SocketClient(address)
File "/usr/local/lib/python3.8/dist-packages/torch/multiprocessing/reductions.py", line 289, in rebuild_storage_fd
fd = df.detach()
File "/usr/lib/python3.8/multiprocessing/connection.py", line 630, in SocketClient
s.connect(address)
File "/usr/local/lib/python3.8/dist-packages/torch/multiprocessing/reductions.py", line 289, in rebuild_storage_fd
fd = df.detach()
File "/usr/lib/python3.8/multiprocessing/resource_sharer.py", line 57, in detach
with _resource_sharer.get_connection(self._id) as conn:
File "/usr/lib/python3.8/multiprocessing/resource_sharer.py", line 87, in get_connection
c = Client(address, authkey=process.current_process().authkey)
File "/usr/local/lib/python3.8/dist-packages/torch/multiprocessing/reductions.py", line 289, in rebuild_storage_fd
fd = df.detach()
ConnectionRefusedError: [Errno 111] Connection refused
File "/usr/lib/python3.8/multiprocessing/connection.py", line 502, in Client
c = SocketClient(address)
File "/usr/lib/python3.8/multiprocessing/connection.py", line 630, in SocketClient
s.connect(address)
File "/usr/lib/python3.8/multiprocessing/resource_sharer.py", line 57, in detach
with _resource_sharer.get_connection(self._id) as conn:
File "/usr/lib/python3.8/multiprocessing/resource_sharer.py", line 87, in get_connection
c = Client(address, authkey=process.current_process().authkey)
ConnectionRefusedError: [Errno 111] Connection refused
File "/usr/lib/python3.8/multiprocessing/resource_sharer.py", line 87, in get_connection
c = Client(address, authkey=process.current_process().authkey)
File "/usr/lib/python3.8/multiprocessing/connection.py", line 502, in Client
c = SocketClient(address)
File "/usr/lib/python3.8/multiprocessing/connection.py", line 630, in SocketClient
s.connect(address)
ConnectionRefusedError: [Errno 111] Connection refused
File "/usr/lib/python3.8/multiprocessing/resource_sharer.py", line 57, in detach
with _resource_sharer.get_connection(self._id) as conn:
File "/usr/lib/python3.8/multiprocessing/connection.py", line 502, in Client
c = SocketClient(address)
ConnectionRefusedError: [Errno 111] Connection refused
File "/usr/lib/python3.8/multiprocessing/connection.py", line 630, in SocketClient
s.connect(address)
File "/usr/lib/python3.8/multiprocessing/resource_sharer.py", line 87, in get_connection
c = Client(address, authkey=process.current_process().authkey)
File "/usr/lib/python3.8/multiprocessing/connection.py", line 502, in Client
c = SocketClient(address)
File "/usr/lib/python3.8/multiprocessing/connection.py", line 630, in SocketClient
s.connect(address)
ConnectionRefusedError: [Errno 111] Connection refused
ConnectionRefusedError: [Errno 111] Connection refused
ConnectionRefusedError: [Errno 111] Connection refused
Traceback (most recent call last):
ConnectionRefusedError: [Errno 111] Connection refused
File "/usr/lib/python3.8/multiprocessing/process.py", line 315, in _bootstrap
self.run()
File "/usr/lib/python3.8/multiprocessing/process.py", line 108, in run
self._target(*self._args, **self._kwargs)
File "/usr/local/lib/python3.8/dist-packages/torch/utils/data/_utils/worker.py", line 260, in _worker_loop
r = index_queue.get(timeout=MP_STATUS_CHECK_INTERVAL)
File "/usr/lib/python3.8/multiprocessing/queues.py", line 116, in get
return _ForkingPickler.loads(res)
File "/usr/local/lib/python3.8/dist-packages/torch/multiprocessing/reductions.py", line 289, in rebuild_storage_fd
fd = df.detach()
File "/usr/lib/python3.8/multiprocessing/resource_sharer.py", line 57, in detach
with _resource_sharer.get_connection(self._id) as conn:
File "/usr/lib/python3.8/multiprocessing/resource_sharer.py", line 87, in get_connection
c = Client(address, authkey=process.current_process().authkey)
File "/usr/lib/python3.8/multiprocessing/connection.py", line 502, in Client
c = SocketClient(address)
File "/usr/lib/python3.8/multiprocessing/connection.py", line 630, in SocketClient
s.connect(address)
ConnectionRefusedError: [Errno 111] Connection refused
File "/usr/lib/python3.8/multiprocessing/resource_sharer.py", line 57, in detach
with _resource_sharer.get_connection(self._id) as conn:
File "/usr/lib/python3.8/multiprocessing/resource_sharer.py", line 87, in get_connection
c = Client(address, authkey=process.current_process().authkey)
File "/usr/lib/python3.8/multiprocessing/connection.py", line 502, in Client
c = SocketClient(address)
File "/usr/lib/python3.8/multiprocessing/connection.py", line 630, in SocketClient
s.connect(address)
ConnectionRefusedError: [Errno 111] Connection refused
Traceback (most recent call last):
File "main_moco.py", line 496, in <module>
main()
File "main_moco.py", line 177, in main
launch(main_worker, ngpus_per_node, machine_rank=args.rank, dist_url=args.dist_url, args=(args.gpu, ngpus_per_node, args))
File "/home/solomon/public/Shawn/2022/Jun/moco/slmdptc/detectron2/detectron2/engine/launch.py", line 48, in launch
mp.spawn(
File "/usr/local/lib/python3.8/dist-packages/torch/multiprocessing/spawn.py", line 230, in spawn
return start_processes(fn, args, nprocs, join, daemon, start_method='spawn')
File "/usr/local/lib/python3.8/dist-packages/torch/multiprocessing/spawn.py", line 188, in start_processes
while not context.join():
File "/usr/local/lib/python3.8/dist-packages/torch/multiprocessing/spawn.py", line 150, in join
raise ProcessRaisedException(msg, error_index, failed_process.pid)
torch.multiprocessing.spawn.ProcessRaisedException:
-- Process 1 terminated with the following error:
Traceback (most recent call last):
File "/usr/local/lib/python3.8/dist-packages/torch/multiprocessing/spawn.py", line 59, in _wrap
File "/home/solomon/public/Shawn/2022/Jun/moco/slmdptc/detectron2/detectron2/engine/launch.py", line 87, in _distributed_worker
File "/home/solomon/public/Shawn/2022/Jun/moco/main_moco.py", line 217, in main_worker
mrcnn_tool_k = slmdptc_main(transform=transf, image_folder=args.data, arch=args.arch, args=args)
File "/home/solomon/public/Shawn/2022/Jun/moco/slmdptc/main.py", line 1635, in main
File "/home/solomon/public/Shawn/2022/Jun/moco/slmdptc/main.py", line 656, in __init__
File "/home/solomon/public/Shawn/2022/Jun/moco/slmdptc/main.py", line 584, in __init__
File "/home/solomon/public/Shawn/2022/Jun/moco/slmdptc/main.py", line 380, in __init__
File "/home/solomon/public/Shawn/2022/Jun/moco/slmdptc/main.py", line 487, in loadConfigFromFile
File "/usr/local/lib/python3.8/dist-packages/numpy/lib/npyio.py", line 1065, in loadtxt
File "/usr/local/lib/python3.8/dist-packages/numpy/lib/_datasource.py", line 194, in open
File "/usr/local/lib/python3.8/dist-packages/numpy/lib/_datasource.py", line 528, in open
OSError: [Errno 24] Too many open files: '/home/solomon/public/Shawn/2022/Feb/continual_learning/unknown_full_benchmark/blender150/Images/Annotation/class_name.txt'
/usr/lib/python3.8/multiprocessing/resource_tracker.py:216: UserWarning: resource_tracker: There appear to be 104 leaked semaphore objects to clean up at shutdown
warnings.warn('resource_tracker: There appear to be %d '
################################################################
ANOTHER ERROR WITH A DIFFERENT TEXT FILE OPENING
################################################################
-- Process 1 terminated with the following error:
Traceback (most recent call last):
File "/usr/local/lib/python3.8/dist-packages/torch/multiprocessing/spawn.py", line 59, in _wrap
File "/home/solomon/public/Shawn/2022/Jun/moco/slmdptc/detectron2/detectron2/engine/launch.py", line 87, in _distributed_worker
File "/home/solomon/public/Shawn/2022/Jun/moco/main_moco.py", line 217, in main_worker
mrcnn_tool_k = slmdptc_main(transform=transf, image_folder=args.data, arch=args.arch, args=args)
File "/home/solomon/public/Shawn/2022/Jun/moco/slmdptc/main.py", line 1641, in main
File "/home/solomon/public/Shawn/2022/Jun/moco/slmdptc/main.py", line 993, in train
File "/home/solomon/public/Shawn/2022/Jun/moco/slmdptc/main.py", line 749, in initail_cfg
File "/home/solomon/public/Shawn/2022/Jun/moco/slmdptc/detectron2/detectron2/engine/../../detectron2/config/config.py", line 24, in merge_from_file
File "/usr/local/lib/python3.8/dist-packages/fvcore/common/config.py", line 59, in load_yaml_with_base
File "/usr/local/lib/python3.8/dist-packages/fvcore/common/config.py", line 40, in _open_cfg
File "/usr/local/lib/python3.8/dist-packages/iopath/common/file_io.py", line 929, in open
File "/usr/local/lib/python3.8/dist-packages/iopath/common/file_io.py", line 589, in _open
OSError: [Errno 24] Too many open files: '/home/solomon/public/Pawn/detectron2-0.6/configs/COCO-InstanceSegmentation/mask_rcnn_R_101_FPN_3x.yaml'
################################################################
Set serialize in DT2 = False
################################################################
-- Process 1 terminated with the following error:
Traceback (most recent call last):
File "/usr/local/lib/python3.8/dist-packages/torch/multiprocessing/spawn.py", line 59, in _wrap
File "/home/solomon/public/Shawn/2022/Jun/moco/slmdptc/detectron2/detectron2/engine/launch.py", line 87, in _distributed_worker
File "/home/solomon/public/Shawn/2022/Jun/moco/main_moco.py", line 333, in main_worker
train(train_loader, model, criterion, optimizer, epoch, args)
File "/home/solomon/public/Shawn/2022/Jun/moco/main_moco.py", line 363, in train
for i, images in enumerate(train_loader):
File "/usr/local/lib/python3.8/dist-packages/torch/utils/data/dataloader.py", line 359, in __iter__
File "/usr/local/lib/python3.8/dist-packages/torch/utils/data/dataloader.py", line 305, in _get_iterator
File "/usr/local/lib/python3.8/dist-packages/torch/utils/data/dataloader.py", line 891, in __init__
File "/usr/lib/python3.8/multiprocessing/context.py", line 103, in Queue
File "/usr/lib/python3.8/multiprocessing/queues.py", line 41, in __init__
File "/usr/lib/python3.8/multiprocessing/connection.py", line 527, in Pipe
OSError: [Errno 24] Too many open files
Traceback (most recent call last):
File "<string>", line 1, in <module>
File "/usr/lib/python3.8/multiprocessing/spawn.py", line 116, in spawn_main
exitcode = _main(fd, parent_sentinel)
File "/usr/lib/python3.8/multiprocessing/spawn.py", line 126, in _main
self = reduction.pickle.load(from_parent)
_pickle.UnpicklingError: pickle data was truncated
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment