Last active
August 26, 2018 03:34
-
-
Save raybellwaves/f28777bf840cc40f4c76d88beca528c5 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
I was copying the example from https://www.youtube.com/watch?v=nH_AQo8WdKw creating creating a Tb dataset. | |
I tried to set off ~50 workers as in the example but the queues are more limited on pegasus (our HPC) compared to Cheyenne. | |
I was only getting ~5 workers, each with 25 Gb, to run and they were probably struggling to handle to large dataset. | |
# Build notes: | |
# conda create --name djq_mem_issue python=3.6 | |
# conda activate djq_mem_issue | |
# conda install -c conda-forge psutil notebook ipywidgets bokeh pandas | |
# pip install git+https://github.com/dask/dask-jobqueue | |
# This is run in a notebook: | |
from dask_jobqueue import LSFCluster | |
from dask.distributed import Client | |
import dask.dataframe as dd | |
import dask.array as da | |
cluster = LSFCluster(cores=8, memory='25 GB', queue='general', walltime='00:15') | |
cluster | |
# Click on 'Manual Scaling' choose 50 workers. | |
# on HPC type 'bjobs' to see how many are running | |
client = Client(cluster) | |
client | |
df = dd.demo.make_timeseries(start='2000-01-01', | |
end='2010-12-31', | |
dtypes={'x': float, 'y': float, 'id': int}, | |
freq='10ms', | |
partition_freq='24h') | |
df | |
df.head() | |
df = df.persist() | |
# Errors occurs after this | |
# In this case I only two workers running | |
# The output in the terminal is: | |
distributed.scheduler - ERROR - '17496259' | |
Traceback (most recent call last): | |
File "/nethome/rxb826/local/bin/miniconda3/envs/djq_mem_issue/lib/python3.6/site-packages/distributed/scheduler.py", line 1267, in add_worker | |
plugin.add_worker(scheduler=self, worker=address) | |
File "/nethome/rxb826/local/bin/miniconda3/envs/djq_mem_issue/lib/python3.6/site-packages/dask_jobqueue/core.py", line 61, in add_worker | |
self.running_jobs[job_id] = self.pending_jobs.pop(job_id) | |
KeyError: '17496259' | |
distributed.scheduler - ERROR - '17496258' | |
Traceback (most recent call last): | |
File "/nethome/rxb826/local/bin/miniconda3/envs/djq_mem_issue/lib/python3.6/site-packages/distributed/scheduler.py", line 1267, in add_worker | |
plugin.add_worker(scheduler=self, worker=address) | |
File "/nethome/rxb826/local/bin/miniconda3/envs/djq_mem_issue/lib/python3.6/site-packages/dask_jobqueue/core.py", line 61, in add_worker | |
self.running_jobs[job_id] = self.pending_jobs.pop(job_id) | |
KeyError: '17496258' | |
distributed.scheduler - ERROR - '17496259' | |
Traceback (most recent call last): | |
File "/nethome/rxb826/local/bin/miniconda3/envs/djq_mem_issue/lib/python3.6/site-packages/distributed/scheduler.py", line 1662, in remove_worker | |
plugin.remove_worker(scheduler=self, worker=address) | |
File "/nethome/rxb826/local/bin/miniconda3/envs/djq_mem_issue/lib/python3.6/site-packages/dask_jobqueue/core.py", line 73, in remove_worker | |
del self.running_jobs[job_id][name] | |
KeyError: '17496259' | |
distributed.scheduler - ERROR - '17496259' | |
Traceback (most recent call last): | |
File "/nethome/rxb826/local/bin/miniconda3/envs/djq_mem_issue/lib/python3.6/site-packages/distributed/scheduler.py", line 1267, in add_worker | |
plugin.add_worker(scheduler=self, worker=address) | |
File "/nethome/rxb826/local/bin/miniconda3/envs/djq_mem_issue/lib/python3.6/site-packages/dask_jobqueue/core.py", line 61, in add_worker | |
self.running_jobs[job_id] = self.pending_jobs.pop(job_id) | |
KeyError: '17496259' | |
distributed.scheduler - ERROR - '17496258' | |
Traceback (most recent call last): | |
File "/nethome/rxb826/local/bin/miniconda3/envs/djq_mem_issue/lib/python3.6/site-packages/distributed/scheduler.py", line 1662, in remove_worker | |
plugin.remove_worker(scheduler=self, worker=address) | |
File "/nethome/rxb826/local/bin/miniconda3/envs/djq_mem_issue/lib/python3.6/site-packages/dask_jobqueue/core.py", line 73, in remove_worker | |
del self.running_jobs[job_id][name] | |
KeyError: '17496258' | |
distributed.scheduler - ERROR - '17496258' | |
Traceback (most recent call last): | |
File "/nethome/rxb826/local/bin/miniconda3/envs/djq_mem_issue/lib/python3.6/site-packages/distributed/scheduler.py", line 1267, in add_worker | |
plugin.add_worker(scheduler=self, worker=address) | |
File "/nethome/rxb826/local/bin/miniconda3/envs/djq_mem_issue/lib/python3.6/site-packages/dask_jobqueue/core.py", line 61, in add_worker | |
self.running_jobs[job_id] = self.pending_jobs.pop(job_id) | |
KeyError: '17496258' | |
distributed.scheduler - ERROR - '17496259' | |
Traceback (most recent call last): | |
File "/nethome/rxb826/local/bin/miniconda3/envs/djq_mem_issue/lib/python3.6/site-packages/distributed/scheduler.py", line 1662, in remove_worker | |
plugin.remove_worker(scheduler=self, worker=address) | |
File "/nethome/rxb826/local/bin/miniconda3/envs/djq_mem_issue/lib/python3.6/site-packages/dask_jobqueue/core.py", line 73, in remove_worker | |
del self.running_jobs[job_id][name] | |
KeyError: '17496259' | |
distributed.scheduler - ERROR - '17496259' | |
Traceback (most recent call last): | |
File "/nethome/rxb826/local/bin/miniconda3/envs/djq_mem_issue/lib/python3.6/site-packages/distributed/scheduler.py", line 1267, in add_worker | |
plugin.add_worker(scheduler=self, worker=address) | |
File "/nethome/rxb826/local/bin/miniconda3/envs/djq_mem_issue/lib/python3.6/site-packages/dask_jobqueue/core.py", line 61, in add_worker | |
self.running_jobs[job_id] = self.pending_jobs.pop(job_id) | |
KeyError: '17496259' | |
distributed.scheduler - ERROR - '17496258' | |
Traceback (most recent call last): | |
File "/nethome/rxb826/local/bin/miniconda3/envs/djq_mem_issue/lib/python3.6/site-packages/distributed/scheduler.py", line 1662, in remove_worker | |
plugin.remove_worker(scheduler=self, worker=address) | |
File "/nethome/rxb826/local/bin/miniconda3/envs/djq_mem_issue/lib/python3.6/site-packages/dask_jobqueue/core.py", line 73, in remove_worker | |
del self.running_jobs[job_id][name] | |
KeyError: '17496258' | |
distributed.scheduler - ERROR - '17496258' | |
Traceback (most recent call last): | |
File "/nethome/rxb826/local/bin/miniconda3/envs/djq_mem_issue/lib/python3.6/site-packages/distributed/scheduler.py", line 1267, in add_worker | |
plugin.add_worker(scheduler=self, worker=address) | |
File "/nethome/rxb826/local/bin/miniconda3/envs/djq_mem_issue/lib/python3.6/site-packages/dask_jobqueue/core.py", line 61, in add_worker | |
self.running_jobs[job_id] = self.pending_jobs.pop(job_id) | |
KeyError: '17496258' | |
distributed.scheduler - ERROR - '17496259' | |
Traceback (most recent call last): | |
File "/nethome/rxb826/local/bin/miniconda3/envs/djq_mem_issue/lib/python3.6/site-packages/distributed/scheduler.py", line 1662, in remove_worker | |
plugin.remove_worker(scheduler=self, worker=address) | |
File "/nethome/rxb826/local/bin/miniconda3/envs/djq_mem_issue/lib/python3.6/site-packages/dask_jobqueue/core.py", line 73, in remove_worker | |
del self.running_jobs[job_id][name] | |
KeyError: '17496259' | |
distributed.scheduler - ERROR - '17496259' | |
Traceback (most recent call last): | |
File "/nethome/rxb826/local/bin/miniconda3/envs/djq_mem_issue/lib/python3.6/site-packages/distributed/scheduler.py", line 1267, in add_worker | |
plugin.add_worker(scheduler=self, worker=address) | |
File "/nethome/rxb826/local/bin/miniconda3/envs/djq_mem_issue/lib/python3.6/site-packages/dask_jobqueue/core.py", line 61, in add_worker | |
self.running_jobs[job_id] = self.pending_jobs.pop(job_id) | |
KeyError: '17496259' | |
distributed.scheduler - ERROR - '17496258' | |
Traceback (most recent call last): | |
File "/nethome/rxb826/local/bin/miniconda3/envs/djq_mem_issue/lib/python3.6/site-packages/distributed/scheduler.py", line 1662, in remove_worker | |
plugin.remove_worker(scheduler=self, worker=address) | |
File "/nethome/rxb826/local/bin/miniconda3/envs/djq_mem_issue/lib/python3.6/site-packages/dask_jobqueue/core.py", line 73, in remove_worker | |
del self.running_jobs[job_id][name] | |
KeyError: '17496258' | |
distributed.scheduler - ERROR - '17496258' | |
Traceback (most recent call last): | |
File "/nethome/rxb826/local/bin/miniconda3/envs/djq_mem_issue/lib/python3.6/site-packages/distributed/scheduler.py", line 1267, in add_worker | |
plugin.add_worker(scheduler=self, worker=address) | |
File "/nethome/rxb826/local/bin/miniconda3/envs/djq_mem_issue/lib/python3.6/site-packages/dask_jobqueue/core.py", line 61, in add_worker | |
self.running_jobs[job_id] = self.pending_jobs.pop(job_id) | |
KeyError: '17496258' | |
distributed.scheduler - ERROR - '17496259' | |
Traceback (most recent call last): | |
File "/nethome/rxb826/local/bin/miniconda3/envs/djq_mem_issue/lib/python3.6/site-packages/distributed/scheduler.py", line 1662, in remove_worker | |
plugin.remove_worker(scheduler=self, worker=address) | |
File "/nethome/rxb826/local/bin/miniconda3/envs/djq_mem_issue/lib/python3.6/site-packages/dask_jobqueue/core.py", line 73, in remove_worker | |
del self.running_jobs[job_id][name] | |
KeyError: '17496259' | |
distributed.scheduler - ERROR - '17496259' | |
Traceback (most recent call last): | |
File "/nethome/rxb826/local/bin/miniconda3/envs/djq_mem_issue/lib/python3.6/site-packages/distributed/scheduler.py", line 1267, in add_worker | |
plugin.add_worker(scheduler=self, worker=address) | |
File "/nethome/rxb826/local/bin/miniconda3/envs/djq_mem_issue/lib/python3.6/site-packages/dask_jobqueue/core.py", line 61, in add_worker | |
self.running_jobs[job_id] = self.pending_jobs.pop(job_id) | |
KeyError: '17496259' | |
distributed.scheduler - ERROR - '17496259' | |
Traceback (most recent call last): | |
File "/nethome/rxb826/local/bin/miniconda3/envs/djq_mem_issue/lib/python3.6/site-packages/distributed/scheduler.py", line 1662, in remove_worker | |
plugin.remove_worker(scheduler=self, worker=address) | |
File "/nethome/rxb826/local/bin/miniconda3/envs/djq_mem_issue/lib/python3.6/site-packages/dask_jobqueue/core.py", line 73, in remove_worker | |
del self.running_jobs[job_id][name] | |
KeyError: '17496259' | |
distributed.scheduler - ERROR - '17496259' | |
Traceback (most recent call last): | |
File "/nethome/rxb826/local/bin/miniconda3/envs/djq_mem_issue/lib/python3.6/site-packages/distributed/scheduler.py", line 1267, in add_worker | |
plugin.add_worker(scheduler=self, worker=address) | |
File "/nethome/rxb826/local/bin/miniconda3/envs/djq_mem_issue/lib/python3.6/site-packages/dask_jobqueue/core.py", line 61, in add_worker | |
self.running_jobs[job_id] = self.pending_jobs.pop(job_id) | |
KeyError: '17496259' | |
distributed.scheduler - ERROR - '17496258' | |
Traceback (most recent call last): | |
File "/nethome/rxb826/local/bin/miniconda3/envs/djq_mem_issue/lib/python3.6/site-packages/distributed/scheduler.py", line 1662, in remove_worker | |
plugin.remove_worker(scheduler=self, worker=address) | |
File "/nethome/rxb826/local/bin/miniconda3/envs/djq_mem_issue/lib/python3.6/site-packages/dask_jobqueue/core.py", line 73, in remove_worker | |
del self.running_jobs[job_id][name] | |
KeyError: '17496258' | |
distributed.scheduler - ERROR - '17496258' | |
Traceback (most recent call last): | |
File "/nethome/rxb826/local/bin/miniconda3/envs/djq_mem_issue/lib/python3.6/site-packages/distributed/scheduler.py", line 1267, in add_worker | |
plugin.add_worker(scheduler=self, worker=address) | |
File "/nethome/rxb826/local/bin/miniconda3/envs/djq_mem_issue/lib/python3.6/site-packages/dask_jobqueue/core.py", line 61, in add_worker | |
self.running_jobs[job_id] = self.pending_jobs.pop(job_id) | |
KeyError: '17496258' | |
distributed.scheduler - ERROR - '17496259' | |
Traceback (most recent call last): | |
File "/nethome/rxb826/local/bin/miniconda3/envs/djq_mem_issue/lib/python3.6/site-packages/distributed/scheduler.py", line 1662, in remove_worker | |
plugin.remove_worker(scheduler=self, worker=address) | |
File "/nethome/rxb826/local/bin/miniconda3/envs/djq_mem_issue/lib/python3.6/site-packages/dask_jobqueue/core.py", line 73, in remove_worker | |
del self.running_jobs[job_id][name] | |
KeyError: '17496259' | |
distributed.scheduler - ERROR - '17496259' | |
Traceback (most recent call last): | |
File "/nethome/rxb826/local/bin/miniconda3/envs/djq_mem_issue/lib/python3.6/site-packages/distributed/scheduler.py", line 1267, in add_worker | |
plugin.add_worker(scheduler=self, worker=address) | |
File "/nethome/rxb826/local/bin/miniconda3/envs/djq_mem_issue/lib/python3.6/site-packages/dask_jobqueue/core.py", line 61, in add_worker | |
self.running_jobs[job_id] = self.pending_jobs.pop(job_id) | |
KeyError: '17496259' | |
distributed.scheduler - ERROR - '17496258' | |
Traceback (most recent call last): | |
File "/nethome/rxb826/local/bin/miniconda3/envs/djq_mem_issue/lib/python3.6/site-packages/distributed/scheduler.py", line 1662, in remove_worker | |
plugin.remove_worker(scheduler=self, worker=address) | |
File "/nethome/rxb826/local/bin/miniconda3/envs/djq_mem_issue/lib/python3.6/site-packages/dask_jobqueue/core.py", line 73, in remove_worker | |
del self.running_jobs[job_id][name] | |
KeyError: '17496258' | |
distributed.scheduler - ERROR - '17496258' | |
Traceback (most recent call last): | |
File "/nethome/rxb826/local/bin/miniconda3/envs/djq_mem_issue/lib/python3.6/site-packages/distributed/scheduler.py", line 1267, in add_worker | |
plugin.add_worker(scheduler=self, worker=address) | |
File "/nethome/rxb826/local/bin/miniconda3/envs/djq_mem_issue/lib/python3.6/site-packages/dask_jobqueue/core.py", line 61, in add_worker | |
self.running_jobs[job_id] = self.pending_jobs.pop(job_id) | |
KeyError: '17496258' | |
distributed.scheduler - ERROR - '17496259' | |
Traceback (most recent call last): | |
File "/nethome/rxb826/local/bin/miniconda3/envs/djq_mem_issue/lib/python3.6/site-packages/distributed/scheduler.py", line 1662, in remove_worker | |
plugin.remove_worker(scheduler=self, worker=address) | |
File "/nethome/rxb826/local/bin/miniconda3/envs/djq_mem_issue/lib/python3.6/site-packages/dask_jobqueue/core.py", line 73, in remove_worker | |
del self.running_jobs[job_id][name] | |
KeyError: '17496259' | |
distributed.scheduler - ERROR - '17496259' | |
Traceback (most recent call last): | |
File "/nethome/rxb826/local/bin/miniconda3/envs/djq_mem_issue/lib/python3.6/site-packages/distributed/scheduler.py", line 1267, in add_worker | |
plugin.add_worker(scheduler=self, worker=address) | |
File "/nethome/rxb826/local/bin/miniconda3/envs/djq_mem_issue/lib/python3.6/site-packages/dask_jobqueue/core.py", line 61, in add_worker | |
self.running_jobs[job_id] = self.pending_jobs.pop(job_id) | |
KeyError: '17496259' | |
distributed.scheduler - ERROR - '17496258' | |
Traceback (most recent call last): | |
File "/nethome/rxb826/local/bin/miniconda3/envs/djq_mem_issue/lib/python3.6/site-packages/distributed/scheduler.py", line 1662, in remove_worker | |
plugin.remove_worker(scheduler=self, worker=address) | |
File "/nethome/rxb826/local/bin/miniconda3/envs/djq_mem_issue/lib/python3.6/site-packages/dask_jobqueue/core.py", line 73, in remove_worker | |
del self.running_jobs[job_id][name] | |
KeyError: '17496258' | |
distributed.scheduler - ERROR - '17496259' | |
Traceback (most recent call last): | |
File "/nethome/rxb826/local/bin/miniconda3/envs/djq_mem_issue/lib/python3.6/site-packages/distributed/scheduler.py", line 1662, in remove_worker | |
plugin.remove_worker(scheduler=self, worker=address) | |
File "/nethome/rxb826/local/bin/miniconda3/envs/djq_mem_issue/lib/python3.6/site-packages/dask_jobqueue/core.py", line 73, in remove_worker | |
del self.running_jobs[job_id][name] | |
KeyError: '17496259' | |
distributed.scheduler - ERROR - '17496258' | |
Traceback (most recent call last): | |
File "/nethome/rxb826/local/bin/miniconda3/envs/djq_mem_issue/lib/python3.6/site-packages/distributed/scheduler.py", line 1267, in add_worker | |
plugin.add_worker(scheduler=self, worker=address) | |
File "/nethome/rxb826/local/bin/miniconda3/envs/djq_mem_issue/lib/python3.6/site-packages/dask_jobqueue/core.py", line 61, in add_worker | |
self.running_jobs[job_id] = self.pending_jobs.pop(job_id) | |
KeyError: '17496258' | |
distributed.scheduler - ERROR - '17496259' | |
Traceback (most recent call last): | |
File "/nethome/rxb826/local/bin/miniconda3/envs/djq_mem_issue/lib/python3.6/site-packages/distributed/scheduler.py", line 1267, in add_worker | |
plugin.add_worker(scheduler=self, worker=address) | |
File "/nethome/rxb826/local/bin/miniconda3/envs/djq_mem_issue/lib/python3.6/site-packages/dask_jobqueue/core.py", line 61, in add_worker | |
self.running_jobs[job_id] = self.pending_jobs.pop(job_id) | |
KeyError: '17496259' | |
| |
I then type 'bkill 0' to kill all jobs | |
The dask-worker.err and dask-worker.out can be found at https://drive.google.com/drive/folders/1tkS9_qqyGXQUk81fy0_pLijeI2guxJOf?usp=sharing |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment