Host juwels
HostName juwels.fz-juelich.de
User salaj1
IdentityFile ~/.ssh/id_rsa_juwels.pub
To connect simply ssh juwels
.
$PROJECT
:= /p/project/chhd34
.
Install conda:
wget https://repo.continuum.io/archive/Anaconda3-2018.12-Linux-x86_64.sh
chmod +x Anaconda3-2018.12-Linux-x86_64.sh
./Anaconda3-2018.12-Linux-x86_64.sh
Activate it in: ~/.bashrc
, /p/project/chhd34/hhd342/.bashrc
and /p/project/chhd34/hhd342/.profile
.
After login always move to the project directory! Add the following to the ~/.bash_profile
:
alias ll='ls -l'
jutil env activate -p chhd34
cd /p/project/chhd34/hhd342/
salloc --partition=gpus --nodes=1 --gres=gpu:4 --account=hhd34 --time=23:59:00 # allocate time on gpu node
srun --nodes=1 --cpu_bind=none --pty /bin/bash -i # attach to reserved node
module load GCCcore/.8.2.0 tmux/2.8
tmux
conda activate venv
To reconnect to specific node:
srun -w jwc09n024 --nodes=1 --cpu_bind=none --pty /bin/bash -i
Find the node name of a job listed in squeue -u your_user_name
.
A script to start many jobs (for parameter search for example):
#!/bin/bash -x
sbatch juwels_single_run.sh "--comment=bptt_NMODEL_thr.005 --eprop1=False --thr=0.005"
sbatch juwels_single_run.sh "--comment=bptt_NMODEL_thr.01 --eprop1=False --thr=0.01"
sbatch juwels_single_run.sh "--comment=bptt_NMODEL_thr.02 --eprop1=False --thr=0.02"
sbatch juwels_single_run.sh "--comment=bptt_NMODEL_thr.03 --eprop1=False --thr=0.03"
sbatch juwels_single_run.sh "--comment=bptt_NMODEL_thr.04 --eprop1=False --thr=0.04"
sbatch juwels_single_run.sh "--comment=bptt_NMODEL_thr.06 --eprop1=False --thr=0.06"
sbatch juwels_single_run.sh "--comment=bptt_NMODEL_thr.08 --eprop1=False --thr=0.08"
sbatch juwels_single_run.sh "--comment=bptt_NMODEL_thr.1 --eprop1=False --thr=0.1"
where the juwels_single_run.sh
contains the following script:
#!/bin/bash -x
#SBATCH --account=hhd34
#SBATCH --nodes=1
#SBATCH --gres=gpu:4
#SBATCH --time=23:59:00
#SBATCH --partition=gpus
#SBATCH --output=results/timit-out.%j
#SBATCH --error=results/timit-err.%j
module load CUDA cuDNN
runID=$(( RANDOM % 1000000 ))
DATE_WITH_TIME=`date "+%Y-%m-%d-%H:%M:%S"`
OPTIONS=$1
srun python3 -u solve_timit_with_framewise_lsnn_eprop1.py $OPTIONS --run_id=$runID | tee ${DATE_WITH_TIME}_${runID}_out.txt &
wait
rsync -avzP -e ssh salaj1@juwels.fz-juelich.de:/p/project/chhd34/hhd342/timit_processing/results/solve_timit_with_framewise_lsnn_eprop1/ .