Tikquuss/mila_cluster.sh

## mila_cluster.sh
# First of all, excuse my English
# Second of all, https://docs.mila.quebec/

# I generally prefer to place myself in a specific folder, where I can move my files
cd D:/AI/MILA

############################### 1 - Moving file or folder ######################################
# move a file/folder from cluster to local
## file (pascaltikeng/at/job.sh)
scp -P 2222 pascal.tikeng@login-1.login.server.mila.quebec:pascaltikeng/at/job.sh D:/AI/MILA
### I am already in D:/AI/MILA
scp -P 2222 pascal.tikeng@login-1.login.server.mila.quebec:pascaltikeng/at/job.sh .
## folder (pascaltikeng/path_to_myfolder)
scp -r -P 2222 pascal.tikeng@login-1.login.server.mila.quebec:pascaltikeng/path_to_myfolder D:/AI/MILA
scp -r -P 2222 pascal.tikeng@login-1.login.server.mila.quebec:pascaltikeng/path_to_myfolder .

# move a file/folder from local to cluster
## file (D:/AI/MILA/file.txt)
scp -P 2222 D:/AI/MILA/file.txt pascal.tikeng@login-1.login.server.mila.quebec:pascaltikeng/at
### I am already in D:/AI/MILA
scp -P 2222 file.txt pascal.tikeng@login-1.login.server.mila.quebec:pascaltikeng/at
# folder
scp -r -P 2222 path_to_myfolder pascal.tikeng@login-1.login.server.mila.quebec:pascaltikeng/at

############################### 2-login ###################################
My mila email is : pascal.tikeng@mila.quebec
The password is : CAN''T SHARE ...... ::))

## command
ssh pascal.tikeng@login.server.mila.quebec -p 2222
ssh pascal.tikeng@login-1.login.server.mila.quebec -p 2222

### and put your password

########################### 3 - Virtual env ###########################

# 3.1 For your first connection, it is better to create your working directory
mkdir pascaltikeng

## The project is https://github.com/Tikquuss/controllable_text_attribute_transfer
## I will call it at

# 3.2 If it is your first time to run this (specific) project on mila cluster  :
## 3.2.1 create a custom directory for this project (where you can keep your dataset, your logs ...)
## I will name it "at"

cd pascaltikeng
mkdir at
cd at

## 3.2.2 create a virtual env (You can also create from your first connection a venv that you will \
##   use for all your projects, but with this approach you can have versioning problems if two \
##   projects need two different versions of the same dependency.)
## I WILL name it "at" too

module load python/3.7
#### create your venv
virtualenv at
#### activate
source at/bin/activate

### 3.2.3 Install your dependencies (for pytorch, go to https://pytorch.org/ and ...)
pip install --upgrade pip
#TORCH=1.9.1
#CUDA=cu111
#pip install torch==${TORCH}+${CUDA} torchvision==0.10.1+${CUDA} torchaudio==0.9.1 -f https://download.pytorch.org/whl/torch_stable.html
pip install torch==1.8.0+cu111 torchvision==0.9.0+cu111 torchaudio==0.8.0 -f https://download.pytorch.org/whl/torch_stable.html
#pip install tqdm pandas==1.1.5 ....

### 3.2.4 Clone your repo
git clone https://github.com/Tikquuss/controllable_text_attribute_transfer
cd controllable_text_attribute_transfer

### 3.2.5 Install other dependencies if you have them
pip install -r requirements.txt

# 3.3 If it is not your first time : You go directly in your workspace
cd pascaltikeng/at/controllable_text_attribute_transfer
source ../at/bin/activate

############################ 4. Dataset ####################

# 4.1 online dataset
wget -c https://data.deepai.org/mnist.zip -P pascaltikeng/at
unzip -u pascaltikeng/at/mnist.zip -d pascaltikeng/at

# 4.2 from github
git clone https://github.com/Tikquuss/nli_dataset pascaltikeng/nli

# 4.3 from your computer
## use `scp` : see above (1.)


############################# 5. Before runing your job ####################
tmux

"""
In fact, if you start your job and your internet connection stops, or you close the cmd,
or your machine goes to sleep or stops ... your job will stop.
With tmux your job keeps running even if one of these situations happens.

And to recover your tmux session, follow this link :
https://unix.stackexchange.com/questions/22781/how-to-recover-a-shell-after-a-disconnection

Note that you have to connect to the login node on which the temux session was created in order to recover it
"""

############################# 6. Run your job ####################

""" Partitions (see the documentation)
- unkillable is the most stable: you only get 1 job with such a partition.
Jobs requesting -p unkillable will not be preempted (=stopped to give the GPU to another job with higher priority).
You have this node for up to 24h. You cannot request more than 4 cpus 48GB of RAM and 1 GPU
- main is a high priority job: it may be preempted but it's very unlikely: you get 2 jobs with such a partition.
You have this node for up to 48h and the total cpus requested by your main jobs cannot exceed 12.
- long is a low priority job: it may be preempted so you should have regular checkpoints.
There are no limits on the resources you can request, but bear in mind that the more resources you request,
the more likely that they will be needed by a higher priority job and yours will be killed to release them.

Each job assigned with a priority can preempt jobs with a lower priority: unkillable > main > long

Flag                  | Max Resource Usage      | Max Time | Note
------------------------------------------------------------------
–partition=unkillable | 1 GPU, 6 CPUs, mem=32G  | 2 days   |
–partition=main       | 2 GPUs, 8 CPUs, mem=48G | 2 days   |
–partition=long       | no limit of resources   | 7 days   |
"""

# 6.1 option 1 : salloc (You choose what you want as parameters and you launch your job : see the documentation for the other parameters)
salloc --gres=gpu:4 -c 4 --mem=85G --time=96:00:00 --partition=long --job-name=at
salloc --gres=gpu:2 -c 4 --mem=32Gb --time=24:00:00 --partition=main --job-name=at
salloc --gres=gpu:1 -c 4 --mem=32Gb --time=01:00:00 --partition=unkillable --job-name=at

chmod +x job_train.sh
. job_train.sh

#### You launch the jobs as you want, when you finish you release the resources with `exit`

# 6.2 option 2 : srun
# srun ~~ salloc + job + exit
chmod +x job_train.sh
srun --gres=gpu:1 -c 4 --mem=32Gb --time=01:00:00 --partition=unkillable --job-name=at . job_train.sh

# 6.3 option 3 : sbatch
## sbatch ~~ srun, but the parameters can be passed directly into the job script

####### beginning of your job script (job_train.sh)
#!/bin/bash
SBATCH --job-name=at
SBATCH --gres=gpu:2              # Number of GPUs (per node)
SBATCH --mem=85G               # memory (per node)
SBATCH --time=0-12:00            # time (DD-HH:MM)
SBATCH --partition=main

###... load environment
###module load anaconda/3
### load cuda ...
module load cuda/10.1
### ... activate your venv
source ../at/bin/activate
### ...

#### see for example
#### https://github.com/Tikquuss/controllable_text_attribute_transfer/blob/main/main.py
#### https://github.com/Tikquuss/controllable_text_attribute_transfer/blob/main/main.sh
#### https://github.com/Tikquuss/controllable_text_attribute_transfer/blob/main/job_train.sh

python train.py --name_param value_param --max_epoch 10 ...
### or
. train.sh value_param1 value_param2 ...
### or
### ...
### some job
####### end of your

chmod +x job_train.sh
sbatch job_train.sh

############################# 7. Utils ####################
## see your jobs
squeue -u $USER
## kill a job
scancel 776791
## ...
### You can play with the commands : for example if I just want to move my logs which are .txt
### and which are in any subfolder of dump_path
scp -P 2222 pascal.tikeng@login-1.login.server.mila.quebec:"pascaltikeng/at/dump_path/*/*.txt" D:/AI/MILA/at
## ...

############################# 8. Questions ? ####################

#
	# First of all, excuse my English
	# Second of all, https://docs.mila.quebec/

	# I generally prefer to place myself in a specific folder, where I can move my files
	cd D:/AI/MILA

	############################### 1 - Moving file or folder ######################################
	# move a file/folder from cluster to local
	## file (pascaltikeng/at/job.sh)
	scp -P 2222 pascal.tikeng@login-1.login.server.mila.quebec:pascaltikeng/at/job.sh D:/AI/MILA
	### I am already in D:/AI/MILA
	scp -P 2222 pascal.tikeng@login-1.login.server.mila.quebec:pascaltikeng/at/job.sh .
	## folder (pascaltikeng/path_to_myfolder)
	scp -r -P 2222 pascal.tikeng@login-1.login.server.mila.quebec:pascaltikeng/path_to_myfolder D:/AI/MILA
	scp -r -P 2222 pascal.tikeng@login-1.login.server.mila.quebec:pascaltikeng/path_to_myfolder .

	# move a file/folder from local to cluster
	## file (D:/AI/MILA/file.txt)
	scp -P 2222 D:/AI/MILA/file.txt pascal.tikeng@login-1.login.server.mila.quebec:pascaltikeng/at
	### I am already in D:/AI/MILA
	scp -P 2222 file.txt pascal.tikeng@login-1.login.server.mila.quebec:pascaltikeng/at
	# folder
	scp -r -P 2222 path_to_myfolder pascal.tikeng@login-1.login.server.mila.quebec:pascaltikeng/at

	############################### 2-login ###################################
	My mila email is : pascal.tikeng@mila.quebec
	The password is : CAN''T SHARE ...... ::))

	## command
	ssh pascal.tikeng@login.server.mila.quebec -p 2222
	ssh pascal.tikeng@login-1.login.server.mila.quebec -p 2222

	### and put your password

	########################### 3 - Virtual env ###########################

	# 3.1 For your first connection, it is better to create your working directory
	mkdir pascaltikeng

	## The project is https://github.com/Tikquuss/controllable_text_attribute_transfer
	## I will call it at

	# 3.2 If it is your first time to run this (specific) project on mila cluster :
	## 3.2.1 create a custom directory for this project (where you can keep your dataset, your logs ...)
	## I will name it "at"

	cd pascaltikeng
	mkdir at
	cd at

	## 3.2.2 create a virtual env (You can also create from your first connection a venv that you will \
	## use for all your projects, but with this approach you can have versioning problems if two \
	## projects need two different versions of the same dependency.)
	## I WILL name it "at" too

	module load python/3.7
	#### create your venv
	virtualenv at
	#### activate
	source at/bin/activate

	### 3.2.3 Install your dependencies (for pytorch, go to https://pytorch.org/ and ...)
	pip install --upgrade pip
	#TORCH=1.9.1
	#CUDA=cu111
	#pip install torch==${TORCH}+${CUDA} torchvision==0.10.1+${CUDA} torchaudio==0.9.1 -f https://download.pytorch.org/whl/torch_stable.html
	pip install torch==1.8.0+cu111 torchvision==0.9.0+cu111 torchaudio==0.8.0 -f https://download.pytorch.org/whl/torch_stable.html
	#pip install tqdm pandas==1.1.5 ....

	### 3.2.4 Clone your repo
	git clone https://github.com/Tikquuss/controllable_text_attribute_transfer
	cd controllable_text_attribute_transfer

	### 3.2.5 Install other dependencies if you have them
	pip install -r requirements.txt

	# 3.3 If it is not your first time : You go directly in your workspace
	cd pascaltikeng/at/controllable_text_attribute_transfer
	source ../at/bin/activate

	############################ 4. Dataset ####################

	# 4.1 online dataset
	wget -c https://data.deepai.org/mnist.zip -P pascaltikeng/at
	unzip -u pascaltikeng/at/mnist.zip -d pascaltikeng/at

	# 4.2 from github
	git clone https://github.com/Tikquuss/nli_dataset pascaltikeng/nli

	# 4.3 from your computer
	## use `scp` : see above (1.)


	############################# 5. Before runing your job ####################
	tmux

	"""
	In fact, if you start your job and your internet connection stops, or you close the cmd,
	or your machine goes to sleep or stops ... your job will stop.
	With tmux your job keeps running even if one of these situations happens.

	And to recover your tmux session, follow this link :
	https://unix.stackexchange.com/questions/22781/how-to-recover-a-shell-after-a-disconnection

	Note that you have to connect to the login node on which the temux session was created in order to recover it
	"""

	############################# 6. Run your job ####################

	""" Partitions (see the documentation)
	- unkillable is the most stable: you only get 1 job with such a partition.
	Jobs requesting -p unkillable will not be preempted (=stopped to give the GPU to another job with higher priority).
	You have this node for up to 24h. You cannot request more than 4 cpus 48GB of RAM and 1 GPU
	- main is a high priority job: it may be preempted but it's very unlikely: you get 2 jobs with such a partition.
	You have this node for up to 48h and the total cpus requested by your main jobs cannot exceed 12.
	- long is a low priority job: it may be preempted so you should have regular checkpoints.
	There are no limits on the resources you can request, but bear in mind that the more resources you request,
	the more likely that they will be needed by a higher priority job and yours will be killed to release them.

	Each job assigned with a priority can preempt jobs with a lower priority: unkillable > main > long

	Flag \| Max Resource Usage \| Max Time \| Note
	------------------------------------------------------------------
	–partition=unkillable \| 1 GPU, 6 CPUs, mem=32G \| 2 days \|
	–partition=main \| 2 GPUs, 8 CPUs, mem=48G \| 2 days \|
	–partition=long \| no limit of resources \| 7 days \|
	"""

	# 6.1 option 1 : salloc (You choose what you want as parameters and you launch your job : see the documentation for the other parameters)
	salloc --gres=gpu:4 -c 4 --mem=85G --time=96:00:00 --partition=long --job-name=at
	salloc --gres=gpu:2 -c 4 --mem=32Gb --time=24:00:00 --partition=main --job-name=at
	salloc --gres=gpu:1 -c 4 --mem=32Gb --time=01:00:00 --partition=unkillable --job-name=at

	chmod +x job_train.sh
	. job_train.sh

	#### You launch the jobs as you want, when you finish you release the resources with `exit`

	# 6.2 option 2 : srun
	# srun ~~ salloc + job + exit
	chmod +x job_train.sh
	srun --gres=gpu:1 -c 4 --mem=32Gb --time=01:00:00 --partition=unkillable --job-name=at . job_train.sh

	# 6.3 option 3 : sbatch
	## sbatch ~~ srun, but the parameters can be passed directly into the job script

	####### beginning of your job script (job_train.sh)
	#!/bin/bash
	SBATCH --job-name=at
	SBATCH --gres=gpu:2 # Number of GPUs (per node)
	SBATCH --mem=85G # memory (per node)
	SBATCH --time=0-12:00 # time (DD-HH:MM)
	SBATCH --partition=main

	###... load environment
	###module load anaconda/3
	### load cuda ...
	module load cuda/10.1
	### ... activate your venv
	source ../at/bin/activate
	### ...

	#### see for example
	#### https://github.com/Tikquuss/controllable_text_attribute_transfer/blob/main/main.py
	#### https://github.com/Tikquuss/controllable_text_attribute_transfer/blob/main/main.sh
	#### https://github.com/Tikquuss/controllable_text_attribute_transfer/blob/main/job_train.sh

	python train.py --name_param value_param --max_epoch 10 ...
	### or
	. train.sh value_param1 value_param2 ...
	### or
	### ...
	### some job
	####### end of your

	chmod +x job_train.sh
	sbatch job_train.sh

	############################# 7. Utils ####################
	## see your jobs
	squeue -u $USER
	## kill a job
	scancel 776791
	## ...
	### You can play with the commands : for example if I just want to move my logs which are .txt
	### and which are in any subfolder of dump_path
	scp -P 2222 pascal.tikeng@login-1.login.server.mila.quebec:"pascaltikeng/at/dump_path//.txt" D:/AI/MILA/at
	## ...

	############################# 8. Questions ? ####################

	#