kinow/README.md

## README.md

      
    Raw
  

              README.md
            
          
    Autosubmit

Install Autosubmit with pip install autosubmit==3.*. Follow their tutorial to set up the workflow engine (basically
autosubmit configure and autosubmit install to create the directories, files, and databases).
Create an experiment with autosubmit expid --HPC local --description "autosubmit-fl...", and edit the files
expdef_a000.conf and jobs_a000.conf in the ~/autosubmit/a000/ directory. The important entries to add or
modify in expdef_a000.conf are:
MEMBERS = mnist svhn
NUMCHUNKS = 3
And in jobs_a000.conf are:
[init]
FILE=_.sh

[train]
FILE=_.sh
RUNNING=chunk
DEPENDENCIES=init

[eval]
FILE=_.sh
RUNNING=member
DEPENDENCIES=train
Then autosubmit create a000 will validate and plot the workflow.
file:///home/kinow/autosubmit/a000/plot/a000_20220901_1623.pdf.png
Cylc

Install Cylc with pip install cylc-flow==8.*. Create the directory for the workflow with mkdir -p ~/cylc-src/cylc-fl.
Paste the contents of flow.cylc into ~/cylc-src/cylc-fl/flow.cylc. cd into that directory and run cylc install .. That should create the first run for the cylc-fl workflow.
Now plot and run the workflow with cylc graph cylc-fl and cylc run cylc-fl --no-detach.
You can then preview the graph of the workflow, as seen below, or print the job logs (you can also navigate to that directory).
file:///tmp/tmpeewrb476.PNG
(venv) kinow@ranma:~/cylc-src/cylc-fl$ cylc cat-log --file=o cylc-fl//1/train_round1_mnist
Workflow : cylc-fl/run1
Job : 1/train_round1_mnist/01 (try 1)
User@Host: kinow@ranma

Training the model... round 1
2022-09-01T16:03:00+12:00 INFO - started
2022-09-01T16:03:01+12:00 INFO - succeeded

  
## autosubmit_a000.conf
[config]
# Experiment identifier
# No need to change
EXPID = a000
# No need to change.
# Autosubmit version identifier
AUTOSUBMIT_VERSION = 3.14.0b
# Default maximum number of jobs to be waiting in any platform
# Default = 3
MAXWAITINGJOBS = 3
# Default maximum number of jobs to be running at the same time at any platform
# Default = 6
TOTALJOBS = 6
# Time (seconds) between connections to the HPC queue scheduler to poll already submitted jobs status
# Default = 10
SAFETYSLEEPTIME = 10
# Number of retrials if a job fails. Can be override at job level
# Default = 0
RETRIALS = 0
# Default output type for CREATE, MONITOR, SET STATUS, RECOVERY. Available options: pdf, svg, png, ps, txt
# Default = pdf
OUTPUT = pdf
# Allow to send jobs earlier
# Default = False
PRESUBMISSION = FALSE

# Basic Configuration of wrapper
# Types available: Horizontal,vertical,vertical-mixed,horizontal-vertical
# JOBS_IN_WRAPPER = Sections that should be wrapped together ex SIM
# MIN_WRAPPED set the minim  number of jobs that should be included in the wrapper. DEFAULT = 2
# MAX_WRAPPED set the maxim  number of jobs that should be included in the wrapper. DEFAULT = TOTALJOBS
# METHOD = ASThread # for vertical-horizontal or horizontal with threads-based parallelism  this must be srun
#[wrapper]
#TYPE = Vertical
#JOBS_IN_WRAPPER = SIM
#MIN_WRAPPED = 2
#MAX_WRAPPED = 9999
#METHOD = ASThread # for vertical-horizontal or horizontal with threads-based parallelism  this must be srun

[mail]
# Enable mail notifications
# Default = False
NOTIFICATIONS = False
# Mail address where notifications will be received
TO =

[communications]
# Communications library used to connect with platforms: paramiko or saga.
# Default = paramiko
API = paramiko

[storage]
# Defines the way of storing the progress of the experiment. The available options are:
# A PICKLE file (pkl) or an SQLite database (db). Default = pkl
TYPE = pkl
# Defines if the remote logs will be copied to the local platform. Default = True.
COPY_REMOTE_LOGS = True

[migrate]
# Changes experiment files owner.
TO_USER =

## expdef_a000.conf
[DEFAULT]
# Experiment identifier
# No need to change
EXPID = a000
# HPC name.
# No need to change
HPCARCH = local

[experiment]
# Supply the list of start dates. Available formats: YYYYMMDD YYYYMMDDhh YYYYMMDDhhmm
# You can also use an abbreviated syntax for multiple dates with common parts: 200001[01 15] <=> 20000101 20000115
# 200001[01-04] <=> 20000101 20000102 20000103 20000104
# DATELIST = 20000101 19600101 19650101 19700101
# DATELIST = 20000101 1960[0101 0201 0301]
# DATELIST = 20000101 19[60-65]
DATELIST = 2022
# Supply the list of members. Format fcX
# You can also use an abreviated syntax for multiple members: fc[0 1 2] <=> fc0 fc1 fc2
# fc[0-2] <=> fc0 fc1 fc2
# MEMBERS = fc0 fc0 fc1 fc2 fc3 fc4
# MEMBERS = fc0 fc[0-4]
MEMBERS = mnist svhn
# Chunk size unit. STRING = hour, day, month, year
CHUNKSIZEUNIT = year
# Chunk size. NUMERIC = 4, 6, 12
CHUNKSIZE = 4
# Total number of chunks in experiment. NUMERIC = 30, 15, 10
NUMCHUNKS = 3
# Initial chunk of the experiment. Optional. DEFAULT = 1
CHUNKINI = 1
# Calendar used. LIST: standard, noleap
CALENDAR = standard
# List of members that can be included in this run. Optional.
# RUN_ONLY_MEMBERS = fc0 fc1 fc2 fc3 fc4
# RUN_ONLY_MEMBERS = fc[0-4]
RUN_ONLY_MEMBERS =

[project]
# Select project type. STRING = git, svn, local, none
# If PROJECT_TYPE is set to none, Autosubmit self-contained dummy templates will be used
PROJECT_TYPE = none
# Destination folder name for project. type = STRING, default = leave empty,
PROJECT_DESTINATION =

# If PROJECT_TYPE is not git, no need to change
[git]
# Repository URL  STRING = 'https://github.com/torvalds/linux.git'
PROJECT_ORIGIN =
# Select branch or tag, STRING, default = 'master', help = {'master' (default), 'develop', 'v3.1b', ...}
PROJECT_BRANCH =
# type = STRING, default = leave empty, help = if model branch is a TAG leave empty
PROJECT_COMMIT =
# type = STRING, default = leave empty and will load all submodules, help = loadThisSubmodule alsoloadthis anotherLoad ...
PROJECT_SUBMODULES =
# type = STRING, default = leave empty and will perform a full clone, help = true,false
FETCH_SINGLE_BRANCH = true
# type = STRING, default = leave empty and will perform a clone on local machine, help = Path to root of remote folder
#REMOTE_CLONE_ROOT = /gpfs/scratch/archive/bsc32
# If PROJECT_TYPE is not svn, no need to change
[svn]
# type = STRING, help = 'https://svn.ec-earth.org/ecearth3'
PROJECT_URL =
# Select revision number. NUMERIC = 1778
PROJECT_REVISION =

# If PROJECT_TYPE is not local, no need to change
[local]
# type = STRING, help = /foo/bar/ecearth
PROJECT_PATH =

# If PROJECT_TYPE is none, no need to change
[project_files]
# Where is PROJECT CONFIGURATION file location relative to project root path
FILE_PROJECT_CONF =
# Where is JOBS CONFIGURATION file location relative to project root path
FILE_JOBS_CONF =
# Default job scripts type in the project. type = STRING, default = bash, supported = 'bash', 'python' or 'r'
JOB_SCRIPTS_TYPE =

[rerun]
# Is a rerun or not? [Default: Do set FALSE]. BOOLEAN = TRUE, FALSE
RERUN = FALSE
# If RERUN = TRUE then supply the list of chunks to rerun
# LIST = [ 19601101 [ fc0 [1 2 3 4] fc1 [1] ] 19651101 [ fc0 [16-30] ] ]
CHUNKLIST =

## flow.cylc
[scheduler]
  allow implicit tasks = True
[task parameters]
  round = 1..3
  dataset = mnist, svhn
[scheduling]
  initial cycle point = 1
  cycling mode = integer
  [[graph]]
    R1 = """
      init => train<round,dataset> => evaluate<dataset>
    """
[runtime]
  [[root]]
    script = true
  [[train<round>]]
    script = """
      echo "Training the model... round ${CYLC_TASK_PARAM_round}"
    """
  [[eval<dataset>]]
    script = """
      echo "Evaluating the dataset ${CYLC_TASK_PARAM_dataset}"
    """

## jobs_a000.conf
[init]
FILE=_.sh

[train]
FILE=_.sh
RUNNING=chunk
DEPENDENCIES=init

[eval]
FILE=_.sh
RUNNING=member
DEPENDENCIES=train

## platforms_a000.conf
# Example platform with all options specified

[LOCAL]
TYPE=PS
HOST=localhost
USER=kinow
SCRATCH_DIR=/tmp
PROJECT=bla

## Platform name
# [PLATFORM]
## Queue type. Options: PBS, SGE, PS, LSF, ecaccess, SLURM. Required
# TYPE =
## Version of queue manager to use. Needed only in PBS (options: 10, 11, 12) and ecaccess (options: pbs, loadleveler)
# VERSION =
## Hostname of the HPC. Required
# HOST =
## Project for the machine scheduler. Required
# PROJECT =
## Budget account for the machine scheduler. If omitted, takes the value defined in PROJECT
# BUDGET =
## Option to add project name to host. This is required for some HPCs.
# ADD_PROJECT_TO_HOST = False
## User for the machine scheduler. Required
# USER =
## Optional. If given, Autosubmit will change owner of files in given platform when using migrate_exp.
# USER_TO =
## Path to the scratch directory for the machine. Required.
# SCRATCH_DIR = /scratch
## Path to the machine's temporary directory for migrate purposes.
# TEMP_DIR = /tmp
## If true, Autosubmit test command can use this queue as a main queue. Defaults to False
# TEST_SUITE = False
## If given, Autosubmit will add jobs to the given queue. Required for some platforms.
# QUEUE =
## Optional. If given, Autosubmit will submit the serial jobs with the exclusivity directive.
# EXCLUSIVITY =
## Optional. If specified, autosubmit will run jobs with only one processor in the specified platform.
# SERIAL_PLATFORM = SERIAL_PLATFORM_NAME
## Optional. If specified, autosubmit will run jobs with only one processor in the specified queue.
## Autosubmit will ignore this configuration if SERIAL_PLATFORM is provided
# SERIAL_QUEUE = SERIAL_QUEUE_NAME
## Optional. Default number of processors per node to be used in jobs
# PROCESSORS_PER_NODE =
## Optional. Integer. Scratch free space requirements for the platform in percentage (%).
## If not specified, it won't be defined on the template.
# SCRATCH_FREE_SPACE =
## Optional. Integer. Default Maximum number of jobs to be waiting in any platform queue
## Default = 3
# MAX_WAITING_JOBS =
## Optional. Integer. Default maximum number of jobs to be running at the same time at any platform
## Default = 6
# TOTAL_JOBS =
## Max wallclock per job submitted to the HPC queue in format HH:MM. If not specified, defaults to empty.
## Optional. Required for wrappers.
# MAX_WALLCLOCK = 72:00
## Max processors number per job submitted to the HPC. If not specified, defaults to empty.
## Optional. Required for wrappers.
# MAX_PROCESSORS = 1
## Optional. Custom directives for the resource manager of the platform used.
## Put as many as you wish in a json formatted array.
# CUSTOM_DIRECTIVE = ["#PBS -v myvar=value, "#PBS -v othervar=value"]
	[config]
	# Experiment identifier
	# No need to change
	EXPID = a000
	# No need to change.
	# Autosubmit version identifier
	AUTOSUBMIT_VERSION = 3.14.0b
	# Default maximum number of jobs to be waiting in any platform
	# Default = 3
	MAXWAITINGJOBS = 3
	# Default maximum number of jobs to be running at the same time at any platform
	# Default = 6
	TOTALJOBS = 6
	# Time (seconds) between connections to the HPC queue scheduler to poll already submitted jobs status
	# Default = 10
	SAFETYSLEEPTIME = 10
	# Number of retrials if a job fails. Can be override at job level
	# Default = 0
	RETRIALS = 0
	# Default output type for CREATE, MONITOR, SET STATUS, RECOVERY. Available options: pdf, svg, png, ps, txt
	# Default = pdf
	OUTPUT = pdf
	# Allow to send jobs earlier
	# Default = False
	PRESUBMISSION = FALSE

	# Basic Configuration of wrapper
	# Types available: Horizontal,vertical,vertical-mixed,horizontal-vertical
	# JOBS_IN_WRAPPER = Sections that should be wrapped together ex SIM
	# MIN_WRAPPED set the minim number of jobs that should be included in the wrapper. DEFAULT = 2
	# MAX_WRAPPED set the maxim number of jobs that should be included in the wrapper. DEFAULT = TOTALJOBS
	# METHOD = ASThread # for vertical-horizontal or horizontal with threads-based parallelism this must be srun
	#[wrapper]
	#TYPE = Vertical
	#JOBS_IN_WRAPPER = SIM
	#MIN_WRAPPED = 2
	#MAX_WRAPPED = 9999
	#METHOD = ASThread # for vertical-horizontal or horizontal with threads-based parallelism this must be srun

	[mail]
	# Enable mail notifications
	# Default = False
	NOTIFICATIONS = False
	# Mail address where notifications will be received
	TO =

	[communications]
	# Communications library used to connect with platforms: paramiko or saga.
	# Default = paramiko
	API = paramiko

	[storage]
	# Defines the way of storing the progress of the experiment. The available options are:
	# A PICKLE file (pkl) or an SQLite database (db). Default = pkl
	TYPE = pkl
	# Defines if the remote logs will be copied to the local platform. Default = True.
	COPY_REMOTE_LOGS = True

	[migrate]
	# Changes experiment files owner.
	TO_USER =
	[DEFAULT]
	# Experiment identifier
	# No need to change
	EXPID = a000
	# HPC name.
	# No need to change
	HPCARCH = local

	[experiment]
	# Supply the list of start dates. Available formats: YYYYMMDD YYYYMMDDhh YYYYMMDDhhmm
	# You can also use an abbreviated syntax for multiple dates with common parts: 200001[01 15] <=> 20000101 20000115
	# 200001[01-04] <=> 20000101 20000102 20000103 20000104
	# DATELIST = 20000101 19600101 19650101 19700101
	# DATELIST = 20000101 1960[0101 0201 0301]
	# DATELIST = 20000101 19[60-65]
	DATELIST = 2022
	# Supply the list of members. Format fcX
	# You can also use an abreviated syntax for multiple members: fc[0 1 2] <=> fc0 fc1 fc2
	# fc[0-2] <=> fc0 fc1 fc2
	# MEMBERS = fc0 fc0 fc1 fc2 fc3 fc4
	# MEMBERS = fc0 fc[0-4]
	MEMBERS = mnist svhn
	# Chunk size unit. STRING = hour, day, month, year
	CHUNKSIZEUNIT = year
	# Chunk size. NUMERIC = 4, 6, 12
	CHUNKSIZE = 4
	# Total number of chunks in experiment. NUMERIC = 30, 15, 10
	NUMCHUNKS = 3
	# Initial chunk of the experiment. Optional. DEFAULT = 1
	CHUNKINI = 1
	# Calendar used. LIST: standard, noleap
	CALENDAR = standard
	# List of members that can be included in this run. Optional.
	# RUN_ONLY_MEMBERS = fc0 fc1 fc2 fc3 fc4
	# RUN_ONLY_MEMBERS = fc[0-4]
	RUN_ONLY_MEMBERS =

	[project]
	# Select project type. STRING = git, svn, local, none
	# If PROJECT_TYPE is set to none, Autosubmit self-contained dummy templates will be used
	PROJECT_TYPE = none
	# Destination folder name for project. type = STRING, default = leave empty,
	PROJECT_DESTINATION =

	# If PROJECT_TYPE is not git, no need to change
	[git]
	# Repository URL STRING = 'https://github.com/torvalds/linux.git'
	PROJECT_ORIGIN =
	# Select branch or tag, STRING, default = 'master', help = {'master' (default), 'develop', 'v3.1b', ...}
	PROJECT_BRANCH =
	# type = STRING, default = leave empty, help = if model branch is a TAG leave empty
	PROJECT_COMMIT =
	# type = STRING, default = leave empty and will load all submodules, help = loadThisSubmodule alsoloadthis anotherLoad ...
	PROJECT_SUBMODULES =
	# type = STRING, default = leave empty and will perform a full clone, help = true,false
	FETCH_SINGLE_BRANCH = true
	# type = STRING, default = leave empty and will perform a clone on local machine, help = Path to root of remote folder
	#REMOTE_CLONE_ROOT = /gpfs/scratch/archive/bsc32
	# If PROJECT_TYPE is not svn, no need to change
	[svn]
	# type = STRING, help = 'https://svn.ec-earth.org/ecearth3'
	PROJECT_URL =
	# Select revision number. NUMERIC = 1778
	PROJECT_REVISION =

	# If PROJECT_TYPE is not local, no need to change
	[local]
	# type = STRING, help = /foo/bar/ecearth
	PROJECT_PATH =

	# If PROJECT_TYPE is none, no need to change
	[project_files]
	# Where is PROJECT CONFIGURATION file location relative to project root path
	FILE_PROJECT_CONF =
	# Where is JOBS CONFIGURATION file location relative to project root path
	FILE_JOBS_CONF =
	# Default job scripts type in the project. type = STRING, default = bash, supported = 'bash', 'python' or 'r'
	JOB_SCRIPTS_TYPE =

	[rerun]
	# Is a rerun or not? [Default: Do set FALSE]. BOOLEAN = TRUE, FALSE
	RERUN = FALSE
	# If RERUN = TRUE then supply the list of chunks to rerun
	# LIST = [ 19601101 [ fc0 [1 2 3 4] fc1 [1] ] 19651101 [ fc0 [16-30] ] ]
	CHUNKLIST =
	[scheduler]
	allow implicit tasks = True
	[task parameters]
	round = 1..3
	dataset = mnist, svhn
	[scheduling]
	initial cycle point = 1
	cycling mode = integer
	[[graph]]
	R1 = """
	init => train<round,dataset> => evaluate<dataset>
	"""
	[runtime]
	[[root]]
	script = true
	[[train<round>]]
	script = """
	echo "Training the model... round ${CYLC_TASK_PARAM_round}"
	"""
	[[eval<dataset>]]
	script = """
	echo "Evaluating the dataset ${CYLC_TASK_PARAM_dataset}"
	"""
	[init]
	FILE=_.sh

	[train]
	FILE=_.sh
	RUNNING=chunk
	DEPENDENCIES=init

	[eval]
	FILE=_.sh
	RUNNING=member
	DEPENDENCIES=train
	# Example platform with all options specified

	[LOCAL]
	TYPE=PS
	HOST=localhost
	USER=kinow
	SCRATCH_DIR=/tmp
	PROJECT=bla

	## Platform name
	# [PLATFORM]
	## Queue type. Options: PBS, SGE, PS, LSF, ecaccess, SLURM. Required
	# TYPE =
	## Version of queue manager to use. Needed only in PBS (options: 10, 11, 12) and ecaccess (options: pbs, loadleveler)
	# VERSION =
	## Hostname of the HPC. Required
	# HOST =
	## Project for the machine scheduler. Required
	# PROJECT =
	## Budget account for the machine scheduler. If omitted, takes the value defined in PROJECT
	# BUDGET =
	## Option to add project name to host. This is required for some HPCs.
	# ADD_PROJECT_TO_HOST = False
	## User for the machine scheduler. Required
	# USER =
	## Optional. If given, Autosubmit will change owner of files in given platform when using migrate_exp.
	# USER_TO =
	## Path to the scratch directory for the machine. Required.
	# SCRATCH_DIR = /scratch
	## Path to the machine's temporary directory for migrate purposes.
	# TEMP_DIR = /tmp
	## If true, Autosubmit test command can use this queue as a main queue. Defaults to False
	# TEST_SUITE = False
	## If given, Autosubmit will add jobs to the given queue. Required for some platforms.
	# QUEUE =
	## Optional. If given, Autosubmit will submit the serial jobs with the exclusivity directive.
	# EXCLUSIVITY =
	## Optional. If specified, autosubmit will run jobs with only one processor in the specified platform.
	# SERIAL_PLATFORM = SERIAL_PLATFORM_NAME
	## Optional. If specified, autosubmit will run jobs with only one processor in the specified queue.
	## Autosubmit will ignore this configuration if SERIAL_PLATFORM is provided
	# SERIAL_QUEUE = SERIAL_QUEUE_NAME
	## Optional. Default number of processors per node to be used in jobs
	# PROCESSORS_PER_NODE =
	## Optional. Integer. Scratch free space requirements for the platform in percentage (%).
	## If not specified, it won't be defined on the template.
	# SCRATCH_FREE_SPACE =
	## Optional. Integer. Default Maximum number of jobs to be waiting in any platform queue
	## Default = 3
	# MAX_WAITING_JOBS =
	## Optional. Integer. Default maximum number of jobs to be running at the same time at any platform
	## Default = 6
	# TOTAL_JOBS =
	## Max wallclock per job submitted to the HPC queue in format HH:MM. If not specified, defaults to empty.
	## Optional. Required for wrappers.
	# MAX_WALLCLOCK = 72:00
	## Max processors number per job submitted to the HPC. If not specified, defaults to empty.
	## Optional. Required for wrappers.
	# MAX_PROCESSORS = 1
	## Optional. Custom directives for the resource manager of the platform used.
	## Put as many as you wish in a json formatted array.
	# CUSTOM_DIRECTIVE = ["#PBS -v myvar=value, "#PBS -v othervar=value"]