Last active
January 30, 2024 14:29
-
-
Save DaisukeMiyamoto/d1dac9483ff0971d5d9f34000311d312 to your computer and use it in GitHub Desktop.
set up Slurm Accounting feature (sacct) with slurmdbd/MySQL on AWS ParallelCluster
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash -xe | |
# Setting up Slurm Accounting feature with slurmdbd/MySQL for AWS ParallelCluster | |
# Daisuke Miyamoto midaisuk@amazon.co.jp | |
# | |
# Test Condition: | |
# - 20200312 | |
# - ParallelCluster 2.6.0 | |
# - CentOS7/Slurm | |
# | |
# ParallelCluster config example: | |
# base_os = centos7 | |
# scheduler = slurm | |
# post_install = https://midaisuk-pcluster-script.s3-ap-northeast-1.amazonaws.com/setup_slurm_accounting_parallelcluster.sh | |
# | |
# Usage: | |
# use post_install setting on ParallelCluster config | |
# or execute this script in a master node | |
# sudo setup_slurm_accounting_parallelcluster.sh | |
# | |
# Note1 Root password for MariaDB: | |
# CAUTION! Root password for MariaDB is not set by default. | |
# You should set a password for root after executing this script. | |
# | |
# Note2 Data Persistency: | |
# In this script, MariaDB on Master node is used for recording account history. | |
# However, for production cluster, we recommend to use a external database service for data persistency, e.g. RDS MySQL/MariaDB. | |
# For the purpose you could launch RDS and change database settings in thsi script. | |
# It could be helpful if you change cluster name for each cluster to identify the recrods. | |
# | |
# Reference: | |
# - https://slurm.schedmd.com/accounting.html | |
# - https://slurm.schedmd.com/sacct.html | |
# | |
# | |
# set val | |
# | |
. "/etc/parallelcluster/cfnconfig" | |
SLURM_PATH=/opt/slurm | |
SLURM_CLUSTER=parallelcluster | |
SLURM_ACCOUNT=aws | |
SLURM_USER=centos | |
HOSTNAME=$(curl http://169.254.169.254/latest/meta-data/hostname | sed -e 's/\..*//') | |
DB_HOSTNAME=${HOSTNAME} | |
DB_USER_PASSWORD=password | |
DB_ADMIN_USERNAME=root | |
# DB_ADMIN_PASSWORD= | |
SetMariaDB () { | |
# | |
# Install and set up MariaDB | |
# | |
yum install mariadb mariadb-server -y | |
systemctl enable mariadb.service | |
systemctl start mariadb.service | |
} | |
SetSlurmAccounting () { | |
# | |
# initialize DB and DB user | |
# | |
mysql -u ${DB_ADMIN_USERNAME} -e "create user 'slurm'@'localhost' identified by '${DB_USER_PASSWORD}'; grant all on slurm_acct_db.* TO 'slurm'@'localhost'; create database slurm_acct_db;" | |
# | |
# set up slurm.conf | |
# | |
cat << EOS >> ${SLURM_PATH}/etc/slurm.conf | |
# for Accounting | |
AccountingStorageType=accounting_storage/slurmdbd | |
AccountingStorageHost=${HOSTNAME} | |
JobAcctGatherType=jobacct_gather/linux | |
JobAcctGatherFrequency=30 | |
EOS | |
# | |
# set up slurmdbd.conf | |
# | |
cat << EOS > ${SLURM_PATH}/etc/slurmdbd.conf | |
ArchiveEvents=yes | |
ArchiveJobs=yes | |
ArchiveResvs=yes | |
ArchiveSteps=no | |
ArchiveSuspend=no | |
ArchiveTXN=no | |
ArchiveUsage=no | |
AuthInfo=/var/run/munge/munge.socket.2 | |
AuthType=auth/munge | |
DbdHost=${DB_HOSTNAME} | |
DebugLevel=info | |
PurgeEventAfter=1month | |
PurgeJobAfter=12month | |
PurgeResvAfter=1month | |
PurgeStepAfter=1month | |
PurgeSuspendAfter=1month | |
PurgeTXNAfter=12month | |
PurgeUsageAfter=24month | |
LogFile=/var/log/slurmdbd.log | |
SlurmUser=slurm | |
StoragePass=${DB_USER_PASSWORD} | |
StorageType=accounting_storage/mysql | |
StorageUser=slurm | |
EOS | |
# | |
# set up slurmdbd.log file | |
# | |
touch /var/log/slurmdbd.log | |
chown slurm /var/log/slurmdbd.log | |
# | |
# restart slurm daemon | |
# | |
${SLURM_PATH}/sbin/slurmdbd | |
systemctl stop slurmctld | |
systemctl start slurmctld | |
sleep 10 | |
# | |
# set up sacctmgr | |
# | |
#${SLURM_PATH}/bin/sacctmgr add cluster ${SLURM_CLUSTER} -i | |
#${SLURM_PATH}/bin/sacctmgr add account ${SLURM_ACCOUNT} -i | |
#${SLURM_PATH}/bin/sacctmgr add user ${SLURM_USER} Account=${SLURM_ACCOUNT} -i | |
} | |
case "${cfn_node_type}" in | |
MasterServer) | |
echo "Post Script on Master" | |
SetMariaDB | |
SetSlurmAccounting | |
;; | |
ComputeFleet) | |
echo "Post Script on Compute" | |
;; | |
*) | |
;; | |
esac |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
I get
error: slurm_persist_conn_open_without_init: failed to open persistent connection to host:<ip>:6819: Connection refused
, when runningsacctmgr
command on the post-install, any idea why this would happen?Sometimes it works, most times it doesn't,
However when I login to the cluster, after install, there are no issues with sacctmgr.