Skip to content

Instantly share code, notes, and snippets.

@thimslugga
Last active July 3, 2024 19:06
Show Gist options
  • Save thimslugga/a6ff15d21bb5f21bd3a9f8e39f1fc802 to your computer and use it in GitHub Desktop.
Save thimslugga/a6ff15d21bb5f21bd3a9f8e39f1fc802 to your computer and use it in GitHub Desktop.
High Availability Cluster Cheatsheet i.e. Pacemaker and Corosync
# As root add to your ~/.bashrc or ~/.bash_aliases
export ListInstances=$(/usr/sap/hostctrl/exe/saphostctrl -function ListInstances | head -1)
export sid=$(echo "$ListInstances" | cut -d " " -f 5 | tr [A-Z] [a-z])
export SID=$(echo $sid | tr [a-z] [A-Z])
export Instance=$(echo "$ListInstances" | cut -d " " -f 7)
alias hgrep='history | grep $1'
alias tma='tmux attach -t $(tmux ls | grep -v atta | head -1 | cut -d " " -f 1)'
#alias pit='ssh pitunnel'
#alias killnode="echo 'b' > /proc/sysrq-trigger"
alias crmm='watch -n 1 crm_mon -1Arf'
#alias crmv='watch -n 1 /usr/local/bin/crmmv'
alias cdhb='cd /usr/lib/ocf/resource.d/heartbeat'
#alias vih='vim /usr/lib/ocf/resource.d/heartbeat/SAPHanaStart'
alias shr='watch -n 5 "SAPHanaSR-monitor --sid=${SID}"'
alias tm='tail -100f /var/log/messages | grep -v systemd'
alias tms='tail -1000f /var/log/messages | grep -s -E "Setting master-rsc_SAPHana_${SID}_HDB${Instance}|sr_register|WAITING4
LPA|EXCLUDE as possible takeover node|SAPHanaSR|failed|${HOSTNAME}|PROMOTED|DEMOTED|UNDEFINED|master_walk|SWAIT|WaitforStop
ped|FAILED"'
alias tmss='tail -1000f /var/log/messages | grep -v systemd | grep -s -E "secondary with sync status|Setting master-rsc_SAPHa
na_${SID}_HDB${Instance}|sr_register|WAITING4LPA|EXCLUDE as possible takeover node|SAPHanaSR|failed|${HOSTNAME}|PROMOTED|DE
MOTED|UNDEFINED|master_walk|SWAIT|WaitforStopped|FAILED"'
alias tmm='tail -1000f /var/log/messages | grep -s -E "Setting master-rsc_SAPHana_${SID}_HDB${Instance}|sr_register|WAITING4
LPA|PROMOTED|DEMOTED|UNDEFINED|master_walk|SWAIT|WaitforStopped|FAILED|LPT|SOK|SFAIL|SAPHanaSR-mon" | grep -v systemd'
alias tmsl='tail -1000f /var/log/messages | grep -s -E "Setting master-rsc_SAPHana_${SID}_HDB${Instance}|sr_register|WAITING
4LPA|PROMOTED|DEMOTED|UNDEFINED|master_walk|SWAIT|WaitforStopped|FAILED|LPT|SOK|SFAIL|SAPHanaSR-mon"'
alias cglo='su - ${sid}adm -c cglo'
alias gtr='su - ${sid}adm -c gtr'
alias hdb='su - ${sid}adm -c hdb'
alias hdbi='su - ${sid}adm -c hdbi'
alias hri='su - ${sid}adm -c hri'
alias hris='su - ${sid}adm -c hris'
alias lhc='su - ${sid}adm -c lhc'
alias sgsi='su - ${sid}adm -c sgsi'
alias srm='su - ${sid}adm -c srm'
alias srs='su - ${sid}adm -c srs'
alias vglo='su - ${sid}adm -c vglo'
alias sapstart='su - ${sid}adm -c sapstart'
alias sapstop='su - ${sid}adm -c sapstop'
alias srstate='su - ${sid}adm -c srstate'
#alias python='/usr/sap/${SID}/HDB${Instance}/exe/Python/bin/python'

High Availability Cluster Cheatsheet

Corosync

corosync-keygen
scp /etc/corosync/corosync.conf root@node02:/etc/corosync/
systemctl restart corosync
corosync-cmapctl | grep members

CRM Shell aka crmsh

crmsh configuration

/etc/crm/crm.conf
~/.config/crm/crm.conf
~/.crm.rc
crm options user hacluster
cat /etc/sudoers

Cluster Status

crm status
crm corosync status
crm_mon -1
crm_mon -1Ar
crm cluster health
watch -n 1 -c crm status

Corosync Configuration

crm corosync diff
cat /etc/corosync/corosync.conf
/etc/corosync/authkey
cat /etc/resolv.conf
cat /etc/hosts
cat /etc/ntp.conf

Cluster Manager Configuration

crm configure show
crm configure show | grep cli-

cibadmin -Q

Show raw configuration:

crm configure show xml

Show options

crm configure show cib-bootstrap-options
crm configure show rsc-options
crm configure show op-options
crm configure show SAPHanaSR

Upgrade CIB Syntax Version

Sometimes new features are only available with the latest CIB syntax version. When you upgrade to a new product version, your CIB syntax version will not be upgraded by default.

Check your version with:

cibadmin -Q | grep validate-with

Upgrade to the latest CIB syntax version with:

cibadmin --upgrade --force

Enable use of ACLs in cluster

crm configure property enable-acl=true

Now all users for whom you want to modify access rights with ACLs must belong to the haclient group.

Resource Agents

crm ra classes

crm ra list ocf pacemaker

Fencing aka Stonith Resource

crm configure property stonith-enabled=true
crm configure property stonith-enabled=false

PCS

dnf install -y firewalld corosync pacemaker pcs fence-agents-all
passwd hacluster
firewall-cmd --state
firewall-cmd --permanent --add-service=high-availability
firewall-cmd --add-service=high-availability
systemctl enable --now pcsd.service
pcs cluster auth <host01> <host02> <host03>

Cluster Status

pcs status

crm_mon -1

Cluster Manager Configuration

pcs config
pcs cluster cib

Upgrade CIB:

pcs cluster cib-upgrade

Cluster Properties

pcs property list --all
pcs property list --defaults

pcs property show --defaults
pcs property show cluster-infrastructure

pcs property show property

Resource Agents

pcs resource standards

pcs resource agents ocf
pcs resource agents lsb
pcs resource agents service
pcs resource agents stonith
pcs resource agents

pcs resource agents ocf:pacemaker
pcs resource show
pcs resource show --full
pcs resource show ClusterIP
pcs resource config
pcs resource describe IPaddr2
pcs resource describe ocf:heartbeat:IPaddr2

Fencing aka Stonith Resource

pcs stonith show
pcs stonith show --full
pcs stonith list

ls -lA /usr/sbin/fence*

pcs stonith describe fence_aws
pcs property set startup-fencing=true
pcs property set concurrent-fencing=true
pcs property set stonith-action=reboot
pcs property set stonith-action=reboot
pcs property set stonith-timeout=300s
pcs property set stonith-max-attempts=10
pcs property set stonith-enabled=true
pcs property set stonith-enabled=false
pcs property set maintenance-mode=false
pcs property set no-quorum-policy=ignore
pcs property set symmetic-cluster=

# Create an opt-in cluster, which prevents resources from running anywhere by default
pcs property set symmetric-cluster=false

# Create an opt-out cluster, which allows resources to run everywhere by default
pcs property set symmetric-cluster=true

pcs property set shutdown-escalation=20min
#pcs stonith create name fencing_agent parameters
pcs stonith create rhev-fence fence_rhevm ipaddr=engine.local.net ipport=443 ssl_insecure=1 ssl=1 inet4_only=1 login=admin@internal passwd=PASSWD pcmk_host_map="clu01:clu01.local.net;clu02:clu02.local.net;clu03:clu03.local.net" pcmk_host_check=static-list pcmk_host_list="clu01.local.net,clu02.local.net,clu03.local.net" power_wait=3 op monitor interval=90s
pcs stonith update rhev-fence fence_rhevm api_path=/ovirt-engine/api disable_http_filter=1 ipaddr=engine.local.net ipport=443 ssl_insecure=1 ssl=1 inet4_only=1 login=admin@internal passwd=PASSWORD pcmk_host_map="clu01.local.net:clu01.local.net;clu02.local.net:clu02.local.net;clu03.local.net:clu03.local.net" pcmk_host_check=static-list pcmk_host_list="clu01.local.net,clu02.local.net,clu03.local.net" power_wait=3 op monitor interval=90s
pcs stonith update rhev-fence port=nodeb
pcs stonith show rhev-fence
fence_rhevm -o status -a engine.local.net --username=admin@internal --password=PASSWORD --ipport=443  -n clu03.local.net -z --ssl-insecure  --disable-http-filter 
pcs stonith fence hostname

Note: When Pacemaker's policy engine creates a transition with a fencing request, the stonith daemon uses the timeout value that is passed by the transition engine. This matches the value of the stonith-timeout cluster property.

When fencing is triggered manually via stonith_admin or pcs stonith fence, the default timeout implemented in stonith_admin (120s) is used instead.

pcs stonith delete fence_noded

Cluster Notifications

pcs resource create webmail MailTo email=user@domain.com subject="CLUSTER-NOTIFICATINS" --group=firstweb
vim /usr/local/bin/crm_notify.sh
pcs resource create mailme ClusterMon extra_options="-e mail@domain.com -E /usr/local/bin/crm_notify.sh --clone

SAPHanaSR

SAPHanaSR-showAttr
SAPHanaSR-monitor

ClusterTools2

cs_clusterstate -i
cs_show_error_patterns -c | grep -v "=.0"
cs_show_memory
cs_sum_base_config

SAP HANA:

cs_show_hana_autofailover --all
cs_show_hana_info --info $SID $nr

SAP

SAP Startup Sevice Framework

ps aux | grep <SID>adm | grep sapstartsrv

sapcontrol

sapcontrol -nr $nr -function GetSystemInstanceList
sapcontrol -nr $nr -function HAGetFailoverConfig
sapcontrol -nr $nr -function HACheckFailoverConfig
#sapcontrol -nr $nr -function StopService
#sapcontrol -nr $nr -function StartService <SID> 
#sapcontrol -nr $nr -function StartSystem
#sapcontrol -nr $nr -function StopSystem ALL

SAP HANA System Replication (SR)

HDBsettings.sh systemOverview.py
HDBSettings.sh systemReplicationStatus.py; echo RC:$?
HDBSettings.sh landscapeHostConfiguration.py; echo RC:$?
hdbnsutil -sr_state
# Please read the corosync.conf.5 manual page
# https://github.com/corosync/corosync/blob/main/conf/corosync.conf.example
totem {
# This specifies the version of the configuration file.
# Currently the only valid version for this directive is 2.
version: 2
# Corosync itself works without a cluster name, but DLM needs one.
# The cluster name is also written into the VG metadata of newly
# created shared LVM volume groups, if lvmlockd uses DLM locking.
# It is also used for computing mcastaddr, unless overridden below.
cluster_name: hacluster
# Specifies version of IP to use for communication. Value can be one of ipv4 or ipv6.
# Default (if unspecified) is ipv4.
ip_version: ipv4
# How long before declaring a token lost (ms)
token: 30000
# How long to wait for consensus to be achieved before starting a new round of membership configuration (ms)
consensus: 36000
# How many token retransmits should be attempted before forming a new configuration.
# Also used for token_retransmit and hold calculations.
token_retransmits_before_loss_const: 10
# Allows Corosync to hold token by representative when there is too much retransmit messages.
# This allows network to process increased load without overloading it. Used mechanism is
# same as described for hold directive.
#
# Some deployments may prefer to never hold token when there is retransmit messages.
# If so, option should be set to yes.
cancel_token_hold_on_retransmit: no
# How long to wait for join messages in the membership protocol (ms)
join: 1000
# Turn off the virtual synchrony filter
#vsftype: none
# Number of messages that may be sent by one processor on receipt of the token
max_messages: 20
# Limit generated nodeids to 31-bits (positive signed integers)
clear_node_high_bit: yes
# crypto_cipher and crypto_hash: Used for mutual node authentication.
# If you choose to enable this, then do remember to create a shared
# secret with "corosync-keygen".
#
# Enabling crypto_cipher, requires also enabling of crypto_hash.
#
# Valid values for crypto_cipher are none (no encryption), aes256, aes192,
# aes128 and 3des.
crypto_cipher: aes256
# Enabling crypto_cipher, requires also enabling of crypto_hash.
# Valid values for crypto_hash are none (no authentication), md5, sha1,
# sha256, sha384 and sha512.
crypto_hash: sha256
# crypto_cipher and crypto_hash should be used instead of deprecated
# secauth parameter.
secauth: off
# How many threads to use for encryption/decryption
#threads: 0
# Optionally assign a fixed node id (integer)
#nodeid: 1234
# This specifies the mode of redundant ring, which may be none, active, or passive.
#rrp_mode: passive
# For UDPU transport, an interface section is not needed and it is recommended that
# the nodelist is used to define cluster nodes.
transport: udpu
#interface {
#ringnumber: 0
# This is normally the *network* address of the
# interface to bind to. This ensures that you can use
# identical instances of this configuration file
# across all your cluster nodes, without having to
# modify this option.
#bindnetaddr: 192.168.1.0
# However, if you have multiple physical network
# interfaces configured for the same subnet, then the
# network address alone is not sufficient to identify
# the interface Corosync should bind to. In that case,
# configure the *host* address of the interface
# instead:
#bindnetaddr: 192.168.1.10
#
#member {
# memberaddr: 192.168.1.10
#}
#member {
# memberaddr: 192.168.1.20
#}
# When selecting a multicast address, consider RFC
# 2365 (which, among other things, specifies that
# 239.255.x.x addresses are left to the discretion of
# the network administrator). Do not reuse multicast
# addresses across multiple Corosync clusters sharing
# the same network.
#mcastaddr: 239.255.1.1
# Corosync uses the port you specify here for UDP
# messaging, and also the immediately preceding
# port. Thus if you set this to 5405, Corosync sends
# messages over UDP ports 5405 and 5404.
#mcastport: 5405
#ttl: 1
#}
}
nodelist {
# For UDPU, every node that should be a member of the membership must be specified.
node {
# Hostname of the node, used by pacemaker and not by corosync.
#name: node01
# This configuration option is optional when using IPv4 and required when using IPv6.
nodeid: 1
# Address of first link
ring0_addr: <ip or hostname e.g. node01>
# When knet transport is used it's possible to define up to 8 links
#ring1_addr:
}
node {
# Hostname of the node, used by pacemaker and not by corosync.
#name: node02
# This configuration option is optional when using IPv4 and required when using IPv6.
nodeid: 2
# Address of first link
ring0_addr: <ip or hostname e.g. node02>
# When knet transport is used it's possible to define up to 8 links
#ring1_addr:
}
}
logging {
# Log the source file and line where messages are being
# generated. When in doubt, leave off. Potentially useful for
# debugging.
fileline: off
# Log to standard error. When in doubt, set to no. Useful when
# running in the foreground (when invoking "corosync -f")
#to_stderr: yes
# Log to a log file. When set to "no", the "logfile" option
# must not be set.
to_logfile: yes
logfile: /var/log/cluster/corosync.log
# Log to the system log daemon. When in doubt, set to yes.
to_syslog: yes
# Log debug messages (very verbose). When in doubt, leave off.
#syslog_facility: daemon
debug: off
# Log messages with time stamps. When in doubt, set to on
# (unless you are only logging to syslog, where double
# timestamps can be annoying).
timestamp: on
logger_subsys {
subsys: QUORUM
debug: off
}
}
quorum {
# Enable and configure quorum subsystem (default: off)
# see also corosync.conf.5 and votequorum.5
provider: corosync_votequorum
expected_votes: 2
two_node: 1
}
alias hgrep='history | grep $1'
alias tm='tail -100f /var/log/messages | grep -v systemd'
alias tms='tail -1000f /var/log/messages | grep -s -E "Setting master-rsc_SAPHana_$SAPSYSTEMNAME_HDB${TINSTANCE}|sr_register|WAITING4LPA|EXCLUDE as possible takeover node|SAPHanaSR|failed|${HOSTNAME}|PROMOTED|DEMOTED|UNDEFINED|master_walk|SWAIT|WaitforStopped|FAILED"'
alias tmsl='tail -1000f /var/log/messages | grep -s -E "Setting master-rsc_SAPHana_$SAPSYSTEMNAME_HDB${TINSTANCE}|sr_register|WAITING4LPA|PROMOTED|DEMOTED|UNDEFINED|master_walk|SWAIT|WaitforStopped|FAILED|LPT"'
alias sapstart='sapcontrol -nr ${TINSTANCE} -function StartSystem HDB; hdbi'
alias sapstop='sapcontrol -nr ${TINSTANCE} -function StopSystem HDB; hdbi'
alias sgsi='watch sapcontrol -nr ${TINSTANCE} -function GetSystemInstanceList'
alias spl='watch sapcontrol -nr ${TINSTANCE} -function GetProcessList'
alias splh='watch "sapcontrol -nr ${TINSTANCE} -function GetProcessList | grep hdbdaemon"'
alias srm='watch "hdbnsutil -sr_state --sapcontrol=1 |grep site.*Mode"'
alias srs="watch -n 5 'python /usr/sap/$SAPSYSTEMNAME/HDB${TINSTANCE}/exe/python_support/systemReplicationStatus.py; echo Status \$?'"
alias srstate='watch -n 10 hdbnsutil -sr_state'
alias hri='hdbcons -e hdbindexserver "replication info"'
alias hris='hdbcons -e hdbindexserver "replication info" | grep -E "SiteID|ReplicationStatus_"'
alias hdb='watch -n 5 "sapcontrol -nr ${TINSTANCE} -function GetProcessList | grep -s -E hdbdaemon\|hdbnameserver\|hdbindexserver"'
alias hdbi='watch -n 5 "sapcontrol -nr ${TINSTANCE} -function GetProcessList | grep -s -E hdbdaemon\|hdbnameserver\|hdbindexserver; sapcontrol -nr ${TINSTANCE} -function GetSystemInstanceList"'
alias vglo="vim /usr/sap/$SAPSYSTEMNAME/SYS/global/hdb/custom/config/global.ini"
alias vgloh="vim /hana/shared/${SAPSYSTEMNAME}/HDB${TINSTANCE}/${HOSTNAME}/global.ini"
alias gtr='watch -n 10 /usr/sap/$SAPSYSTEMNAME/HDB${TINSTANCE}/exe/Python/bin/python /usr/sap/$SAPSYSTEMNAME/HDB${TINSTANCE}/exe/python_support/getTakeoverRecommendation.py --sapcontrol=1'
alias lhc='/usr/sap/$SAPSYSTEMNAME/HDB${TINSTANCE}/exe/Python/bin/python /usr/sap/$SAPSYSTEMNAME/HDB${TINSTANCE}/exe/python_support/landscapeHostConfiguration.py; echo $?'
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment