Skip to content

Instantly share code, notes, and snippets.

@dbafromthecold
Last active October 29, 2022 11:06
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save dbafromthecold/2c96994ea2e0d737bb45327a3413d95e to your computer and use it in GitHub Desktop.
Save dbafromthecold/2c96994ea2e0d737bb45327a3413d95e to your computer and use it in GitHub Desktop.
Creating a VMWare pacemaker cluster to run SQL Server Availability Groups
######################################################################################################################
#
# Creating a pacemaker cluser and availability group on VMWare Virtual Machines - Andrew Pruski
# @dbafromthecold
# dbafromthecold@gmail.com
#
######################################################################################################################
# Installing SQL Server
######################################################################################################################
# this is for a three node cluster running Ubuntu 20.04 in VMWare
# install sql server
wget -qO- https://packages.microsoft.com/keys/microsoft.asc | sudo apt-key add -
sudo add-apt-repository "$(wget -qO- https://packages.microsoft.com/config/ubuntu/20.04/mssql-server-preview.list)"
sudo apt-get update
sudo apt-get install -y mssql-server
# install sql server ha components
sudo apt-get install mssql-server-ha
# configure sql server
sudo /opt/mssql/bin/mssql-conf setup
sudo /opt/mssql/bin/mssql-conf set sqlagent.enabled true
sudo /opt/mssql/bin/mssql-conf set hadr.hadrenabled 1
sudo systemctl restart mssql-server
systemctl status mssql-server
######################################################################################################################
# Configuring the nodes
######################################################################################################################
# add entries for servers into /etc/hosts
sudo vim /etc/hosts
<NODE1-IP-ADDRESS> <NODE1>
<NODE2-IP-ADDRESS> <NODE2>
<NODE3-IP-ADDRESS> <NODE3>
<LISTENER-IP-ADDRESS> <LISTENER NAME>
# install cluster components
sudo apt-get install -y pacemaker pacemaker-cli-utils crmsh resource-agents fence-agents csync2
# run on primary server
sudo corosync-keygen
# copy to other servers
sudo scp /etc/corosync/authkey ubuntu@<NODE2>:~
sudo scp /etc/corosync/authkey ubuntu@<NODE3>:~
# run on other servers
sudo mv authkey /etc/corosync/authkey
# run on primary server
sudo vim /etc/corosync/corosync.conf
# example file: -
totem {
version: 2
cluster_name: <CLUSTER-NAME>
transport: udpu
crypto_cipher: none
crypto_hash: none
}
logging {
fileline: off
to_stderr: yes
to_logfile: yes
logfile: /var/log/corosync/corosync.log
to_syslog: yes
debug: off
logger_subsys {
subsys: QUORUM
debug: off
}
}
quorum {
provider: corosync_votequorum
}
nodelist {
node {
name: <NODE1>
nodeid: 1
ring0_addr: <NODE1-IP-ADDRESS>
}
node {
name: <NODE2>
nodeid: 2
ring0_addr: <NODE2-IP-ADDRESS>
}
node {
name: <NODE3>
nodeid: 3
ring0_addr: <NODE3-IP-ADDRESS>
}
}
# copy to other servers
sudo scp /etc/corosync/corosync.conf ubuntu@<NODE2>:~
sudo scp /etc/corosync/corosync.conf ubuntu@<NODE3>:~
# move conf file on other servers
sudo mv corosync.conf /etc/corosync/
# run on all servers
sudo systemctl restart pacemaker corosync
# confirm status of cluster
sudo crm status
######################################################################################################################
# create availability group in SQL Server
######################################################################################################################
# create always on extended events on all servers
ALTER EVENT SESSION AlwaysOn_health ON SERVER WITH (STARTUP_STATE=ON);
GO
# create master key and cert on primary node
CREATE MASTER KEY ENCRYPTION BY PASSWORD = 'PASSWORD';
CREATE CERTIFICATE dbm_certificate WITH SUBJECT = 'dbm';
BACKUP CERTIFICATE dbm_certificate
TO FILE = '/var/opt/mssql/data/dbm_certificate.cer'
WITH PRIVATE KEY (
FILE = '/var/opt/mssql/data/dbm_certificate.pvk',
ENCRYPTION BY PASSWORD = 'PASSWORD'
);
# copy cert backups to other nodes
sudo su
cd /var/opt/mssql/data
scp dbm_certificate.* ubuntu@<NODE2>:~
scp dbm_certificate.* ubuntu@<NODE1>:~
exit
# move cert backups to /var/opt/mssql/data
sudo su
cp /home/ubuntu/dbm_certificate.* /var/opt/mssql/data/
chown mssql:mssql /var/opt/mssql/data/dbm_certificate.*
exit
# create master key and certificate from backups on other nodes
CREATE MASTER KEY ENCRYPTION BY PASSWORD = 'PASSWORD';
CREATE CERTIFICATE dbm_certificate
FROM FILE = '/var/opt/mssql/data/dbm_certificate.cer'
WITH PRIVATE KEY (
FILE = '/var/opt/mssql/data/dbm_certificate.pvk',
DECRYPTION BY PASSWORD = 'PASSWORD'
);
# create hadr endpoints on all nodes
CREATE ENDPOINT [Hadr_endpoint]
AS TCP (LISTENER_PORT = 5022)
FOR DATABASE_MIRRORING (
ROLE = ALL,
AUTHENTICATION = CERTIFICATE dbm_certificate,
ENCRYPTION = REQUIRED ALGORITHM AES
);
ALTER ENDPOINT [Hadr_endpoint] STATE = STARTED;
# create login for pacemaker on all nodes
USE [master]
GO
CREATE LOGIN [pacemakerLogin] with PASSWORD= N'PASSWORD';
ALTER SERVER ROLE [sysadmin] ADD MEMBER [pacemakerLogin];
GO
# create password file for pacemaker on all nodes
echo 'pacemakerLogin' >> ~/pacemaker-passwd
echo 'PASSWORD' >> ~/pacemaker-passwd
sudo mv ~/pacemaker-passwd /var/opt/mssql/secrets/passwd
sudo chown root:root /var/opt/mssql/secrets/passwd
sudo chmod 400 /var/opt/mssql/secrets/passwd
# create the availability group
CREATE AVAILABILITY GROUP [ag1]
WITH (CLUSTER_TYPE = EXTERNAL)
FOR REPLICA ON
N'<NODE1>'
WITH (
ENDPOINT_URL = N'tcp://<NODE1>:5022',
AVAILABILITY_MODE = SYNCHRONOUS_COMMIT,
FAILOVER_MODE = EXTERNAL,
SEEDING_MODE = AUTOMATIC
),
N'<NODE2>'
WITH (
ENDPOINT_URL = N'tcp://<NODE2>:5022',
AVAILABILITY_MODE = SYNCHRONOUS_COMMIT,
FAILOVER_MODE = EXTERNAL,
SEEDING_MODE = AUTOMATIC
),
N'<NODE3>'
WITH(
ENDPOINT_URL = N'tcp://<NODE3>:5022',
AVAILABILITY_MODE = SYNCHRONOUS_COMMIT,
FAILOVER_MODE = EXTERNAL,
SEEDING_MODE = AUTOMATIC
);
ALTER AVAILABILITY GROUP [ag1] GRANT CREATE ANY DATABASE;
GO
# join other nodes to the availability group
ALTER AVAILABILITY GROUP [ag1] JOIN WITH (CLUSTER_TYPE = EXTERNAL);
ALTER AVAILABILITY GROUP [ag1] GRANT CREATE ANY DATABASE;
# grant permissions to pacemaker login
GRANT ALTER, CONTROL, VIEW DEFINITION ON AVAILABILITY GROUP::ag1 TO [pacemakerLogin];
GRANT VIEW SERVER STATE TO [pacemakerLogin];
GO
######################################################################################################################
# create availability group resource in pacemaker
######################################################################################################################
# disable stonith for the cluster - this will be setup later
sudo crm configure property stonith-enabled=false
# create the availbility group resource via crmsh interactive shell
sudo crm
configure
primitive ag1_cluster \
ocf:mssql:ag \
params ag_name="ag1" \
meta failure-timeout=60s \
op start timeout=60s \
op stop timeout=60s \
op promote timeout=60s \
op demote timeout=10s \
op monitor timeout=60s interval=10s \
op monitor timeout=60s on-fail=demote interval=11s role="Master" \
op monitor timeout=60s interval=12s role="Slave" \
op notify timeout=60s
ms ms-ag1 ag1_cluster \
meta master-max="1" master-node-max="1" clone-max="3" \
clone-node-max="1" notify="true"
commit
# check the status of the availability group
sudo crm resource status ms-ag1
# check the cluster status
sudo crm status
######################################################################################################################
# test auto-seeding in availability group
######################################################################################################################
# create a database to test seeding in availability group
USE [master];
GO
CREATE DATABASE [testdatabase1];
GO
BACKUP DATABASE [testdatabase1] TO DISK = N'/var/opt/mssql/data/testdatabase1.bak';
BACKUP LOG [testdatabase1] TO DISK = N'/var/opt/mssql/data/testdatabase1.trn';
GO
ALTER AVAILABILITY GROUP [ag1] ADD DATABASE [testdatabase1];
GO
######################################################################################################################
# create listener
######################################################################################################################
# create listener resource in pacemaker
sudo crm configure primitive virtualip \
ocf:heartbeat:IPaddr2 \
params ip=<LISTENER IP ADDRESS>
# confirm cluster status
sudo crm status
# create listener
ALTER AVAILABILITY GROUP [ag1] ADD LISTENER N'<LISTENER>' (
WITH IP
((N'<LISTENER IP ADDRESS>', N'<NETMASK>')), PORT=1433);
GO
# configure colocation and ordering constraints
sudo crm configure colocation ag-with-listener INFINITY: virtualip ms-ag1:Master
sudo crm configure order ag-before-listener Mandatory: ms-ag1:promote virtualip:start
# test failover of availability group
sudo crm resource move ms-ag1 <NODE2>
# confirm cluster status
sudo crm status
# view constraints in cluster
sudo crm resource constraints ms-ag1
# delete move constraint
sudo crm configure delete cli-prefer-ms-ag1
######################################################################################################################
# create STONITH resource
######################################################################################################################
# list stonith devices
crm ra list stonith
# test using fence_vmware_rest stonith devices
fence_vmware_rest -a <VSPHERE IP ADDRESS> -l <LOGIN> -p <PASSWORD> --ssl-insecure -z -o list | egrep "(<NODE1>|<NODE2>|<NODE3>)"
fence_vmware_rest -a <VSPHERE IP ADDRESS> -l <LOGIN> -p <PASSWORD> --ssl-insecure -z -o status -n <NODE1>
# create fence_vmware_rest stonith resource
sudo crm configure primitive fence_vmware stonith:fence_vmware_rest \
params \
ipaddr="<VSPHERE IP ADDRESS>" \
action=reboot \
login="<LOGIN>" \
passwd="<PASSWORD>" \
ssl=1 ssl_insecure=1 \
pcmk_reboot_timeout=900 \
power_timeout=60 \
op monitor \
interval=3600 \
timeout=120
# configure stonith properties
sudo crm configure property cluster-recheck-interval=2min
sudo crm configure property start-failure-is-fatal=true
sudo crm configure property stonith-timeout=900
sudo crm configure property concurrent-fencing=true
sudo crm configure property stonith-enabled=true
# delete stonith resource - whilst testing
#sudo crm configure property stonith-enabled=false
#sudo crm resource stop fence_vmware
#sudo crm configure delete fence_vmware
# confirm cluster status
sudo crm status
# test stonith
sudo crm cluster stop # run on target node
sudo crm status
sudo stonith_admin --reboot <NODE3>
sudo crm status
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment