Skip to content

Instantly share code, notes, and snippets.

@mrjk
Forked from dansimau/pgprepmgr.sh
Last active October 16, 2015 21:12
Show Gist options
  • Save mrjk/2947e908980e04ab89b1 to your computer and use it in GitHub Desktop.
Save mrjk/2947e908980e04ab89b1 to your computer and use it in GitHub Desktop.
Wrapper for pgpools' pcp tools to help you manage your pgpool setup and postgresql cluster. See usage in the comment below.
#!/bin/bash
#
# pgpool-II replication manager
#
# Interfaces with pgpool's pcp command-line tools to provide access to common functions for managing
# load-balancing and failover.
#
# mrjk.78 at the famous google mail
# 2014-10-13
# dsimmons@squiz.co.uk
# 2011-08-28
#
# PCP configuration
pcp_host="127.0.0.1"
pcp_port="9898"
pcp_username="admin"
pcp_password="admin"
pcp_timeout="10"
# Health check uses psql to connect to each backend server. Specify options required to connect here
export PGPASSWORD=repmgr
psql_healthcheck_opts="-U repmgr"
# Default options to send to pcp commands
pcp_cmd_preamble="$pcp_timeout $pcp_host $pcp_port $pcp_username $pcp_password"
# Postgresql system user to connect to query repmgr
remote_user="postgres"
# NEW
repmgr_pg_root="root"
repmgr_pg_user="postgres"
repmgr_pg_data_dir="/var/lib/pgsql/9.4/data"
repmgr_bin_dir="/usr/pgsql-9.4/bin"
repmgr_conf="/etc/repmgr/9.4/repmgr.conf"
#
# Runs abitrary pcp command with the preamble
#
_pcp()
{
cmd=$1
shift
$cmd $pcp_cmd_preamble $*
}
#
# Prints the total number of pgpool nodes.
#
_get_node_count()
{
pcp_node_count $pcp_cmd_preamble
}
#
# Prints out node information for the specified pgpool node
#
_get_node_info()
{
if [ -z $1 ]; then
echo "Usage: $0 _get_node_info <node id>" >&2
return 99
fi
pcp_node_info $pcp_cmd_preamble $1
}
#
# Get node IP
#
_get_node_ip()
{
if [ -z $1 ]; then
echo "Usage: $0 _get_node_ip <node id>" >&2
return 99
fi
node_info=($( pcp_node_info $pcp_cmd_preamble $1 ))
echo ${node_info[0]}
}
#
# Outputs the replication lag (in bytes) between the specified master and slave
#
_get_replication_lag()
{
if [ -z $2 ]; then
echo "Usage: $0 _get_replication_lag <master node id> <slave node id>" >&2
return 99
fi
# Get connection information for nodes
master_node_info=($(_get_node_info $1))
if [ $? -gt 0 ]; then
echo "ERROR: Failed getting node info for node $1" >&2
return 1
fi
slave_node_info=($(_get_node_info $2))
if [ $? -gt 0 ]; then
echo "ERROR: Failed getting node info for node $2" >&2
return 1
fi
for n in $1 $2; do
if [ $(_is_node_alive $n) -eq 0 ]; then
echo "ERROR: Node $n is not available. Unable to calculate lag." >&2
return 1
fi
done
export PGCONNECT_TIMEOUT=2
# Get xlog locations for master and slaves
master_xlog_loc=$(psql $psql_healthcheck_opts -h "${master_node_info[0]}" -p "${master_node_info[1]}" -Atc "SELECT pg_current_xlog_location();")
if [ $? -gt 0 ]; then
echo "ERROR: Failed getting xlog location from node $1" >&2
return 1
fi
slave_xlog_loc=$(psql $psql_healthcheck_opts -h "${slave_node_info[0]}" -p "${slave_node_info[1]}" -Atc "SELECT pg_last_xlog_replay_location();")
if [ $? -gt 0 ]; then
echo "ERROR: Failed getting xlog location from node $2" >&2
return 1
fi
# Number of bytes behind
echo $(($(_xlog_to_bytes $master_xlog_loc) - $(_xlog_to_bytes $slave_xlog_loc)))
}
#
# Converts specified xlog number to byte location
#
_xlog_to_bytes()
{
if [ -z $1 ]; then
echo "Usage: $0 _xlog_to_bytes <xlog loc>" >&2
echo " Eg.: $0 _xlog_to_bytes 0/2BC825C0" >&2
return 99
fi
logid="${1%%/*}"
offset="${1##*/}"
echo $((0xFFFFFF * 0x$logid + 0x$offset))
}
#
# Prints whether the postgresql service on the specified node is responding.
#
_is_node_alive()
{
if [ -z $1 ]; then
echo "Usage: $0 _is_node_alive <node id>" >&2
return 99
fi
# Get node connection information
node_info=($(_get_node_info $1))
if [ $? -gt 0 ]; then
echo "ERROR: Failed getting node info for node $1" >&2
return 1
fi
export PGCONNECT_TIMEOUT=2
result=$(psql $psql_healthcheck_opts -h "${node_info[0]}" -p "${node_info[1]}" -Atc "SELECT 1;" 2>/dev/null)
if [ "$result" == "1" ]; then
echo 1
return 1
else
echo 0
return 0
fi
}
#
# Get the node ID of the first master
#
_get_master_node()
{
# Get total number of nodes
nodes=$(_get_node_count)
if [ $? -gt 0 ]; then
echo "ERROR: Failed getting node count: $nodes" >&2
exit 1
fi
c=0
# Loop through each node to check if it's the master
while [ $c -lt $nodes ]; do
if [ "$(_is_standby $c)" == "0" ]; then
echo $c
return 0
fi
let c=c+1
done
echo "-1"
return 1
}
#
# Checks if the node is in postgresql recovery mode (ie. if it is a slave)
#
_is_standby()
{
if [ -z $1 ]; then
echo "Usage: $0 _is_standby <node id>" >&2
return 99
fi
# Get node connection information
node_info=($(_get_node_info $1))
if [ $? -gt 0 ]; then
echo "ERROR: Failed getting node info for node $1" >&2
return 1
fi
export PGCONNECT_TIMEOUT=2
result=$(psql $psql_healthcheck_opts -h "${node_info[0]}" -p "${node_info[1]}" -Atc "SELECT pg_is_in_recovery();" 2>/dev/null)
if [ "$result" == "t" ]; then
echo 1
return 1
else
echo 0
return 0
fi
}
#
# Attaches the specified node to the pool
#
pool_attach()
{
if [ -z $1 ]; then
echo "Usage: $0 attach <node id>" >&2
return 99
fi
pcp_attach_node $pcp_cmd_preamble $1
}
#
# Detaches the specified node from the pool
#
pool_detach()
{
if [ -z $1 ]; then
echo "Usage: $0 detach <node id>" >&2
return 99
fi
pcp_detach_node $pcp_cmd_preamble $1
}
#
# Recovers the specified node (restores it from current master and re-attaches)
#
pool_recover()
{
if [ -z $1 ]; then
echo "Usage: $0 recover <node id>" >&2
return 99
fi
pcp_recovery_node $pcp_cmd_preamble $1
}
#
# Prints the status of the specified node in human readable format.
#
_get_node_status()
{
if [ -z $1 ]; then
echo "Usage: $0 _get_node_status <node id>" >&2
return 99
fi
node_info=($(_get_node_info $1))
if [ $? -gt 0 ]; then
echo "ERROR: Failed getting node info for node $1" >&2
else
node_role=""
node_replication_lag=""
node_alive=""
case "$(_is_node_alive $1)" in
1)
node_alive="Up"
;;
*)
node_alive="Down"
;;
esac
if [ "$node_alive" == "Up" ]; then
# Find out what role this node has
if [ "$(_is_standby $1)" == "1" ]; then
node_role="Slave"
# Calculation replication lag
master_node=$(_get_master_node)
node_replication_lag=$(_get_replication_lag $master_node $1)
if [ $? -eq 0 ]; then
node_replication_lag="$node_replication_lag bytes"
else
node_replication_log="Unknown"
fi
else
node_role="Master"
fi
fi
case "${node_info[2]}" in
3)
node_status="detached from pool"
;;
2)
node_status="in pool and connected"
;;
1)
node_status="in pool"
;;
*)
node_status="Unknown"
;;
esac
# Print status information about this node
echo "Node: $1"
echo "Host: ${node_info[0]}"
echo "Port: ${node_info[1]}"
echo "Weight: ${node_info[3]}"
echo "Status: $node_alive, $node_status (${node_info[2]})"
[ -n "$node_role" ] && echo "Role: $node_role"
[ -n "$node_replication_lag" ] && echo "Replication lag: $node_replication_lag"
echo ""
fi
}
#
# Prints out the status of all pgpool nodes in human readable form.
#
pool_show()
{
# Get total number of nodes
nodes=$(_get_node_count)
if [ $? -gt 0 ]; then
echo "ERROR: Failed getting node count: $nodes" >&2
exit 1
fi
c=0
# Loop through each node to retrieve info
while [ $c -lt $nodes ]; do
_get_node_status $c
let c=c+1
done
}
#
# Prints out the status of all node, easier to analyse in a script
#
pool_status()
{
# Get the second argument
display="${1:-all}"
shift
if [ "${display}" == "master" ]; then
display_master=1
display_slave=0
elif [ "${display}" == "slave" ]; then
display_master=0
display_slave=1
elif [ "${display}" == "all" ]; then
display_master=0
display_slave=0
else
echo "ERROR: Unknown argument: ${display}" >&2
exit 1
fi
# Get total number of nodes
nodes=$(_get_node_count)
if [ $? -gt 0 ]; then
echo "ERROR: Failed getting node count: $nodes" >&2
exit 1
fi
# Check what to display
echo -ne "Node\tHost\t\tPort\tAlive\tStandBy\tStatus\n"
# Loop through each node to retrieve info
c=0
while [ $c -lt $nodes ]; do
# Get infos
node_info=($(_get_node_info $c))
# Display info
if [[ "${display_master}" == 1 &&
"$(_is_node_alive $c)" == "1" &&
"$(_is_standby $c)" != "1" &&
"${node_info[2]}" != "3" ]]; then
# Display active master
printf "%s\t%s\t%s\t%s\t%s\t%s\n" "$c" "${node_info[0]}" "${node_info[1]}" "$(_is_node_alive $c)" "$(_is_standby $c)" "${node_info[2]}"
elif [[ "${display_slave}" == 1 &&
"$(_is_node_alive $c)" == "1" &&
"$(_is_standby $c)" == "1" &&
"${node_info[2]}" != "3" ]]; then
# Display active slaves
printf "%s\t%s\t%s\t%s\t%s\t%s\n" "$c" "${node_info[0]}" "${node_info[1]}" "$(_is_node_alive $c)" "$(_is_standby $c)" "${node_info[2]}"
elif [[ "${display_master}" == 0 && "${display_slave}" == 0 ]]; then
# Display all (even inactive)
printf "%s\t%s\t%s\t%s\t%s\t%s\n" "$c" "${node_info[0]}" "${node_info[1]}" "$(_is_node_alive $c)" "$(_is_standby $c)" "${node_info[2]}"
fi
# Node: $c
# Host: ${node_info[0]}
# Port: ${node_info[1]}
# Alive: $(_is_node_alive $c)
# Standby: $(_is_standby $c)
# Status: ${node_info[2]}
# Status:
# 0 - This state is only used during the initialization. PCP will never display it.
# 1 - Node is up. No connections yet.
# 2 - Node is up. Connections are pooled.
# 3 - Node is down.
let c=c+1
done
}
#
# Prints out the status of all node, easier to analyse in a script
#
rep_status()
{
# Get total number of nodes
nodes=$(_get_node_count)
if [ $? -gt 0 ]; then
echo "ERROR: Failed getting node count: $nodes" >&2
exit 1
fi
# Loop over
c=0
while [ $c -lt $nodes ]; do
# Get infos
node_info=($(_get_node_info $c))
# Launch the command
echo "Host: ${node_info[0]}"
ssh ${repmgr_pg_user}@${node_info[0]} "${repmgr_bin_dir}/repmgr -f ${repmgr_conf} cluster show"
echo ""
# increment
let c=c+1
done
}
#
# Stop postgreSQL instance
#
pg_start()
{
# Check node id arguments
local promote=$( _get_node_ip ${1} )
# Must be an IP
# Test if argument is an IP
if [[ ! "${promote}" =~ ^[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+$ ]]; then
echo "ERROR: argument is not a valid IP: ${promote}" >&2
fi
# Test ssh connection
ssh -q ${repmgr_pg_root}@${promote} exit
if [ $? -gt 0 ]; then
echo "ERROR: Failed ssh connexion: ${promote}" >&2
exit 1
fi
# Promote the slave
#ssh ${repmgr_pg_root}@${promote} "su - ${repmgr_pg_user} -c ${repmgr_bin_dir}/repmgr -f ${repmgr_conf} standby promote"
ssh ${repmgr_pg_root}@${promote} "/usr/bin/systemctl start postgresql-9.4 || echo FAIL"
# Show the logs
#echo "Status:"
#ssh ${repmgr_pg_root}@${promote} "/usr/bin/systemctl status postgresql-9.4"
#ssh ${repmgr_pg_root}@${promote} "/usr/bin/netstat -lntp | grep postgres"
}
#
# Stop postgreSQL instance
#
pg_stop()
{
# Check node id arguments
local promote=$( _get_node_ip ${1} )
# Must be an IP
# Test if argument is an IP
if [[ ! "${promote}" =~ ^[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+$ ]]; then
echo "ERROR: argument is not a valid IP: ${promote}" >&2
exit 1
fi
# Test ssh connection
ssh -q ${repmgr_pg_root}@${promote} exit
if [ $? -gt 0 ]; then
echo "ERROR: Failed ssh connexion: ${promote}" >&2
exit 1
fi
# Promote the slave
#ssh ${repmgr_pg_root}@${promote} "su - ${repmgr_pg_user} -c ${repmgr_bin_dir}/repmgr -f ${repmgr_conf} standby promote"
ssh ${repmgr_pg_root}@${promote} "/usr/bin/systemctl stop postgresql-9.4 || echo FAIL"
}
#
# Promote a slave
#
rep_standby_clone()
{
# Check node id arguments
local standby=$( _get_node_ip ${1} )
local master=$( _get_node_ip ${2} )
# Must be an IP
# Test if argument is an IP
if [[ ! ( "${standby}" =~ ^[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+$ && "${master}" =~ ^[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+$ ) ]]; then
echo "ERROR: argument is not a valid IP: ${standby}" >&2
exit 1
fi
# Test ssh connection
ssh -q ${repmgr_pg_root}@${standby} exit && ssh -q ${repmgr_pg_user}@${standby} exit
if [ $? -gt 0 ]; then
echo "ERROR: Failed ssh connexion: ${standby}" >&2
exit 1
fi
# Stop the postgreSQL daemon
pg_stop ${1}
# Lanch the repmgr recovery
#ssh ${repmgr_pg_root}@${standby} "su - ${repmgr_pg_user} -c ${repmgr_bin_dir}/repmgr -f ${repmgr_conf} standby standby"
ssh ${repmgr_pg_user}@${standby} "/usr/bin/rm -r ${repmgr_pg_data_dir}/* 2>/dev/null "
ssh ${repmgr_pg_user}@${standby} "${repmgr_bin_dir}/repmgr -D ${repmgr_pg_data_dir} -d repmgr -U repmgr --verbose standby clone ${master} 2>> /tmp/test || echo 'FAIL'"
#rm -rf /var/lib/pgsql/9.4/data/*
#repmgr -D /var/lib/pgsql/9.4/data/ -d repmgr -U repmgr --verbose standby clone 192.168.121.144
# Show the logs
#echo "Repmgr log file: "
#ssh ${repmgr_pg_root}@${standby} "/usr/bin/tail /var/log/repmgr/repmgr-9.4.log"
# Start the postgreSQL daemon
pg_start ${1}
}
#
# Register a slave
#
rep_standby_register()
{
# Check node id arguments
local register=$( _get_node_ip ${1} )
# Test if argument is an IP
if [[ ! "${register}" =~ ^[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+$ ]]; then
echo "ERROR: argument is not a valid IP: ${register}" >&2
exit 1
fi
# Test ssh connection
ssh -q ${repmgr_pg_root}@${register} exit && ssh -q ${repmgr_pg_user}@${register} exit
if [ $? -gt 0 ]; then
echo "ERROR: Failed ssh connexion: ${register}" >&2
exit 1
fi
# Lanch the repmgr recovery
ssh ${repmgr_pg_user}@${register} "${repmgr_bin_dir}/repmgr -f ${repmgr_conf} standby register || echo 'FAIL'"
# Show the logs
echo "Repmgr log file: "
ssh ${repmgr_pg_root}@${register} "/usr/bin/tail /var/log/repmgr/repmgr-9.4.log"
}
#
# Promote a slave
#
rep_standby_promote()
{
# Check node id arguments
local promote=$( _get_node_ip ${1} )
# Test if argument is an IP
if [[ ! "${promote}" =~ ^[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+$ ]]; then
echo "ERROR: argument is not a valid IP: ${promote}" >&2
exit 1
fi
# Test ssh connection
ssh -q ${repmgr_pg_root}@${promote} exit && ssh -q ${repmgr_pg_user}@${promote} exit
if [ $? -gt 0 ]; then
echo "ERROR: Failed ssh connexion: ${promote}" >&2
exit 1
fi
# Lanch the repmgr recovery
ssh ${repmgr_pg_user}@${promote} "${repmgr_bin_dir}/repmgr -f ${repmgr_conf} standby promote || echo 'FAIL' "
# Show the logs
#echo "Repmgr log file: "
#ssh ${repmgr_pg_root}@${promote} "/usr/bin/tail /var/log/repmgr/repmgr-9.4.log"
}
#
# Promote a master
#
rep_master_register()
{
# Check node id arguments
local master=${1}
# Test if argument is an IP
if [[ ! "${master}" =~ ^[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+$ ]]; then
echo "ERROR: argument is not a valid IP: ${master}" >&2
exit 1
fi
# Test ssh connection
ssh -q ${repmgr_pg_root}@${master} exit && ssh -q ${repmgr_pg_user}@${master} exit
if [ $? -gt 0 ]; then
echo "ERROR: Failed ssh connexion: ${master}" >&2
exit 1
fi
# Lanch the repmgr recovery
#ssh ${repmgr_pg_root}@${master} "su - ${repmgr_pg_user} -c ${repmgr_bin_dir}/repmgr -f ${repmgr_conf} standby promote"
ssh ${repmgr_pg_user}@${master} "${repmgr_bin_dir}/repmgr -f ${repmgr_conf} master register"
# Show the logs
echo "Repmgr log file: "
ssh ${repmgr_pg_root}@${master} "/usr/bin/tail /var/log/repmgr/repmgr-9.4.log"
}
#
# Prints out the human overall status
#
show()
{
echo "PgPool-II Status:"
echo "====================="
#pool_show
#echo ""
pool_status
echo ""
echo "Repmgr Status:"
echo "====================="
rep_status
}
# Run function
if [ ! "$(type -t $1)" ]; then
echo "Usage $0 <option>" >&2
echo "" >&2
echo "Available options:" >&2
echo "$(compgen -A function |grep -v '^_')" >&2
exit 99
else
cmd=$1
shift
$cmd $*
exit $?
fi
@mrjk
Copy link
Author

mrjk commented Oct 8, 2015

I added master and slave commands, in way to have a simple call to get only active nodes

@mrjk
Copy link
Author

mrjk commented Oct 16, 2015

Implemented repmgr in the command line. It offers a nice tool to control (very) easily your PostgreSQL replication tool.

I'll push that in production, just for fun ^^ But I do not yet recommend you to do the same ;-)

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment