Skip to content

Instantly share code, notes, and snippets.

@cabecada
Created June 2, 2022 07:11
Show Gist options
  • Save cabecada/481704cc152b36ee33dbed3ac412d5b8 to your computer and use it in GitHub Desktop.
Save cabecada/481704cc152b36ee33dbed3ac412d5b8 to your computer and use it in GitHub Desktop.
root@demo:~# cat /opt/pgsqlchk1
#!/usr/bin/bash
export PGPASSWORD="1234"
VALUE=`timeout 5 /usr/bin/psql -t -d postgres -h 127.0.0.1 -U postgres -p 5432 -c 'select not pg_is_in_recovery()' 2>/dev/null`
rc=$?
if [ $rc -eq 0 ]
then
VALUE=`timeout 5 /usr/bin/psql -t -d postgres -h 127.0.0.1 -U postgres -p 5001 -c 'select not pg_is_in_recovery()' 2>/dev/null`
rc=$?
fi
# Check the output. If it is not empty then everything is fine and we return something. Else, we just do not return anything.
if [ $rc -ne 0 ]
then
/bin/echo -ne "HTTP/1.1 503 Service Unavailable\r\n"
/bin/echo -ne "Content-Type: Content-Type: text/plain\r\n"
/bin/echo -ne "\r\n"
/bin/echo "DB Down"
/bin/echo -ne "\r\n"
elif [ $VALUE == "f" ]
then
/bin/echo -ne "HTTP/1.1 206 OK\r\n"
/bin/echo -ne "Content-Type: Content-Type: text/plain\r\n"
/bin/echo -ne "\r\n"
/bin/echo "Standby"
/bin/echo -ne "\r\n"
elif [ $VALUE == "t" ]
then
/bin/echo -ne "HTTP/1.1 203 OK\r\n"
/bin/echo -ne "Content-Type: Content-Type: text/plain\r\n"
/bin/echo -ne "\r\n"
/bin/echo "Primary"
/bin/echo -ne "\r\n"
else
/bin/echo -ne "HTTP/1.1 503 Service Unavailable\r\n"
/bin/echo -ne "Content-Type: Content-Type: text/plain\r\n"
/bin/echo -ne "\r\n"
/bin/echo "DB Down"
/bin/echo -ne "\r\n"
fi
root@demo:~# cat /opt/pgsqlchk2
#!/usr/bin/bash
export PGPASSWORD="1234"
VALUE=`timeout 5 /usr/bin/psql -t -d postgres -h 127.0.0.1 -U postgres -p 5433 -c 'select not pg_is_in_recovery()' 2>/dev/null`
rc=$?
if [ $rc -eq 0 ]
then
VALUE=`timeout 5 /usr/bin/psql -t -d postgres -h 127.0.0.1 -U postgres -p 5002 -c 'select not pg_is_in_recovery()' 2>/dev/null`
rc=$?
fi
# Check the output. If it is not empty then everything is fine and we return something. Else, we just do not return anything.
if [ $rc -ne 0 ]
then
/bin/echo -ne "HTTP/1.1 503 Service Unavailable\r\n"
/bin/echo -ne "Content-Type: Content-Type: text/plain\r\n"
/bin/echo -ne "\r\n"
/bin/echo "DB Down"
/bin/echo -ne "\r\n"
elif [ $VALUE == "f" ]
then
/bin/echo -ne "HTTP/1.1 206 OK\r\n"
/bin/echo -ne "Content-Type: Content-Type: text/plain\r\n"
/bin/echo -ne "\r\n"
/bin/echo "Standby"
/bin/echo -ne "\r\n"
elif [ $VALUE == "t" ]
then
/bin/echo -ne "HTTP/1.1 203 OK\r\n"
/bin/echo -ne "Content-Type: Content-Type: text/plain\r\n"
/bin/echo -ne "\r\n"
/bin/echo "Primary"
/bin/echo -ne "\r\n"
else
/bin/echo -ne "HTTP/1.1 503 Service Unavailable\r\n"
/bin/echo -ne "Content-Type: Content-Type: text/plain\r\n"
/bin/echo -ne "\r\n"
/bin/echo "DB Down"
/bin/echo -ne "\r\n"
fi
root@demo:~# cat /etc/xinetd.d/pgsqlchk1
service pgsqlchk1
{
flags = REUSE
socket_type = stream
port = 23267
wait = no
user = nobody
server = /opt/pgsqlchk1
log_on_failure += USERID
disable = no
only_from = 0.0.0.0/0
per_source = UNLIMITED
}
root@demo:~# cat /etc/xinetd.d/pgsqlchk2
service pgsqlchk2
{
flags = REUSE
socket_type = stream
port = 23268
wait = no
user = nobody
server = /opt/pgsqlchk2
log_on_failure += USERID
disable = no
only_from = 0.0.0.0/0
per_source = UNLIMITED
}
root@demo:~# cat /var/lib/postgresql/temp/db1.ini
[databases]
postgres = port=5432 host=127.0.0.1
[pgbouncer]
;; auth_type = hba ;; if set then only allow user from pg_hba.conf below
auth_type = trust
;; allow via md5 auth
;; auth_hba_file = /etc/pgbouncer/pg_hba.conf
;; auth_file = /etc/pgbouncer/userlist2.txt
;; list of user/md5password text file
admin_users = postgres
auth_user = postgres
;; auth_query = SELECT p_user, p_password FROM public.lookup($1)
logfile = /tmp/pgbouncer_db1.log
pidfile = /tmp/pgbouncer_db1.pid
;;;
;;; Where to wait for clients
;;;
;; IP address or * which means all IPs
listen_addr = 127.0.0.1
listen_port = 5001
unix_socket_dir = /tmp/db1
;; query_wait_timeout = 20
;; listen_backlog = 10
;; server_connect_timeout = 5
log_connections = 1
log_disconnections = 1
;; pool_mode = transaction
root@demo:~# cat /var/lib/postgresql/temp/db2.ini
[databases]
postgres = port=5433 host=127.0.0.1
[pgbouncer]
;; auth_type = hba ;; if set then only allow user from pg_hba.conf below
auth_type = trust
;; allow via md5 auth
;; auth_hba_file = /etc/pgbouncer/pg_hba.conf
;; auth_file = /etc/pgbouncer/userlist2.txt
;; list of user/md5password text file
admin_users = postgres
auth_user = postgres
;; auth_query = SELECT p_user, p_password FROM public.lookup($1)
logfile = /tmp/pgbouncer_db2.log
pidfile = /tmp/pgbouncer_db2.pid
;;;
;;; Where to wait for clients
;;;
;; IP address or * which means all IPs
listen_addr = 127.0.0.1
listen_port = 5002
unix_socket_dir = /tmp/db2
root@demo:~# cat /etc/haproxy/haproxy.cfg
global
maxconn 100
stats socket ipv4@127.0.0.1:9999 level admin
stats socket /var/run/haproxy.sock mode 666 level admin
stats timeout 2m
defaults
log global
mode tcp
retries 2
timeout client 30m
timeout connect 4s
timeout server 30m
timeout check 5s
listen stats
mode http
bind *:7000
stats enable
stats uri /
listen ReadOnly
mode tcp
bind 0.0.0.0:8002
option httpchk
http-check expect status 206
default-server observe layer7 error-limit 10 inter 3s fall 3 rise 2 on-marked-down shutdown-sessions
server pg0 127.0.0.1:5001 check port 23267
server pg1 127.0.0.1:5002 check port 23268
listen ReadWrite
mode tcp
bind 0.0.0.0:8001
option httpchk
http-check expect status 203
default-server observe layer7 error-limit 10 inter 3s fall 3 rise 2 on-marked-down shutdown-sessions
server pg0 127.0.0.1:5001 check port 23267
server pg1 127.0.0.1:5002 check port 23268
######################
steps
pg_ctl -D db1 -l db1.log start
pg_ctl -D db1 -l db1.log start
pgbouncer -u postgres -v -d db1.ini
pgbouncer -u postgres -v -d db2.ini
systemctl status xinetd.service
systemctl status haproxy.service
monitor haproxy via
ip:7000/
tests:
postgres@demo:~/temp$ #stop pg on 5432
postgres@demo:~/temp$ pg_ctl -D db1 -l db1.log stop
waiting for server to shut down.... done
server stopped
#since pgbouncer is running, we cannot failover
postgres@demo:~/temp$ timeout 5 psql -p 5001 -U postgres -d postgres -h 127.0.0.1 -c 'select pg_is_in_recovery();'
postgres@demo:~/temp$ echo $?
124
postgres@demo:~/temp$ timeout 5 psql -p 8001 -U postgres -d postgres -h 127.0.0.1 -c 'select pg_is_in_recovery();'
psql: error: connection to server at "127.0.0.1", port 8001 failed: server closed the connection unexpectedly
This probably means the server terminated abnormally
before or while processing the request.
#so haproxy will not disable frontend ReadWrite, even if health check for postgres and pgbouncer fail
postgres@demo:~/temp$ ps aux | grep pgbouncer
postgres 7747 0.0 0.3 18128 3772 ? Sl 11:35 0:01 pgbouncer -u postgres -v -d db2.ini
postgres 46590 0.0 0.3 18132 3840 ? Sl 12:25 0:00 pgbouncer -u postgres -v -d db1.ini
postgres 50011 0.0 0.0 7696 624 pts/0 S+ 12:28 0:00 grep pgbouncer
postgres@demo:~/temp$ kill 7747
postgres@demo:~/temp$ timeout 5 psql -p 5001 -U postgres -d postgres -h 127.0.0.1 -c 'select pg_is_in_recovery();'
FATAL: server login has been failing, try again later (server_login_retry)
server closed the connection unexpectedly
This probably means the server terminated abnormally
before or while processing the request.
connection to server was lost
postgres@demo:~/temp$ timeout 5 psql -p 8001 -U postgres -d postgres -h 127.0.0.1 -c 'select pg_is_in_recovery();'
psql: error: connection to server at "127.0.0.1", port 8001 failed: server closed the connection unexpectedly
This probably means the server terminated abnormally
before or while processing the request.
### on the haproxy status for frontend is OPEN, hence psql will not failover to other node
psql "postgresql://127.0.0.1:8001,127.0.0.1:8002/postgres?target_session_attrs=any" -c 'select pg_is_in_recovery();'
psql: error: connection to server at "127.0.0.1", port 8001 failed: server closed the connection unexpectedly
This probably means the server terminated abnormally
before or while processing the request.
we need to mark the frontend down explictly via runtime api
then via socat
sudo socat stdio tcp4-connect:127.0.0.1:9999
disable frontend ReadWrite
#on the dashboard for haproxy it changes from OPEN to STOP
now haproxy simply considers the frontend down and routes to 8002
postgres@demo:~/temp$ psql "postgresql://127.0.0.1:8001,127.0.0.1:8002/postgres?target_session_attrs=any" -c 'select pg_is_in_recovery();'
pg_is_in_recovery
-------------------
t
(1 row)
postgres@demo:~/temp$ timeout 5 psql "postgresql://127.0.0.1:5001,127.0.0.1:5002/postgres?target_session_attrs=any" -c 'select pg_is_in_recovery();'
FATAL: server login has been failing, try again later (server_login_retry)
server closed the connection unexpectedly
This probably means the server terminated abnormally
before or while processing the request.
connection to server was lost
postgres@demo:~/temp$ #pgbouncer still failing as is active but backend is not
postgres@demo:~/temp$ psql "postgresql://127.0.0.1:8001,127.0.0.1:8002/postgres?target_session_attrs=any" -c 'select pg_is_in_recovery();'
pg_is_in_recovery
-------------------
t
(1 row)
#even if pg recovers and pgbouncer recovers and both healthy, we need to enable haproxy frontend else
postgres@demo:~/temp$ #pgbouncer still failing as is active but backend is not
postgres@demo:~/temp$ psql "postgresql://127.0.0.1:8001,127.0.0.1:8002/postgres?target_session_attrs=any" -c 'select pg_is_in_recovery();'
pg_is_in_recovery
-------------------
t
(1 row)
postgres@demo:~/temp$ pg_ctl -D db1 -l db1.log start
waiting for server to start.... done
server started
postgres@demo:~/temp$ psql "postgresql://127.0.0.1:8001,127.0.0.1:8002/postgres?target_session_attrs=any" -c 'select pg_is_in_recovery();'
pg_is_in_recovery
-------------------
t
(1 row)
postgres@demo:~/temp$ psql "postgresql://127.0.0.1:5001,127.0.0.1:5002/postgres?target_session_attrs=any" -c 'select pg_is_in_recovery();'
pg_is_in_recovery
-------------------
f
(1 row)
postgres@demo:~/temp$ psql "postgresql://127.0.0.1:5432,127.0.0.1:5433/postgres?target_session_attrs=any" -c 'select pg_is_in_recovery();'
pg_is_in_recovery
-------------------
f
(1 row)
##all up db,haproxy,pgb
root@demo:~# sudo socat stdio tcp4-connect:127.0.0.1:9999
enable frontend ReadWrite
postgres@demo:~/temp$ psql "postgresql://127.0.0.1:8001,127.0.0.1:8002/postgres?target_session_attrs=any" -c 'select pg_is_in_recovery();'
pg_is_in_recovery
-------------------
f
(1 row)
postgres@demo:~/temp$ psql "postgresql://127.0.0.1:5001,127.0.0.1:5002/postgres?target_session_attrs=any" -c 'select pg_is_in_recovery();'
pg_is_in_recovery
-------------------
f
(1 row)
postgres@demo:~/temp$ psql "postgresql://127.0.0.1:5432,127.0.0.1:5433/postgres?target_session_attrs=any" -c 'select pg_is_in_recovery();'
pg_is_in_recovery
-------------------
f
(1 row)
@cabecada
Copy link
Author

cabecada commented Jun 3, 2022

use connect_timeout for hanging pgbouncer to force failover,

strace -f -e trace=network -s 10000  -- psql "postgresql://127.0.0.1:5002,127.0.0.1:5003/postgres?target_session_attrs=any&connect_timeout=5" -c 'select inet_server_port();'
socket(AF_UNIX, SOCK_STREAM|SOCK_CLOEXEC|SOCK_NONBLOCK, 0) = 3
connect(3, {sa_family=AF_UNIX, sun_path="/var/run/nscd/socket"}, 110) = -1 ENOENT (No such file or directory)
socket(AF_UNIX, SOCK_STREAM|SOCK_CLOEXEC|SOCK_NONBLOCK, 0) = 3
connect(3, {sa_family=AF_UNIX, sun_path="/var/run/nscd/socket"}, 110) = -1 ENOENT (No such file or directory)
socket(AF_INET, SOCK_STREAM, IPPROTO_IP) = 3
setsockopt(3, SOL_TCP, TCP_NODELAY, [1], 4) = 0
setsockopt(3, SOL_SOCKET, SO_KEEPALIVE, [1], 4) = 0
connect(3, {sa_family=AF_INET, sin_port=htons(5002), sin_addr=inet_addr("127.0.0.1")}, 16) = -1 EINPROGRESS (Operation now in progress)
getsockopt(3, SOL_SOCKET, SO_ERROR, [0], [4]) = 0
getsockname(3, {sa_family=AF_INET, sin_port=htons(41788), sin_addr=inet_addr("127.0.0.1")}, [128->16]) = 0
sendto(3, "\0\0\0\10\4\322\26/", 8, MSG_NOSIGNAL, NULL, 0) = 8
recvfrom(3, "N", 16384, 0, NULL, NULL)  = 1
sendto(3, "\0\0\0T\0\3\0\0user\0postgres\0database\0postgres\0application_name\0psql\0client_encoding\0UTF8\0\0", 84, MSG_NOSIGNAL, NULL, 0) = 84
socket(AF_INET, SOCK_STREAM, IPPROTO_IP) = 3
setsockopt(3, SOL_TCP, TCP_NODELAY, [1], 4) = 0
setsockopt(3, SOL_SOCKET, SO_KEEPALIVE, [1], 4) = 0
connect(3, {sa_family=AF_INET, sin_port=htons(5003), sin_addr=inet_addr("127.0.0.1")}, 16) = -1 EINPROGRESS (Operation now in progress)
getsockopt(3, SOL_SOCKET, SO_ERROR, [0], [4]) = 0
getsockname(3, {sa_family=AF_INET, sin_port=htons(32884), sin_addr=inet_addr("127.0.0.1")}, [128->16]) = 0
sendto(3, "\0\0\0\10\4\322\26/", 8, MSG_NOSIGNAL, NULL, 0) = 8
recvfrom(3, "N", 16384, 0, NULL, NULL)  = 1
sendto(3, "\0\0\0T\0\3\0\0user\0postgres\0database\0postgres\0application_name\0psql\0client_encoding\0UTF8\0\0", 84, MSG_NOSIGNAL, NULL, 0) = 84
recvfrom(3, "R\0\0\0\10\0\0\0\0S\0\0\0&default_transaction_read_only\0off\0S\0\0\0\26in_hot_standby\0on\0S\0\0\0\31integer_datetimes\0on\0S\0\0\0\33IntervalStyle\0postgres\0S\0\0\0\24is_superuser\0on\0S\0\0\0\31server_encoding\0UTF8\0S\0\0\0004server_version\00014.3 (Ubuntu 14.3-1.pgdg20.04+1)\0S\0\0\0#session_authorization\0postgres\0S\0\0\0\31client_encoding\0UTF8\0S\0\0\0\27DateStyle\0ISO, MDY\0S\0\0\0\32TimeZone\0Asia/Kolkata\0S\0\0\0#standard_conforming_strings\0on\0S\0\0\0\32application_name\0psql\0K\0\0\0\f\273G<ncU\250\30Z\0\0\0\5I", 16384, 0, NULL, NULL) = 420
sendto(3, "Q\0\0\0\37select inet_server_port();\0", 32, MSG_NOSIGNAL, NULL, 0) = 32
recvfrom(3, "T\0\0\0)\0\1inet_server_port\0\0\0\0\0\0\0\0\0\0\27\0\4\377\377\377\377\0\0D\0\0\0\16\0\1\0\0\0\0045434C\0\0\0\rSELECT 1\0Z\0\0\0\5I", 16384, 0, NULL, NULL) = 77
 inet_server_port 
------------------
             5434
(1 row)

recvfrom(3, 0xaaab1e1c4870, 16384, 0, NULL, NULL) = -1 EAGAIN (Resource temporarily unavailable)
sendto(3, "X\0\0\0\4", 5, MSG_NOSIGNAL, NULL, 0) = 5
+++ exited with 0 +++

@cabecada
Copy link
Author

cabecada commented Jun 3, 2022

ref:
FEBE message format ascii identifiers

https://github.com/tlocke/pg8000/blob/main/pg8000/core.py#L76

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment