Created
June 2, 2022 07:11
-
-
Save cabecada/481704cc152b36ee33dbed3ac412d5b8 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
root@demo:~# cat /opt/pgsqlchk1 | |
#!/usr/bin/bash | |
export PGPASSWORD="1234" | |
VALUE=`timeout 5 /usr/bin/psql -t -d postgres -h 127.0.0.1 -U postgres -p 5432 -c 'select not pg_is_in_recovery()' 2>/dev/null` | |
rc=$? | |
if [ $rc -eq 0 ] | |
then | |
VALUE=`timeout 5 /usr/bin/psql -t -d postgres -h 127.0.0.1 -U postgres -p 5001 -c 'select not pg_is_in_recovery()' 2>/dev/null` | |
rc=$? | |
fi | |
# Check the output. If it is not empty then everything is fine and we return something. Else, we just do not return anything. | |
if [ $rc -ne 0 ] | |
then | |
/bin/echo -ne "HTTP/1.1 503 Service Unavailable\r\n" | |
/bin/echo -ne "Content-Type: Content-Type: text/plain\r\n" | |
/bin/echo -ne "\r\n" | |
/bin/echo "DB Down" | |
/bin/echo -ne "\r\n" | |
elif [ $VALUE == "f" ] | |
then | |
/bin/echo -ne "HTTP/1.1 206 OK\r\n" | |
/bin/echo -ne "Content-Type: Content-Type: text/plain\r\n" | |
/bin/echo -ne "\r\n" | |
/bin/echo "Standby" | |
/bin/echo -ne "\r\n" | |
elif [ $VALUE == "t" ] | |
then | |
/bin/echo -ne "HTTP/1.1 203 OK\r\n" | |
/bin/echo -ne "Content-Type: Content-Type: text/plain\r\n" | |
/bin/echo -ne "\r\n" | |
/bin/echo "Primary" | |
/bin/echo -ne "\r\n" | |
else | |
/bin/echo -ne "HTTP/1.1 503 Service Unavailable\r\n" | |
/bin/echo -ne "Content-Type: Content-Type: text/plain\r\n" | |
/bin/echo -ne "\r\n" | |
/bin/echo "DB Down" | |
/bin/echo -ne "\r\n" | |
fi | |
root@demo:~# cat /opt/pgsqlchk2 | |
#!/usr/bin/bash | |
export PGPASSWORD="1234" | |
VALUE=`timeout 5 /usr/bin/psql -t -d postgres -h 127.0.0.1 -U postgres -p 5433 -c 'select not pg_is_in_recovery()' 2>/dev/null` | |
rc=$? | |
if [ $rc -eq 0 ] | |
then | |
VALUE=`timeout 5 /usr/bin/psql -t -d postgres -h 127.0.0.1 -U postgres -p 5002 -c 'select not pg_is_in_recovery()' 2>/dev/null` | |
rc=$? | |
fi | |
# Check the output. If it is not empty then everything is fine and we return something. Else, we just do not return anything. | |
if [ $rc -ne 0 ] | |
then | |
/bin/echo -ne "HTTP/1.1 503 Service Unavailable\r\n" | |
/bin/echo -ne "Content-Type: Content-Type: text/plain\r\n" | |
/bin/echo -ne "\r\n" | |
/bin/echo "DB Down" | |
/bin/echo -ne "\r\n" | |
elif [ $VALUE == "f" ] | |
then | |
/bin/echo -ne "HTTP/1.1 206 OK\r\n" | |
/bin/echo -ne "Content-Type: Content-Type: text/plain\r\n" | |
/bin/echo -ne "\r\n" | |
/bin/echo "Standby" | |
/bin/echo -ne "\r\n" | |
elif [ $VALUE == "t" ] | |
then | |
/bin/echo -ne "HTTP/1.1 203 OK\r\n" | |
/bin/echo -ne "Content-Type: Content-Type: text/plain\r\n" | |
/bin/echo -ne "\r\n" | |
/bin/echo "Primary" | |
/bin/echo -ne "\r\n" | |
else | |
/bin/echo -ne "HTTP/1.1 503 Service Unavailable\r\n" | |
/bin/echo -ne "Content-Type: Content-Type: text/plain\r\n" | |
/bin/echo -ne "\r\n" | |
/bin/echo "DB Down" | |
/bin/echo -ne "\r\n" | |
fi | |
root@demo:~# cat /etc/xinetd.d/pgsqlchk1 | |
service pgsqlchk1 | |
{ | |
flags = REUSE | |
socket_type = stream | |
port = 23267 | |
wait = no | |
user = nobody | |
server = /opt/pgsqlchk1 | |
log_on_failure += USERID | |
disable = no | |
only_from = 0.0.0.0/0 | |
per_source = UNLIMITED | |
} | |
root@demo:~# cat /etc/xinetd.d/pgsqlchk2 | |
service pgsqlchk2 | |
{ | |
flags = REUSE | |
socket_type = stream | |
port = 23268 | |
wait = no | |
user = nobody | |
server = /opt/pgsqlchk2 | |
log_on_failure += USERID | |
disable = no | |
only_from = 0.0.0.0/0 | |
per_source = UNLIMITED | |
} | |
root@demo:~# cat /var/lib/postgresql/temp/db1.ini | |
[databases] | |
postgres = port=5432 host=127.0.0.1 | |
[pgbouncer] | |
;; auth_type = hba ;; if set then only allow user from pg_hba.conf below | |
auth_type = trust | |
;; allow via md5 auth | |
;; auth_hba_file = /etc/pgbouncer/pg_hba.conf | |
;; auth_file = /etc/pgbouncer/userlist2.txt | |
;; list of user/md5password text file | |
admin_users = postgres | |
auth_user = postgres | |
;; auth_query = SELECT p_user, p_password FROM public.lookup($1) | |
logfile = /tmp/pgbouncer_db1.log | |
pidfile = /tmp/pgbouncer_db1.pid | |
;;; | |
;;; Where to wait for clients | |
;;; | |
;; IP address or * which means all IPs | |
listen_addr = 127.0.0.1 | |
listen_port = 5001 | |
unix_socket_dir = /tmp/db1 | |
;; query_wait_timeout = 20 | |
;; listen_backlog = 10 | |
;; server_connect_timeout = 5 | |
log_connections = 1 | |
log_disconnections = 1 | |
;; pool_mode = transaction | |
root@demo:~# cat /var/lib/postgresql/temp/db2.ini | |
[databases] | |
postgres = port=5433 host=127.0.0.1 | |
[pgbouncer] | |
;; auth_type = hba ;; if set then only allow user from pg_hba.conf below | |
auth_type = trust | |
;; allow via md5 auth | |
;; auth_hba_file = /etc/pgbouncer/pg_hba.conf | |
;; auth_file = /etc/pgbouncer/userlist2.txt | |
;; list of user/md5password text file | |
admin_users = postgres | |
auth_user = postgres | |
;; auth_query = SELECT p_user, p_password FROM public.lookup($1) | |
logfile = /tmp/pgbouncer_db2.log | |
pidfile = /tmp/pgbouncer_db2.pid | |
;;; | |
;;; Where to wait for clients | |
;;; | |
;; IP address or * which means all IPs | |
listen_addr = 127.0.0.1 | |
listen_port = 5002 | |
unix_socket_dir = /tmp/db2 | |
root@demo:~# cat /etc/haproxy/haproxy.cfg | |
global | |
maxconn 100 | |
stats socket ipv4@127.0.0.1:9999 level admin | |
stats socket /var/run/haproxy.sock mode 666 level admin | |
stats timeout 2m | |
defaults | |
log global | |
mode tcp | |
retries 2 | |
timeout client 30m | |
timeout connect 4s | |
timeout server 30m | |
timeout check 5s | |
listen stats | |
mode http | |
bind *:7000 | |
stats enable | |
stats uri / | |
listen ReadOnly | |
mode tcp | |
bind 0.0.0.0:8002 | |
option httpchk | |
http-check expect status 206 | |
default-server observe layer7 error-limit 10 inter 3s fall 3 rise 2 on-marked-down shutdown-sessions | |
server pg0 127.0.0.1:5001 check port 23267 | |
server pg1 127.0.0.1:5002 check port 23268 | |
listen ReadWrite | |
mode tcp | |
bind 0.0.0.0:8001 | |
option httpchk | |
http-check expect status 203 | |
default-server observe layer7 error-limit 10 inter 3s fall 3 rise 2 on-marked-down shutdown-sessions | |
server pg0 127.0.0.1:5001 check port 23267 | |
server pg1 127.0.0.1:5002 check port 23268 | |
###################### | |
steps | |
pg_ctl -D db1 -l db1.log start | |
pg_ctl -D db1 -l db1.log start | |
pgbouncer -u postgres -v -d db1.ini | |
pgbouncer -u postgres -v -d db2.ini | |
systemctl status xinetd.service | |
systemctl status haproxy.service | |
monitor haproxy via | |
ip:7000/ | |
tests: | |
postgres@demo:~/temp$ #stop pg on 5432 | |
postgres@demo:~/temp$ pg_ctl -D db1 -l db1.log stop | |
waiting for server to shut down.... done | |
server stopped | |
#since pgbouncer is running, we cannot failover | |
postgres@demo:~/temp$ timeout 5 psql -p 5001 -U postgres -d postgres -h 127.0.0.1 -c 'select pg_is_in_recovery();' | |
postgres@demo:~/temp$ echo $? | |
124 | |
postgres@demo:~/temp$ timeout 5 psql -p 8001 -U postgres -d postgres -h 127.0.0.1 -c 'select pg_is_in_recovery();' | |
psql: error: connection to server at "127.0.0.1", port 8001 failed: server closed the connection unexpectedly | |
This probably means the server terminated abnormally | |
before or while processing the request. | |
#so haproxy will not disable frontend ReadWrite, even if health check for postgres and pgbouncer fail | |
postgres@demo:~/temp$ ps aux | grep pgbouncer | |
postgres 7747 0.0 0.3 18128 3772 ? Sl 11:35 0:01 pgbouncer -u postgres -v -d db2.ini | |
postgres 46590 0.0 0.3 18132 3840 ? Sl 12:25 0:00 pgbouncer -u postgres -v -d db1.ini | |
postgres 50011 0.0 0.0 7696 624 pts/0 S+ 12:28 0:00 grep pgbouncer | |
postgres@demo:~/temp$ kill 7747 | |
postgres@demo:~/temp$ timeout 5 psql -p 5001 -U postgres -d postgres -h 127.0.0.1 -c 'select pg_is_in_recovery();' | |
FATAL: server login has been failing, try again later (server_login_retry) | |
server closed the connection unexpectedly | |
This probably means the server terminated abnormally | |
before or while processing the request. | |
connection to server was lost | |
postgres@demo:~/temp$ timeout 5 psql -p 8001 -U postgres -d postgres -h 127.0.0.1 -c 'select pg_is_in_recovery();' | |
psql: error: connection to server at "127.0.0.1", port 8001 failed: server closed the connection unexpectedly | |
This probably means the server terminated abnormally | |
before or while processing the request. | |
### on the haproxy status for frontend is OPEN, hence psql will not failover to other node | |
psql "postgresql://127.0.0.1:8001,127.0.0.1:8002/postgres?target_session_attrs=any" -c 'select pg_is_in_recovery();' | |
psql: error: connection to server at "127.0.0.1", port 8001 failed: server closed the connection unexpectedly | |
This probably means the server terminated abnormally | |
before or while processing the request. | |
we need to mark the frontend down explictly via runtime api | |
then via socat | |
sudo socat stdio tcp4-connect:127.0.0.1:9999 | |
disable frontend ReadWrite | |
#on the dashboard for haproxy it changes from OPEN to STOP | |
now haproxy simply considers the frontend down and routes to 8002 | |
postgres@demo:~/temp$ psql "postgresql://127.0.0.1:8001,127.0.0.1:8002/postgres?target_session_attrs=any" -c 'select pg_is_in_recovery();' | |
pg_is_in_recovery | |
------------------- | |
t | |
(1 row) | |
postgres@demo:~/temp$ timeout 5 psql "postgresql://127.0.0.1:5001,127.0.0.1:5002/postgres?target_session_attrs=any" -c 'select pg_is_in_recovery();' | |
FATAL: server login has been failing, try again later (server_login_retry) | |
server closed the connection unexpectedly | |
This probably means the server terminated abnormally | |
before or while processing the request. | |
connection to server was lost | |
postgres@demo:~/temp$ #pgbouncer still failing as is active but backend is not | |
postgres@demo:~/temp$ psql "postgresql://127.0.0.1:8001,127.0.0.1:8002/postgres?target_session_attrs=any" -c 'select pg_is_in_recovery();' | |
pg_is_in_recovery | |
------------------- | |
t | |
(1 row) | |
#even if pg recovers and pgbouncer recovers and both healthy, we need to enable haproxy frontend else | |
postgres@demo:~/temp$ #pgbouncer still failing as is active but backend is not | |
postgres@demo:~/temp$ psql "postgresql://127.0.0.1:8001,127.0.0.1:8002/postgres?target_session_attrs=any" -c 'select pg_is_in_recovery();' | |
pg_is_in_recovery | |
------------------- | |
t | |
(1 row) | |
postgres@demo:~/temp$ pg_ctl -D db1 -l db1.log start | |
waiting for server to start.... done | |
server started | |
postgres@demo:~/temp$ psql "postgresql://127.0.0.1:8001,127.0.0.1:8002/postgres?target_session_attrs=any" -c 'select pg_is_in_recovery();' | |
pg_is_in_recovery | |
------------------- | |
t | |
(1 row) | |
postgres@demo:~/temp$ psql "postgresql://127.0.0.1:5001,127.0.0.1:5002/postgres?target_session_attrs=any" -c 'select pg_is_in_recovery();' | |
pg_is_in_recovery | |
------------------- | |
f | |
(1 row) | |
postgres@demo:~/temp$ psql "postgresql://127.0.0.1:5432,127.0.0.1:5433/postgres?target_session_attrs=any" -c 'select pg_is_in_recovery();' | |
pg_is_in_recovery | |
------------------- | |
f | |
(1 row) | |
##all up db,haproxy,pgb | |
root@demo:~# sudo socat stdio tcp4-connect:127.0.0.1:9999 | |
enable frontend ReadWrite | |
postgres@demo:~/temp$ psql "postgresql://127.0.0.1:8001,127.0.0.1:8002/postgres?target_session_attrs=any" -c 'select pg_is_in_recovery();' | |
pg_is_in_recovery | |
------------------- | |
f | |
(1 row) | |
postgres@demo:~/temp$ psql "postgresql://127.0.0.1:5001,127.0.0.1:5002/postgres?target_session_attrs=any" -c 'select pg_is_in_recovery();' | |
pg_is_in_recovery | |
------------------- | |
f | |
(1 row) | |
postgres@demo:~/temp$ psql "postgresql://127.0.0.1:5432,127.0.0.1:5433/postgres?target_session_attrs=any" -c 'select pg_is_in_recovery();' | |
pg_is_in_recovery | |
------------------- | |
f | |
(1 row) | |
Author
cabecada
commented
Jun 3, 2022
•
use connect_timeout for hanging pgbouncer to force failover,
strace -f -e trace=network -s 10000 -- psql "postgresql://127.0.0.1:5002,127.0.0.1:5003/postgres?target_session_attrs=any&connect_timeout=5" -c 'select inet_server_port();'
socket(AF_UNIX, SOCK_STREAM|SOCK_CLOEXEC|SOCK_NONBLOCK, 0) = 3
connect(3, {sa_family=AF_UNIX, sun_path="/var/run/nscd/socket"}, 110) = -1 ENOENT (No such file or directory)
socket(AF_UNIX, SOCK_STREAM|SOCK_CLOEXEC|SOCK_NONBLOCK, 0) = 3
connect(3, {sa_family=AF_UNIX, sun_path="/var/run/nscd/socket"}, 110) = -1 ENOENT (No such file or directory)
socket(AF_INET, SOCK_STREAM, IPPROTO_IP) = 3
setsockopt(3, SOL_TCP, TCP_NODELAY, [1], 4) = 0
setsockopt(3, SOL_SOCKET, SO_KEEPALIVE, [1], 4) = 0
connect(3, {sa_family=AF_INET, sin_port=htons(5002), sin_addr=inet_addr("127.0.0.1")}, 16) = -1 EINPROGRESS (Operation now in progress)
getsockopt(3, SOL_SOCKET, SO_ERROR, [0], [4]) = 0
getsockname(3, {sa_family=AF_INET, sin_port=htons(41788), sin_addr=inet_addr("127.0.0.1")}, [128->16]) = 0
sendto(3, "\0\0\0\10\4\322\26/", 8, MSG_NOSIGNAL, NULL, 0) = 8
recvfrom(3, "N", 16384, 0, NULL, NULL) = 1
sendto(3, "\0\0\0T\0\3\0\0user\0postgres\0database\0postgres\0application_name\0psql\0client_encoding\0UTF8\0\0", 84, MSG_NOSIGNAL, NULL, 0) = 84
socket(AF_INET, SOCK_STREAM, IPPROTO_IP) = 3
setsockopt(3, SOL_TCP, TCP_NODELAY, [1], 4) = 0
setsockopt(3, SOL_SOCKET, SO_KEEPALIVE, [1], 4) = 0
connect(3, {sa_family=AF_INET, sin_port=htons(5003), sin_addr=inet_addr("127.0.0.1")}, 16) = -1 EINPROGRESS (Operation now in progress)
getsockopt(3, SOL_SOCKET, SO_ERROR, [0], [4]) = 0
getsockname(3, {sa_family=AF_INET, sin_port=htons(32884), sin_addr=inet_addr("127.0.0.1")}, [128->16]) = 0
sendto(3, "\0\0\0\10\4\322\26/", 8, MSG_NOSIGNAL, NULL, 0) = 8
recvfrom(3, "N", 16384, 0, NULL, NULL) = 1
sendto(3, "\0\0\0T\0\3\0\0user\0postgres\0database\0postgres\0application_name\0psql\0client_encoding\0UTF8\0\0", 84, MSG_NOSIGNAL, NULL, 0) = 84
recvfrom(3, "R\0\0\0\10\0\0\0\0S\0\0\0&default_transaction_read_only\0off\0S\0\0\0\26in_hot_standby\0on\0S\0\0\0\31integer_datetimes\0on\0S\0\0\0\33IntervalStyle\0postgres\0S\0\0\0\24is_superuser\0on\0S\0\0\0\31server_encoding\0UTF8\0S\0\0\0004server_version\00014.3 (Ubuntu 14.3-1.pgdg20.04+1)\0S\0\0\0#session_authorization\0postgres\0S\0\0\0\31client_encoding\0UTF8\0S\0\0\0\27DateStyle\0ISO, MDY\0S\0\0\0\32TimeZone\0Asia/Kolkata\0S\0\0\0#standard_conforming_strings\0on\0S\0\0\0\32application_name\0psql\0K\0\0\0\f\273G<ncU\250\30Z\0\0\0\5I", 16384, 0, NULL, NULL) = 420
sendto(3, "Q\0\0\0\37select inet_server_port();\0", 32, MSG_NOSIGNAL, NULL, 0) = 32
recvfrom(3, "T\0\0\0)\0\1inet_server_port\0\0\0\0\0\0\0\0\0\0\27\0\4\377\377\377\377\0\0D\0\0\0\16\0\1\0\0\0\0045434C\0\0\0\rSELECT 1\0Z\0\0\0\5I", 16384, 0, NULL, NULL) = 77
inet_server_port
------------------
5434
(1 row)
recvfrom(3, 0xaaab1e1c4870, 16384, 0, NULL, NULL) = -1 EAGAIN (Resource temporarily unavailable)
sendto(3, "X\0\0\0\4", 5, MSG_NOSIGNAL, NULL, 0) = 5
+++ exited with 0 +++
ref:
FEBE message format ascii identifiers
https://github.com/tlocke/pg8000/blob/main/pg8000/core.py#L76
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment