Skip to content

Instantly share code, notes, and snippets.

@lfittl
Created May 19, 2009 22:52
Show Gist options
  • Save lfittl/114479 to your computer and use it in GitHub Desktop.
Save lfittl/114479 to your computer and use it in GitHub Desktop.
/usr/bin/walmgr /etc/walmgr/master.ini xarchive %p %f >> /var/log/postgresql/archivecmd.log
[wal-master]
job_name = master
logfile = /var/log/postgresql/walmgr-master.log
master_db = dbname=soup_production
master_data = /var/lib/postgresql/8.3/main
master_config = /etc/postgresql/8.3/main/postgresql.conf
slave_config = /etc/walmgr/slave.ini
slave = core.in.soup.io:/srv/dbstandby-walmgr
completed_wals = %(slave)s/logs.complete
partial_wals = %(slave)s/logs.partial
full_backup = %(slave)s/data.master
file_target = %(slave)s/files.master
# syncdaemon update frequency
loop_delay = 10.0
# use record based shipping available in 8.2
use_xlog_functions = 1
restore_command = '/usr/bin/walmgr /etc/walmgr/slave.ini xrestore %f "%p"'
log_restartpoints = 'true'
[wal-slave]
job_name = standby
logfile = /var/log/postgresql/walmgr-standby.log
slave_volume = /dev/vg-core/dbstandby
slave_data = /var/lib/postgresql/8.3/standby
slave_stop_cmd = /usr/bin/pg_ctlcluster --force 8.3 standby stop
slave_start_cmd = /usr/bin/pg_ctlcluster 8.3 standby start
slave_bin = /usr/lib/postgresql/8.3/bin
snapshot_volume = /dev/vg-core/dbindexer
snapshot_size = 20G
snapshot_data = /var/lib/postgresql/8.3/indexer
snapshot_stop_cmd = /usr/bin/pg_ctlcluster --force 8.3 indexer stop
snapshot_start_cmd = /usr/bin/pg_ctlcluster 8.3 indexer start
slave = /srv/dbstandby-walmgr
completed_wals = %(slave)s/logs.complete
partial_wals = %(slave)s/logs.partial
full_backup = %(slave)s/data.master
keep_backups = 0
archive_command =
--- walmgr.orig 2009-04-08 09:04:55.000000000 +0000
+++ walmgr 2009-04-08 10:16:13.000000000 +0000
@@ -52,7 +52,7 @@
"""
import os, sys, skytools, re, signal, time, traceback
-import errno, glob, ConfigParser, shutil
+import errno, glob, ConfigParser, shutil, subprocess, math
MASTER = 1
SLAVE = 0
@@ -142,6 +142,30 @@
label.label_string = m.group(1)
return label
+def get_wal_filename(timeline, wal_location, wal_segment_size):
+ upper_wal_location, lower_wal_location = [int(x, 16) for x in wal_location.split('/')]
+
+ # Lower WAL location is split into a file identifier, and the location within the file itself (depending on WAL segment size)
+ lower_wal_location_file = lower_wal_location >> int(math.log(wal_segment_size) / math.log(2))
+
+ return "%08X%08X%08X" % (timeline, upper_wal_location, lower_wal_location_file)
+
+def get_redo_filename(data_dir):
+ timeline_regexp = re.compile("^Latest checkpoint's TimeLineID:\s*(.*)$", re.MULTILINE)
+ redo_location_regexp = re.compile("^Latest checkpoint's REDO location:\s*(.*)$", re.MULTILINE)
+ wal_segment_size_regexp = re.compile("^Bytes per WAL segment:\s*(.*)$", re.MULTILINE)
+
+ pgcontrol = subprocess.Popen(["/usr/lib/postgresql/8.3/bin/pg_controldata", data_dir], stdout=subprocess.PIPE).communicate()[0]
+
+ if pgcontrol == '': return
+
+ timeline = int(timeline_regexp.search(pgcontrol).group(1))
+ redo_location = redo_location_regexp.search(pgcontrol).group(1)
+ wal_segment_size = int(wal_segment_size_regexp.search(pgcontrol).group(1))
+
+ return get_wal_filename(timeline, redo_location, wal_segment_size)
+
+
class WalMgr(skytools.DBScript):
def init_optparse(self, parser=None):
@@ -193,6 +217,8 @@
'pause': self.slave_pause,
'continue': self.slave_continue,
'boot': self.slave_boot,
+ 'boot-snapshot': self.slave_boot_snapshot,
+ 'destroy-snapshot': self.slave_destroy_snapshot,
'xlock': self.slave_lock_backups_exit,
'xrelease': self.slave_resume_backups,
'xrotate': self.slave_rotate_backups,
@@ -953,7 +979,12 @@
# cleanup only if we don't keep backup history.
# historic WAL files are removed during backup rotation
self.log.debug("%s: copy done, cleanup" % srcname)
- self.slave_cleanup(srcname)
+
+ redo_filename_standby = get_redo_filename(self.cf.get("slave_data"))
+ redo_filename_snapshot = get_redo_filename(self.cf.get("snapshot_data"))
+ redo_filename = (redo_filename_snapshot and redo_filename_snapshot < redo_filename_standby) and redo_filename_snapshot or redo_filename_standby
+ self.log.info("Deleting old WAL logs, last required file for REDO is %s" % redo_filename)
+ self.slave_cleanup(redo_filename)
if os.path.isfile(partfile) and not srcfile == partfile:
# Remove any partial files after restore. Only leave the partial if
@@ -1151,7 +1182,52 @@
self.exec_cmd(["cp", src_authfile, dst_authfile])
except Exception, e:
self.log.warning("Unable to restore pg_auth file: %s" % e)
+
+ def slave_boot_snapshot(self):
+ if self.not_really: return
+
+ slave_volume = self.cf.get("slave_volume")
+ snapshot_volume = self.cf.get("snapshot_volume")
+ snapshot_size = self.cf.get("snapshot_size")
+ data_dir = self.cf.get("snapshot_data")
+ self.slave_pause(waitcomplete=1)
+
+ try:
+ self.exec_cmd(["/sbin/lvcreate", "--size", snapshot_size, "--snapshot", "--name", snapshot_volume, slave_volume])
+ self.exec_cmd(["/bin/mount", snapshot_volume, data_dir])
+ finally:
+ # Important: LVM snapshot has to be mounted before we continue, so standby won't delete WAL logs we still need
+ self.slave_continue()
+
+ # Remove the PID file of the slave PostgreSQL instance, if it exists
+ old_pidfile = os.path.join(data_dir, "postmaster.pid")
+ if os.path.isfile(old_pidfile):
+ os.remove(old_pidfile)
+
+ # Write new recovery.conf, to restore all WAL logs from last checkpoint to "crash"
+ recovery_conf = os.path.join(data_dir, "recovery.conf")
+ cf_file = os.path.abspath(self.cf.filename)
+ f = open(recovery_conf, "w")
+ f.write("restore_command = 'cp %s/%%f %%p'\n" % self.cf.get("completed_wals"))
+ f.close()
+
+ os.system(self.cf.get("snapshot_start_cmd"))
+
+ started_regexp = re.compile("^Database cluster state:\s*in production$", re.MULTILINE)
+ while True:
+ pgcontrol = subprocess.Popen(["/usr/lib/postgresql/8.3/bin/pg_controldata", data_dir], stdout=subprocess.PIPE).communicate()[0]
+ if started_regexp.search(pgcontrol):
+ break
+ time.sleep(1)
+
+ def slave_destroy_snapshot(self):
+ if self.not_really: return
+
+ os.system(self.cf.get("snapshot_stop_cmd"))
+ self.exec_cmd(["/bin/umount", self.cf.get("snapshot_data")])
+ self.exec_cmd(["/sbin/lvremove", "--force", self.cf.get("snapshot_volume")])
+
def slave_pause(self, waitcomplete=0):
"""Pause the WAL apply, wait until last file applied if needed"""
self.assert_valid_role(SLAVE)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment