Skip to content

Instantly share code, notes, and snippets.

@QQGoblin
Last active December 30, 2021 06:18
Show Gist options
  • Save QQGoblin/137e63cacc9213cc4deae6b848fd3ac8 to your computer and use it in GitHub Desktop.
Save QQGoblin/137e63cacc9213cc4deae6b848fd3ac8 to your computer and use it in GitHub Desktop.
【Patroni源码阅读】Recover 流程
class Ha(object):
def recover(self):
# Postgres is not running and we will restart in standby mode. Watchdog is not needed until we promote.
self.watchdog.disable()
if self.has_lock() and self.update_lock():
# master_start_timeout,在触发故障转移之前允许主服务器从故障中恢复的时间(以秒为单位),默认值为 300 秒。
# 当设置为 0 时,如果可能,在检测到崩溃后立即进行故障转移。使用异步复制时,故障转移可能会导致事务丢失。
# master 故障的最坏情况故障转移时间是:loop_wait + master_start_timeout + loop_wait,除非 master_start_timeout 为零,
# PS:需要注意,master_start_timeout发生这段时间内,patroni会尝试恢复pg服务
timeout = self.patroni.config['master_start_timeout']
if timeout == 0:
# We are requested to prefer failing over to restarting master. But see first if there
# is anyone to fail over to.
if self.is_failover_possible(self.cluster.members):
logger.info("Master crashed. Failing over.")
self.demote('immediate')
return 'stopped PostgreSQL to fail over after a crash'
else:
timeout = None
data = self.state_handler.controldata()
logger.info('pg_controldata:\n%s\n', '\n'.join(' {0}: {1}'.format(k, v) for k, v in data.items()))
if data.get('Database cluster state') in ('in production', 'shutting down', 'in crash recovery') \
and not self._crash_recovery_executed and \
(self.cluster.is_unlocked() or self._rewind.can_rewind):
# self._rewind.can_rewind = true 当配置了use_pg_rewind=true时
# 标记当前recovery操作正在进行,_crash_recovery_started 标记了操作开始时间
self._crash_recovery_executed = True
self._crash_recovery_started = time.time()
msg = 'doing crash recovery in a single user mode'
return self._async_executor.try_run_async(msg, self._rewind.ensure_clean_shutdown) or msg
self.load_cluster_from_dcs()
role = 'replica'
if self.is_standby_cluster() or not self.has_lock():
# self._rewind.executed 表示self._rewind是 SUCCESS/FAILED
if not self._rewind.executed:
# 如果设置了参数remove_data_directory_on_diverged_timelines=true,或者 use_pg_rewind = true
# 并且 self._rewind 状态不是 NEED,那么 self._rewind 的状态会被设置为 CHECK
self._rewind.trigger_check_diverged_lsn()
msg = self._handle_rewind_or_reinitialize()
if msg:
return msg
if self.has_lock(): # in standby cluster
msg = "starting as a standby leader because i had the session lock"
role = 'standby_leader'
node_to_follow = self._get_node_to_follow(self.cluster)
elif self.is_standby_cluster() and self.cluster.is_unlocked():
msg = "trying to follow a remote master because standby cluster is unhealthy"
node_to_follow = self.get_remote_master()
else:
msg = "starting as a secondary"
node_to_follow = self._get_node_to_follow(self.cluster)
elif self.has_lock():
# 当前节点获取了Leader,因此自己启动
# 这个逻辑实际上是leader自我降级了,下一个loop开始时,由于recovering 被设置为True,因此post_recovery函数会释放lock
#
msg = "starting as readonly because i had the session lock"
node_to_follow = None
if self._async_executor.try_run_async('restarting after failure', self.state_handler.follow,
args=(node_to_follow, role, timeout)) is None:
self.recovering = True
return msg
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment