Last active
December 30, 2021 06:18
-
-
Save QQGoblin/137e63cacc9213cc4deae6b848fd3ac8 to your computer and use it in GitHub Desktop.
【Patroni源码阅读】Recover 流程
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
class Ha(object): | |
def recover(self): | |
# Postgres is not running and we will restart in standby mode. Watchdog is not needed until we promote. | |
self.watchdog.disable() | |
if self.has_lock() and self.update_lock(): | |
# master_start_timeout,在触发故障转移之前允许主服务器从故障中恢复的时间(以秒为单位),默认值为 300 秒。 | |
# 当设置为 0 时,如果可能,在检测到崩溃后立即进行故障转移。使用异步复制时,故障转移可能会导致事务丢失。 | |
# master 故障的最坏情况故障转移时间是:loop_wait + master_start_timeout + loop_wait,除非 master_start_timeout 为零, | |
# PS:需要注意,master_start_timeout发生这段时间内,patroni会尝试恢复pg服务 | |
timeout = self.patroni.config['master_start_timeout'] | |
if timeout == 0: | |
# We are requested to prefer failing over to restarting master. But see first if there | |
# is anyone to fail over to. | |
if self.is_failover_possible(self.cluster.members): | |
logger.info("Master crashed. Failing over.") | |
self.demote('immediate') | |
return 'stopped PostgreSQL to fail over after a crash' | |
else: | |
timeout = None | |
data = self.state_handler.controldata() | |
logger.info('pg_controldata:\n%s\n', '\n'.join(' {0}: {1}'.format(k, v) for k, v in data.items())) | |
if data.get('Database cluster state') in ('in production', 'shutting down', 'in crash recovery') \ | |
and not self._crash_recovery_executed and \ | |
(self.cluster.is_unlocked() or self._rewind.can_rewind): | |
# self._rewind.can_rewind = true 当配置了use_pg_rewind=true时 | |
# 标记当前recovery操作正在进行,_crash_recovery_started 标记了操作开始时间 | |
self._crash_recovery_executed = True | |
self._crash_recovery_started = time.time() | |
msg = 'doing crash recovery in a single user mode' | |
return self._async_executor.try_run_async(msg, self._rewind.ensure_clean_shutdown) or msg | |
self.load_cluster_from_dcs() | |
role = 'replica' | |
if self.is_standby_cluster() or not self.has_lock(): | |
# self._rewind.executed 表示self._rewind是 SUCCESS/FAILED | |
if not self._rewind.executed: | |
# 如果设置了参数remove_data_directory_on_diverged_timelines=true,或者 use_pg_rewind = true | |
# 并且 self._rewind 状态不是 NEED,那么 self._rewind 的状态会被设置为 CHECK | |
self._rewind.trigger_check_diverged_lsn() | |
msg = self._handle_rewind_or_reinitialize() | |
if msg: | |
return msg | |
if self.has_lock(): # in standby cluster | |
msg = "starting as a standby leader because i had the session lock" | |
role = 'standby_leader' | |
node_to_follow = self._get_node_to_follow(self.cluster) | |
elif self.is_standby_cluster() and self.cluster.is_unlocked(): | |
msg = "trying to follow a remote master because standby cluster is unhealthy" | |
node_to_follow = self.get_remote_master() | |
else: | |
msg = "starting as a secondary" | |
node_to_follow = self._get_node_to_follow(self.cluster) | |
elif self.has_lock(): | |
# 当前节点获取了Leader,因此自己启动 | |
# 这个逻辑实际上是leader自我降级了,下一个loop开始时,由于recovering 被设置为True,因此post_recovery函数会释放lock | |
# | |
msg = "starting as readonly because i had the session lock" | |
node_to_follow = None | |
if self._async_executor.try_run_async('restarting after failure', self.state_handler.follow, | |
args=(node_to_follow, role, timeout)) is None: | |
self.recovering = True | |
return msg |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment