Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save barthalion/644b1314c15362809411d39c11098101 to your computer and use it in GitHub Desktop.
Save barthalion/644b1314c15362809411d39c11098101 to your computer and use it in GitHub Desktop.
#!/usr/bin/python3 -u
# -*- mode: Python; coding: utf-8 -*-
# Fix issues with missing OSTree objects
#
# Copyright (C) 2017 Endless Mobile, Inc.
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License along
# with this program; if not, write to the Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
"""Fix issues with OSTree missing objects
When OSTree repository objects have been inadvertently deleted, it can
cause two types of problems (among others):
1. If the deleted object is part of a commit, then the commit is now
partial, but OSTree doesn't know that unless a commitpartial file
exists. Without that, it will assume the commit is fully intact and use
it as the source for a static delta.
2. If the deleted object is a commit, then any references to it will be
dangling. This will cause errors since OSTree assumes that a referenced
commit will exist and will raise errors as soon as it tries to be used.
This script attempts to address these 2 issues by repulling the commits
for any dangling references and marking any commits with missing objects
as partial.
To guard against another program operating on the repository, all
processes that have the repository open are killed.
Some commands for testing this script when hacking on it (from a throwaway
working system):
```
for objtype in commit dirtree dirmeta file; do
sudo find /ostree/repo/objects -type f -name "*.${objtype}" -print -delete -quit
done
sudo eos-fix-ostree-repo
sudo ostree fsck
```
Make sure to test on normal and split-disk systems.
"""
from argparse import ArgumentParser
from fnmatch import fnmatch
import gi
gi.require_version('OSTree', '1.0')
from gi.repository import GLib, Gio, OSTree
import os
import pwd
import signal
import stat
import sys
import time
# Older OSTree versions had the GI annotation wrong, and the enum
# value was OSTree.RepoCommitState.REPO_COMMIT_STATE_PARTIAL while
# recent versions have OSTree.RepoCommitState.PARTIAL [1].
#
# So we try to load OSTree.RepoCommitState.PARTIAL and, if that fails,
# override OSTree.RepoCommitState.PARTIAL to be the old enum value.
#
# [1] See https://github.com/ostreedev/ostree/pull/1335
if 'PARTIAL' not in dir(OSTree.RepoCommitState):
OSTree.RepoCommitState.PARTIAL = OSTree.RepoCommitState.REPO_COMMIT_STATE_PARTIAL
# Prior to ostree-2017.1, the GI annotation for ostree_repo_list_objects
# was wrong, making it unusable from bindings[1]. This is tricky to
# detect since the version checking was not added until ostree-2017.3.
# However, in the same commit,
# ostree_repo_list_commit_objects_starting_with was fixed so that the
# out_commits parameter was marked properly.
#
# Use the direction of this argument as a proxy to know when
# ostree_repo_list_objects will work. Otherwise, make our own
# list_objects implementation and monkey-patch it into the Repo class.
#
# 1. https://github.com/ostreedev/ostree/commit/300752e5
_func = OSTree.Repo.list_commit_objects_starting_with
if _func.get_arguments()[1].get_direction() == gi._gi.Direction.IN:
import glob
# Fake ostree_repo_list_objects. See
# https://github.com/ostreedev/ostree/blob/master/src/libostree/ostree-repo.c
# for the real implementation.
def _list_objects(self, flags, cancellable=None):
objects = {}
repo_path = self.get_path().get_path()
repo_mode = self.get_mode()
# Objects live in the objects directory split after the 2nd
# character of their sha256sum. E.g.,
# 8d/2925839245dd91ac9fbfcc0e7a383cddf5145bd7c5bc5de0d46929a3fa5963.file.
objdir_pattern = repo_path + '/objects/[a-f0-9][a-f0-9]'
for objdir in glob.iglob(objdir_pattern):
for entry in os.listdir(objdir):
name, ext = os.path.splitext(entry)
if len(name) != 62:
# Not a partial sha256
continue
if ext == '':
continue
elif (ext == '.filez' and
repo_mode == OSTree.RepoMode.ARCHIVE_Z2):
objtype = OSTree.ObjectType.FILE
elif (ext == '.file' and
repo_mode != OSTree.RepoMode.ARCHIVE_Z2):
objtype = OSTree.ObjectType.FILE
elif ext == '.dirtree':
objtype = OSTree.ObjectType.DIR_TREE
elif ext == '.dirmeta':
objtype = OSTree.ObjectType.DIR_META
elif ext == '.commit':
objtype = OSTree.ObjectType.COMMIT
else:
continue
# Insert the object. The key is the serialized object
# name and the value is always the same (bas) variant (I
# think packed objects were supposed to put something
# here, but only loose objects ever exist).
checksum = os.path.basename(objdir) + name
key = OSTree.object_name_serialize(checksum, objtype)
value = GLib.Variant.new_tuple(
GLib.Variant('b', True),
GLib.Variant('as', [])
)
objects[key] = value
return True, objects
# Override the standard list_objects
OSTree.Repo.list_objects = _list_objects
def kill_repo_procs(repo_path, sig):
"""Kill all processes with repo open
Walk /proc to find any process with the repo directory open and kill
them with signal sig.
"""
print('Killing processes with', repo_path, 'open with signal', sig)
self_pid = os.getpid()
for pid in os.listdir('/proc'):
if not pid.isnumeric():
continue
if int(pid) == self_pid:
continue
# The process may have exited
try:
proc_fds = os.listdir(os.path.join('/proc', pid, 'fd'))
except FileNotFoundError:
continue
for fd in proc_fds:
# The process may have exited or the file may have been closed
try:
fd_path = os.readlink(os.path.join('/proc', pid, 'fd', fd))
except FileNotFoundError:
continue
# If the open file is the repo or a path within the repo,
# kill the process
if fd_path == repo_path or fd_path.startswith(repo_path + '/'):
# Try to read the exe file for information, but in some
# cases (kernel thread), it may not exist
try:
pid_exe = os.readlink(os.path.join('/proc', pid, 'exe'))
except:
pid_exe = ''
# Kill it and go to the next process
print('Killing pid', pid, pid_exe, 'with signal', sig)
os.kill(int(pid), sig)
break
def pull_commit(repo, remote, checksum, full=False):
"""Pull commit from remote
When full is False, only the commit metadata will be pulled.
"""
if full:
flags = OSTree.RepoPullFlags.NONE
else:
flags = OSTree.RepoPullFlags.COMMIT_ONLY
opts = GLib.Variant('a{sv}', {
'flags': GLib.Variant('i', flags),
'refs': GLib.Variant('as', (checksum,)),
'depth': GLib.Variant('i', 0),
})
# FIXME: For some reason, pull_with_options cannot be stopped with
# ^C from the keyboard (SIGINT). This could be a problem in ostree
# or pygobject, but I suspect it has something to do with what pull
# does with the main context.
progress = OSTree.AsyncProgress.new()
progress.connect('changed',
OSTree.Repo.pull_default_console_progress_changed,
None)
repo.pull_with_options(remote, opts, progress)
progress.finish()
def fix_dangling_refs(repo):
"""Update repo refs where the commit is missing
This does a commit metadata only pull so the refs are valid again.
"""
repo_path = os.path.realpath(repo.get_path().get_path())
print('Fixing refs pointing to missing commits in', repo_path)
_, all_refs = repo.list_refs()
for refspec, checksum in all_refs.items():
try:
repo.load_commit(checksum)
except GLib.Error as err:
if not err.matches(Gio.io_error_quark(),
Gio.IOErrorEnum.NOT_FOUND):
raise
# Try to pull the commit metadata again.
_, remote, ref = OSTree.parse_refspec(refspec)
if remote is None:
# If there's no remote, assume it's an ostree ref and
# use "eos" as the remote.
print('No remote for ref', ref, 'assuming "eos"')
remote = 'eos'
print('Pulling', checksum, 'commit metadata from', remote,
'for', ref)
pull_commit(repo, remote, checksum)
def mark_commits_partial(repo):
"""Mark commits with missing objects as partial"""
repo_path = os.path.realpath(repo.get_path().get_path())
print('Marking commits with missing objects as partial in', repo_path)
_, all_objects = repo.list_objects(OSTree.RepoListObjectsFlags.ALL, None)
for objname in all_objects:
checksum, objtype = OSTree.object_name_deserialize(objname)
if objtype != OSTree.ObjectType.COMMIT:
continue
_, commit, state = repo.load_commit(checksum)
if state == OSTree.RepoCommitState.PARTIAL:
print('Commit', checksum, 'already marked as partial')
continue
mark_partial = False
try:
# If a dirtree is missing, traverse_commit will fail with
# G_IO_ERROR_NOT_FOUND.
_, reachable_objects = repo.traverse_commit(checksum, 0)
# Unfortunately, it doesn't check that the leaves (dirmeta
# and files) exist, so we need to do that manually. In case
# that behavior ever changes, just check that all the
# reachable objects exist.
#
# https://github.com/ostreedev/ostree/issues/1222
for commit_obj in reachable_objects:
if commit_obj not in all_objects:
mark_partial = True
break
except GLib.Error as err:
if not err.matches(Gio.io_error_quark(),
Gio.IOErrorEnum.NOT_FOUND):
raise
mark_partial = True
if mark_partial:
print('Marking commit', checksum, 'as partial')
commit_partial_path = os.path.join(repo_path, 'state',
checksum + '.commitpartial')
with open(commit_partial_path, 'w'):
pass
def pull_partial_refs(repo):
"""Try to fully restore any partial referenced commits"""
# Look for any partial refs and re-pull them.
_, all_refs = repo.list_refs()
for refspec, checksum in all_refs.items():
_, remote, ref = OSTree.parse_refspec(refspec)
if remote is None:
# Don't bother pulling local refs. Only the ostree deploys
# are local, and as long as they're marked partial, they can
# be updated later.
continue
# If this is an app or runtime locale, it's intentionally
# partial since only the relevant subpaths are pulled. Skip it
# to not use up extra bandwidth and disk space.
if fnmatch(ref, '*/*.Locale/*/*'):
print('Skipping intentionally partial Locale commit',
refspec, checksum)
continue
_, commit, state = repo.load_commit(checksum)
if state != OSTree.RepoCommitState.PARTIAL:
continue
# Try to pull the full commit again.
print('Pulling', checksum, 'commit from', remote, 'for', ref)
pull_commit(repo, remote, checksum, full=True)
def main():
aparser = ArgumentParser(
description='Fix broken OSTree repo'
)
path_group = aparser.add_mutually_exclusive_group()
path_group.add_argument('--sysroot', help='path to OSTree sysroot')
path_group.add_argument('--repo', help='path to OSTree repo')
args = aparser.parse_args()
print('WARNING: Do not start App Center while this is running')
if args.repo is not None:
# Use a repo directly instead of getting it from the sysroot
sysroot = None
repo_file = Gio.File.new_for_path(args.repo)
repo = OSTree.Repo.new(repo_file)
repo.open()
else:
# Get the repo from the sysroot
if args.sysroot is None:
sysroot_file = None
else:
sysroot_file = Gio.File.new_for_path(args.sysroot)
sysroot = OSTree.Sysroot.new(sysroot_file)
sysroot.load()
_, repo = sysroot.get_repo()
# Resolve the full repo path
repo_path = os.path.realpath(repo.get_path().get_path())
# Kill once with SIGTERM, then with SIGKILL
kill_repo_procs(repo_path, signal.SIGTERM)
time.sleep(1)
kill_repo_procs(repo_path, signal.SIGKILL)
# Now lock the sysroot if one is in use
if sysroot is not None and not sysroot.try_lock():
print('Could not lock sysroot', sysroot.get_path().get_path(),
file=sys.stderr)
sys.exit(1)
# In older OSTree, cleaning up after a transaction (e.g., a pull)
# would delete the tmp/cache directory if it was older than 1 day.
# That's a problem because it has an open fd for that directory.
# Update the directory's mtime to current. This is racy because
# other repo users may have deleted the directory after we opened
# the repo and before they were killed, so just fail if the
# directory doesn't exist.
cache_dir = os.path.join(repo_path, 'tmp', 'cache')
try:
os.utime(cache_dir)
except FileNotFoundError:
print(cache_dir, 'does not exist - run', sys.argv[0], 'again!',
file=sys.stderr)
sys.exit(1)
# First, fix dangling refs so that refs can be reliably listed again
fix_dangling_refs(repo)
# Next, traverse all commits to mark any as partial
mark_commits_partial(repo)
# Finally, try to completely pull in any partial referenced commits
# so there are no longer any missing objects
pull_partial_refs(repo)
print('\nSuccess! Try to update the OS and Apps now.')
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment