Skip to content

Instantly share code, notes, and snippets.

@jmendeth jmendeth/
Created Jul 15, 2019

What would you like to do?
Quick & dirty uility for diffing filesystems
import os, re
# Ignores differences on folder mode / ownership
# Collapses "all contents are new" into "whole folder is new"
# Forces no type differences (i.e. link vs dir, etc.)
# Keeps in mind loaded passwd / group for comparison of UID or GID
with open("/etc/passwd") as f:
new_uid_table = { int(x.split(":")[2]): x.split(":")[0] for x in }
with open("/other/etc/passwd") as f:
old_uid_table = { int(x.split(":")[2]): x.split(":")[0] for x in }
with open("/etc/group") as f:
new_gid_table = { int(x.split(":")[2]): x.split(":")[0] for x in }
with open("/other/etc/group") as f:
old_gid_table = { int(x.split(":")[2]): x.split(":")[0] for x in }
with open("root-tree.json", "r") as f:
new = json.loads(
with open("other-tree.json", "r") as f:
old = json.loads(
EXCLUDE = set('''
EXCLUDE_PATTERNS = list(map(re.compile,
[ ".*/__pycache__", ".*\\.pyc", "/usr/src/linux.*", "/etc/rc.\\.d" ]))
def match_attrs(new, old):
return new["mode"] == old["mode"] and new_uid_table[new["uid"]] == old_uid_table[old["uid"]] and new_gid_table[new["gid"]] == old_gid_table[old["gid"]]
def accumulate(seen):
if not seen: return "equal"
if "multiple" in seen or "delete" in seen: return "multiple"
if len(seen) == 1: return next(iter(seen))
return "multiple"
output = open("changes.txt", "w")
def process_tree(new, old, relpath="/"):
if relpath in EXCLUDE or any(re.fullmatch(x, relpath) for x in EXCLUDE_PATTERNS): return
if not old: return { "diff": "add" }
if not new: return { "diff": "delete" }
assert new["type"] == old["type"]
amatch = match_attrs(new["attrs"], old["attrs"])
if new["type"] == 'dir':
files = set(new["files"]) | set(old["files"])
results = []
seen = set()
for f in sorted(files):
result = process_tree(new["files"].get(f), old["files"].get(f), os.path.join(relpath, f))
if result is None: continue
results.append((os.path.join(relpath, f), result["diff"]))
gdiff = accumulate(seen)
if gdiff == "multiple":
for f, r in results:
if r not in {"multiple", "equal", "delete"}:
print({"add": "+", "change": ":", "delete": "-", "equal": " "}[r], f, file=output)
return { "diff": gdiff }
if new["type"] == "link":
return { "diff": "equal" if amatch and new["link"] == old["link"] else "change" }
if new["type"] == "file":
return { "diff": "equal" if amatch and new["hash"] == old["hash"] else "change" }
process_tree(new, old)
# Generates "root-tree.json" or "other-tree.json"
import os
import hashlib
import json
import sys
def process_file(path, relpath="/"):
stat = os.lstat(path)
attrs = { "mode": stat.st_mode, "uid": stat.st_uid, "gid": stat.st_gid }
if (relpath != "/" and os.path.ismount(path)) or relpath in EXCLUDE:
if os.path.islink(path):
return { "type": "link", "link": os.readlink(path) }
if os.path.isdir(path):
files = {}
for k in os.listdir(path):
v = process_file(os.path.join(path, k), os.path.join(relpath, k))
if not (v is None): files[k] = v
return { "type": "dir", "attrs": attrs, "files": files }
if os.path.isfile(path):
return { "type": "file", "attrs": attrs, "hash": hash_file(path) }
print("WARNING: Can't figure out type of: {}".format(path), file=sys.stderr)
def hash_file(path):
h = hashlib.md5()
b = bytearray(128*1024)
mv = memoryview(b)
with open(path, 'rb', buffering=0) as f:
for n in iter(lambda : f.readinto(mv), 0):
return h.hexdigest()
argv = sys.argv[1:]
if len(argv) != 2:
print("Usage: <root folder> <output file>", file=sys.stderr)
my_tree = process_file(argv[0])
with open(argv[1], "w") as f:
f.write(json.dumps(my_tree) + "\n")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.