public
Created

  • Download Gist
metatar.py
Python
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105
#!/usr/bin/python
 
# Written by Nuutti Kotivuori <naked@iki.fi>
#
# This work is free. You can redistribute it and/or modify it under the
# terms of the Do What The Fuck You Want To Public License, Version 2,
# as published by Sam Hocevar. See http://www.wtfpl.net/ for more details.
 
import sys, yaml, tarfile
 
# TODO: xz support requires python 3.3, but may work
# TODO: detect source compression format and use that in output as well
# TODO: proper command line option parsing
# TODO: ar format parsing and generation, nested metadata
 
AR_HEADER_FMT = '=16s12s6s6s8s10s2s'
 
# XXX: monkey patch tarfile class: GNU tar leaves devmajor and
# devminor as empty if file is not device, while tarfile puts in 7
# zeros (octal). This change makes sure the fields are empty in
# tarfile as well.
def fix_tar():
import struct
from tarfile import itn, stn, calc_chksums, POSIX_MAGIC, REGTYPE, CHRTYPE, BLKTYPE, BLOCKSIZE
def _create_header_fixed(_, info, format):
"""Return a header block. info is a dictionary with file
information, format must be one of the *_FORMAT constants.
"""
parts = [
stn(info.get("name", ""), 100),
itn(info.get("mode", 0) & 07777, 8, format),
itn(info.get("uid", 0), 8, format),
itn(info.get("gid", 0), 8, format),
itn(info.get("size", 0), 12, format),
itn(info.get("mtime", 0), 12, format),
" ", # checksum field
info.get("type", REGTYPE),
stn(info.get("linkname", ""), 100),
stn(info.get("magic", POSIX_MAGIC), 8),
stn(info.get("uname", ""), 32),
stn(info.get("gname", ""), 32),
itn(info.get("devmajor", 0), 8, format) if info.get("type") in [CHRTYPE, BLKTYPE] else stn("", 8),
itn(info.get("devminor", 0), 8, format) if info.get("type") in [CHRTYPE, BLKTYPE] else stn("", 8),
stn(info.get("prefix", ""), 155)
]
buf = struct.pack("%ds" % BLOCKSIZE, "".join(parts))
chksum = calc_chksums(buf[-BLOCKSIZE:])[0]
buf = buf[:-364] + "%06o\0" % chksum + buf[-357:]
return buf
tarfile.TarInfo._create_header = _create_header_fixed
fix_tar()
 
def create_metadata(srctar):
with tarfile.open(srctar, mode='r|*') as tar:
members = []
for info in tar:
members.append(dict(name=info.name,
size=info.size,
mtime=info.mtime,
mode=info.mode,
type=info.type,
linkname=info.linkname,
uid=info.uid,
gid=info.gid,
uname=info.uname,
gname=info.gname,
devmajor=info.devmajor,
devminor=info.devminor,
pax_headers=info.pax_headers))
return dict(format=tar.format,
pax_headers=tar.pax_headers,
members=members)
 
def apply_metadata(metadata, srctar, dsttar):
srctar = tarfile.open(sys.argv[1], 'r:*')
with tarfile.open(name=dsttar, mode='w|', format=metadata['format'], pax_headers=metadata['pax_headers']) as tar:
for member in metadata['members']:
info = tarfile.TarInfo(member['name'])
info.size = member['size']
info.mtime = member['mtime']
info.mode = member['mode']
info.type = member['type']
info.linkname = member['linkname']
info.uid = member['uid']
info.gid = member['gid']
info.uname = member['uname']
info.gname = member['gname']
info.devmajor = member['devmajor']
info.devminor = member['devminor']
info.pax_headers = member['pax_headers']
tar.addfile(info, srctar.extractfile(member['name']))
 
def main():
if len(sys.argv) == 2:
metadata = create_metadata(sys.argv[1])
print yaml.dump(metadata, default_flow_style=False)
elif len(sys.argv) == 3:
metadata = yaml.load(sys.stdin)
apply_metadata(metadata, sys.argv[1], sys.argv[2])
else:
print 'aiee'
 
if __name__ == '__main__':
main()

This is awesome. In testing this, I found a related issue, which arguably would be within scope of metatar.py: if the tar file is actually a tar.gz file, then they might have different timestamp values in the GzipFile objects. It would be nice to have a way to remove/normalize that.

Additionally, it would be great to have a similar tool for the timestamp in the 'ar' header in the *.deb file. See also http://en.wikipedia.org/wiki/Ar_%28Unix%29#File_format_details

Once we have those things done, we effectively have a script that can take two *.deb files, show these trivial metadata differences, and adjust the metadata so they are the same. This would be huge.

Also... how did you find out about the ReproducibleBuilds stuff? (-:

BTW, I strongly encourage you to "watch" the https://wiki.debian.org/ReproducibleBuilds page!

Please sign in to comment on this gist.

Something went wrong with that request. Please try again.