Skip to content

Instantly share code, notes, and snippets.

Created May 11, 2014 07:13
Show Gist options
  • Save pudquick/f9a4c12a1fba19ae723e to your computer and use it in GitHub Desktop.
Save pudquick/f9a4c12a1fba19ae723e to your computer and use it in GitHub Desktop.
This python project is able to retrieve the PackageInfo metadata from flatpkg files over HTTP without downloading the entire .pkg file (if the web server it's hosted on supports partial file transfer / byte ranges)
# Skip to the end to see what this can do.
# Flat packages are xar files with a particular structure
# We're looking for the PackageInfo file within the xar file
import urllib2, ctypes, zlib
import xml.etree.ElementTree as ET
class SimpleObj(object):
class XarHeader(ctypes.BigEndianStructure):
_pack_ = 1
_fields_ = [('magic', ctypes.c_char*4),
('size', ctypes.c_uint16),
('version', ctypes.c_uint16),
('toc_length_compressed', ctypes.c_uint64),
('toc_length_uncompressed', ctypes.c_uint64),
('chksum_alg', ctypes.c_uint32),
def request_bytes(url, start=0, length=1):
req = urllib2.Request(url)
req.headers['Range'] = 'bytes=%s-%s' % (start, start+length-1)
f = urllib2.urlopen(req)
data =
return data
def retrieve_TOC_offset(url):
# Header is a minimum 28 byte structure
# Beyond that (up to 64 bytes total) is a checksum we're ignoring
# Grab 28 bytes from beginning
header_raw = request_bytes(url, length=28)
# Parse it as a xar header
header = XarHeader.from_buffer_copy(header_raw)
# Use the header length field to calculate TOC offset
# Use the header toc_length_compressed field for TOC length
return (header.size, header.toc_length_compressed)
def retrieve_TOC(url):
# TOC is a zlib compressed XML(-ish) structure
# Location within the xar is within the header
TOC_start, TOC_length = retrieve_TOC_offset(url)
compressed_TOC = request_bytes(url, start=TOC_start, length=TOC_length)
# Decompress raw zlib compressed stream
decompressor = zlib.decompressobj()
# Return TOC XML
# You should really check out the raw XML - lots of fun stuff in it!
TOC = SimpleObj()
TOC.heap_start = TOC_start + TOC_length
TOC.contents = decompressor.decompress(compressed_TOC)
return TOC
def retrieve_PackageInfo_offsets(url):
toc = retrieve_TOC(url)
toc_root = ET.fromstring(toc.contents)
root_files = toc_root.find('toc').findall('file')
root_names = [x.find('name').text for x in root_files]
info_files = []
if 'PackageInfo' in root_names:
# Single package
for x in root_files:
if x.find('name').text == 'PackageInfo':
# Name of the .pkg is None, there's only the file itself
info_files.append([None, x])
# Multiple packages, need to find PackageInfo for each
for i,file_name in enumerate(root_names):
if file_name.lower().endswith('.pkg'):
sub_pkg = root_files[i]
sub_files = sub_pkg.findall('file')
for x in sub_files:
if x.find('name').text == 'PackageInfo':
# Remember name of the sub-pkg
info_files.append([file_name, x])
# Now process the info_files
# Data: package name: encoding, offset, length
PackageInfo = SimpleObj()
PackageInfo.TOC = toc
PackageInfo.offsets = dict()
for pkg_name, i_file in info_files:
encoding = i_file.find('data').find('encoding').attrib['style']
offset = int(i_file.find('data').find('offset').text)
length = int(i_file.find('data').find('length').text)
PackageInfo.offsets[pkg_name] = (encoding, offset, length)
return PackageInfo
def retrieve_PackageInfo(url, pkg_name=None):
# All of the PackageInfo offsets are retrieved
PackageInfo = retrieve_PackageInfo_offsets(url)
# pkg_name=None is the default, for when there's one package
# Otherwise, set pkg_name to:
# * single string: one sub-package to pull information about
# * list of strings: multiple sub-packages to pull
if type(pkg_name) != type([]):
packages = [pkg_name]
packages = pkg_name
results = []
for a_pkg in packages:
encoding, offset, length = PackageInfo.offsets[a_pkg]
# Offset is stored from the beginning of the XAR heap
offset += PackageInfo.TOC.heap_start
if encoding not in ['application/x-gzip', 'application/zlib']:
raise Exception('Only gzip/zlib decompression support so far (add more!)')
compressed_PackageInfo = request_bytes(url, start=offset, length=length)
decompressor = zlib.decompressobj()
if type(pkg_name) != type([]):
return results[0]
return results
# >>> retrieve_PackageInfo_offsets(SAMPLE_URL).offsets.keys()
# ['r.pkg', 'tcltk8.pkg', 'r-1.pkg']
# ( v-- 5875 *bytes* of data + headers for 4 HTTP requests - for a 68 MB file)
# >>> for x in retrieve_PackageInfo(SAMPLE_URL, ['r.pkg', 'r-1.pkg']):
# ... print '\n\n' + x
# <pkg-info format-version="2" identifier="org.r-project.R.mavericks.fw.pkg" version="30165387" install-location="/Library/Frameworks" auth="root">
# <payload installKBytes="89740" numberOfFiles="3149"/>
# <scripts>
# <postinstall file="./postinstall"/>
# </scripts>
# <bundle id="org.r-project.R-framework" CFBundleIdentifier="org.r-project.R-framework" path="./R.framework" CFBundleVersion="3.1.0">
# <bundle id="org.r-project.R-framework" CFBundleIdentifier="org.r-project.R-framework" path="./Versions/3.1" CFBundleVersion="3.1.0">
# <bundle id="" CFBundleIdentifier="" path="./Resources/lib/libR.dylib.dSYM" CFBundleVersion="3.1.0"/>
# <bundle id="" CFBundleIdentifier="" path="./Resources/lib/libRblas.dylib.dSYM" CFBundleVersion="3.1.0"/>
# <bundle id="" CFBundleIdentifier="" path="./Resources/lib/libRlapack.dylib.dSYM" CFBundleVersion="3.1.0"/>
# </bundle>
# </bundle>
# <bundle-version>
# <bundle id="org.r-project.R-framework"/>
# <bundle id="org.r-project.R-framework"/>
# <bundle id=""/>
# <bundle id=""/>
# <bundle id=""/>
# </bundle-version>
# </pkg-info>
# <pkg-info format-version="2" identifier="org.r-project.R.mavericks.GUI.pkg" version="1.64" install-location="/Applications" auth="root">
# <payload installKBytes="3888" numberOfFiles="137"/>
# <bundle id="org.R-project.R" CFBundleIdentifier="org.R-project.R" path="./" CFBundleVersion="6734"/>
# <bundle-version>
# <bundle id="org.R-project.R"/>
# </bundle-version>
# </pkg-info>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment