Skip to content

Instantly share code, notes, and snippets.

@pombredanne
Last active April 25, 2025 17:12
Show Gist options
  • Save pombredanne/18d69ff95a44ffcab04af2f9d8879860 to your computer and use it in GitHub Desktop.
Save pombredanne/18d69ff95a44ffcab04af2f9d8879860 to your computer and use it in GitHub Desktop.
#
# Copyright (c) nexB Inc. and others.
# http://nexb.com and https://github.com/nexB/debut/
# SPDX-License-Identifier: Apache-2.0
from collections import defaultdict
import gzip
"""
Utilities to parse a Debian Contents index file.
These are used by apt-file for instance
See https://wiki.debian.org/DebianRepository/Format#A.22Contents.22_indices
for format details.
For example files see:
- http://ftp.de.debian.org/debian/dists/Debian10.6/main/Contents-amd64.gz
- http://archive.ubuntu.com/ubuntu/dists/focal/Contents-i386.gz
See also https://salsa.debian.org/debian-irc-team/judd/-/blob/master/supybot/plugins/Judd/debcontents/contents_file.py
"""
def parse_contents(location, has_header=True):
"""
Return a mapping of {path: [list of packages]} and a mapping of
{package: [list of paths]} from parsing a Debian Contents file at
``location``.
If ``has_header`` is True, the file is expected to have a header narrative
and a FILE/LOCATION columns headers before the table starts in earnest.
See https://wiki.debian.org/DebianRepository/Format#A.22Contents.22_indices
for format details.
"""
packages_by_path = defaultdict(list)
paths_by_package = defaultdict(list)
with gzip.GzipFile(location) as lines:
if has_header:
# keep track if we are now in the table proper
# e.g. after the FILE LOCATION header
# this is the case for Ubuntu
in_table = False
else:
# if we have no header (like in Debian) we start right away.
in_table = True
for line in lines:
line = line.decode('utf-8')
left, _, right = line.strip().rpartition(' ')
left = left.strip()
right = right.strip()
if not in_table:
# "The first row of the table SHOULD have the columns "FILE" and "LOCATION""
# This is the spec and used to be TRue for Debian.
# But nowadays only Ubuntu does this.
if left == 'FILE' and right == 'LOCATION':
in_table = True
else:
path = left
packages = right
package_names = packages.split(',')
for archsec_name in package_names:
# "A list of qualified package names, separated by comma. A
# qualified package name has the form
# [[$AREA/]$SECTION/]$NAME, where $AREA is the archive area,
# $SECTION the package section, and $NAME the name of the
# package."
# NOTE: we ignore the arch and section for now
archsec, _, package_name = archsec_name.rpartition('/')
arch, _, section = archsec.rpartition('/')
packages_by_path[path].append(package_name)
paths_by_package[package_name].append(path)
if not in_table:
raise Exception('Invalid Content files without FILE/LOCATION header.')
return packages_by_path, paths_by_package
if __name__ == '__main__':
import sys
import time
try:
location = sys.argv[1]
start = time.time()
packages_by_path, paths_by_package = parse_contents(location, has_header=False)
duration = time.time() - start
print(f'Parsing completed in {duration} seconds.')
names_count = len(paths_by_package)
paths_count = len(packages_by_path)
print(f'Found {names_count} package names with {paths_count} paths.')
except Exception as e:
print('Parse a Debian Contents files and print stats.')
print('Usage: contents <path to a Gzipped Debian Contents index>')
print('For example, download this file: http://ftp.de.debian.org/debian/dists/Debian10.6/main/Contents-amd64.gz')
raise
@pombredanne
Copy link
Author

And some stats:


$ wget http://archive.ubuntu.com/ubuntu/dists/focal/Contents-i386.gz
$ python contents.py Contents-i386.gz 
Parsing completed in 17.560503482818604 seconds.
Found 34235 package names with 5485301 paths.

$ wget http://ftp.de.debian.org/debian/dists/Debian10.6/main/Contents-amd64.gz
$ python contents.py Contents-amd64.gz 
Parsing completed in 21.503568410873413 seconds.
Found 56882 package names with 6091815 paths.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment