Last active
April 25, 2025 17:12
-
-
Save pombredanne/18d69ff95a44ffcab04af2f9d8879860 to your computer and use it in GitHub Desktop.
Parse Debian Contents indices as speced at https://wiki.debian.org/DebianRepository/Format#A.22Contents.22_indices
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# | |
# Copyright (c) nexB Inc. and others. | |
# http://nexb.com and https://github.com/nexB/debut/ | |
# SPDX-License-Identifier: Apache-2.0 | |
from collections import defaultdict | |
import gzip | |
""" | |
Utilities to parse a Debian Contents index file. | |
These are used by apt-file for instance | |
See https://wiki.debian.org/DebianRepository/Format#A.22Contents.22_indices | |
for format details. | |
For example files see: | |
- http://ftp.de.debian.org/debian/dists/Debian10.6/main/Contents-amd64.gz | |
- http://archive.ubuntu.com/ubuntu/dists/focal/Contents-i386.gz | |
See also https://salsa.debian.org/debian-irc-team/judd/-/blob/master/supybot/plugins/Judd/debcontents/contents_file.py | |
""" | |
def parse_contents(location, has_header=True): | |
""" | |
Return a mapping of {path: [list of packages]} and a mapping of | |
{package: [list of paths]} from parsing a Debian Contents file at | |
``location``. | |
If ``has_header`` is True, the file is expected to have a header narrative | |
and a FILE/LOCATION columns headers before the table starts in earnest. | |
See https://wiki.debian.org/DebianRepository/Format#A.22Contents.22_indices | |
for format details. | |
""" | |
packages_by_path = defaultdict(list) | |
paths_by_package = defaultdict(list) | |
with gzip.GzipFile(location) as lines: | |
if has_header: | |
# keep track if we are now in the table proper | |
# e.g. after the FILE LOCATION header | |
# this is the case for Ubuntu | |
in_table = False | |
else: | |
# if we have no header (like in Debian) we start right away. | |
in_table = True | |
for line in lines: | |
line = line.decode('utf-8') | |
left, _, right = line.strip().rpartition(' ') | |
left = left.strip() | |
right = right.strip() | |
if not in_table: | |
# "The first row of the table SHOULD have the columns "FILE" and "LOCATION"" | |
# This is the spec and used to be TRue for Debian. | |
# But nowadays only Ubuntu does this. | |
if left == 'FILE' and right == 'LOCATION': | |
in_table = True | |
else: | |
path = left | |
packages = right | |
package_names = packages.split(',') | |
for archsec_name in package_names: | |
# "A list of qualified package names, separated by comma. A | |
# qualified package name has the form | |
# [[$AREA/]$SECTION/]$NAME, where $AREA is the archive area, | |
# $SECTION the package section, and $NAME the name of the | |
# package." | |
# NOTE: we ignore the arch and section for now | |
archsec, _, package_name = archsec_name.rpartition('/') | |
arch, _, section = archsec.rpartition('/') | |
packages_by_path[path].append(package_name) | |
paths_by_package[package_name].append(path) | |
if not in_table: | |
raise Exception('Invalid Content files without FILE/LOCATION header.') | |
return packages_by_path, paths_by_package | |
if __name__ == '__main__': | |
import sys | |
import time | |
try: | |
location = sys.argv[1] | |
start = time.time() | |
packages_by_path, paths_by_package = parse_contents(location, has_header=False) | |
duration = time.time() - start | |
print(f'Parsing completed in {duration} seconds.') | |
names_count = len(paths_by_package) | |
paths_count = len(packages_by_path) | |
print(f'Found {names_count} package names with {paths_count} paths.') | |
except Exception as e: | |
print('Parse a Debian Contents files and print stats.') | |
print('Usage: contents <path to a Gzipped Debian Contents index>') | |
print('For example, download this file: http://ftp.de.debian.org/debian/dists/Debian10.6/main/Contents-amd64.gz') | |
raise |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
And some stats: