Skip to content

Instantly share code, notes, and snippets.

@agateau
Created November 17, 2021 08:24
Show Gist options
  • Save agateau/07c68d35a20ef08a3719ac4af1321708 to your computer and use it in GitHub Desktop.
Save agateau/07c68d35a20ef08a3719ac4af1321708 to your computer and use it in GitHub Desktop.
Scrape packages.ubuntu.com to list packages containing a given file
#!/usr/bin/env python3
"""
Scrape packages.ubuntu.com to list packages containing a given file.
"""
import argparse
import subprocess
import sys
import urllib.request
try:
from bs4 import BeautifulSoup
except ImportError:
print("ERROR: Install beautifulsoup4 with `pip install beautifulsoup4`.")
sys.exit(1)
PKG_URL="http://packages.ubuntu.com/search?lang=en&suite=%(suite)s&arch=any&mode=exactfilename&searchon=contents&keywords=%(keywords)s"
def parse_html(html):
try:
soup = BeautifulSoup(html, features="html.parser")
except Exception as exc:
print("*" * 40)
print("Failed to parse this HTML:")
print(html)
print("*" * 40)
raise exc
for tr in soup.findAll('tr')[1:]:
path_td = tr.contents[1]
pkg_td = tr.contents[3]
path = "".join(path_td.findAll(text=True))
pkg = "".join(pkg_td.findAll(text=True)).strip()
print("{}: {}".format(path, pkg))
def find_current_distro():
result = subprocess.run(["lsb_release", "-c", "-s"], capture_output=True, text=True)
return result.stdout.strip()
def main():
parser = argparse.ArgumentParser()
parser.description = __doc__
parser.add_argument("-d", "--distro", metavar="DISTRO",
help="Search for packages in DISTRO instead of the default one")
parser.add_argument("filename")
args = parser.parse_args()
if args.distro is None:
distro = find_current_distro()
else:
distro = args.distro
url = PKG_URL % dict(suite=distro, keywords=args.filename)
with urllib.request.urlopen(url) as fl:
html = fl.read().decode("utf-8")
parse_html(html)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment