Created
November 17, 2021 08:24
-
-
Save agateau/07c68d35a20ef08a3719ac4af1321708 to your computer and use it in GitHub Desktop.
Scrape packages.ubuntu.com to list packages containing a given file
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
""" | |
Scrape packages.ubuntu.com to list packages containing a given file. | |
""" | |
import argparse | |
import subprocess | |
import sys | |
import urllib.request | |
try: | |
from bs4 import BeautifulSoup | |
except ImportError: | |
print("ERROR: Install beautifulsoup4 with `pip install beautifulsoup4`.") | |
sys.exit(1) | |
PKG_URL="http://packages.ubuntu.com/search?lang=en&suite=%(suite)s&arch=any&mode=exactfilename&searchon=contents&keywords=%(keywords)s" | |
def parse_html(html): | |
try: | |
soup = BeautifulSoup(html, features="html.parser") | |
except Exception as exc: | |
print("*" * 40) | |
print("Failed to parse this HTML:") | |
print(html) | |
print("*" * 40) | |
raise exc | |
for tr in soup.findAll('tr')[1:]: | |
path_td = tr.contents[1] | |
pkg_td = tr.contents[3] | |
path = "".join(path_td.findAll(text=True)) | |
pkg = "".join(pkg_td.findAll(text=True)).strip() | |
print("{}: {}".format(path, pkg)) | |
def find_current_distro(): | |
result = subprocess.run(["lsb_release", "-c", "-s"], capture_output=True, text=True) | |
return result.stdout.strip() | |
def main(): | |
parser = argparse.ArgumentParser() | |
parser.description = __doc__ | |
parser.add_argument("-d", "--distro", metavar="DISTRO", | |
help="Search for packages in DISTRO instead of the default one") | |
parser.add_argument("filename") | |
args = parser.parse_args() | |
if args.distro is None: | |
distro = find_current_distro() | |
else: | |
distro = args.distro | |
url = PKG_URL % dict(suite=distro, keywords=args.filename) | |
with urllib.request.urlopen(url) as fl: | |
html = fl.read().decode("utf-8") | |
parse_html(html) | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment