Created
July 11, 2021 14:32
-
-
Save jspeed-meyers/4675e3c84ca5a93ec99473d6a7587320 to your computer and use it in GitHub Desktop.
Scrape anaconda package lists.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"""Scrape anaconda package names and corresponding github links. | |
User provides a URL to the anaconda page that contains the package information | |
for one particular python version, e.g. 3.9, and for one particular platform, | |
e.g. linux 64. This program then extracts all the package names and associated | |
links (provided by Anaconda) for each package. This data is then exported | |
to a csv. | |
""" | |
import csv | |
from bs4 import BeautifulSoup | |
import requests | |
URL = "https://docs.anaconda.com/anaconda/packages/py3.9_linux-64/" | |
if __name__ == "__main__": | |
# retrieve specified URL | |
page = requests.get(URL) | |
# convert html to BeautifulSoup object to make navigation easy | |
soup = BeautifulSoup(page.content, "html.parser") | |
# filter down to the package table | |
table = soup.find(lambda tag: tag.name == "table") | |
# identify all table cells with a link | |
pkgs = table.find_all(lambda tag: "href" in tag.attrs) | |
# create name for csv file that contains the exported results | |
python_version_and_platform = URL.split("/")[-2] | |
output_name = "packages_name_links_" + python_version_and_platform + ".csv" | |
# write results to csv file | |
with open(output_name, "w", newline="") as csvfile: | |
# create csv file | |
fieldnames = ["package_name", "anaconda_link"] | |
writer = csv.DictWriter(csvfile, fieldnames=fieldnames) | |
# create header row of csv file | |
writer.writerow( | |
{"package_name": "package_name", "anaconda_link": "anaconda_link"} | |
) | |
# loop through each package, writing out the package name and the URL link | |
# provided by anaconda. | |
for pkg in pkgs: | |
writer.writerow({"package_name": pkg.string, "anaconda_link": pkg["href"]}) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment