Last active
January 10, 2022 20:13
-
-
Save jspeed-meyers/32b1fccd7a55046c12f2ebc8b32b2a33 to your computer and use it in GitHub Desktop.
Scrape GitHub links from Quantum Open Source Fund (qosf) projects page
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"""Scrape quantum open source fund package links | |
Identify and store in CSV all GitHub links associated with quantum open source | |
fund projects. Projects without a GitHub link will not be included. | |
NOTE: User has to do a little manual cleaning after running this script. | |
""" | |
import csv | |
from bs4 import BeautifulSoup | |
import requests | |
URL = "https://qosf.org/project_list/" | |
if __name__ == "__main__": | |
# retrieve specified URL | |
page = requests.get(URL) | |
# convert html to BeautifulSoup object to make navigation easy | |
soup = BeautifulSoup(page.content, "html.parser") | |
# identify all elements with a link that includes github.com in path | |
pkgs = soup.find_all(lambda tag: "href" in tag.attrs and "github.com" in tag["href"]) | |
# create name for csv file that contains the exported results | |
OUTPUT_CSV = "qosf_github_links.csv" | |
# write results to csv file | |
with open(OUTPUT_CSV, "w", encoding="utf-8", newline="") as csvfile: | |
# create csv file | |
fieldnames = ["package_name", "github_link"] | |
writer = csv.DictWriter(csvfile, fieldnames=fieldnames) | |
# create header row of csv file | |
writer.writerow({"package_name": "package_name", "github_link": "github_link"}) | |
# write out package name and link to csv - Only if link has not been | |
# added already | |
unique_link_list = [] | |
for pkg in pkgs: | |
if pkg["href"] not in unique_link_list: | |
unique_link_list.append(pkg["href"]) | |
writer.writerow({"package_name": pkg.string, "github_link": pkg["href"]}) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Minor edits: (1) make github link list unique and (2) only include links that use github.com rather than other github domains, e.g. github.io