Created
February 27, 2024 12:59
-
-
Save ThePyProgrammer/c69bcca827c9509486256b081090abc3 to your computer and use it in GitHub Desktop.
Track down all Devpost Hackathon Projects via Participant List (when project gallery isn't released)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import time | |
from selenium import webdriver | |
from selenium.webdriver.chrome.service import Service as ChromeService | |
from bs4 import BeautifulSoup | |
from tqdm import tqdm | |
import pandas as pd | |
# Set up the Chrome WebDriver | |
driver = webdriver.Chrome() | |
# Open Hackathon Webpage | |
HACKATHON_URL = "https://hack4good-2024.devpost.com" # placeholder, you can try whatever | |
URL = f"{HACKATHON_URL}/participants" | |
driver.get(URL) | |
# Scroll to the bottom to load all participants (note, change `scroll_count` value as you see fit to get to the end) | |
scroll_count = 15 | |
for _ in tqdm(range(scroll_count)): | |
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") | |
time.sleep(2) # Wait for the new results to load | |
soup = BeautifulSoup(driver.page_source, 'html.parser') | |
search_results = soup.find_all('div', class_='participant') | |
users = [] | |
for res in search_results: | |
project_count = int(res.find(class_="participant-software-count").strong.get_text()) | |
name = res.h5.get_text() | |
if(project_count == 0): | |
print(name, "had no projects, skipping") | |
continue | |
try: | |
url = res.a.attrs['href'] | |
users.append({ "name": name, "url": url }) | |
except Exception as e: | |
print(name, e) | |
project_set = set() | |
for i, res in enumerate(tqdm(users)): | |
name, url = res["name"], res["url"] | |
driver.get(url) | |
soup = BeautifulSoup(driver.page_source, 'html.parser') | |
projects = soup.find_all("div", class_="gallery-item") | |
for project in projects: | |
project_url = project.a.attrs["href"] | |
project_set.add(project_url) | |
time.sleep(0.5) | |
project_urls = list(project_set) | |
projects = [] | |
for i, url in enumerate(tqdm(project_urls)): | |
driver.get(url) | |
soup = BeautifulSoup(driver.page_source, 'html.parser') | |
if len(soup.find_all("a", href=f"{HACKATHON_URL}/")): | |
title = soup.find("h1", id="app-title").get_text() | |
description = soup.find("p", class_="large").get_text().strip() | |
projects.append({ | |
"title": title, | |
"description": description, | |
"url": url | |
}) | |
print(i, title, url) | |
time.sleep(0.5) | |
# Stores all the data to an excel file for easy access :) | |
pd.DataFrame(projects).to_excel("projects.xlsx", index=False) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment