Skip to content

Instantly share code, notes, and snippets.

@Scub3d
Created March 8, 2018 01:59
Show Gist options
  • Save Scub3d/f565f893eaa582e29ad205cdb2a37c6e to your computer and use it in GitHub Desktop.
Save Scub3d/f565f893eaa582e29ad205cdb2a37c6e to your computer and use it in GitHub Desktop.
mlh.io hackathon scraper
import sys, os, uuid
from urllib.request import urlopen, Request
from bs4 import BeautifulSoup
from dateutil.parser import parse
def decodeEmail(e):
de = ""
k = int(e[:2], 16)
for i in range(2, len(e) - 1, 2):
de += chr(int(e[i:i+2], 16)^k)
return de
def getHackathons():
urls = [
"http://mlh.io/seasons/na-2018/events",
"http://mlh.io/seasons/na-2017/events",
"http://mlh.io/seasons/eu-2018/events",
"http://mlh.io/seasons/eu-2017/events",
"http://mlh.io/seasons/s2016/events",
"http://mlh.io/seasons/s2015/events",
"http://mlh.io/seasons/s2014/events",
"http://mlh.io/seasons/f2013/events",
"http://mlh.io/seasons/f2014/events",
"http://mlh.io/seasons/f2015/events"
]
years = ["2018", "2017", "2016", "2015", "2014", "2013"]
# Only needed for the most recent years since mlh has their seasons listed a year ahead (not really but kinda)
yearsDict = {"2018": 2018, "2017": 2017}
earlyMonths = ["Aug", "Sep", "Oct", "Nov", "Dec"]
newYears = ["2018", "2017"]
for url in urls:
req = Request(url, headers={"User-Agent": "Mozilla/5.0"})
page = urlopen(req).read()
soup = BeautifulSoup(page, 'html.parser')
hackathonNames = soup.findAll("h3", attrs={'itemprop': 'name'})
for hackathonName in hackathonNames:
for year in years:
if year in url.split("seasons/")[1].split("/events")[0]:
if hackathonName.parent.parent.findNext("p").contents[0].split(" ")[0] in earlyMonths and year in newYears:
hackathonYear = str(yearsDict[year] - 1)
else:
hackathonYear = year
break
else:
hackathonYear = "N/A"
hackathon = {}
hackathon["location"] = hackathonName.parent.parent.findNext("span").contents[0] + ", " + hackathonName.parent.parent.findNext("span").findNext("span").contents[0]
if "protected" in hackathonName.string and "email" in hackathonName.string:
hackathon["name"] = decodeEmail(hackathonName.find("span")['data-cfemail'])
else:
hackathon["name"] = hackathonName.text
hackathon["dateString"] = hackathonName.parent.parent.findNext("p").contents[0] + ", " + hackathonYear
if "-" in hackathon["dateString"]:
hDate = parse(hackathon["dateString"].split("-")[0] + hackathon["dateString"].split("-")[1].split(",")[1])
else:
hDate = parse(hackathon["dateString"])
hackathon["date"] = hDate
hackathon["url"] = hackathonName.findPrevious("a", href=True)["href"]
hackathon["id"] = str(uuid.uuid4())
hackathon["logoURL"] = hackathonName.findPrevious("img")["src"]
hackathon["splashURL"] = hackathonName.findPrevious("img").findPrevious("img")["src"]
print(hackathon)
if __name__ == '__main__':
getHackathons()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment