Last active
March 21, 2016 12:49
-
-
Save williballenthin/235050aa55691f01a2f3 to your computer and use it in GitHub Desktop.
Fetch BlackHat, Defcon, and BsidesLV schedules and create a consolidated list
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
requirements: | |
- requests | |
- unicodecsv | |
- beautifulsoup4 | |
""" | |
import re | |
import functools | |
from collections import namedtuple | |
from collections import defaultdict | |
import requests | |
import unicodecsv | |
from bs4 import BeautifulSoup | |
DC_BASE_URL = "https://www.defcon.org/html/defcon-23/" | |
DC_URL = "https://www.defcon.org/html/defcon-23/dc-23-schedule.html" | |
DCL_URL = "https://www.defcon.org/html/defcon-23/dc-23-demo-labs-schedule.html" | |
DC_DAY_HEADING = "h2" | |
DC_TIME_HEADING = "h3" | |
DC_ROW_HEADING = "ul" | |
DC_SCHEDULE_CLASS = "scheduleRow" | |
BH_BASE_URL = "https://www.blackhat.com/us-15/schedule/" | |
BH_TIME_CLASS = "schedule-time" | |
BH_TRACKS_CLASS = "schedule-tracks" | |
BH_TRACK_CLASS = "schedule-track-col" | |
BH_TITLE_CLASS = "schedule-briefing-title" | |
BH_PRESENTER_CLASS = "schedule-briefing-speaker" | |
BH_VENUE_CLASS = "schedule-briefing-room" | |
BS_BASE_URL = "https://bsideslv2015.sched.org" | |
BS_TIME_CLASS = "time" | |
BS_TITLE_CLASS = "title" | |
BS_VENUE_CLASS = "venue" | |
BS_SPEAKER_CLASS = "sched-role-list" | |
Presentation = namedtuple("Presentation", | |
["conference", "day", "time", "track", "title", "presenter", "url"]) | |
def dc_parse_row(tag, heading, day, time, url_base): | |
results = [] | |
for track in heading.find_all("li"): | |
ps = track.find_all("p") | |
if len(ps) < 2: | |
continue | |
track_name = track.h4.text | |
title = ps[0].text | |
presenter = ps[1].text | |
a = track.find_all("a") | |
if len(a) == 0: | |
url = "" | |
else: | |
url = url_base + track.find_all("a")[0].attrs["href"] | |
p = Presentation(tag, day, time, track_name, title, presenter, url) | |
results.append(p) | |
return results | |
def dc_parse_time(tag, heading, day, time, url_base): | |
results = [] | |
for sibling in heading.next_siblings: | |
if sibling.name in set([DC_DAY_HEADING, DC_TIME_HEADING]): | |
break | |
if sibling.name != DC_ROW_HEADING: | |
continue | |
if DC_SCHEDULE_CLASS not in sibling.attrs.get("class", []): | |
continue | |
results.extend(dc_parse_row(tag, sibling, day, time, url_base)) | |
return results | |
def dc_parse_day(tag, heading, day, url_base): | |
results = [] | |
for sibling in heading.next_siblings: | |
if sibling.name == DC_DAY_HEADING: | |
break | |
if sibling.name == DC_TIME_HEADING: | |
if "time" not in sibling.attrs.get("class", []): | |
continue | |
time = sibling.text.partition("-")[0] | |
results.extend(dc_parse_time(tag, sibling, day, time, url_base)) | |
return results | |
def get_dc(tag, url, url_base): | |
r = requests.get(url) | |
doc = BeautifulSoup(r.text, "html.parser") | |
results = [] | |
for h2 in doc.find_all(DC_DAY_HEADING): | |
day = h2.attrs.get("id") | |
if day not in set(["Thursday", "Friday", "Saturday", "Sunday"]): | |
continue | |
results.extend(dc_parse_day(tag, h2, day, url_base)) | |
return results | |
def bh_parse_track(tag, heading, day, time): | |
results = [] | |
for sibling in heading.next_siblings: | |
if not hasattr(sibling, "attrs"): | |
continue | |
if BH_TIME_CLASS in sibling.attrs.get("class", []): | |
break | |
if BH_TRACKS_CLASS not in sibling.attrs.get("class", []): | |
continue | |
for track in sibling.children: | |
if not hasattr(track, "find_all"): | |
continue | |
if len(track.find_all("a")) < 2: | |
continue | |
url = BH_BASE_URL + track.a.attrs["href"] | |
title = track.find_all("a", class_=BH_TITLE_CLASS)[0].text | |
presenters = [] | |
for a in track.find_all("a", class_=BH_PRESENTER_CLASS): | |
presenters.append(a.text) | |
track_name = track.find_all("span", class_=BH_VENUE_CLASS)[0].text | |
time2 = track.find_all("span", class_=BH_VENUE_CLASS)[1].text | |
p = Presentation(tag, day, time, track_name, title, ", ".join(presenters), url) | |
results.append(p) | |
return results | |
def bh_parse_time(tag, heading, day, time): | |
results = [] | |
for sibling in heading.next_siblings: | |
if not hasattr(sibling, "attrs"): | |
continue | |
if BH_TIME_CLASS in sibling.attrs.get("class", []): | |
break | |
if BH_TRACKS_CLASS in sibling.attrs.get("class", []): | |
results.extend(bh_parse_track(tag, heading, day, time)) | |
return results | |
def get_bh(tag, day, url): | |
results = [] | |
r = requests.get(url) | |
doc = BeautifulSoup(r.text, "html.parser") | |
for div in doc.find_all("div", class_=BH_TIME_CLASS): | |
time = div.text | |
results.extend(bh_parse_time(tag, div, day, time)) | |
return results | |
def bsides_parse_row(heading, day): | |
if not heading.find_all("em"): | |
return [] | |
time = re.match("[0-9:]+", heading.find_all("td", class_=BS_TIME_CLASS)[0].text.strip()).group(0) | |
title = heading.find_all("td", class_=BS_TITLE_CLASS)[0].text.partition("\r")[0].strip() | |
venue = heading.find_all("div", class_=BS_VENUE_CLASS)[0].text.strip() | |
presenter = heading.find_all("em", class_=BS_SPEAKER_CLASS)[0].text.strip().replace("Speakers: ", "") | |
url = BS_BASE_URL + heading.find_all("a")[0].attrs["href"] | |
p = Presentation("BS", day, time, venue, title, presenter, url) | |
return [p] | |
def get_bsides(): | |
results = [] | |
r = requests.get("https://bsideslv2015.sched.org/print?iframe=no&w=i:100;&sidebar=no&bg=dark") | |
doc = BeautifulSoup(r.text, "html.parser") | |
day = doc.h2.text # first h2 is first day | |
# sorry: hack | |
if "Tuesday" in day: | |
day = "Tuesday" | |
for tr in doc.table.find_all("tr"): | |
if len(tr.find_all("h2")) > 0: | |
day = tr.text | |
# sorry: hack | |
if "Wednesday" in day: | |
day = "Wednesday" | |
continue | |
results.extend(bsides_parse_row(tr, day)) | |
return results | |
def main(): | |
results = [] | |
results.extend(get_dc("DC", DC_URL, DC_URL)) | |
results.extend(get_dc("DC-L", DCL_URL, DCL_URL)) | |
results.extend(get_bh("BH", "Wednesday", "https://www.blackhat.com/us-15/schedule/briefings-5.html")) | |
results.extend(get_bh("BH", "Thursday", "https://www.blackhat.com/us-15/schedule/briefings-6.html")) | |
results.extend(get_bh("BH-A", "Wednesday", "https://www.blackhat.com/us-15/schedule/arsenal-5.html")) | |
results.extend(get_bh("BH-A", "Thursday", "https://www.blackhat.com/us-15/schedule/arsenal-6.html")) | |
results.extend(get_bsides()) | |
# Mapping[string:day, Mapping[string:time, List[Presentation]]] | |
days = defaultdict(functools.partial(defaultdict, set)) | |
for result in results: | |
day = days[result.day] | |
time = day[result.time] | |
time.add(result) | |
cwriter = unicodecsv.writer(sys.stdout, delimiter='|', quoting=unicodecsv.QUOTE_MINIMAL, encoding="utf-8") | |
for dayname in ["Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]: | |
times = days[dayname] | |
for timename in sorted(times.keys()): | |
presentations = times[timename] | |
for presentation in presentations: | |
cwriter.writerow([ | |
dayname, | |
timename, | |
presentation.conference, | |
presentation.track, | |
presentation.title, | |
presentation.presenter, | |
presentation.url]) | |
if __name__ == "__main__": | |
import sys | |
main(*sys.argv[1:]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment