Skip to content

Instantly share code, notes, and snippets.

@williballenthin
Last active March 21, 2016 12:49
Show Gist options
  • Star 2 You must be signed in to star a gist
  • Fork 3 You must be signed in to fork a gist
  • Save williballenthin/235050aa55691f01a2f3 to your computer and use it in GitHub Desktop.
Save williballenthin/235050aa55691f01a2f3 to your computer and use it in GitHub Desktop.
Fetch BlackHat, Defcon, and BsidesLV schedules and create a consolidated list
"""
requirements:
- requests
- unicodecsv
- beautifulsoup4
"""
import re
import functools
from collections import namedtuple
from collections import defaultdict
import requests
import unicodecsv
from bs4 import BeautifulSoup
DC_BASE_URL = "https://www.defcon.org/html/defcon-23/"
DC_URL = "https://www.defcon.org/html/defcon-23/dc-23-schedule.html"
DCL_URL = "https://www.defcon.org/html/defcon-23/dc-23-demo-labs-schedule.html"
DC_DAY_HEADING = "h2"
DC_TIME_HEADING = "h3"
DC_ROW_HEADING = "ul"
DC_SCHEDULE_CLASS = "scheduleRow"
BH_BASE_URL = "https://www.blackhat.com/us-15/schedule/"
BH_TIME_CLASS = "schedule-time"
BH_TRACKS_CLASS = "schedule-tracks"
BH_TRACK_CLASS = "schedule-track-col"
BH_TITLE_CLASS = "schedule-briefing-title"
BH_PRESENTER_CLASS = "schedule-briefing-speaker"
BH_VENUE_CLASS = "schedule-briefing-room"
BS_BASE_URL = "https://bsideslv2015.sched.org"
BS_TIME_CLASS = "time"
BS_TITLE_CLASS = "title"
BS_VENUE_CLASS = "venue"
BS_SPEAKER_CLASS = "sched-role-list"
Presentation = namedtuple("Presentation",
["conference", "day", "time", "track", "title", "presenter", "url"])
def dc_parse_row(tag, heading, day, time, url_base):
results = []
for track in heading.find_all("li"):
ps = track.find_all("p")
if len(ps) < 2:
continue
track_name = track.h4.text
title = ps[0].text
presenter = ps[1].text
a = track.find_all("a")
if len(a) == 0:
url = ""
else:
url = url_base + track.find_all("a")[0].attrs["href"]
p = Presentation(tag, day, time, track_name, title, presenter, url)
results.append(p)
return results
def dc_parse_time(tag, heading, day, time, url_base):
results = []
for sibling in heading.next_siblings:
if sibling.name in set([DC_DAY_HEADING, DC_TIME_HEADING]):
break
if sibling.name != DC_ROW_HEADING:
continue
if DC_SCHEDULE_CLASS not in sibling.attrs.get("class", []):
continue
results.extend(dc_parse_row(tag, sibling, day, time, url_base))
return results
def dc_parse_day(tag, heading, day, url_base):
results = []
for sibling in heading.next_siblings:
if sibling.name == DC_DAY_HEADING:
break
if sibling.name == DC_TIME_HEADING:
if "time" not in sibling.attrs.get("class", []):
continue
time = sibling.text.partition("-")[0]
results.extend(dc_parse_time(tag, sibling, day, time, url_base))
return results
def get_dc(tag, url, url_base):
r = requests.get(url)
doc = BeautifulSoup(r.text, "html.parser")
results = []
for h2 in doc.find_all(DC_DAY_HEADING):
day = h2.attrs.get("id")
if day not in set(["Thursday", "Friday", "Saturday", "Sunday"]):
continue
results.extend(dc_parse_day(tag, h2, day, url_base))
return results
def bh_parse_track(tag, heading, day, time):
results = []
for sibling in heading.next_siblings:
if not hasattr(sibling, "attrs"):
continue
if BH_TIME_CLASS in sibling.attrs.get("class", []):
break
if BH_TRACKS_CLASS not in sibling.attrs.get("class", []):
continue
for track in sibling.children:
if not hasattr(track, "find_all"):
continue
if len(track.find_all("a")) < 2:
continue
url = BH_BASE_URL + track.a.attrs["href"]
title = track.find_all("a", class_=BH_TITLE_CLASS)[0].text
presenters = []
for a in track.find_all("a", class_=BH_PRESENTER_CLASS):
presenters.append(a.text)
track_name = track.find_all("span", class_=BH_VENUE_CLASS)[0].text
time2 = track.find_all("span", class_=BH_VENUE_CLASS)[1].text
p = Presentation(tag, day, time, track_name, title, ", ".join(presenters), url)
results.append(p)
return results
def bh_parse_time(tag, heading, day, time):
results = []
for sibling in heading.next_siblings:
if not hasattr(sibling, "attrs"):
continue
if BH_TIME_CLASS in sibling.attrs.get("class", []):
break
if BH_TRACKS_CLASS in sibling.attrs.get("class", []):
results.extend(bh_parse_track(tag, heading, day, time))
return results
def get_bh(tag, day, url):
results = []
r = requests.get(url)
doc = BeautifulSoup(r.text, "html.parser")
for div in doc.find_all("div", class_=BH_TIME_CLASS):
time = div.text
results.extend(bh_parse_time(tag, div, day, time))
return results
def bsides_parse_row(heading, day):
if not heading.find_all("em"):
return []
time = re.match("[0-9:]+", heading.find_all("td", class_=BS_TIME_CLASS)[0].text.strip()).group(0)
title = heading.find_all("td", class_=BS_TITLE_CLASS)[0].text.partition("\r")[0].strip()
venue = heading.find_all("div", class_=BS_VENUE_CLASS)[0].text.strip()
presenter = heading.find_all("em", class_=BS_SPEAKER_CLASS)[0].text.strip().replace("Speakers: ", "")
url = BS_BASE_URL + heading.find_all("a")[0].attrs["href"]
p = Presentation("BS", day, time, venue, title, presenter, url)
return [p]
def get_bsides():
results = []
r = requests.get("https://bsideslv2015.sched.org/print?iframe=no&w=i:100;&sidebar=no&bg=dark")
doc = BeautifulSoup(r.text, "html.parser")
day = doc.h2.text # first h2 is first day
# sorry: hack
if "Tuesday" in day:
day = "Tuesday"
for tr in doc.table.find_all("tr"):
if len(tr.find_all("h2")) > 0:
day = tr.text
# sorry: hack
if "Wednesday" in day:
day = "Wednesday"
continue
results.extend(bsides_parse_row(tr, day))
return results
def main():
results = []
results.extend(get_dc("DC", DC_URL, DC_URL))
results.extend(get_dc("DC-L", DCL_URL, DCL_URL))
results.extend(get_bh("BH", "Wednesday", "https://www.blackhat.com/us-15/schedule/briefings-5.html"))
results.extend(get_bh("BH", "Thursday", "https://www.blackhat.com/us-15/schedule/briefings-6.html"))
results.extend(get_bh("BH-A", "Wednesday", "https://www.blackhat.com/us-15/schedule/arsenal-5.html"))
results.extend(get_bh("BH-A", "Thursday", "https://www.blackhat.com/us-15/schedule/arsenal-6.html"))
results.extend(get_bsides())
# Mapping[string:day, Mapping[string:time, List[Presentation]]]
days = defaultdict(functools.partial(defaultdict, set))
for result in results:
day = days[result.day]
time = day[result.time]
time.add(result)
cwriter = unicodecsv.writer(sys.stdout, delimiter='|', quoting=unicodecsv.QUOTE_MINIMAL, encoding="utf-8")
for dayname in ["Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]:
times = days[dayname]
for timename in sorted(times.keys()):
presentations = times[timename]
for presentation in presentations:
cwriter.writerow([
dayname,
timename,
presentation.conference,
presentation.track,
presentation.title,
presentation.presenter,
presentation.url])
if __name__ == "__main__":
import sys
main(*sys.argv[1:])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment