Last active
August 29, 2015 19:19
-
-
Save jefftriplett/496b0b6ff6c8999be3a4 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Quick attempt to extract speakers and talks from every DjangoCon US. | |
To install requirements: | |
pip install click grab unicodecsv | |
Goals: | |
#. Extract speakers and talks from every DjangoCon US. (DONE) | |
#. Store extracted information in a useful format (mostly DONE) | |
#. Process information into an app | |
#. Expand for DjangoCon EU + AU | |
""" | |
import click | |
import unicodecsv | |
from grab import Grab | |
def write_csv(filename, rows): | |
with open(filename, 'w') as f: | |
writer = unicodecsv.writer(f, quoting=unicodecsv.QUOTE_ALL) | |
writer.writerow([ | |
'conference', | |
'year', | |
'start_time', | |
'speaker', | |
'talk_name', | |
'talk_url', | |
'video_url', | |
'slides_url', | |
]) | |
for row in rows: | |
writer.writerow([ | |
row.get('conference'), | |
row.get('year'), | |
row.get('start_time'), | |
row.get('speaker'), | |
row.get('talk_name'), | |
row.get('talk_url'), | |
row.get('video_url'), | |
row.get('slides_url'), | |
]) | |
@click.group() | |
def cli(): | |
pass | |
@cli.command() | |
def fetch_all(): | |
fetch_2008() | |
fetch_2009() | |
fetch_2010() | |
fetch_2011() | |
fetch_2012() | |
fetch_2013() | |
fetch_2014() | |
fetch_2015() | |
@cli.command() | |
def fetch_2008(): | |
url = 'http://web.archive.org/web/20090217112815id_/http://djangocon.org/program' | |
grab = Grab() | |
grab.go(url) | |
""" | |
<tr class="odd first"> | |
<td>9:00am - 9:50am</td> | |
<td>Building 40</td> | |
<td>-</td> | |
<td>Doors Open/Registration</td> | |
</tr> | |
<tr class="even"> | |
<td>10:00am - 10:10am</td> | |
<td>Track 1</td> | |
<td>Robert Lofthouse</td> | |
<td>Keynote: Chairman's Opening Statement</td> | |
</tr> | |
""" | |
data = [] | |
rows = [item for item in grab.doc('//table/tbody/tr')] | |
for row in rows: | |
click.echo(click.style(row.text(), fg='blue')) | |
talk = row.select('td') | |
if talk.exists(): | |
start_time = talk[0].text() | |
speaker = talk[2].text() | |
talk_name = talk[3].text() | |
click.echo(click.style(talk.text(), fg='yellow')) | |
click.echo(u'start_time: {0}'.format(start_time)) | |
click.echo(u'speaker: {0}'.format(speaker)) | |
click.echo(u'talk name: {0}'.format(talk_name)) | |
# click.echo(u'talk url: {0}'.format(talk_url)) | |
click.echo() | |
data.append({ | |
'conference': 'djangocon-us', | |
'year': '2008', | |
'start_time': start_time, | |
'speaker': speaker, | |
'talk_name': talk_name, | |
'talk_url': '', | |
}) | |
write_csv('csv/djangocon-us-2008.csv', data) | |
@cli.command() | |
def fetch_2009(): | |
url = 'http://web.archive.org/web/20100428054846id_/http://www.djangocon.org/2009/conference/schedule/' | |
grab = Grab() | |
grab.go(url) | |
""" | |
<tr> | |
<td class="time">8:00 - 18:00</td> | |
<td class="talk colspan="2"><p>Registration opens for the rest of the day.</p></td> | |
</tr> | |
<tr class="even"> | |
<td class="time">9:00</td> | |
<td class="talk" colspan="2"><p>Chairman's Opening Remarks & Introduction from the Fake Jacob Kaplan-Moss</p></td> | |
</tr> | |
""" | |
data = [] | |
rows = [item for item in grab.doc('//table/tbody/tr')] | |
for row in rows: | |
click.echo(click.style(row.text(), fg='blue')) | |
start_time = row.select('td') | |
if start_time.exists(): | |
start_time = start_time.text() | |
else: | |
start_time = '' | |
click.echo(click.style(row.text(), fg='blue')) | |
talks = row.select('td')[1:] | |
for talk in talks: | |
speaking_info = talk.select('p') | |
if speaking_info.exists() and len(speaking_info) > 1: | |
speaker = speaking_info[1].text() | |
talk_name = speaking_info[0].text() | |
click.echo(click.style(talk.text(), fg='yellow')) | |
click.echo(u'start_time: {0}'.format(start_time)) | |
click.echo(u'speaker: {0}'.format(speaker)) | |
click.echo(u'talk name: {0}'.format(talk_name)) | |
# click.echo(u'talk url: {0}'.format(talk_url)) | |
click.echo() | |
data.append({ | |
'conference': 'djangocon-us', | |
'year': '2009', | |
'start_time': start_time, | |
'speaker': speaker, | |
'talk_name': talk_name, | |
'talk_url': '', | |
}) | |
write_csv('csv/djangocon-us-2009.csv', data) | |
@cli.command() | |
def fetch_2010(): | |
url = 'http://web.archive.org/web/20101005140539id_/http://djangocon.us/schedule/' | |
grab = Grab() | |
grab.go(url) | |
data = [] | |
rows = [item for item in grab.doc('//table/tr')] | |
for row in rows: | |
start_time = row.select('td') | |
if start_time.exists(): | |
start_time = start_time.text() | |
else: | |
start_time = '' | |
click.echo(click.style(row.text(), fg='blue')) | |
talks = row.select('td')[1:] | |
for talk in talks: | |
speaker = talk.select('td[contains(@class, "speaker")]') | |
if speaker.exists(): | |
speaker = speaker.text().strip() | |
else: | |
speaker = '' | |
anchor = talk.select('a') | |
if anchor.exists(): | |
talk_name = anchor.text() | |
talk_url = 'http://web.archive.org/web/20101005140539id_/http://djangocon.us/{0}'.format(anchor.attr('href', '')) | |
else: | |
talk_name = '' | |
talk_url = '' | |
click.echo(click.style(talk.text(), fg='yellow')) | |
click.echo(u'start_time: {0}'.format(start_time)) | |
click.echo(u'speaker: {0}'.format(speaker)) | |
click.echo(u'talk name: {0}'.format(talk_name)) | |
click.echo(u'talk url: {0}'.format(talk_url)) | |
click.echo() | |
data.append({ | |
'conference': 'djangocon-us', | |
'year': '2010', | |
'start_time': start_time, | |
'speaker': speaker, | |
'talk_name': talk_name, | |
'talk_url': talk_url, | |
}) | |
write_csv('csv/djangocon-us-2010.csv', data) | |
@cli.command() | |
def fetch_2011(): | |
url = 'http://2011.djangocon.us/schedule/' | |
grab = Grab() | |
grab.go(url) | |
data = [] | |
rows = [item for item in grab.doc('//table/tr')] | |
for row in rows: | |
start_time = row.select('td[contains(@class, "time")]') | |
if start_time.exists(): | |
start_time = start_time.text() | |
else: | |
start_time = '' | |
click.echo(click.style(row.text(), fg='blue')) | |
talks = row.select('td')[1:] | |
for talk in talks: | |
speaker = talk.select('div[contains(@class, "speaker")]') | |
if speaker.exists(): | |
speaker = speaker.text().strip() | |
else: | |
speaker = '' | |
anchor = talk.select('div[contains(@class, "title")]/a') | |
if anchor.exists(): | |
talk_name = anchor.text() | |
talk_url = 'http://2011.djangocon.us/{0}'.format(anchor.attr('href', '')) | |
else: | |
talk_name = '' | |
talk_url = '' | |
click.echo(click.style(talk.text(), fg='yellow')) | |
click.echo(u'start_time: {0}'.format(start_time)) | |
click.echo(u'speaker: {0}'.format(speaker)) | |
click.echo(u'talk name: {0}'.format(talk_name)) | |
click.echo(u'talk url: {0}'.format(talk_url)) | |
click.echo() | |
data.append({ | |
'conference': 'djangocon-us', | |
'year': '2011', | |
'start_time': start_time, | |
'speaker': speaker, | |
'talk_name': talk_name, | |
'talk_url': talk_url, | |
}) | |
write_csv('csv/djangocon-us-2011.csv', data) | |
@cli.command() | |
def fetch_2012(): | |
url = 'http://2012.djangocon.us/schedule/' | |
grab = Grab() | |
grab.go(url) | |
data = [] | |
rows = [item for item in grab.doc('//table/tr')] | |
for row in rows: | |
start_time = row.select('td[contains(@class, "time")]') | |
if start_time.exists(): | |
start_time = start_time.text() | |
else: | |
start_time = '' | |
click.echo(click.style(row.text(), fg='blue')) | |
talks = row.select('td')[1:] | |
for talk in talks: | |
speaker = talk.select('div[contains(@class, "speaker")]') | |
if speaker.exists(): | |
speaker = speaker.text().strip() | |
else: | |
speaker = '' | |
anchor = talk.select('div[contains(@class, "title")]/a') | |
if anchor.exists(): | |
talk_name = anchor.text() | |
talk_url = 'http://2012.djangocon.us/{0}'.format(anchor.attr('href', '')) | |
else: | |
talk_name = '' | |
talk_url = '' | |
click.echo(click.style(talk.text(), fg='yellow')) | |
click.echo(u'start_time: {0}'.format(start_time)) | |
click.echo(u'speaker: {0}'.format(speaker)) | |
click.echo(u'talk name: {0}'.format(talk_name)) | |
click.echo(u'talk url: {0}'.format(talk_url)) | |
click.echo() | |
data.append({ | |
'conference': 'djangocon-us', | |
'year': '2012', | |
'start_time': start_time, | |
'speaker': speaker, | |
'talk_name': talk_name, | |
'talk_url': talk_url, | |
}) | |
write_csv('csv/djangocon-us-2012.csv', data) | |
@cli.command() | |
def fetch_2013(): | |
url = 'https://web.archive.org/web/20131022134635id_/http://djangocon.us/schedule/' | |
grab = Grab() | |
grab.go(url) | |
data = [] | |
rows = [item for item in grab.doc('//table/tbody/tr')] | |
for row in rows: | |
start_time = row.select('td[contains(@class, "time")]') | |
if start_time.exists(): | |
start_time = start_time.text() | |
else: | |
start_time = '' | |
click.echo(click.style(row.text(), fg='blue')) | |
talks = row.select('td')[1:] | |
for talk in talks: | |
speaker = talk.select('span[contains(@class, "speaker")]') | |
if speaker.exists(): | |
speaker = speaker.text().strip() | |
else: | |
speaker = '' | |
anchor = talk.select('span[contains(@class, "title")]/a') | |
if anchor.exists(): | |
talk_name = anchor.text() | |
talk_url = 'http://2012.djangocon.us/{0}'.format(anchor.attr('href', '')) | |
else: | |
talk_name = '' | |
talk_url = '' | |
click.echo(click.style(talk.text(), fg='yellow')) | |
click.echo(u'start_time: {0}'.format(start_time)) | |
click.echo(u'speaker: {0}'.format(speaker)) | |
click.echo(u'talk name: {0}'.format(talk_name)) | |
click.echo(u'talk url: {0}'.format(talk_url)) | |
click.echo() | |
data.append({ | |
'conference': 'djangocon-us', | |
'year': '2013', | |
'start_time': start_time, | |
'speaker': speaker, | |
'talk_name': talk_name, | |
'talk_url': talk_url, | |
}) | |
write_csv('csv/djangocon-us-2013.csv', data) | |
@cli.command() | |
def fetch_2014(): | |
url = 'http://2014.djangocon.us/schedule/' | |
g = Grab() | |
g.go(url) | |
data = [] | |
talks = [item for item in g.doc('//table/tbody/tr')] | |
for talk in talks: | |
row = talk.select('td') | |
anchor = row.select('p/a') | |
start_time = row.select('p').text() | |
speaker = row.select('p[contains(@class, "speaker")]') | |
if speaker.exists(): | |
speaker = speaker.text() | |
else: | |
speaker = '' | |
print row.text() | |
if anchor.exists(): | |
talk_name = anchor.text() | |
talk_url = 'https://2015.djangocon.us/{0}'.format( | |
anchor.attr('href', '') | |
) | |
else: | |
talk_name = '' | |
talk_url = '' | |
click.echo(click.style(talk.text(), fg='blue')) | |
click.echo(u'start_time: {0}'.format(start_time)) | |
click.echo(u'speaker: {0}'.format(speaker)) | |
click.echo(u'talk name: {0}'.format(talk_name)) | |
click.echo(u'talk url: {0}'.format(talk_url)) | |
click.echo() | |
data.append({ | |
'conference': 'djangocon-us', | |
'year': '2014', | |
'start_time': start_time, | |
'speaker': speaker, | |
'talk_name': talk_name, | |
'talk_url': talk_url, | |
}) | |
write_csv('csv/djangocon-us-2014.csv', data) | |
@cli.command() | |
def fetch_2015(): | |
url = 'http://2015.djangocon.us/schedule/general-sessions/' | |
g = Grab() | |
g.go(url) | |
data = [] | |
talks = [item for item in g.doc('//table/tbody/tr')] | |
for talk in talks: | |
rows = talk.select('td') | |
for row in rows: | |
anchor = row.select('p[contains(@class, "start-time")]/a') | |
start_time = row.select('p').text() | |
speaker = row.select('p[contains(@class, "speaker")]') | |
if speaker.exists(): | |
speaker = speaker.text() | |
else: | |
speaker = '' | |
print row.text() | |
if anchor.exists(): | |
talk_name = anchor.text() | |
talk_url = 'https://2015.djangocon.us{0}'.format( | |
anchor.attr('href', '') | |
) | |
else: | |
talk_name = '' | |
talk_url = '' | |
click.echo(click.style(talk.text(), fg='blue')) | |
click.echo(u'start_time: {0}'.format(start_time)) | |
click.echo(u'speaker: {0}'.format(speaker)) | |
click.echo(u'talk name: {0}'.format(talk_name)) | |
click.echo(u'talk url: {0}'.format(talk_url)) | |
click.echo() | |
data.append({ | |
'conference': 'djangocon-us', | |
'year': '2015', | |
'start_time': start_time, | |
'speaker': speaker, | |
'talk_name': talk_name, | |
'talk_url': talk_url, | |
}) | |
write_csv('csv/djangocon-us-2015.csv', data) | |
if __name__ == '__main__': | |
cli() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment