Skip to content

Instantly share code, notes, and snippets.

@jefftriplett
Last active August 29, 2015 19:19
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jefftriplett/496b0b6ff6c8999be3a4 to your computer and use it in GitHub Desktop.
Save jefftriplett/496b0b6ff6c8999be3a4 to your computer and use it in GitHub Desktop.
"""
Quick attempt to extract speakers and talks from every DjangoCon US.
To install requirements:
pip install click grab unicodecsv
Goals:
#. Extract speakers and talks from every DjangoCon US. (DONE)
#. Store extracted information in a useful format (mostly DONE)
#. Process information into an app
#. Expand for DjangoCon EU + AU
"""
import click
import unicodecsv
from grab import Grab
def write_csv(filename, rows):
with open(filename, 'w') as f:
writer = unicodecsv.writer(f, quoting=unicodecsv.QUOTE_ALL)
writer.writerow([
'conference',
'year',
'start_time',
'speaker',
'talk_name',
'talk_url',
'video_url',
'slides_url',
])
for row in rows:
writer.writerow([
row.get('conference'),
row.get('year'),
row.get('start_time'),
row.get('speaker'),
row.get('talk_name'),
row.get('talk_url'),
row.get('video_url'),
row.get('slides_url'),
])
@click.group()
def cli():
pass
@cli.command()
def fetch_all():
fetch_2008()
fetch_2009()
fetch_2010()
fetch_2011()
fetch_2012()
fetch_2013()
fetch_2014()
fetch_2015()
@cli.command()
def fetch_2008():
url = 'http://web.archive.org/web/20090217112815id_/http://djangocon.org/program'
grab = Grab()
grab.go(url)
"""
<tr class="odd first">
<td>9:00am - 9:50am</td>
<td>Building 40</td>
<td>-</td>
<td>Doors Open/Registration</td>
</tr>
<tr class="even">
<td>10:00am - 10:10am</td>
<td>Track 1</td>
<td>Robert Lofthouse</td>
<td>Keynote: Chairman's Opening Statement</td>
</tr>
"""
data = []
rows = [item for item in grab.doc('//table/tbody/tr')]
for row in rows:
click.echo(click.style(row.text(), fg='blue'))
talk = row.select('td')
if talk.exists():
start_time = talk[0].text()
speaker = talk[2].text()
talk_name = talk[3].text()
click.echo(click.style(talk.text(), fg='yellow'))
click.echo(u'start_time: {0}'.format(start_time))
click.echo(u'speaker: {0}'.format(speaker))
click.echo(u'talk name: {0}'.format(talk_name))
# click.echo(u'talk url: {0}'.format(talk_url))
click.echo()
data.append({
'conference': 'djangocon-us',
'year': '2008',
'start_time': start_time,
'speaker': speaker,
'talk_name': talk_name,
'talk_url': '',
})
write_csv('csv/djangocon-us-2008.csv', data)
@cli.command()
def fetch_2009():
url = 'http://web.archive.org/web/20100428054846id_/http://www.djangocon.org/2009/conference/schedule/'
grab = Grab()
grab.go(url)
"""
<tr>
<td class="time">8:00 - 18:00</td>
<td class="talk colspan="2"><p>Registration opens for the rest of the day.</p></td>
</tr>
<tr class="even">
<td class="time">9:00</td>
<td class="talk" colspan="2"><p>Chairman's Opening Remarks & Introduction from the Fake Jacob Kaplan-Moss</p></td>
</tr>
"""
data = []
rows = [item for item in grab.doc('//table/tbody/tr')]
for row in rows:
click.echo(click.style(row.text(), fg='blue'))
start_time = row.select('td')
if start_time.exists():
start_time = start_time.text()
else:
start_time = ''
click.echo(click.style(row.text(), fg='blue'))
talks = row.select('td')[1:]
for talk in talks:
speaking_info = talk.select('p')
if speaking_info.exists() and len(speaking_info) > 1:
speaker = speaking_info[1].text()
talk_name = speaking_info[0].text()
click.echo(click.style(talk.text(), fg='yellow'))
click.echo(u'start_time: {0}'.format(start_time))
click.echo(u'speaker: {0}'.format(speaker))
click.echo(u'talk name: {0}'.format(talk_name))
# click.echo(u'talk url: {0}'.format(talk_url))
click.echo()
data.append({
'conference': 'djangocon-us',
'year': '2009',
'start_time': start_time,
'speaker': speaker,
'talk_name': talk_name,
'talk_url': '',
})
write_csv('csv/djangocon-us-2009.csv', data)
@cli.command()
def fetch_2010():
url = 'http://web.archive.org/web/20101005140539id_/http://djangocon.us/schedule/'
grab = Grab()
grab.go(url)
data = []
rows = [item for item in grab.doc('//table/tr')]
for row in rows:
start_time = row.select('td')
if start_time.exists():
start_time = start_time.text()
else:
start_time = ''
click.echo(click.style(row.text(), fg='blue'))
talks = row.select('td')[1:]
for talk in talks:
speaker = talk.select('td[contains(@class, "speaker")]')
if speaker.exists():
speaker = speaker.text().strip()
else:
speaker = ''
anchor = talk.select('a')
if anchor.exists():
talk_name = anchor.text()
talk_url = 'http://web.archive.org/web/20101005140539id_/http://djangocon.us/{0}'.format(anchor.attr('href', ''))
else:
talk_name = ''
talk_url = ''
click.echo(click.style(talk.text(), fg='yellow'))
click.echo(u'start_time: {0}'.format(start_time))
click.echo(u'speaker: {0}'.format(speaker))
click.echo(u'talk name: {0}'.format(talk_name))
click.echo(u'talk url: {0}'.format(talk_url))
click.echo()
data.append({
'conference': 'djangocon-us',
'year': '2010',
'start_time': start_time,
'speaker': speaker,
'talk_name': talk_name,
'talk_url': talk_url,
})
write_csv('csv/djangocon-us-2010.csv', data)
@cli.command()
def fetch_2011():
url = 'http://2011.djangocon.us/schedule/'
grab = Grab()
grab.go(url)
data = []
rows = [item for item in grab.doc('//table/tr')]
for row in rows:
start_time = row.select('td[contains(@class, "time")]')
if start_time.exists():
start_time = start_time.text()
else:
start_time = ''
click.echo(click.style(row.text(), fg='blue'))
talks = row.select('td')[1:]
for talk in talks:
speaker = talk.select('div[contains(@class, "speaker")]')
if speaker.exists():
speaker = speaker.text().strip()
else:
speaker = ''
anchor = talk.select('div[contains(@class, "title")]/a')
if anchor.exists():
talk_name = anchor.text()
talk_url = 'http://2011.djangocon.us/{0}'.format(anchor.attr('href', ''))
else:
talk_name = ''
talk_url = ''
click.echo(click.style(talk.text(), fg='yellow'))
click.echo(u'start_time: {0}'.format(start_time))
click.echo(u'speaker: {0}'.format(speaker))
click.echo(u'talk name: {0}'.format(talk_name))
click.echo(u'talk url: {0}'.format(talk_url))
click.echo()
data.append({
'conference': 'djangocon-us',
'year': '2011',
'start_time': start_time,
'speaker': speaker,
'talk_name': talk_name,
'talk_url': talk_url,
})
write_csv('csv/djangocon-us-2011.csv', data)
@cli.command()
def fetch_2012():
url = 'http://2012.djangocon.us/schedule/'
grab = Grab()
grab.go(url)
data = []
rows = [item for item in grab.doc('//table/tr')]
for row in rows:
start_time = row.select('td[contains(@class, "time")]')
if start_time.exists():
start_time = start_time.text()
else:
start_time = ''
click.echo(click.style(row.text(), fg='blue'))
talks = row.select('td')[1:]
for talk in talks:
speaker = talk.select('div[contains(@class, "speaker")]')
if speaker.exists():
speaker = speaker.text().strip()
else:
speaker = ''
anchor = talk.select('div[contains(@class, "title")]/a')
if anchor.exists():
talk_name = anchor.text()
talk_url = 'http://2012.djangocon.us/{0}'.format(anchor.attr('href', ''))
else:
talk_name = ''
talk_url = ''
click.echo(click.style(talk.text(), fg='yellow'))
click.echo(u'start_time: {0}'.format(start_time))
click.echo(u'speaker: {0}'.format(speaker))
click.echo(u'talk name: {0}'.format(talk_name))
click.echo(u'talk url: {0}'.format(talk_url))
click.echo()
data.append({
'conference': 'djangocon-us',
'year': '2012',
'start_time': start_time,
'speaker': speaker,
'talk_name': talk_name,
'talk_url': talk_url,
})
write_csv('csv/djangocon-us-2012.csv', data)
@cli.command()
def fetch_2013():
url = 'https://web.archive.org/web/20131022134635id_/http://djangocon.us/schedule/'
grab = Grab()
grab.go(url)
data = []
rows = [item for item in grab.doc('//table/tbody/tr')]
for row in rows:
start_time = row.select('td[contains(@class, "time")]')
if start_time.exists():
start_time = start_time.text()
else:
start_time = ''
click.echo(click.style(row.text(), fg='blue'))
talks = row.select('td')[1:]
for talk in talks:
speaker = talk.select('span[contains(@class, "speaker")]')
if speaker.exists():
speaker = speaker.text().strip()
else:
speaker = ''
anchor = talk.select('span[contains(@class, "title")]/a')
if anchor.exists():
talk_name = anchor.text()
talk_url = 'http://2012.djangocon.us/{0}'.format(anchor.attr('href', ''))
else:
talk_name = ''
talk_url = ''
click.echo(click.style(talk.text(), fg='yellow'))
click.echo(u'start_time: {0}'.format(start_time))
click.echo(u'speaker: {0}'.format(speaker))
click.echo(u'talk name: {0}'.format(talk_name))
click.echo(u'talk url: {0}'.format(talk_url))
click.echo()
data.append({
'conference': 'djangocon-us',
'year': '2013',
'start_time': start_time,
'speaker': speaker,
'talk_name': talk_name,
'talk_url': talk_url,
})
write_csv('csv/djangocon-us-2013.csv', data)
@cli.command()
def fetch_2014():
url = 'http://2014.djangocon.us/schedule/'
g = Grab()
g.go(url)
data = []
talks = [item for item in g.doc('//table/tbody/tr')]
for talk in talks:
row = talk.select('td')
anchor = row.select('p/a')
start_time = row.select('p').text()
speaker = row.select('p[contains(@class, "speaker")]')
if speaker.exists():
speaker = speaker.text()
else:
speaker = ''
print row.text()
if anchor.exists():
talk_name = anchor.text()
talk_url = 'https://2015.djangocon.us/{0}'.format(
anchor.attr('href', '')
)
else:
talk_name = ''
talk_url = ''
click.echo(click.style(talk.text(), fg='blue'))
click.echo(u'start_time: {0}'.format(start_time))
click.echo(u'speaker: {0}'.format(speaker))
click.echo(u'talk name: {0}'.format(talk_name))
click.echo(u'talk url: {0}'.format(talk_url))
click.echo()
data.append({
'conference': 'djangocon-us',
'year': '2014',
'start_time': start_time,
'speaker': speaker,
'talk_name': talk_name,
'talk_url': talk_url,
})
write_csv('csv/djangocon-us-2014.csv', data)
@cli.command()
def fetch_2015():
url = 'http://2015.djangocon.us/schedule/general-sessions/'
g = Grab()
g.go(url)
data = []
talks = [item for item in g.doc('//table/tbody/tr')]
for talk in talks:
rows = talk.select('td')
for row in rows:
anchor = row.select('p[contains(@class, "start-time")]/a')
start_time = row.select('p').text()
speaker = row.select('p[contains(@class, "speaker")]')
if speaker.exists():
speaker = speaker.text()
else:
speaker = ''
print row.text()
if anchor.exists():
talk_name = anchor.text()
talk_url = 'https://2015.djangocon.us{0}'.format(
anchor.attr('href', '')
)
else:
talk_name = ''
talk_url = ''
click.echo(click.style(talk.text(), fg='blue'))
click.echo(u'start_time: {0}'.format(start_time))
click.echo(u'speaker: {0}'.format(speaker))
click.echo(u'talk name: {0}'.format(talk_name))
click.echo(u'talk url: {0}'.format(talk_url))
click.echo()
data.append({
'conference': 'djangocon-us',
'year': '2015',
'start_time': start_time,
'speaker': speaker,
'talk_name': talk_name,
'talk_url': talk_url,
})
write_csv('csv/djangocon-us-2015.csv', data)
if __name__ == '__main__':
cli()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment