Skip to content

Instantly share code, notes, and snippets.

@derekm
Last active August 29, 2015 13:57
Show Gist options
  • Save derekm/9755742 to your computer and use it in GitHub Desktop.
Save derekm/9755742 to your computer and use it in GitHub Desktop.
import re
import bs4
import urllib2
root = "http://www.ted.com"
req = urllib2.Request(root + "/talks/quick-list")
html = urllib2.urlopen(req).read()
while html:
talks = bs4.BeautifulSoup(html)
for a in talks.select('td > a[href^="/talks/"]'):
url = root + a['href'] + '/transcript'
req = urllib2.Request(url)
try:
html = urllib2.urlopen(req).read()
print url
talk = bs4.BeautifulSoup(html)
paras = talk.select('span.talk-transcript__para__text')
for para in paras:
print ' '.join( [re.sub('\n', ' ', i.text.strip()) \
for i in para.select('span.talk-transcript__fragment')] )
except:
pass
try:
req = urllib2.Request(root + talks('li', 'next')[0].a['href'])
html = urllib2.urlopen(req).read()
except:
html = False
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment