derekm/get-talks.py

## get-talks.py
import re
import bs4
import urllib2

root = "http://www.ted.com"

req = urllib2.Request(root + "/talks/quick-list")
html = urllib2.urlopen(req).read()
while html:
    talks = bs4.BeautifulSoup(html)

    for a in talks.select('td > a[href^="/talks/"]'):
        url = root + a['href'] + '/transcript'
        req = urllib2.Request(url)

        try:
            html = urllib2.urlopen(req).read()
            print url
            talk = bs4.BeautifulSoup(html)
            paras = talk.select('span.talk-transcript__para__text')
            for para in paras:
                print ' '.join( [re.sub('\n', ' ', i.text.strip()) \
                      for i in para.select('span.talk-transcript__fragment')] )
        except:
            pass

    try:
        req = urllib2.Request(root + talks('li', 'next')[0].a['href'])
        html = urllib2.urlopen(req).read()
    except:
        html = False
	import re
	import bs4
	import urllib2

	root = "http://www.ted.com"

	req = urllib2.Request(root + "/talks/quick-list")
	html = urllib2.urlopen(req).read()
	while html:
	talks = bs4.BeautifulSoup(html)

	for a in talks.select('td > a[href^="/talks/"]'):
	url = root + a['href'] + '/transcript'
	req = urllib2.Request(url)

	try:
	html = urllib2.urlopen(req).read()
	print url
	talk = bs4.BeautifulSoup(html)
	paras = talk.select('span.talk-transcript__para__text')
	for para in paras:
	print ' '.join( [re.sub('\n', ' ', i.text.strip()) \
	for i in para.select('span.talk-transcript__fragment')] )
	except:
	pass

	try:
	req = urllib2.Request(root + talks('li', 'next')[0].a['href'])
	html = urllib2.urlopen(req).read()
	except:
	html = False