Skip to content

Instantly share code, notes, and snippets.

@pipwilson
Created April 24, 2012 20:17
Show Gist options
  • Save pipwilson/2483345 to your computer and use it in GitHub Desktop.
Save pipwilson/2483345 to your computer and use it in GitHub Desktop.
TED scraping info
# from https://github.com/plexinc-plugins/TED-Talks.bundle/blob/master/Contents/Code/__init__.py
import re, datetime
from string import ascii_uppercase
###################################################################################################
PLUGIN_TITLE = "TED Talks"
TED_BASE = "http://www.ted.com"
TED_TALKS_FILTER = "http://www.ted.com/talks/browse.json?tagid=%s&orderedby=%s"
TED_THEMES = "http://www.ted.com/themes/atoz"
TED_TAGS = "http://www.ted.com/talks/tags"
TED_SPEAKERS = "http://www.ted.com/speakers/atoz/page/%d"
MEDIA_NS = {'media':'http://search.yahoo.com/mrss/'}
ART_DEFAULT = "art-default.jpg"
ICON_DEFAULT = "icon-default.jpg"
###################################################################################################
def Start():
Plugin.AddPrefixHandler("/video/ted", VideoMainMenu, PLUGIN_TITLE, ICON_DEFAULT, ART_DEFAULT)
Plugin.AddViewGroup("InfoList", viewMode="InfoList", mediaType="items")
Plugin.AddViewGroup("List", viewMode="List", mediaType="items")
ObjectContainer.art = R(ART_DEFAULT)
ObjectContainer.title1 = PLUGIN_TITLE
ObjectContainer.view_group = "InfoList"
DirectoryObject.thumb = R(ICON_DEFAULT)
HTTP.CacheTime = CACHE_1DAY
HTTP.Headers['User-Agent'] = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.7; rv:10.0.2) Gecko/20100101 Firefox/10.0.2"
####################################################################################################
def VideoMainMenu():
oc = ObjectContainer(view_group="List")
oc.add(DirectoryObject(key=Callback(FrontPageList, name="Front Page"), title="Front Page"))
oc.add(DirectoryObject(key=Callback(ThemeList, name="Themes"), title="Themes"))
oc.add(DirectoryObject(key=Callback(TagsList, name="Tags"), title="Tags"))
oc.add(DirectoryObject(key=Callback(SpeakersAZ, name="Speakers"), title="Speakers"))
return oc
####################################################################################################
def SpeakersAZ(name):
oc = ObjectContainer(title2=name, view_group="List")
# A to Z
for char in list(ascii_uppercase):
oc.add(DirectoryObject(key=Callback(SpeakersList, char=char), title=char))
return oc
####################################################################################################
def SpeakersList(char, page=1):
oc = ObjectContainer(title2=char, view_group="List")
i = page
content = HTML.ElementFromURL(TED_SPEAKERS % (i), cacheTime=CACHE_1WEEK)
while len(content.xpath('//a[@class="next"]')) > 0:
content = HTML.ElementFromURL(TED_SPEAKERS % (i), cacheTime=CACHE_1WEEK)
letter_list = content.xpath('//h3[text()="' + char + '"]/following-sibling::ul')
i = i+1
if len(letter_list) == 1:
for speaker in letter_list[0].xpath('./li/a'):
speaker_name = speaker.text.split(" ", 1)
speaker_name.reverse()
speaker_name = ", ".join(speaker_name)
speaker_name = speaker_name.strip(", ")
url = TED_BASE + speaker.get('href')
oc.add(DirectoryObject(key=Callback(SpeakerTalks, name=speaker_name, url=url), title=speaker_name, thumb=Callback(Photo, url=url)))
if len(oc) == 0:
return MessageContainer("Empty", "There aren't any speakers whose name starts with " + char)
return oc
####################################################################################################
def SpeakerTalks(name, url):
oc = ObjectContainer(title2=name)
content = HTML.ElementFromURL(url).xpath('//dl[@class="box clearfix"]')
for talk in content:
title = talk.xpath('.//h4/a')[0].text
url = TED_BASE + talk.xpath('.//h4/a')[0].get('href')
timecode = talk.xpath('.//em')[0].text_content().split(" Posted: ")[0]
duration = CalculateDuration(timecode)
date = Datetime.ParseDate(talk.xpath('.//em')[0].text_content().split(" Posted: ")[1]).date()
thumb = talk.xpath('.//img')[1].get('src')
oc.add(VideoClipObject(url=url, title=title, originally_available_at=date, duration=duration, thumb=Callback(Thumb, url=thumb)))
if len(oc) == 0 :
return MessageContainer("Empty", "This category is empty")
else:
return oc
####################################################################################################
def FrontPageList(name):
oc = ObjectContainer(title2=name, view_group="List")
oc.add(DirectoryObject(key=Callback(FrontPageSort, name="Technology", id=20), title="Technology"))
oc.add(DirectoryObject(key=Callback(FrontPageSort, name="Entertainment", id=25), title="Entertainment"))
oc.add(DirectoryObject(key=Callback(FrontPageSort, name="Design", id=26), title="Design"))
oc.add(DirectoryObject(key=Callback(FrontPageSort, name="Business", id=21), title="Business"))
oc.add(DirectoryObject(key=Callback(FrontPageSort, name="Science", id=24), title="Science"))
oc.add(DirectoryObject(key=Callback(FrontPageSort, name="Global issues", id=28), title="Global issues"))
oc.add(DirectoryObject(key=Callback(FrontPageSort, name="All", id=None), title="All"))
return oc
####################################################################################################
def FrontPageSort(name, id):
oc = ObjectContainer(title2=name, view_group="List")
if id == None:
id_s = ''
else:
id_s = str(id)
oc.add(DirectoryObject(key=Callback(GetTalks, name="Newest releases", url=TED_TALKS_FILTER % (id_s, "NEWEST")), title="Newest releases"))
oc.add(DirectoryObject(key=Callback(GetTalks, name="Most languages", url=TED_TALKS_FILTER % (id_s, "MOSTTRANSLATED")), title="Most languages"))
oc.add(DirectoryObject(key=Callback(GetTalks, name="Most emailed this week", url=TED_TALKS_FILTER % (id_s, "MOSTEMAILED")), title="Most emailed this week"))
oc.add(DirectoryObject(key=Callback(GetTalks, name="Most comments this week", url=TED_TALKS_FILTER % (id_s, "MOSTDISCUSSED")), title="Most comments this week"))
oc.add(DirectoryObject(key=Callback(GetTalks, name="Rated jaw-dropping", url=TED_TALKS_FILTER % (id_s, "JAW-DRAPPING")), title="Rated jaw-dropping"))
oc.add(DirectoryObject(key=Callback(GetTalks, name="... persuasive", url=TED_TALKS_FILTER % (id_s, "PERSUASIVE")), title="... persuasive"))
oc.add(DirectoryObject(key=Callback(GetTalks, name="... courageous", url=TED_TALKS_FILTER % (id_s, "COURAGEOUS")), title="... courageous"))
oc.add(DirectoryObject(key=Callback(GetTalks, name="... ingenious", url=TED_TALKS_FILTER % (id_s, "INGENIOUS")), title="... ingenious"))
oc.add(DirectoryObject(key=Callback(GetTalks, name="... fascinating", url=TED_TALKS_FILTER % (id_s, "FASCINATING")), title="... fascinating"))
oc.add(DirectoryObject(key=Callback(GetTalks, name="... inspiring", url=TED_TALKS_FILTER % (id_s, "INSPIRING")), title="... inspiring"))
oc.add(DirectoryObject(key=Callback(GetTalks, name="... beautiful", url=TED_TALKS_FILTER % (id, "BEAUTIFUL")), title="... beautiful"))
oc.add(DirectoryObject(key=Callback(GetTalks, name="... funny", url=TED_TALKS_FILTER % (id_s, "FUNNY")), title="... funny"))
oc.add(DirectoryObject(key=Callback(GetTalks, name="... informative", url=TED_TALKS_FILTER % (id_s, "INFORMATIVE")), title="... informative"))
return oc
####################################################################################################
def ThemeList(name):
oc = ObjectContainer(title2=name, view_group="List")
content = HTML.ElementFromURL(TED_THEMES)
for theme in content.xpath('//div[@class="box themes"]//a/img/parent::a'):
try:
title = theme.get('title')
url = TED_BASE + theme.get('href')
thumb = theme.xpath('./img')[0].get('src').replace('_132x99.jpg', '_291x218.jpg')
oc.add(DirectoryObject(key=Callback(Theme, name=title, url=url), title=title, thumb=Callback(Thumb, url=thumb)))
except:
pass
return oc
####################################################################################################
def Theme(name, url):
oc = ObjectContainer(title2=name)
try:
rss_url = HTML.ElementFromURL(url).xpath('//link[@rel="alternate"]')[0].get('href')
content = XML.ElementFromURL(rss_url, errors='ignore')
except:
return MessageContainer("Error", "The link for this entry appears to be broken")
for item in content.xpath("//item"):
title = item.xpath('./title')[0].text
url = item.xpath('./link')[0].text
summary = String.StripTags( item.xpath('./description')[0].text )
date = Datetime.ParseDate(item.xpath('./pubDate')[0].text).date()
try:
thumb = item.xpath('./media:group/media:thumbnail', namespaces=MEDIA_NS)[0].get('url').replace('_132x99.jpg', '_291x218.jpg')
except:
thumb = None
oc.add(VideoClipObject(url=url, title=title, originally_available_at=date, summary=summary, thumb=Callback(Thumb, url=thumb)))
if len(oc) == 0 :
return MessageContainer("Empty", "This category is empty")
else:
return oc
####################################################################################################
def TagsList(name):
oc = ObjectContainer(title2=name, view_group="List")
content = HTML.ElementFromURL(TED_TAGS)
for tag in content.xpath('//div[@id="maincontent"]//a'):
title = tag.text
url = TED_BASE + tag.get('href')
oc.add(DirectoryObject(key=Callback(Tag, name=title, url=url), title=title))
return oc
####################################################################################################
def Tag(name, url):
oc = ObjectContainer(title2=name)
current_page = HTML.ElementFromURL(url)
try:
prevpage = current_page.xpath("//div[@class='pagination clearfix']")[0]
try:
oc.add(DirectoryObject(key=Callback(Tag, name=name, url=TED_BASE + prevpage.xpath(".//a[@class='previous']")[0].get('href')), title="Previous Page"))
except:
pass
for item in HTML.ElementFromURL(url).xpath("//dl[@class='clearfix']"):
title = item.xpath('./dd//a')[0].text
url = TED_BASE + item.xpath('./dd//a')[0].get('href')
summary= None
date = None
try:
thumb = item.xpath('./dt//img[@alt="Talk image"]')[0].get('src').replace('_160x120.jpg', '_291x218.jpg')
except:
thumb = None
oc.add(VideoClipObject(url=url, title=title, originally_available_at=date, summary=summary, thumb=Callback(Thumb, url=thumb)))
try:
oc.add(DirectoryObject(key=Callback(Tag, name=name, url=TED_BASE + prevpage.xpath(".//a[@class='next']")[0].get('href')), title="Next Page"))
except:
pass
except:
pass
if len(oc) == 0 :
return MessageContainer("Empty", "This category is empty")
else:
return oc
####################################################################################################
def GetTalks(name, url):
oc = ObjectContainer(title2=name)
talks = JSON.ObjectFromURL(url)['main']
for talk in talks:
title = talks[str(talk)]['tTitle']
try:
date = Datetime.ParseDate(talks[str(talk)]['talkpDate']).date() # Post date
except:
date = datetime.datetime.strptime(talks[str(talk)]['talkpDate'],"%b %Y")
if talks[str(talk)]['altTitle'] != talks[str(talk)]['tTitle']:
summary = String.StripTags( talks[str(talk)]['altTitle'] + '\n\n' + talks[str(talk)]['blurb'] )
else:
summary = String.StripTags( talks[str(talk)]['blurb'] )
timecode = talks[str(talk)]['talkDuration']
duration = CalculateDuration(timecode)
thumb = str(talks[str(talk)]['image']) + "_240x180.jpg"
url = TED_BASE + talks[str(talk)]['talkLink']
oc.add(VideoClipObject(url=url, title=title, originally_available_at=date, duration=duration, summary=summary, thumb=Callback(Thumb, url=thumb)))
if len(oc) == 0 :
return MessageContainer("Empty", "This category is empty")
else:
return oc
####################################################################################################
def Photo(url):
try:
photo_url = HTML.ElementFromURL(url).xpath('//link[@rel="image_src"]')[0].get('href')
data = HTTP.Request(photo_url, cacheTime=CACHE_1MONTH).content
return DataObject(data, 'image/jpeg')
except:
return Redirect(R(ICON_DEFAULT))
####################################################################################################
def Thumb(url):
if url:
try:
data = HTTP.Request(url, cacheTime=CACHE_1MONTH).content
return DataObject(data, 'image/jpeg')
except:
pass
return Redirect(R(ICON_DEFAULT))
####################################################################################################
def CalculateDuration(timecode):
milliseconds = 0
d = re.search('([0-9]{1,2}):([0-9]{2})', timecode)
milliseconds += int( d.group(1) ) * 60 * 1000
milliseconds += int( d.group(2) ) * 1000
return milliseconds
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment