Skip to content

Instantly share code, notes, and snippets.

Last active December 19, 2015 21:48
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save tfmorris/6022576 to your computer and use it in GitHub Desktop.
Save tfmorris/6022576 to your computer and use it in GitHub Desktop.
BBC Desert Island Discs scraper for the current/old scraperwiki Until scraperwiki shutsdown original is at
# Scrape BBC Desert Island Discs data including songs, books, and luxury item, if available, for the celebrity "castaways"
# based on original work by Francis Irving with the following changes by Tom Morris July 2012:
# - updated to current BBC page format
# - switched from BeautifulSoup to lxml
# - updated deprecated database calls
# - restructured to run as a single integrated process and not rescrape data it already extracted
import scraperwiki
import scraperwiki.apiwrapper
import lxml.html
from datetime import datetime
SITE = ''
BASE = SITE + '/radio4/features/desert-island-discs/find-a-castaway'
if scraperwiki.sqlite.show_tables():
past = [(i['date'],i['guest']) for i in"* from swdata WHERE type == 'url'")]
past = []
print 'Database contains %d past entries' % len(past)
def process_guest(date, name, occupation, url):
if (date,name) in past:
# print 'Skipping %s %s' % (date,name)
return False
html = scraperwiki.scrape(url).decode("utf-8")
root = lxml.html.fromstring(html)
intro = root.cssselect('div#castaway_intro h1')
# Check for unexpected page format
if intro == None:
print "skipping, no <div id='castaway_intro'>, page format has changed? ",url
# Denormalized schema, but that's a little easier for consumers
# Old schema - Pass1: date date_scraped url guest
# Pass2: date_scraped guest title url date type performer
# old record types: record keep_record book luxury
rec = {'date_scraped' :,
}["date", "guest", "type", "title"], rec)
castaway = intro[0].text_content()
if not castaway == name:
print 'Mismatched names between index (%s) and detail page (%s)' % (name,castaway)
rec = {'date':date,
}["date", "guest", "type", "title"], rec)
# TODO It would be more efficient to only fetch the page once for all broadcasts,
# but we sacrifice a small amount of efficiency for the rare cast to better fit with our control flow
broadcast_id = url.split('#')[-1]
broadcast = root.cssselect('div#'+broadcast_id)
# The first broadcast doesn't appear to be tagged with an ID (even though the URL references it)
# so default to using the first broadcast that we find
if not broadcast:
#print 'Failed to find broadcast by ID. Using default'
broadcast = broadcast[0]
# Track choices
for choice in broadcast.cssselect('div.castaway-choice'):
text = choice.cssselect('div.text')[0]
num = text.cssselect('p.number')[0].text_content()
#print lxml.html.tostring(text)
# Sanity check number?
keep = text.cssselect('p.track_keep') # Only present if it's their favorite track
artist = text.cssselect('h4')[0].text_content()
# extract artist musicbrainz id if available
link = text.cssselect('h4 a') # need to parse link attribute url
if link:
mb_id = link[0].attrib['href'].split('/')[-1]
mb_id = None
track = text.cssselect('p.track_choice')[0].text_content()
composer = text.cssselect('p.composer')[0].text_content() # not necessarily the composer
principal = 'artist'
if composer:
if composer.startswith('Composer: '):
composer = composer.split('Composer: ')[1]
principal = 'composer'
tmp = composer
composer = artist
if tmp.startswith('Artist: '):
artist = tmp.split('Artist: ')[1]
artist = tmp
rec.update({'type': 'record_keep' if keep else 'record',
'title' : track,
'performer' : artist,
'composer' : composer,
'principal' : principal,
'mb_id' : mb_id,
})["date", "guest", "type", "title"], rec)
# Clear music specific fields
rec.update({'performer' : None,
'composer' : None,
'principal' : None,
'mb_id' : None,
book = broadcast.cssselect('')
if book:
title = book[0].cssselect('h5.book_choice')[0].text_content()
rec.update({'type': 'book',
'title' : title,
})["date", "guest", "type", "title"], rec)
luxury = broadcast.cssselect('')
if luxury:
item = luxury[0].cssselect('h5.luxury_item_choice')[0].text_content()
rec.update({'type': 'luxury',
'title' : item,
})["date", "guest", "type", "title"], rec)
# URL record must be written last because it's the key we use to determine record is complete
rec = {'date':date,
}["date", "guest", "type", "title"], rec)
return True
def process_index_page(pg):
items = pg.cssselect('')
# print 'Index page has %d items' % len(items)
count = 0
for item in items:
text = item.cssselect('div.text')
if not text:
print 'Unable to process item - no text div'
text = text[0]
guest = text.cssselect('h4 a')
if not guest:
print 'Unabled to find guest name'
guest = guest[0]
guest_url = SITE + guest.attrib['href']
guest_name = guest.text_content()
date = text.cssselect('')
if not date:
print 'Unable to find broadcast date for guest "%s"' % guest_name
date = date[0].text_content().split('|')[1].strip()
# Convert date to ISO format
date = datetime.strptime(date,'%d %b %Y').strftime('%Y-%m-%d')
occupation = text.cssselect('p')
if len(occupation) > 1:
occupation = occupation[1].text_content()
occupation = ''
#print date, guest_name, occupation, guest_url
if process_guest(date, guest_name, occupation, guest_url):
count += 1
print 'Processed %d of %d shows' % (count,len(items))
return count
def fetch_index_page(page_num):
print 'Fetching index page %d' % page_num
page_html = scraperwiki.scrape(BASE + '/page/' + str(page_num))
return lxml.html.fromstring(page_html)
def main():
index_html = scraperwiki.scrape(BASE).decode("utf-8")
index = lxml.html.fromstring(index_html)
episode_count = int(index.cssselect('p#search-found span')[0].text_content().split(' ')[0])
print '%d total episodes' % episode_count
pages = index.cssselect('ul.pages li a')
last_index_page = int(pages[-2].text_content())
print '%d index pages' % last_index_page
count = process_index_page(index) # handle the first page
for page_num in range(2,last_index_page+1):
page = fetch_index_page(page_num)
count += process_index_page(page)
print 'Processed %d new entries' % count
def test():
# Test multiple appearances
print process_guest('1980-12-20','Arthur Askey','Comedian, Music hall','')
print process_guest('1942-04-02','Arthur Askey','Comedian','')
# Test index pages without dates
page = fetch_index_page(96)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment