Create a gist now

Instantly share code, notes, and snippets.

What would you like to do?
This script scraps Wikipedia'a page for all American TV shows, and output the titles and the years in 2 separate files, surrounded with <item> tags. Written for the sync adapter sample app introduced in my blog: http://www.udinic.com
#!/usr/bin/env python
"""
tv_shows_scraper.py
Creates 2 files, TV shows titles and year, surrounded with <item> tags.
Usage: python tv_show_scraper.py
"""
__author__ = 'Udi Cohen <udi@udinic.com>'
__license__ = "Apache 2.0"
__copyright__ = 'Copyright 2013 Udi Cohen'
from lxml import etree
from lxml.html import fromstring, tostring
import codecs
import re
import requests
url = 'http://en.wikipedia.org/wiki/List_of_American_television_series'
# Trick to overcome wikipedia restrictions
read = requests.get(url, headers={'User-Agent' : "Udinic Browser"}).text
tree = etree.HTML(read)
udi = tree.xpath("//*[@id=\"mw-content-text\"]/ul[*]/li[*]")
names = codecs.open('tvshows_name.txt',mode='wb', encoding='utf-8')
years = codecs.open('tvshows_years.txt',mode='wb', encoding='utf-8')
year_regexp = re.compile(r'\d\d\d\d')
for show in udi:
name_fragments = show
while len(name_fragments.getchildren()) > 0:
name_fragments = name_fragments.getchildren()[0]
name = name_fragments.text
year = "0"
for split in show.xpath("text()"):
year_patterns = year_regexp.findall(split)
if len(year_patterns) > 0:
year = year_patterns[0]
break
names.write("<item>"+name+"</item>\n")
years.write("<item>"+year+"</item>\n")
names.close()
years.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment