Created July 22, 2013
This script scraps Wikipedia'a page for all American TV shows, and output the titles and the years in 2 separate files, surrounded with <item> tags. Written for the sync adapter sample app introduced in my blog:
#!/usr/bin/env python
Creates 2 files, TV shows titles and year, surrounded with <item> tags.
Usage: python
__author__ = 'Udi Cohen <>'
__license__ = "Apache 2.0"
__copyright__ = 'Copyright 2013 Udi Cohen'
from lxml import etree
from lxml.html import fromstring, tostring
import codecs
import re
import requests
url = ''
# Trick to overcome wikipedia restrictions
read = requests.get(url, headers={'User-Agent' : "Udinic Browser"}).text
tree = etree.HTML(read)
udi = tree.xpath("//*[@id=\"mw-content-text\"]/ul[*]/li[*]")
names ='tvshows_name.txt',mode='wb', encoding='utf-8')
years ='tvshows_years.txt',mode='wb', encoding='utf-8')
year_regexp = re.compile(r'\d\d\d\d')
for show in udi:
name_fragments = show
while len(name_fragments.getchildren()) > 0:
name_fragments = name_fragments.getchildren()[0]
name = name_fragments.text
year = "0"
for split in show.xpath("text()"):
year_patterns = year_regexp.findall(split)
if len(year_patterns) > 0:
year = year_patterns[0]
