Skip to content

Instantly share code, notes, and snippets.

@Udinic
Created July 22, 2013 01:35
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Udinic/6050766 to your computer and use it in GitHub Desktop.
Save Udinic/6050766 to your computer and use it in GitHub Desktop.
This script scraps Wikipedia'a page for all American TV shows, and output the titles and the years in 2 separate files, surrounded with <item> tags. Written for the sync adapter sample app introduced in my blog: http://www.udinic.com
#!/usr/bin/env python
"""
tv_shows_scraper.py
Creates 2 files, TV shows titles and year, surrounded with <item> tags.
Usage: python tv_show_scraper.py
"""
__author__ = 'Udi Cohen <udi@udinic.com>'
__license__ = "Apache 2.0"
__copyright__ = 'Copyright 2013 Udi Cohen'
from lxml import etree
from lxml.html import fromstring, tostring
import codecs
import re
import requests
url = 'http://en.wikipedia.org/wiki/List_of_American_television_series'
# Trick to overcome wikipedia restrictions
read = requests.get(url, headers={'User-Agent' : "Udinic Browser"}).text
tree = etree.HTML(read)
udi = tree.xpath("//*[@id=\"mw-content-text\"]/ul[*]/li[*]")
names = codecs.open('tvshows_name.txt',mode='wb', encoding='utf-8')
years = codecs.open('tvshows_years.txt',mode='wb', encoding='utf-8')
year_regexp = re.compile(r'\d\d\d\d')
for show in udi:
name_fragments = show
while len(name_fragments.getchildren()) > 0:
name_fragments = name_fragments.getchildren()[0]
name = name_fragments.text
year = "0"
for split in show.xpath("text()"):
year_patterns = year_regexp.findall(split)
if len(year_patterns) > 0:
year = year_patterns[0]
break
names.write("<item>"+name+"</item>\n")
years.write("<item>"+year+"</item>\n")
names.close()
years.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment