This script scraps Wikipedia'a page for all American TV shows, and output the titles and the years in 2 separate files, surrounded with <item> tags. Written for the sync adapter sample app introduced in my blog: http://www.udinic.com
#!/usr/bin/env python | |
""" | |
tv_shows_scraper.py | |
Creates 2 files, TV shows titles and year, surrounded with <item> tags. | |
Usage: python tv_show_scraper.py | |
""" | |
__author__ = 'Udi Cohen <udi@udinic.com>' | |
__license__ = "Apache 2.0" | |
__copyright__ = 'Copyright 2013 Udi Cohen' | |
from lxml import etree | |
from lxml.html import fromstring, tostring | |
import codecs | |
import re | |
import requests | |
url = 'http://en.wikipedia.org/wiki/List_of_American_television_series' | |
# Trick to overcome wikipedia restrictions | |
read = requests.get(url, headers={'User-Agent' : "Udinic Browser"}).text | |
tree = etree.HTML(read) | |
udi = tree.xpath("//*[@id=\"mw-content-text\"]/ul[*]/li[*]") | |
names = codecs.open('tvshows_name.txt',mode='wb', encoding='utf-8') | |
years = codecs.open('tvshows_years.txt',mode='wb', encoding='utf-8') | |
year_regexp = re.compile(r'\d\d\d\d') | |
for show in udi: | |
name_fragments = show | |
while len(name_fragments.getchildren()) > 0: | |
name_fragments = name_fragments.getchildren()[0] | |
name = name_fragments.text | |
year = "0" | |
for split in show.xpath("text()"): | |
year_patterns = year_regexp.findall(split) | |
if len(year_patterns) > 0: | |
year = year_patterns[0] | |
break | |
names.write("<item>"+name+"</item>\n") | |
years.write("<item>"+year+"</item>\n") | |
names.close() | |
years.close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment