Udinic/tv_shows_scraper.py

## tv_shows_scraper.py
#!/usr/bin/env python

"""
    tv_shows_scraper.py

    Creates 2 files, TV shows titles and year, surrounded with <item> tags.

    Usage: python tv_show_scraper.py

"""

__author__ = 'Udi Cohen <udi@udinic.com>'
__license__ = "Apache 2.0"
__copyright__ = 'Copyright 2013 Udi Cohen'

from lxml import etree
from lxml.html import fromstring, tostring
import codecs
import re
import requests

url = 'http://en.wikipedia.org/wiki/List_of_American_television_series'

# Trick to overcome wikipedia restrictions
read = requests.get(url, headers={'User-Agent' : "Udinic Browser"}).text
tree = etree.HTML(read)

udi = tree.xpath("//*[@id=\"mw-content-text\"]/ul[*]/li[*]")

names = codecs.open('tvshows_name.txt',mode='wb', encoding='utf-8')
years = codecs.open('tvshows_years.txt',mode='wb', encoding='utf-8')
year_regexp = re.compile(r'\d\d\d\d')

for show in udi:
    name_fragments = show

    while len(name_fragments.getchildren()) > 0:
        name_fragments = name_fragments.getchildren()[0]

    name = name_fragments.text
    year = "0"
    for split in show.xpath("text()"):
        year_patterns = year_regexp.findall(split)
        if len(year_patterns) > 0:
            year = year_patterns[0]
            break

    names.write("<item>"+name+"</item>\n")
    years.write("<item>"+year+"</item>\n")

names.close()
years.close()
	#!/usr/bin/env python

	"""
	tv_shows_scraper.py

	Creates 2 files, TV shows titles and year, surrounded with <item> tags.

	Usage: python tv_show_scraper.py

	"""

	__author__ = 'Udi Cohen <udi@udinic.com>'
	__license__ = "Apache 2.0"
	__copyright__ = 'Copyright 2013 Udi Cohen'

	from lxml import etree
	from lxml.html import fromstring, tostring
	import codecs
	import re
	import requests

	url = 'http://en.wikipedia.org/wiki/List_of_American_television_series'

	# Trick to overcome wikipedia restrictions
	read = requests.get(url, headers={'User-Agent' : "Udinic Browser"}).text
	tree = etree.HTML(read)

	udi = tree.xpath("//[@id=\"mw-content-text\"]/ul[]/li[*]")

	names = codecs.open('tvshows_name.txt',mode='wb', encoding='utf-8')
	years = codecs.open('tvshows_years.txt',mode='wb', encoding='utf-8')
	year_regexp = re.compile(r'\d\d\d\d')

	for show in udi:
	name_fragments = show

	while len(name_fragments.getchildren()) > 0:
	name_fragments = name_fragments.getchildren()[0]

	name = name_fragments.text
	year = "0"
	for split in show.xpath("text()"):
	year_patterns = year_regexp.findall(split)
	if len(year_patterns) > 0:
	year = year_patterns[0]
	break

	names.write("<item>"+name+"</item>\n")
	years.write("<item>"+year+"</item>\n")

	names.close()
	years.close()