jmoggr/Metacritic_Scraper_ProofOfConcept.py

## Metacritic_Scraper_ProofOfConcept.py
import lxml.html
import urllib.request

from urllib.error import URLError

user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
headers = { 'User-Agent' : user_agent }

metacritic_tv_search_url = "http://metacritic.com/search/tv/{}/results"
metacritic_url = "http://metacritic.com"

tv_shows = ["Firefly", "Game of Thrones", "Silicon Valley", "Doctor Who", "Better Off Ted"]

def get_page(url):

    req = urllib.request.Request(url, None, headers)
    response = None

    try:
        response = urllib.request.urlopen(req)
    except URLError as e:
        print(e)

        return None

    content = response.read()

    return content

if __name__ == "__main__":
    for tv_show in tv_shows:
        url = metacritic_tv_search_url.format(tv_show.replace(' ', '%20'))
        content = get_page(url)

        if content is None:
            print("No page: " + tv_show + " | " + url)
            continue

        # From the results page, get a link to the specific page for the first result
        # The link is found using a unique path through the html
        document_tree = lxml.html.document_fromstring(content)
        link = document_tree.xpath("//ul[@class='search_results module']/li[@class='result first_result']//a/@href")[0]

        if link is None:
            print("No link: " + tv_show + " | " + url)
            continue

        url = metacritic_url + link
        content = get_page(url)

        # From the specific page get the rating from a unique path through the html
        document_tree = lxml.html.document_fromstring(content)
        rating =document_tree.xpath("//*[@itemprop='ratingValue']/text()")[0]

        if rating is None:
            print("No rating: " + tv_show + " | " + url)
            continue

        print(tv_show + ": " + rating)
	import lxml.html
	import urllib.request

	from urllib.error import URLError

	user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
	headers = { 'User-Agent' : user_agent }

	metacritic_tv_search_url = "http://metacritic.com/search/tv/{}/results"
	metacritic_url = "http://metacritic.com"

	tv_shows = ["Firefly", "Game of Thrones", "Silicon Valley", "Doctor Who", "Better Off Ted"]

	def get_page(url):

	req = urllib.request.Request(url, None, headers)
	response = None

	try:
	response = urllib.request.urlopen(req)
	except URLError as e:
	print(e)

	return None

	content = response.read()

	return content

	if __name__ == "__main__":
	for tv_show in tv_shows:
	url = metacritic_tv_search_url.format(tv_show.replace(' ', '%20'))
	content = get_page(url)

	if content is None:
	print("No page: " + tv_show + " \| " + url)
	continue

	# From the results page, get a link to the specific page for the first result
	# The link is found using a unique path through the html
	document_tree = lxml.html.document_fromstring(content)
	link = document_tree.xpath("//ul[@class='search_results module']/li[@class='result first_result']//a/@href")[0]

	if link is None:
	print("No link: " + tv_show + " \| " + url)
	continue

	url = metacritic_url + link
	content = get_page(url)

	# From the specific page get the rating from a unique path through the html
	document_tree = lxml.html.document_fromstring(content)
	rating =document_tree.xpath("//*[@itemprop='ratingValue']/text()")[0]

	if rating is None:
	print("No rating: " + tv_show + " \| " + url)
	continue

	print(tv_show + ": " + rating)