Skip to content

Instantly share code, notes, and snippets.

@jmoggr
Last active December 9, 2015 20:19
Show Gist options
  • Save jmoggr/532ee35e6e07b78c6e8f to your computer and use it in GitHub Desktop.
Save jmoggr/532ee35e6e07b78c6e8f to your computer and use it in GitHub Desktop.
Metacritic_Scraper_ProofOfConcept
import lxml.html
import urllib.request
from urllib.error import URLError
user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
headers = { 'User-Agent' : user_agent }
metacritic_tv_search_url = "http://metacritic.com/search/tv/{}/results"
metacritic_url = "http://metacritic.com"
tv_shows = ["Firefly", "Game of Thrones", "Silicon Valley", "Doctor Who", "Better Off Ted"]
def get_page(url):
req = urllib.request.Request(url, None, headers)
response = None
try:
response = urllib.request.urlopen(req)
except URLError as e:
print(e)
return None
content = response.read()
return content
if __name__ == "__main__":
for tv_show in tv_shows:
url = metacritic_tv_search_url.format(tv_show.replace(' ', '%20'))
content = get_page(url)
if content is None:
print("No page: " + tv_show + " | " + url)
continue
# From the results page, get a link to the specific page for the first result
# The link is found using a unique path through the html
document_tree = lxml.html.document_fromstring(content)
link = document_tree.xpath("//ul[@class='search_results module']/li[@class='result first_result']//a/@href")[0]
if link is None:
print("No link: " + tv_show + " | " + url)
continue
url = metacritic_url + link
content = get_page(url)
# From the specific page get the rating from a unique path through the html
document_tree = lxml.html.document_fromstring(content)
rating =document_tree.xpath("//*[@itemprop='ratingValue']/text()")[0]
if rating is None:
print("No rating: " + tv_show + " | " + url)
continue
print(tv_show + ": " + rating)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment