Skip to content

Instantly share code, notes, and snippets.

@csytan
Created May 11, 2009 06:39
Show Gist options
  • Save csytan/109890 to your computer and use it in GitHub Desktop.
Save csytan/109890 to your computer and use it in GitHub Desktop.
imdb.py
from lib import BeautifulSoup
import urllib2
from urlparse import urljoin
import difflib
import re
def get_title_name(self):
imdb_title = self.title_id
url = urllib2.urlopen("http://imdb.com/title/" + imdb_title)
string = url.read()
soup = BeautifulSoup.BeautifulSoup(string)
title = soup('title')[0]
return title.string
def get_imdb_rating(self):
imdb_title = self.title_id
url = urllib2.urlopen("http://imdb.com/title/" + imdb_title)
string = url.read()
soup = BeautifulSoup.BeautifulSoup(string)
for div in soup('div'):
if 'class' in dict(div.attrs) and div['class']=='general rating':
for b in div('b'):
if b.contents[0] == 'User Rating:':
pass
else:
rating = b.contents[0].rstrip("/10")
return float(rating)
return None
def search_imdb(self, search_string):
"""
Returns the title of the first match in IMDb's title search
"""
query = search_string.replace(" ", "+")
try:
page = urllib2.urlopen("http://imdb.com/find?s=tt&q=" + query)
except:
print "Could not open %s" % page
return None
# if redirected to imdb page (exact match)
if 'imdb.com/title/' in page.url:
titlePattern = re.compile("imdb.com/title/(tt\d+)/?")
results = titlePattern.findall(page.url)
if results is not None:
return results[0]
soup = BeautifulSoup.BeautifulSoup(page.read())
matcher = difflib.SequenceMatcher()
links = soup('a')
best_ratio = 0
for link in links:
if 'href' in dict(link.attrs):
if not link.has_key('onclick'):
if "/title/" in link['href']:
matcher.set_seqs(search_string, link.contents[0])
ratio = matcher.ratio()
if ratio > best_ratio:
best_ratio = ratio
title = link['href'].replace('/title/', '').replace('/', '')
if best_ratio > 0.80:
return title
else:
return None
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment