Skip to content

Instantly share code, notes, and snippets.

@jayrambhia
Last active April 15, 2023 07:57
Show Gist options
  • Star 8 You must be signed in to star a gist
  • Fork 6 You must be signed in to fork a gist
  • Save jayrambhia/1678382 to your computer and use it in GitHub Desktop.
Save jayrambhia/1678382 to your computer and use it in GitHub Desktop.
Fetch movie information from IMDB using Python!
'''
Author : Jay Rambhia
Git : https://github.com/jayrambhia
gist : https://gist.github.com/jayrambhia
'''
import urllib2
from BeautifulSoup import BeautifulSoup
from mechanize import Browser
import re
def getunicode(soup):
body=''
if isinstance(soup, unicode):
soup = soup.replace(''',"'")
soup = soup.replace('"','"')
soup = soup.replace(' ',' ')
body = body + soup
else:
if not soup.contents:
return ''
con_list = soup.contents
for con in con_list:
body = body + getunicode(con)
return body
def main():
movie = str(raw_input('Movie Name: '))
movie_search = '+'.join(movie.split())
base_url = 'http://www.imdb.com/find?q='
url = base_url+movie_search+'&s=all'
title_search = re.compile('/title/tt\d+')
br = Browser()
br.set_proxies({'http':'http://username:password@proxy:port',
'https':'https://username:password@proxy:port'})
br.open(url)
link = br.find_link(url_regex = re.compile(r'/title/tt.*'))
res = br.follow_link(link)
soup = BeautifulSoup(res.read())
movie_title = getunicode(soup.find('title'))
rate = soup.find('span',itemprop='ratingValue')
rating = getunicode(rate)
actors=[]
actors_soup = soup.findAll('a',itemprop='actors')
for i in range(len(actors_soup)):
actors.append(getunicode(actors_soup[i]))
des = soup.find('meta',{'name':'description'})['content']
genre=[]
infobar = soup.find('div',{'class':'infobar'})
r = infobar.find('',{'title':True})['title']
genrelist = infobar.findAll('a',{'href':True})
for i in range(len(genrelist)-1):
genre.append(getunicode(genrelist[i]))
release_date = getunicode(genrelist[-1])
print movie_title,rating+'/10.0'
print 'Relase Date:',release_date
print 'Rated',r
print ''
print 'Genre:',
print ', '.join(genre)
print '\nActors:',
print ', '.join(actors)
print '\nDescription:'
print des
if __name__ == '__main__':
main()
'''
Author : Jay Rambhia
Git : https://github.com/jayrambhia
gist : https://gist.github.com/jayrambhia
'''
import urllib2
from BeautifulSoup import BeautifulSoup
from mechanize import Browser
import re
def getunicode(soup):
body=''
if isinstance(soup, unicode):
soup = soup.replace(''',"'")
soup = soup.replace('"','"')
soup = soup.replace(' ',' ')
body = body + soup
else:
if not soup.contents:
return ''
con_list = soup.contents
for con in con_list:
body = body + getunicode(con)
return body
def main():
movie = str(raw_input('Movie Name: '))
movie_search = '+'.join(movie.split())
base_url = 'http://www.imdb.com/find?q='
url = base_url+movie_search+'&s=all'
title_search = re.compile('/title/tt\d+')
br = Browser()
br.set_proxies({'http':'http://username:password@proxy:port',
'https':'https://username:password@proxy:port'})
br.open(url)
link = br.find_link(url_regex = re.compile(r'/title/tt.*'))
res = br.follow_link(link)
soup = BeautifulSoup(res.read())
movie_title = getunicode(soup.find('title'))
rate = soup.find('span',itemprop='ratingValue')
rating = getunicode(rate)
actors=[]
actors_soup = soup.findAll('a',itemprop='actors')
for i in range(len(actors_soup)):
actors.append(getunicode(actors_soup[i]))
des = soup.find('meta',{'name':'description'})['content']
genre=[]
infobar = soup.find('div',{'class':'infobar'})
r = infobar.find('',{'title':True})['title']
genrelist = infobar.findAll('a',{'href':True})
for i in range(len(genrelist)-1):
genre.append(getunicode(genrelist[i]))
release_date = getunicode(genrelist[-1])
print movie_title,rating+'/10.0'
print 'Relase Date:',release_date
print 'Rated',r
print ''
print 'Genre:',
print ', '.join(genre)
print '\nActors:',
print ', '.join(actors)
print '\nDescription:'
print des
if __name__ == '__main__':
main()
@meokey
Copy link

meokey commented Jan 28, 2014

it won't work as /find is disallowed by robot.txt in IMDB.

@adrian-cid
Copy link

it works for me

@shubhamjanhere
Copy link

Its giving the following error -

`Movie Name: avengers
Traceback (most recent call last):
File "/home/shubham/PycharmProjects/TV_Adro/Excel/search.py", line 17, in
br.open(url)
File "/home/shubham/Downloads/sample/venv/local/lib/python2.7/site-packages/mechanize/_mechanize.py", line 254, in open
return self._mech_open(url_or_request, data, timeout=timeout)
File "/home/shubham/Downloads/sample/venv/local/lib/python2.7/site-packages/mechanize/_mechanize.py", line 284, in _mech_open
response = UserAgentBase.open(self, request, data)
File "/home/shubham/Downloads/sample/venv/local/lib/python2.7/site-packages/mechanize/_opener.py", line 190, in open
req = meth(req)
File "/home/shubham/Downloads/sample/venv/local/lib/python2.7/site-packages/mechanize/_http.py", line 253, in http_request
self.rfp.read()
File "/home/shubham/Downloads/sample/venv/local/lib/python2.7/site-packages/mechanize/_http.py", line 174, in read
f = self._opener.open(req)
File "/home/shubham/Downloads/sample/venv/local/lib/python2.7/site-packages/mechanize/_mechanize.py", line 254, in open
return self._mech_open(url_or_request, data, timeout=timeout)
File "/home/shubham/Downloads/sample/venv/local/lib/python2.7/site-packages/mechanize/_mechanize.py", line 284, in _mech_open
response = UserAgentBase.open(self, request, data)
File "/home/shubham/Downloads/sample/venv/local/lib/python2.7/site-packages/mechanize/_opener.py", line 195, in open
response = urlopen(self, req, data)
File "/home/shubham/Downloads/sample/venv/local/lib/python2.7/site-packages/mechanize/_urllib2_fork.py", line 352, in _open
'_open', req)
File "/home/shubham/Downloads/sample/venv/local/lib/python2.7/site-packages/mechanize/_urllib2_fork.py", line 340, in _call_chain
result = func(*args)
File "/home/shubham/Downloads/sample/venv/local/lib/python2.7/site-packages/mechanize/_urllib2_fork.py", line 1185, in http_open
return self.do_open(httplib.HTTPConnection, req)
File "/home/shubham/Downloads/sample/venv/local/lib/python2.7/site-packages/mechanize/_urllib2_fork.py", line 1127, in do_open
h = http_class(host_port, timeout=req.timeout)
File "/usr/lib/python2.7/httplib.py", line 751, in init
(self.host, self.port) = self._get_hostport(host, port)
File "/usr/lib/python2.7/httplib.py", line 792, in _get_hostport
raise InvalidURL("nonnumeric port: '%s'" % host[i+1:])
httplib.InvalidURL: nonnumeric port: 'port'

Process finished with exit code 1`

@ricardorqr
Copy link

I had the same problem as shubhamjanhere

@rpryzant
Copy link

rpryzant commented Aug 22, 2017

+1 ... same here!

@shubhamjanhere @ricardorqr this does the same thing but works for me:
https://gist.github.com/rpryzant/cb4fe2c4d676262d667a68fcbf4e4c91

@DineshReddyK
Copy link

@shubhamjanhere @ricardorqr you have to give your proxy address for br.set_proxies function.

@Dob-The-Duilder
Copy link

Dob-The-Duilder commented Dec 10, 2022

Remade after IMDb changed the way their page works

https://gist.github.com/Dob-The-Duilder/98f0765ce6dd9c1e11c3a649619654ac

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment