Skip to content

Instantly share code, notes, and snippets.

@erogol
Last active December 22, 2015 10:09
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save erogol/6457197 to your computer and use it in GitHub Desktop.
Save erogol/6457197 to your computer and use it in GitHub Desktop.
scrap imdb to get non-dublicate movies of given categories
import urllib2
from BeautifulSoup import BeautifulSoup
import pdb
import os
import htmlentitydefs
from BeautifulSoup import BeautifulStoneSoup
import HTMLParser
import cgi
def search_in_dict(DICT,query):
keys = DICT.keys()
num_key = len(keys)-1
for i in range(num_key):
if query in DICT[keys[i]]:
DICT[keys[i]].pop(DICT[keys[i]].index(query))
return True
return False
def HTMLEntitiesToUnicode(text):
"""Converts HTML entities to unicode. For example '&' becomes '&'."""
text = unicode(BeautifulStoneSoup(text, convertEntities=BeautifulStoneSoup.ALL_ENTITIES))
return text
def write_dict_to_files(DICT):
OUTPUT_FOLDER = 'outputs'
if not os.path.exists(OUTPUT_FOLDER):
os.makedirs(OUTPUT_FOLDER)
keys = DICT.keys()
num_keys = len(keys)
for i in range(num_keys):
#pdb.set_trace()
f = open(OUTPUT_FOLDER+'/'+keys[i]+'.txt','w')
num_ins = len(DICT[keys[i]])
for j in range(num_ins):
text = DICT[keys[i]][j]
f.write(HTMLEntitiesToUnicode(text)+'\n')
f.close()
num_page = 15
address = ""
h = HTMLParser.HTMLParser()
GENRE_LIST = ['action','comedy','animation','drama','horror',]
#GENRE_LIST = ['animation', 'romance', 'war']
GENRE_DICT = {}
for GENRE in GENRE_LIST:
GENRE_DICT[GENRE] = [];
address = '/search/title?count=100&genres='+GENRE+'&sproduction_status=released&title_type=feature'
for i in range(num_page):
print address
soup = BeautifulSoup(urllib2.urlopen('http://www.imdb.com'+address).read())
for row in soup('td', {'class':'title'}):
film_name = h.unescape(row('a')[0].string)
if not search_in_dict(GENRE_DICT,film_name):
#print film_name
GENRE_DICT[GENRE].append(film_name)
#pdb.set_trace()
#f.write(film_name+'\n')
next_link = soup.find('span',{'class':'pagination'})
link = next_link.findChildren('a')
address = link[-1]['href']
print('\n\n')
write_dict_to_files(GENRE_DICT)
print 'Finish'
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment