Skip to content

Instantly share code, notes, and snippets.

@AlexEne
Last active August 29, 2015 14:03
Show Gist options
  • Save AlexEne/67f50c133d58738d106e to your computer and use it in GitHub Desktop.
Save AlexEne/67f50c133d58738d106e to your computer and use it in GitHub Desktop.
import requests
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import bs4
from lxml import etree
import codecs
import time
import pandas as pd
def scrap_page(start):
payload = {'sort': 'num_votes,desc', 'start': start, 'title_type': 'feature'}
req = requests.get('http://www.imdb.com/search/title', params=payload)
bs = bs4.BeautifulSoup(req.content)
results = bs.findChild('table', {'class': 'results'})
while req.status_code != 200 or not results:
time.sleep(1) # try again
req = requests.get('http://www.imdb.com/search/title', params=payload)
bs = bs4.BeautifulSoup(req.text)
results = bs.findChild('table', {'class': 'results'})
titles = results.findChildren('td', 'title')
for i, title in enumerate(titles[:-1]):
name = title.findAll('a', href=True)[0].text
year = title.find('span', 'year_type').text
rating = title.select('.value')[0].text
d = title.findChild('span', {'class': 'runtime'})
duration = '0'
if d:
duration = d.text
g = title.findChild('span', {'class': 'genre'})
if not g:
continue # no gender for this movie, just skip it.
genres = g.findChildren('a')
genre = '|'.join(gen.text for gen in genres)
p = title.parent
num_votes = p.findChild('td', {'class': 'sort_col'}).text
num_votes = num_votes.replace(',', '')
# print ','.join([name, year, rating, duration, num_votes, genre])
with codecs.open('movies.csv', 'a', 'utf-8') as f:
line = '\t'.join([name, year, rating, duration, num_votes, genre])
f.write(line + '\n')
return i + 1
"""def scrap_page(start):
payload = {'sort': 'num_votes,desc', 'start': start, 'title_type': 'feature'}
req = requests.get('http://www.imdb.com/search/title', params=payload)
while req.status_code != 200:
time.sleep(1)
req = requests.get('http://www.imdb.com/search/title', params=payload)
parser = etree.Htmlparser()
tree = etree.parse(req.text, parser)
tree.#main > table > tbody > tr:nth-child(2) > td.title > a"""
def process_data(csv_file):
n = ['name', 'year', 'score', 'duration', 'votes', 'genre']
data = pd.read_csv(csv_file, names=n, delimiter='\t', encoding='utf-8', engine='python').dropna()
data['duration'] = [float(r.split(' ')[0]) for r in data.duration]
data['year'] = [float(y[1:-1]) for y in data.year]
genres = set()
for movie in data.genre:
genres.update(movie.split('|'))
genres = sorted(genres)
#print genres
for genre in genres:
data[genre] = [genre in movie.split('|') for movie in data.genre]
# print data.shape[0]
#print data.head()
#print data.duration.describe()
return data
def plot_axis(ax, data_x, data_y, title='', show_x=False, show_y=False):
ax.scatter(data_x, data_y, alpha=0.3, color="#3F5D7D", edgecolor="#3F5D7D")
ax.axes.get_xaxis().set_visible(show_x)
ax.axes.get_yaxis().set_visible(show_y)
ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
ax.spines['left'].set_visible(False)
if title:
ax.set_title(title)
# ax.spines['bottom'].set_visible(False)
ax.get_xaxis().tick_bottom()
ax.get_yaxis().tick_left()
def main():
# total_scraped = 0
#while total_scraped < 10000:
# total_scraped += scrap_page(total_scraped + 1)
# print 'Scraped {0}'.format(total_scraped)
data = process_data('movies.csv')
#print data[(data.year < 1950) & (data.votes > 100000)]
d = data[(data.Horror == True)]
d2 = data[(data.Comedy == True)]
d3 = data[(data['Western'] == True)]
fig = plt.figure(facecolor='white', figsize=(9, 9))
genres = ['Action', 'Adult', 'Adventure', 'Animation', 'Biography', 'Comedy', 'Crime', 'Drama',
'Family', 'Fantasy', 'Film-Noir', 'History', 'Horror', 'Music', 'Musical', 'Mystery', 'Romance',
'Sci-Fi', 'Sport', 'Thriller', 'War', 'Western']
ax1 = fig.add_subplot(5, 5, 1)
plot_axis(ax1, data.year, data.score, title='Overall score/year distribution', show_y=True)
for i, genre in enumerate(genres):
d = data[(data[genre] == True)]
ax = fig.add_subplot(5, 5, i+2, sharex=ax1, sharey=ax1, title=genre)
plot_axis(ax, d.year, d.score)
if i > 16:
ax.axes.get_xaxis().set_visible(True)
for label in ax.xaxis.get_ticklabels():
label.set_rotation(45)
if (i+1) % 5 == 0:
ax.axes.get_yaxis().set_visible(True)
plt.xlim(1908, 2015)
plt.show()
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment