Skip to content

Instantly share code, notes, and snippets.

@adilkhash
Created October 7, 2017 13:49
Show Gist options
  • Star 3 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save adilkhash/2689a5ebdc76b9a71a307c0076d1a267 to your computer and use it in GitHub Desktop.
Save adilkhash/2689a5ebdc76b9a71a307c0076d1a267 to your computer and use it in GitHub Desktop.
import csv
import luigi
from luigi.format import UTF8
import requests
import pandas as pd
from bs4 import BeautifulSoup
class AggregateMovieRatingTask(luigi.Task):
years = luigi.ListParameter()
def requires(self):
return [GetMovieMetaDataTask(year) for year in self.years]
def output(self):
return luigi.LocalTarget('results.csv'.format(), format=UTF8)
def run(self):
data_frames = []
for _input in self.input():
with _input.open('r') as raw_file:
data_frames.append(pd.read_csv(raw_file))
df = pd.concat(data_frames)
df = df.sort_values(['rating', 'votes'], ascending=[False, False])
with self.output().open('w') as f:
df[['title', 'rating', 'votes']].to_csv(f)
class GetMovieMetaDataTask(luigi.Task):
year = luigi.Parameter()
def get_movie_meta_data(self, film_div):
title = film_div.h3.a.text
rating = film_div.find('div', class_='ratings-imdb-rating')
rating = rating.attrs['data-value'] if rating else 0
votes = film_div.find('span', attrs={'name': 'nv'})
votes = votes.attrs['data-value'] if votes else 0
return {'title': title, 'rating': rating, 'votes': votes}
def output(self):
return luigi.LocalTarget('raw-{}.csv'.format(self.year), format=UTF8)
def run(self):
url = 'http://www.imdb.com/search/title?release_date={}'.format(self.year)
response = requests.get(url, headers={
'Accept-Language': 'ru-RU,ru;q=0.8,en-US;q=0.6,en;q=0.4'
})
response.raise_for_status()
html = BeautifulSoup(response.text, 'html.parser')
film_container = html.find_all('div', class_='lister-item mode-advanced')
payload = [self.get_movie_meta_data(film) for film in film_container]
with self.output().open('w') as csv_file:
df = pd.DataFrame(payload)
df.to_csv(csv_file)
if __name__ == '__main__':
luigi.run()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment