Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
import csv
import luigi
from luigi.format import UTF8
import requests
import pandas as pd
from bs4 import BeautifulSoup
class AggregateMovieRatingTask(luigi.Task):
years = luigi.ListParameter()
def requires(self):
return [GetMovieMetaDataTask(year) for year in self.years]
def output(self):
return luigi.LocalTarget('results.csv'.format(), format=UTF8)
def run(self):
data_frames = []
for _input in self.input():
with _input.open('r') as raw_file:
data_frames.append(pd.read_csv(raw_file))
df = pd.concat(data_frames)
df = df.sort_values(['rating', 'votes'], ascending=[False, False])
with self.output().open('w') as f:
df[['title', 'rating', 'votes']].to_csv(f)
class GetMovieMetaDataTask(luigi.Task):
year = luigi.Parameter()
def get_movie_meta_data(self, film_div):
title = film_div.h3.a.text
rating = film_div.find('div', class_='ratings-imdb-rating')
rating = rating.attrs['data-value'] if rating else 0
votes = film_div.find('span', attrs={'name': 'nv'})
votes = votes.attrs['data-value'] if votes else 0
return {'title': title, 'rating': rating, 'votes': votes}
def output(self):
return luigi.LocalTarget('raw-{}.csv'.format(self.year), format=UTF8)
def run(self):
url = 'http://www.imdb.com/search/title?release_date={}'.format(self.year)
response = requests.get(url, headers={
'Accept-Language': 'ru-RU,ru;q=0.8,en-US;q=0.6,en;q=0.4'
})
response.raise_for_status()
html = BeautifulSoup(response.text, 'html.parser')
film_container = html.find_all('div', class_='lister-item mode-advanced')
payload = [self.get_movie_meta_data(film) for film in film_container]
with self.output().open('w') as csv_file:
df = pd.DataFrame(payload)
df.to_csv(csv_file)
if __name__ == '__main__':
luigi.run()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.