Skip to content

Instantly share code, notes, and snippets.

@samgrover
Created December 29, 2022 17:40
Show Gist options
  • Save samgrover/1e7841ac4041bdf921a6da8b9e5b67fa to your computer and use it in GitHub Desktop.
Save samgrover/1e7841ac4041bdf921a6da8b9e5b67fa to your computer and use it in GitHub Desktop.
Playing with the IMDb dataset to find top movies that have multiple directors
#!/usr/bin/env python
# Script to go along with the blog post at: https://samgrover.com/2022/12/29/playing-with-the.html
import pandas as pd
FOLDER = '<folder where the dataset lives>'
name_basics_file = FOLDER + '/name.basics.tsv'
title_ratings_file = FOLDER + '/title.ratings.tsv'
title_crew_file = FOLDER + '/title.crew.tsv'
title_basics_file = FOLDER + '/title.basics.tsv'
converters = {
'knownForTitles': lambda x: x.split(',') if x != '\\N' else [],
'primaryProfession': lambda x: x.split(','),
'directors': lambda x: x.split(',') if x != '\\N' else [],
'writers': lambda x: x.split(',') if x != '\\N' else [],
'genres': lambda x: x.split(',') if x != '\\N' else [],
}
na_values = ['\\N']
sep = '\t'
nb = pd.read_csv(
name_basics_file,
sep=sep,
parse_dates=[3],
dtype={'nconst': 'str', 'primaryName': 'str'},
converters=converters,
na_values=na_values,
low_memory=False
)
tr = pd.read_csv(
title_ratings_file,
sep=sep,
dtype={'tconst': 'str'},
na_values=na_values,
low_memory=False
)
tc = pd.read_csv(
title_crew_file,
sep=sep,
dtype={'tconst': 'str'},
converters=converters,
na_values=na_values,
low_memory=False
)
multiple_directors = tc.loc[tc['directors'].map(len) > 1]
tb = pd.read_csv(
title_basics_file,
sep=sep,
dtype={'tconst': 'str', 'primaryTitle': 'str', 'originalTitle': 'str'},
converters=converters,
na_values=na_values,
low_memory=False
)
titles = pd.merge(tb, tr)
e = titles.merge(multiple_directors)
def isNotAnimation(genres):
return False if 'Animation' in genres else True
top = e.loc[
(e['titleType'] == 'movie') &
(e['genres'].map(isNotAnimation)) &
(e['averageRating'] > 7.5) &
(e['averageRating'] <= 10) &
(e['numVotes'] > 250000)
]
final = top.sort_values(by=['averageRating', 'startYear'], ascending=[False, False])
def get_names(nconsts):
res = []
for n in nconsts:
v = nb.loc[nb['nconst'] == n]['primaryName']
res.append(v.iloc[0])
return res
final['director_names'] = final['directors'].map(get_names)
print(final.head(25).to_string())
final.to_pickle("final.pkl")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment