Skip to content

Instantly share code, notes, and snippets.

@dataewan dataewan/
Created Dec 30, 2017

What would you like to do?
flatten tmdb data
import pandas as pd
import json
import itertools
credits = pd.read_csv('tmdb_5000_credits.csv')
movies = pd.read_csv('tmdb_5000_movies.csv')
def extract_people(row, fieldname):
movie_id = {
"movie_id" : row['movie_id']
cast = json.loads(row[fieldname])
return [
{**movie_id, **i} for i in cast
# extract the cast
cast = credits.apply(extract_people, fieldname='cast', axis=1)
cast_df = pd.DataFrame(list(itertools.chain.from_iterable(cast)))
cast_df.to_csv('cast.csv', index=False, encoding='utf-8')
# extract the crew
crew = credits.apply(extract_people, fieldname='crew', axis=1)
crew_df = pd.DataFrame(list(itertools.chain.from_iterable(crew)))
crew_df.to_csv('crew.csv', index=False, encoding='utf-8')
# extract only the interesting fields from the movies dataset
[['budget', 'id', 'original_title', 'overview', 'popularity', 'release_date', 'runtime', 'status', 'tagline', 'title', 'vote_average', 'vote_count']]
).to_csv('movies.csv', index=False, encoding='utf-8')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.