Skip to content

Instantly share code, notes, and snippets.

View harshitcodes's full-sized avatar

Harshit Tyagi harshitcodes

View GitHub Profile
# Importing the required packages here
import numpy as np
import pandas as pd
import seaborn as sns
import ast, json
from datetime import datetime
import matplotlib.pyplot as plt
% matplotlib inline
# Let's load the dataset and create their dataframes
credits_df = pd.read_csv('./tmdb-5000-movie-dataset/tmdb_5000_credits.csv')
movies_df = pd.read_csv('./tmdb-5000-movie-dataset/tmdb_5000_movies.csv')
movies_df.head()
# First step is to clean the data and see which are the redundant or unnecessary cols
del_col_list = ['keywords', 'homepage', 'status', 'tagline', 'original_language', 'homepage',
'overview', 'production_companies', 'original_title', 'title_y']
movies_df = movies_df.drop(del_col_list, axis=1)
movies_df.head()
# dropping the duplicates from the dataset.
print(movies_df.shape)
movies_df = movies_df.drop_duplicates(keep='first')
print(movies_df.shape)
# replacing all the zeros from revenue and budget cols.
cols = ['budget', 'revenue']
movies_df[cols] = movies_df[cols].replace(0, np.nan)
# dropping all the rows with na in the columns mentioned above in the list.
movies_df.dropna(subset=cols, inplace=True)
movies_df.shape
# replacing all the zeros from revenue and budget cols.
cols = ['budget', 'revenue']
movies_df[cols] = movies_df[cols].replace(0, np.nan)
# dropping all the rows with na in the columns mentioned above in the list.
movies_df.dropna(subset=cols, inplace=True)
movies_df.shape
# Changing the release_date column to DateTime column
movies_df.release_date = pd.to_datetime(movies_df['release_date'])
# To answer the last question, we'll have to extract the release year from every release date
movies_df['release_year'] = movies_df['release_date'].dt.year
movies_df.head()
# Changing the data type of the below mentioned columns and
change_cols=['budget', 'revenue']
#changing data type
movies_df[change_cols]=movies_df[change_cols].applymap(np.int64)
movies_df.dtypes
# we see that there are columns which are in json format,
# let's flatten these json data into easyily interpretable lists
def parse_col_json(column, key):
"""
Args:
column: string
name of the column to be processed.
key: string
name of the dictionary key which needs to be extracted
# Answer to question #1.
# To find out the most expensive movies, we need to look at the budget set for them which is an indicator of expense.
expensive_movies_df = movies_df.sort_values(by ='budget', ascending=False).head()
expensive_movies_df