Harshit Tyagi harshitcodes

## import.py
# Importing the required packages here

import numpy as np
import pandas as pd
import seaborn as sns
import ast, json

from datetime import datetime
import matplotlib.pyplot as plt
% matplotlib inline

## load.py
# Let's load the dataset and create their dataframes

credits_df = pd.read_csv('./tmdb-5000-movie-dataset/tmdb_5000_credits.csv')
movies_df = pd.read_csv('./tmdb-5000-movie-dataset/tmdb_5000_movies.csv')
movies_df.head()

## cleaning_1.py
# First step is to clean the data and see which are the redundant or unnecessary cols

del_col_list = ['keywords', 'homepage', 'status', 'tagline', 'original_language', 'homepage',
                'overview', 'production_companies', 'original_title', 'title_y']

movies_df = movies_df.drop(del_col_list, axis=1)
movies_df.head()

## cleaning_2.py
#  dropping the duplicates from the dataset.
print(movies_df.shape)
movies_df = movies_df.drop_duplicates(keep='first')
print(movies_df.shape)

## gist:dd9be68a9ea00f1d1b791e5bae05a48f
# replacing all the zeros from revenue and budget cols.

cols = ['budget', 'revenue']
movies_df[cols] = movies_df[cols].replace(0, np.nan)

# dropping all the rows with na in the columns mentioned above in the list.
movies_df.dropna(subset=cols, inplace=True)
movies_df.shape

## cleaning_3.py
# replacing all the zeros from revenue and budget cols.

cols = ['budget', 'revenue']
movies_df[cols] = movies_df[cols].replace(0, np.nan)

# dropping all the rows with na in the columns mentioned above in the list.
movies_df.dropna(subset=cols, inplace=True)
movies_df.shape

## cleaning_4.py
# Changing the release_date column to DateTime column

movies_df.release_date = pd.to_datetime(movies_df['release_date'])
# To answer the last question, we'll have to extract the release year from every release date
movies_df['release_year'] = movies_df['release_date'].dt.year
movies_df.head()

## cleaning_5.py
# Changing the data type of the below mentioned columns and
change_cols=['budget', 'revenue']
#changing data type
movies_df[change_cols]=movies_df[change_cols].applymap(np.int64)
movies_df.dtypes

## cleaning_6.py
# we see that there are columns which are in json format,
# let's flatten these json data into easyily interpretable lists

def parse_col_json(column, key):
    """
    Args:
        column: string
            name of the column to be processed.
        key: string
            name of the dictionary key which needs to be extracted

## answer_1.py
# Answer to question #1.
# To find out the most expensive movies, we need to look at the budget set for them which is an indicator of expense.

expensive_movies_df = movies_df.sort_values(by ='budget', ascending=False).head()
expensive_movies_df
	# Importing the required packages here

	import numpy as np
	import pandas as pd
	import seaborn as sns
	import ast, json

	from datetime import datetime
	import matplotlib.pyplot as plt
	% matplotlib inline
	# Let's load the dataset and create their dataframes

	credits_df = pd.read_csv('./tmdb-5000-movie-dataset/tmdb_5000_credits.csv')
	movies_df = pd.read_csv('./tmdb-5000-movie-dataset/tmdb_5000_movies.csv')
	movies_df.head()
	# First step is to clean the data and see which are the redundant or unnecessary cols

	del_col_list = ['keywords', 'homepage', 'status', 'tagline', 'original_language', 'homepage',
	'overview', 'production_companies', 'original_title', 'title_y']

	movies_df = movies_df.drop(del_col_list, axis=1)
	movies_df.head()
	# dropping the duplicates from the dataset.
	print(movies_df.shape)
	movies_df = movies_df.drop_duplicates(keep='first')
	print(movies_df.shape)
	# replacing all the zeros from revenue and budget cols.

	cols = ['budget', 'revenue']
	movies_df[cols] = movies_df[cols].replace(0, np.nan)

	# dropping all the rows with na in the columns mentioned above in the list.
	movies_df.dropna(subset=cols, inplace=True)
	movies_df.shape
	# Changing the release_date column to DateTime column

	movies_df.release_date = pd.to_datetime(movies_df['release_date'])
	# To answer the last question, we'll have to extract the release year from every release date
	movies_df['release_year'] = movies_df['release_date'].dt.year
	movies_df.head()
	# Changing the data type of the below mentioned columns and
	change_cols=['budget', 'revenue']
	#changing data type
	movies_df[change_cols]=movies_df[change_cols].applymap(np.int64)
	movies_df.dtypes
	# we see that there are columns which are in json format,
	# let's flatten these json data into easyily interpretable lists

	def parse_col_json(column, key):
	"""
	Args:
	column: string
	name of the column to be processed.
	key: string
	name of the dictionary key which needs to be extracted
	# Answer to question #1.
	# To find out the most expensive movies, we need to look at the budget set for them which is an indicator of expense.

	expensive_movies_df = movies_df.sort_values(by ='budget', ascending=False).head()
	expensive_movies_df