Harshit Tyagi harshitcodes

## cleaning_3.py
# replacing all the zeros from revenue and budget cols.

cols = ['budget', 'revenue']
movies_df[cols] = movies_df[cols].replace(0, np.nan)

# dropping all the rows with na in the columns mentioned above in the list.
movies_df.dropna(subset=cols, inplace=True)
movies_df.shape

## gist:dd9be68a9ea00f1d1b791e5bae05a48f
# replacing all the zeros from revenue and budget cols.

cols = ['budget', 'revenue']
movies_df[cols] = movies_df[cols].replace(0, np.nan)

# dropping all the rows with na in the columns mentioned above in the list.
movies_df.dropna(subset=cols, inplace=True)
movies_df.shape

## cleaning_2.py
#  dropping the duplicates from the dataset.
print(movies_df.shape)
movies_df = movies_df.drop_duplicates(keep='first')
print(movies_df.shape)

## cleaning_1.py
# First step is to clean the data and see which are the redundant or unnecessary cols

del_col_list = ['keywords', 'homepage', 'status', 'tagline', 'original_language', 'homepage',
                'overview', 'production_companies', 'original_title', 'title_y']

movies_df = movies_df.drop(del_col_list, axis=1)
movies_df.head()

## load.py
# Let's load the dataset and create their dataframes

credits_df = pd.read_csv('./tmdb-5000-movie-dataset/tmdb_5000_credits.csv')
movies_df = pd.read_csv('./tmdb-5000-movie-dataset/tmdb_5000_movies.csv')
movies_df.head()

## import.py
# Importing the required packages here

import numpy as np
import pandas as pd
import seaborn as sns
import ast, json

from datetime import datetime
import matplotlib.pyplot as plt
% matplotlib inline
	# replacing all the zeros from revenue and budget cols.

	cols = ['budget', 'revenue']
	movies_df[cols] = movies_df[cols].replace(0, np.nan)

	# dropping all the rows with na in the columns mentioned above in the list.
	movies_df.dropna(subset=cols, inplace=True)
	movies_df.shape
	# dropping the duplicates from the dataset.
	print(movies_df.shape)
	movies_df = movies_df.drop_duplicates(keep='first')
	print(movies_df.shape)
	# First step is to clean the data and see which are the redundant or unnecessary cols

	del_col_list = ['keywords', 'homepage', 'status', 'tagline', 'original_language', 'homepage',
	'overview', 'production_companies', 'original_title', 'title_y']

	movies_df = movies_df.drop(del_col_list, axis=1)
	movies_df.head()
	# Let's load the dataset and create their dataframes

	credits_df = pd.read_csv('./tmdb-5000-movie-dataset/tmdb_5000_credits.csv')
	movies_df = pd.read_csv('./tmdb-5000-movie-dataset/tmdb_5000_movies.csv')
	movies_df.head()
	# Importing the required packages here

	import numpy as np
	import pandas as pd
	import seaborn as sns
	import ast, json

	from datetime import datetime
	import matplotlib.pyplot as plt
	% matplotlib inline