Skip to content

Instantly share code, notes, and snippets.

View harshitcodes's full-sized avatar

Harshit Tyagi harshitcodes

View GitHub Profile
# replacing all the zeros from revenue and budget cols.
cols = ['budget', 'revenue']
movies_df[cols] = movies_df[cols].replace(0, np.nan)
# dropping all the rows with na in the columns mentioned above in the list.
movies_df.dropna(subset=cols, inplace=True)
movies_df.shape
# replacing all the zeros from revenue and budget cols.
cols = ['budget', 'revenue']
movies_df[cols] = movies_df[cols].replace(0, np.nan)
# dropping all the rows with na in the columns mentioned above in the list.
movies_df.dropna(subset=cols, inplace=True)
movies_df.shape
# dropping the duplicates from the dataset.
print(movies_df.shape)
movies_df = movies_df.drop_duplicates(keep='first')
print(movies_df.shape)
# First step is to clean the data and see which are the redundant or unnecessary cols
del_col_list = ['keywords', 'homepage', 'status', 'tagline', 'original_language', 'homepage',
'overview', 'production_companies', 'original_title', 'title_y']
movies_df = movies_df.drop(del_col_list, axis=1)
movies_df.head()
# Let's load the dataset and create their dataframes
credits_df = pd.read_csv('./tmdb-5000-movie-dataset/tmdb_5000_credits.csv')
movies_df = pd.read_csv('./tmdb-5000-movie-dataset/tmdb_5000_movies.csv')
movies_df.head()
# Importing the required packages here
import numpy as np
import pandas as pd
import seaborn as sns
import ast, json
from datetime import datetime
import matplotlib.pyplot as plt
% matplotlib inline