Skip to content

Instantly share code, notes, and snippets.

View harshitcodes's full-sized avatar

Harshit Tyagi harshitcodes

View GitHub Profile
profits_year = movies_df.groupby('release_year')['profit'].sum()
#figure size(width, height)
plt.figure(figsize=(12,6), dpi = 130)
#on x-axis
plt.xlabel('Release Year of Movies in the data set', fontsize = 12)
#on y-axis
plt.ylabel('Profits earned by Movies', fontsize = 12)
#title of the line plot
# movies rated above 7
movies_df[movies_df['vote_average'] >= 7.0]
# in terms of runtime
# Average runtime of movies
movies_df['runtime'].mean()
# comparison
find_min_max_in('runtime')
# to find the most talked about movies, we can sort the dataframe on the popularity column
popular_movies_df = movies_df.sort_values(by ='budget', ascending=False).head()
popular_movies_df.head()
# to find the most profitable movies, we need to find who made the most
# amount after deducting the budget from the revenue generated.
movies_df['profit'] = movies_df['revenue'] - movies_df['budget']
cols = ['budget',
'profit',
'revenue',
'genres',
'id',
'popularity',
def find_min_max_in(col):
"""
The function takes in a column and returns the top 5
and bottom 5 movies dataframe in that column.
args:
col: string - column name
return:
info_df: dataframe - final 5 movies dataframe
"""
# Answer to question #1.
# To find out the most expensive movies, we need to look at the budget set for them which is an indicator of expense.
expensive_movies_df = movies_df.sort_values(by ='budget', ascending=False).head()
expensive_movies_df
# we see that there are columns which are in json format,
# let's flatten these json data into easyily interpretable lists
def parse_col_json(column, key):
"""
Args:
column: string
name of the column to be processed.
key: string
name of the dictionary key which needs to be extracted
# Changing the data type of the below mentioned columns and
change_cols=['budget', 'revenue']
#changing data type
movies_df[change_cols]=movies_df[change_cols].applymap(np.int64)
movies_df.dtypes
# Changing the release_date column to DateTime column
movies_df.release_date = pd.to_datetime(movies_df['release_date'])
# To answer the last question, we'll have to extract the release year from every release date
movies_df['release_year'] = movies_df['release_date'].dt.year
movies_df.head()