Skip to content

Instantly share code, notes, and snippets.

View harshitcodes's full-sized avatar

Harshit Tyagi harshitcodes

View GitHub Profile
def find_min_max_in(col):
"""
The function takes in a column and returns the top 5
and bottom 5 movies dataframe in that column.
args:
col: string - column name
return:
info_df: dataframe - final 5 movies dataframe
"""
# to find the most profitable movies, we need to find who made the most
# amount after deducting the budget from the revenue generated.
movies_df['profit'] = movies_df['revenue'] - movies_df['budget']
cols = ['budget',
'profit',
'revenue',
'genres',
'id',
'popularity',
# to find the most talked about movies, we can sort the dataframe on the popularity column
popular_movies_df = movies_df.sort_values(by ='budget', ascending=False).head()
popular_movies_df.head()
# in terms of runtime
# Average runtime of movies
movies_df['runtime'].mean()
# comparison
find_min_max_in('runtime')
# movies rated above 7
movies_df[movies_df['vote_average'] >= 7.0]
profits_year = movies_df.groupby('release_year')['profit'].sum()
#figure size(width, height)
plt.figure(figsize=(12,6), dpi = 130)
#on x-axis
plt.xlabel('Release Year of Movies in the data set', fontsize = 12)
#on y-axis
plt.ylabel('Profits earned by Movies', fontsize = 12)
#title of the line plot
# Most profitable year from the given dataset.
profits_year.idxmax()
#selecting the movies having profit $50M or more
profit_data = movies_df[movies_df['profit'] >= 50000000]
#reindexing new data
profit_data.index = range(len(profit_data))
#we will start from 1 instead of 0
profit_data.index = profit_data.index + 1
# formatting the data in the genres columns.
movies_df['genres']=movies_df['genres'].str.strip('[]').str.replace(' ','').str.replace("'",'')
movies_df['genres']=movies_df['genres'].str.split(',')
movies_df.head()
plt.subplots(figsize=(12,10))
list1=[]
# extending the list of genres to collect all the genres of all the profitable movies
for i in profit_data['genres']:
list1.extend(i)
genre_count_series = pd.Series(list1).value_counts()[:10].sort_values(ascending=True)
# output looks like