Harshit Tyagi harshitcodes

## answer_2.py
def find_min_max_in(col):
    """
    The function takes in a column and returns the top 5
    and bottom 5 movies dataframe in that column.

    args:
        col: string - column name
    return:
        info_df: dataframe - final 5 movies dataframe
    """

## answer_3.py
# to find the most profitable movies, we need to find who made the most
# amount after deducting the budget from the revenue generated.

movies_df['profit'] = movies_df['revenue'] - movies_df['budget']
cols = ['budget',
         'profit',
         'revenue',
         'genres',
         'id',
         'popularity',

## answer_4.py
# to find the most talked about movies, we can sort the dataframe on the popularity column
popular_movies_df = movies_df.sort_values(by ='budget', ascending=False).head()
popular_movies_df.head()

## answer_5.py
# in terms of runtime
# Average runtime of movies
movies_df['runtime'].mean()
# comparison
find_min_max_in('runtime')

## answer_6.py
# movies rated above 7

movies_df[movies_df['vote_average'] >= 7.0]

## answer_7.py
profits_year = movies_df.groupby('release_year')['profit'].sum()

#figure size(width, height)
plt.figure(figsize=(12,6), dpi = 130)

#on x-axis
plt.xlabel('Release Year of Movies in the data set', fontsize = 12)
#on y-axis
plt.ylabel('Profits earned by Movies', fontsize = 12)
#title of the line plot

## answer_8.py
# Most profitable year from the given dataset.

profits_year.idxmax()

## profit_data.py
#selecting the movies having profit $50M or more
profit_data = movies_df[movies_df['profit'] >= 50000000]

#reindexing new data
profit_data.index = range(len(profit_data))

#we will start from 1 instead of 0
profit_data.index = profit_data.index + 1

## genre.py
# formatting the data in the genres columns.
movies_df['genres']=movies_df['genres'].str.strip('[]').str.replace(' ','').str.replace("'",'')
movies_df['genres']=movies_df['genres'].str.split(',')
movies_df.head()

## seaborn_visualisation.py
plt.subplots(figsize=(12,10))
list1=[]

# extending the list of genres to collect all the genres of all the profitable movies
for i in profit_data['genres']:
    list1.extend(i)


genre_count_series = pd.Series(list1).value_counts()[:10].sort_values(ascending=True)
# output looks like
	def find_min_max_in(col):
	"""
	The function takes in a column and returns the top 5
	and bottom 5 movies dataframe in that column.

	args:
	col: string - column name
	return:
	info_df: dataframe - final 5 movies dataframe
	"""
	# to find the most profitable movies, we need to find who made the most
	# amount after deducting the budget from the revenue generated.

	movies_df['profit'] = movies_df['revenue'] - movies_df['budget']
	cols = ['budget',
	'profit',
	'revenue',
	'genres',
	'id',
	'popularity',
	# to find the most talked about movies, we can sort the dataframe on the popularity column
	popular_movies_df = movies_df.sort_values(by ='budget', ascending=False).head()
	popular_movies_df.head()
	# in terms of runtime
	# Average runtime of movies
	movies_df['runtime'].mean()
	# comparison
	find_min_max_in('runtime')
	# movies rated above 7

	movies_df[movies_df['vote_average'] >= 7.0]
	profits_year = movies_df.groupby('release_year')['profit'].sum()

	#figure size(width, height)
	plt.figure(figsize=(12,6), dpi = 130)

	#on x-axis
	plt.xlabel('Release Year of Movies in the data set', fontsize = 12)
	#on y-axis
	plt.ylabel('Profits earned by Movies', fontsize = 12)
	#title of the line plot
	# Most profitable year from the given dataset.

	profits_year.idxmax()
	#selecting the movies having profit $50M or more
	profit_data = movies_df[movies_df['profit'] >= 50000000]

	#reindexing new data
	profit_data.index = range(len(profit_data))

	#we will start from 1 instead of 0
	profit_data.index = profit_data.index + 1
	# formatting the data in the genres columns.
	movies_df['genres']=movies_df['genres'].str.strip('[]').str.replace(' ','').str.replace("'",'')
	movies_df['genres']=movies_df['genres'].str.split(',')
	movies_df.head()
	plt.subplots(figsize=(12,10))
	list1=[]

	# extending the list of genres to collect all the genres of all the profitable movies
	for i in profit_data['genres']:
	list1.extend(i)


	genre_count_series = pd.Series(list1).value_counts()[:10].sort_values(ascending=True)
	# output looks like