Harshit Tyagi harshitcodes

## answer_7.py
profits_year = movies_df.groupby('release_year')['profit'].sum()

#figure size(width, height)
plt.figure(figsize=(12,6), dpi = 130)

#on x-axis
plt.xlabel('Release Year of Movies in the data set', fontsize = 12)
#on y-axis
plt.ylabel('Profits earned by Movies', fontsize = 12)
#title of the line plot

## answer_6.py
# movies rated above 7

movies_df[movies_df['vote_average'] >= 7.0]

## answer_5.py
# in terms of runtime
# Average runtime of movies
movies_df['runtime'].mean()
# comparison
find_min_max_in('runtime')

## answer_4.py
# to find the most talked about movies, we can sort the dataframe on the popularity column
popular_movies_df = movies_df.sort_values(by ='budget', ascending=False).head()
popular_movies_df.head()

## answer_3.py
# to find the most profitable movies, we need to find who made the most
# amount after deducting the budget from the revenue generated.

movies_df['profit'] = movies_df['revenue'] - movies_df['budget']
cols = ['budget',
         'profit',
         'revenue',
         'genres',
         'id',
         'popularity',

## answer_2.py
def find_min_max_in(col):
    """
    The function takes in a column and returns the top 5
    and bottom 5 movies dataframe in that column.

    args:
        col: string - column name
    return:
        info_df: dataframe - final 5 movies dataframe
    """

## answer_1.py
# Answer to question #1.
# To find out the most expensive movies, we need to look at the budget set for them which is an indicator of expense.

expensive_movies_df = movies_df.sort_values(by ='budget', ascending=False).head()
expensive_movies_df

## cleaning_6.py
# we see that there are columns which are in json format,
# let's flatten these json data into easyily interpretable lists

def parse_col_json(column, key):
    """
    Args:
        column: string
            name of the column to be processed.
        key: string
            name of the dictionary key which needs to be extracted

## cleaning_5.py
# Changing the data type of the below mentioned columns and
change_cols=['budget', 'revenue']
#changing data type
movies_df[change_cols]=movies_df[change_cols].applymap(np.int64)
movies_df.dtypes

## cleaning_4.py
# Changing the release_date column to DateTime column

movies_df.release_date = pd.to_datetime(movies_df['release_date'])
# To answer the last question, we'll have to extract the release year from every release date
movies_df['release_year'] = movies_df['release_date'].dt.year
movies_df.head()
	profits_year = movies_df.groupby('release_year')['profit'].sum()

	#figure size(width, height)
	plt.figure(figsize=(12,6), dpi = 130)

	#on x-axis
	plt.xlabel('Release Year of Movies in the data set', fontsize = 12)
	#on y-axis
	plt.ylabel('Profits earned by Movies', fontsize = 12)
	#title of the line plot
	# movies rated above 7

	movies_df[movies_df['vote_average'] >= 7.0]
	# in terms of runtime
	# Average runtime of movies
	movies_df['runtime'].mean()
	# comparison
	find_min_max_in('runtime')
	# to find the most talked about movies, we can sort the dataframe on the popularity column
	popular_movies_df = movies_df.sort_values(by ='budget', ascending=False).head()
	popular_movies_df.head()
	# to find the most profitable movies, we need to find who made the most
	# amount after deducting the budget from the revenue generated.

	movies_df['profit'] = movies_df['revenue'] - movies_df['budget']
	cols = ['budget',
	'profit',
	'revenue',
	'genres',
	'id',
	'popularity',
	def find_min_max_in(col):
	"""
	The function takes in a column and returns the top 5
	and bottom 5 movies dataframe in that column.

	args:
	col: string - column name
	return:
	info_df: dataframe - final 5 movies dataframe
	"""
	# Answer to question #1.
	# To find out the most expensive movies, we need to look at the budget set for them which is an indicator of expense.

	expensive_movies_df = movies_df.sort_values(by ='budget', ascending=False).head()
	expensive_movies_df
	# we see that there are columns which are in json format,
	# let's flatten these json data into easyily interpretable lists

	def parse_col_json(column, key):
	"""
	Args:
	column: string
	name of the column to be processed.
	key: string
	name of the dictionary key which needs to be extracted
	# Changing the data type of the below mentioned columns and
	change_cols=['budget', 'revenue']
	#changing data type
	movies_df[change_cols]=movies_df[change_cols].applymap(np.int64)
	movies_df.dtypes
	# Changing the release_date column to DateTime column

	movies_df.release_date = pd.to_datetime(movies_df['release_date'])
	# To answer the last question, we'll have to extract the release year from every release date
	movies_df['release_year'] = movies_df['release_date'].dt.year
	movies_df.head()