ShambhaviPataskar/Analysis.py

## Analysis.py
#!/usr/bin/env python
# coding: utf-8

# # Sales Analysis

# ### import necessary libraries

# In[1]:


import pandas as pd
import os


# #### Task #1: merging 12 months of sales data into a single file

# In[2]:


df = pd.read_csv("./Sales_Data/Sales_April_2019.csv")

files = [file for file in os.listdir('./Sales_Data')]

all_months_data = pd.DataFrame()

for file in files:

    df = pd.read_csv("./Sales_Data/"+file)
    all_months_data = pd.concat([all_months_data, df])


    all_months_data.head()
    print(all_months_data)

all_months_data.to_csv("all_data.csv", index = False)


# # Read in the updated dataframe

# all_data.head(100)

# In[3]:


all_data = pd.read_csv("all_data.csv")
all_data.head()


# ### Clean up the data

# #### Drop rows of NaN

# In[4]:


nan_df = all_data[all_data.isna().any(axis=1)]
nan_df.head()

all_data = all_data.dropna(how = 'all')
all_data.head()


# #### Find 'Or' and delete it

# In[5]:


all_data = all_data[all_data['Order Date'].str[0:2] != 'Or']


# ##### Convert columns to the correct type

# In[6]:


all_data['Quantity Ordered'] = pd.to_numeric(all_data['Quantity Ordered'])# Make int
all_data['Price Each'] = pd.to_numeric(all_data['Price Each']) # Make float

all_data.head()


# # Agument data with additional columns

# #### Task #2: Add month column

# In[7]:


all_data['Month'] = all_data["Order Date"].str[0:2]
all_data['Month'] = all_data['Month'].astype('int32')
all_data.head()


# #### Task #3: Add a Sales column

# In[8]:


all_data['Sales'] = all_data['Quantity Ordered'] * all_data['Price Each']
all_data.head()


# #### Add a city column

# In[9]:


# Let's use .apply()
def get_city(address):
    return address.split(',')[1]

def get_state(address):
    return address.split(',')[2].split(' ')[1]

all_data['City'] = all_data['Purchase Address'].apply(lambda x: f"{get_city(x)} ({get_state(x)})")

all_data.head()


# ##### Q1) What was the best month for sales? How much was earned that month?

# In[38]:


results = all_data.groupby('Month').sum()


# In[39]:


import matplotlib.pyplot as plt

months = range(1, 13)

plt.bar(months, results['Sales'], color = 'crimson')

plt.xticks(months)
plt.ylabel('Sales in USD ($)')
plt.xlabel('Month number')

plt.show()

#December was the best month for sales


# ##### Q2) What city had the highest number of sales?

# In[41]:


results = all_data.groupby('City').sum()
results


# In[42]:


import matplotlib.pyplot as plt

cities = [city for city, df in all_data.groupby('City')]

plt.bar(cities, results['Sales'], color = 'dodgerblue')

plt.xticks(cities, rotation = "vertical", size='8')
plt.ylabel('Sales in USD ($)')
plt.xlabel('City Name')

plt.show()

#Answer: San Francisco(CA)


# ##### Q3) What time should we display advertisements to maximize likelihood of customers buying product?

# In[14]:


all_data['Order Date'] = pd.to_datetime(all_data['Order Date'])
all_data['Hour'] = all_data['Order Date'].dt.hour
all_data['Minute'] = all_data['Order Date'].dt.minute


# In[15]:


all_data.head()


# In[25]:


hours = [hour for hour, df in all_data.groupby('Hour')]

plt.plot(hours, all_data.groupby(['Hour']).count(), color = 'darkorange')
plt.xticks(hours)
plt.grid()
plt.xlabel("Time (in hours)")
plt.ylabel('Number of Orders')

all_data.groupby(['Hour']).count()
plt.show()

#Answer: Around 12pm (12) and/or 7pm (19)


# ##### Q4) What products are most often sold together?

# In[43]:


df = all_data[all_data['Order ID'].duplicated(keep = False)]
df['Grouped'] = df.groupby('Order ID')['Product'].transform(lambda x: ',' .join(x))
df = df[['Order ID', 'Grouped']].drop_duplicates()

df.head()


# In[18]:


from itertools import combinations
from collections import Counter

count = Counter()

for row in df['Grouped']:
    row_list = row.split(',')
    count.update(Counter(combinations(row_list, 3)))

for key, value in count.most_common(10):
    print(key, value)


# ##### Q5) What product sold the most? Why do you think it sold the most?

# In[19]:


all_data.head()


# In[30]:


product_group = all_data.groupby('Product')

quantity_ordered = product_group.sum()['Quantity Ordered']

products = [product for product, df in product_group]
plt.bar(products, quantity_ordered, color = 'darkmagenta')

plt.xticks(products, rotation = "vertical", size='8')
plt.xlabel('Products Sold')
plt.ylabel('Ordered Quantity')

plt.show()


# In[50]:


prices = all_data.groupby('Product').mean()['Price Each']


fig, ax1 = plt.subplots()

ax2 = ax1.twinx()
ax1.bar(products, quantity_ordered, color='green')
ax2.plot(products, prices, 'b-', color = 'blue')

ax1.set_xlabel('Product Sold')
ax1.set_ylabel('Quantity Ordered', color='g')
ax2.set_ylabel('Price in USD ($)', color='b')
ax1.set_xticklabels(products, rotation = "vertical", size='8')


plt.show()


# ###### Answer: i) According to our hypothesis, if the quantity ordered is high, the price should be low (which means AAA battery
# ###### pack is sold the most because it is cheap.) As observed from the graph above, LG dryer and LG washing machine are
# ###### sold in the least amount because of the high price.
# ######              ii) AAA battery pack have a wide scale application and can be used in multiple different products, which makes it more sellable.

# In[ ]:


#FIN
	#!/usr/bin/env python
	# coding: utf-8

	# # Sales Analysis

	# ### import necessary libraries

	# In[1]:


	import pandas as pd
	import os


	# #### Task #1: merging 12 months of sales data into a single file

	# In[2]:


	df = pd.read_csv("./Sales_Data/Sales_April_2019.csv")

	files = [file for file in os.listdir('./Sales_Data')]

	all_months_data = pd.DataFrame()

	for file in files:

	df = pd.read_csv("./Sales_Data/"+file)
	all_months_data = pd.concat([all_months_data, df])



	all_months_data.head()
	print(all_months_data)

	all_months_data.to_csv("all_data.csv", index = False)


	# # Read in the updated dataframe

	# all_data.head(100)

	# In[3]:


	all_data = pd.read_csv("all_data.csv")
	all_data.head()


	# ### Clean up the data

	# #### Drop rows of NaN

	# In[4]:


	nan_df = all_data[all_data.isna().any(axis=1)]
	nan_df.head()

	all_data = all_data.dropna(how = 'all')
	all_data.head()


	# #### Find 'Or' and delete it

	# In[5]:


	all_data = all_data[all_data['Order Date'].str[0:2] != 'Or']


	# ##### Convert columns to the correct type

	# In[6]:


	all_data['Quantity Ordered'] = pd.to_numeric(all_data['Quantity Ordered'])# Make int
	all_data['Price Each'] = pd.to_numeric(all_data['Price Each']) # Make float

	all_data.head()


	# # Agument data with additional columns

	# #### Task #2: Add month column

	# In[7]:


	all_data['Month'] = all_data["Order Date"].str[0:2]
	all_data['Month'] = all_data['Month'].astype('int32')
	all_data.head()


	# #### Task #3: Add a Sales column

	# In[8]:


	all_data['Sales'] = all_data['Quantity Ordered'] * all_data['Price Each']
	all_data.head()


	# #### Add a city column

	# In[9]:


	# Let's use .apply()
	def get_city(address):
	return address.split(',')[1]

	def get_state(address):
	return address.split(',')[2].split(' ')[1]

	all_data['City'] = all_data['Purchase Address'].apply(lambda x: f"{get_city(x)} ({get_state(x)})")

	all_data.head()


	# ##### Q1) What was the best month for sales? How much was earned that month?

	# In[38]:


	results = all_data.groupby('Month').sum()


	# In[39]:


	import matplotlib.pyplot as plt

	months = range(1, 13)

	plt.bar(months, results['Sales'], color = 'crimson')

	plt.xticks(months)
	plt.ylabel('Sales in USD ($)')
	plt.xlabel('Month number')

	plt.show()

	#December was the best month for sales


	# ##### Q2) What city had the highest number of sales?

	# In[41]:


	results = all_data.groupby('City').sum()
	results


	# In[42]:


	import matplotlib.pyplot as plt

	cities = [city for city, df in all_data.groupby('City')]

	plt.bar(cities, results['Sales'], color = 'dodgerblue')

	plt.xticks(cities, rotation = "vertical", size='8')
	plt.ylabel('Sales in USD ($)')
	plt.xlabel('City Name')

	plt.show()

	#Answer: San Francisco(CA)


	# ##### Q3) What time should we display advertisements to maximize likelihood of customers buying product?

	# In[14]:


	all_data['Order Date'] = pd.to_datetime(all_data['Order Date'])
	all_data['Hour'] = all_data['Order Date'].dt.hour
	all_data['Minute'] = all_data['Order Date'].dt.minute


	# In[15]:


	all_data.head()


	# In[25]:


	hours = [hour for hour, df in all_data.groupby('Hour')]

	plt.plot(hours, all_data.groupby(['Hour']).count(), color = 'darkorange')
	plt.xticks(hours)
	plt.grid()
	plt.xlabel("Time (in hours)")
	plt.ylabel('Number of Orders')

	all_data.groupby(['Hour']).count()
	plt.show()

	#Answer: Around 12pm (12) and/or 7pm (19)


	# ##### Q4) What products are most often sold together?

	# In[43]:


	df = all_data[all_data['Order ID'].duplicated(keep = False)]
	df['Grouped'] = df.groupby('Order ID')['Product'].transform(lambda x: ',' .join(x))
	df = df[['Order ID', 'Grouped']].drop_duplicates()

	df.head()


	# In[18]:


	from itertools import combinations
	from collections import Counter

	count = Counter()

	for row in df['Grouped']:
	row_list = row.split(',')
	count.update(Counter(combinations(row_list, 3)))

	for key, value in count.most_common(10):
	print(key, value)


	# ##### Q5) What product sold the most? Why do you think it sold the most?

	# In[19]:


	all_data.head()


	# In[30]:


	product_group = all_data.groupby('Product')

	quantity_ordered = product_group.sum()['Quantity Ordered']

	products = [product for product, df in product_group]
	plt.bar(products, quantity_ordered, color = 'darkmagenta')

	plt.xticks(products, rotation = "vertical", size='8')
	plt.xlabel('Products Sold')
	plt.ylabel('Ordered Quantity')

	plt.show()


	# In[50]:


	prices = all_data.groupby('Product').mean()['Price Each']


	fig, ax1 = plt.subplots()

	ax2 = ax1.twinx()
	ax1.bar(products, quantity_ordered, color='green')
	ax2.plot(products, prices, 'b-', color = 'blue')

	ax1.set_xlabel('Product Sold')
	ax1.set_ylabel('Quantity Ordered', color='g')
	ax2.set_ylabel('Price in USD ($)', color='b')
	ax1.set_xticklabels(products, rotation = "vertical", size='8')


	plt.show()


	# ###### Answer: i) According to our hypothesis, if the quantity ordered is high, the price should be low (which means AAA battery
	# ###### pack is sold the most because it is cheap.) As observed from the graph above, LG dryer and LG washing machine are
	# ###### sold in the least amount because of the high price.
	# ###### ii) AAA battery pack have a wide scale application and can be used in multiple different products, which makes it more sellable.

	# In[ ]:


	#FIN