Skip to content

Instantly share code, notes, and snippets.

@benspargo89
Created April 21, 2018 21:46
Show Gist options
  • Save benspargo89/7305772aee24861fff08608c70b1a19c to your computer and use it in GitHub Desktop.
Save benspargo89/7305772aee24861fff08608c70b1a19c to your computer and use it in GitHub Desktop.
Yelp Data Manipulation
import os
import pandas as pd
import datetime
def waiting(text):
if ' wait ' in text.lower() or ' waiting' in text.lower():
return 1
else:
return 0
NoWaitDates = {'Clinton Street Baking Company' : datetime.datetime(2015, 5, 1),'Penelope': datetime.datetime(2016, 3, 1), 'Russian River Brewing': datetime.datetime(2017, 11, 1), 'Soul Gastrolounge': datetime.date(2016, 11, 15),
'The Farmhouse Tap & Grill': datetime.datetime(2016, 9, 1),'The Magnolia Pancake Haus': datetime.datetime(2017, 6, 1), 'The Pharmacy': datetime.datetime(2015, 3, 1), 'Waterman’s Surfside Grille': datetime.datetime(2014, 4, 1)}
excel_files = []
for root, dirs, files in os.walk("."):
for filename in files:
if filename[-4:] == '.xls':
excel_files.append(filename)
companies = []
for company in excel_files:
companies.append(company[:-4])
all_data = pd.DataFrame(columns=['Rating', 'Date', 'Review', 'Company'])
for N in range(len(excel_files)):
df = pd.read_excel(excel_files[N])
df['Company'] = excel_files[N][:-4]
all_data = pd.concat([all_data, df])
all_data['Date'] = pd.to_datetime(all_data['Date'], errors='coerce')
all_data['NoWait'] = datetime.datetime(1999, 8, 1)
for company in companies:
## all_data[all_data['Company'] == company]['NoWait'] = NoWaitDates[company]
all_data.loc[all_data['Company'] == company, "NoWait"] = NoWaitDates[company]
all_data['Wait_Mentioned'] = all_data.apply(lambda row: waiting(str(row['Review'])), axis=1)
all_data = all_data[all_data.Date.notnull()]
PercentWaitMentionedBefore = {}
PercentWaitMentionedAfter = {}
NumberWaitMentionedBefore = {}
NumberWaitMentionedAfter = {}
TotalReviewsAfter ={}
TotalReviewsBefore={}
for company in companies:
for i in range(1, 2):
key = company + " " + str(i) + ' Star'
NumberWaitMentionedBefore[key] = all_data[(all_data['Rating']==i) & (all_data['Company']==company) & (all_data['Date'] < all_data['NoWait'])]['Wait_Mentioned'].sum()
NumberWaitMentionedAfter[key] = all_data[(all_data['Rating']==i) & (all_data['Company']==company) & (all_data['Date'] > all_data['NoWait'])]['Wait_Mentioned'].sum()
TotalReviewsAfter[key] = len(all_data[(all_data['Rating']==i) & (all_data['Company']==company)& (all_data['Date'] > all_data['NoWait'])]['Wait_Mentioned'])
TotalReviewsBefore[key] = len(all_data[(all_data['Rating']==i) & (all_data['Company']==company)& (all_data['Date'] < all_data['NoWait'])]['Wait_Mentioned'])
PercentWaitMentionedBefore[key] = "{0:.1f}%".format(all_data[(all_data['Rating']==i) & (all_data['Company']==company) & (all_data['Date'] < all_data['NoWait'])]['Wait_Mentioned'].sum()
/ len(all_data[(all_data['Rating']==i) & (all_data['Company']==company)& (all_data['Date'] < all_data['NoWait'])]['Wait_Mentioned'])*100)
PercentWaitMentionedAfter[key] = "{0:.1f}%".format(all_data[(all_data['Rating']==i) & (all_data['Company']==company) & (all_data['Date'] > all_data['NoWait'])]['Wait_Mentioned'].sum()
/ len(all_data[(all_data['Rating']==i) & (all_data['Company']==company)& (all_data['Date'] > all_data['NoWait'])]['Wait_Mentioned'])*100)
print(PercentWaitMentionedBefore)
print('\n')
print(PercentWaitMentionedAfter)
BM = pd.DataFrame.from_records([NumberWaitMentionedBefore]).unstack()
BT = pd.DataFrame.from_records([TotalReviewsBefore]).unstack()
AM = pd.DataFrame.from_records([NumberWaitMentionedAfter]).unstack()
AT = pd.DataFrame.from_records([TotalReviewsAfter]).unstack()
sum_results = pd.DataFrame(data=[BM, BT, AM, AT]).unstack().reset_index(level=1, drop=True)
sum_results.index.set_levels(['Before NoWait Mentioned', 'Before NoWait Total','After NoWait Mentioned', 'After NoWait Total'],level=1,inplace=True)
before = pd.DataFrame.from_records([PercentWaitMentionedBefore]).unstack()
after = pd.DataFrame.from_records([PercentWaitMentionedAfter]).unstack()
results = pd.DataFrame(data=[before,after]).unstack().reset_index(level=1, drop=True)
results.index.set_levels(['Before NoWait','After NoWait'],level=1,inplace=True)
results.to_excel('results.xlsx')
all_data.to_excel('allrevies.xlsx')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment