Created
April 21, 2018 21:46
-
-
Save benspargo89/7305772aee24861fff08608c70b1a19c to your computer and use it in GitHub Desktop.
Yelp Data Manipulation
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import pandas as pd | |
import datetime | |
def waiting(text): | |
if ' wait ' in text.lower() or ' waiting' in text.lower(): | |
return 1 | |
else: | |
return 0 | |
NoWaitDates = {'Clinton Street Baking Company' : datetime.datetime(2015, 5, 1),'Penelope': datetime.datetime(2016, 3, 1), 'Russian River Brewing': datetime.datetime(2017, 11, 1), 'Soul Gastrolounge': datetime.date(2016, 11, 15), | |
'The Farmhouse Tap & Grill': datetime.datetime(2016, 9, 1),'The Magnolia Pancake Haus': datetime.datetime(2017, 6, 1), 'The Pharmacy': datetime.datetime(2015, 3, 1), 'Waterman’s Surfside Grille': datetime.datetime(2014, 4, 1)} | |
excel_files = [] | |
for root, dirs, files in os.walk("."): | |
for filename in files: | |
if filename[-4:] == '.xls': | |
excel_files.append(filename) | |
companies = [] | |
for company in excel_files: | |
companies.append(company[:-4]) | |
all_data = pd.DataFrame(columns=['Rating', 'Date', 'Review', 'Company']) | |
for N in range(len(excel_files)): | |
df = pd.read_excel(excel_files[N]) | |
df['Company'] = excel_files[N][:-4] | |
all_data = pd.concat([all_data, df]) | |
all_data['Date'] = pd.to_datetime(all_data['Date'], errors='coerce') | |
all_data['NoWait'] = datetime.datetime(1999, 8, 1) | |
for company in companies: | |
## all_data[all_data['Company'] == company]['NoWait'] = NoWaitDates[company] | |
all_data.loc[all_data['Company'] == company, "NoWait"] = NoWaitDates[company] | |
all_data['Wait_Mentioned'] = all_data.apply(lambda row: waiting(str(row['Review'])), axis=1) | |
all_data = all_data[all_data.Date.notnull()] | |
PercentWaitMentionedBefore = {} | |
PercentWaitMentionedAfter = {} | |
NumberWaitMentionedBefore = {} | |
NumberWaitMentionedAfter = {} | |
TotalReviewsAfter ={} | |
TotalReviewsBefore={} | |
for company in companies: | |
for i in range(1, 2): | |
key = company + " " + str(i) + ' Star' | |
NumberWaitMentionedBefore[key] = all_data[(all_data['Rating']==i) & (all_data['Company']==company) & (all_data['Date'] < all_data['NoWait'])]['Wait_Mentioned'].sum() | |
NumberWaitMentionedAfter[key] = all_data[(all_data['Rating']==i) & (all_data['Company']==company) & (all_data['Date'] > all_data['NoWait'])]['Wait_Mentioned'].sum() | |
TotalReviewsAfter[key] = len(all_data[(all_data['Rating']==i) & (all_data['Company']==company)& (all_data['Date'] > all_data['NoWait'])]['Wait_Mentioned']) | |
TotalReviewsBefore[key] = len(all_data[(all_data['Rating']==i) & (all_data['Company']==company)& (all_data['Date'] < all_data['NoWait'])]['Wait_Mentioned']) | |
PercentWaitMentionedBefore[key] = "{0:.1f}%".format(all_data[(all_data['Rating']==i) & (all_data['Company']==company) & (all_data['Date'] < all_data['NoWait'])]['Wait_Mentioned'].sum() | |
/ len(all_data[(all_data['Rating']==i) & (all_data['Company']==company)& (all_data['Date'] < all_data['NoWait'])]['Wait_Mentioned'])*100) | |
PercentWaitMentionedAfter[key] = "{0:.1f}%".format(all_data[(all_data['Rating']==i) & (all_data['Company']==company) & (all_data['Date'] > all_data['NoWait'])]['Wait_Mentioned'].sum() | |
/ len(all_data[(all_data['Rating']==i) & (all_data['Company']==company)& (all_data['Date'] > all_data['NoWait'])]['Wait_Mentioned'])*100) | |
print(PercentWaitMentionedBefore) | |
print('\n') | |
print(PercentWaitMentionedAfter) | |
BM = pd.DataFrame.from_records([NumberWaitMentionedBefore]).unstack() | |
BT = pd.DataFrame.from_records([TotalReviewsBefore]).unstack() | |
AM = pd.DataFrame.from_records([NumberWaitMentionedAfter]).unstack() | |
AT = pd.DataFrame.from_records([TotalReviewsAfter]).unstack() | |
sum_results = pd.DataFrame(data=[BM, BT, AM, AT]).unstack().reset_index(level=1, drop=True) | |
sum_results.index.set_levels(['Before NoWait Mentioned', 'Before NoWait Total','After NoWait Mentioned', 'After NoWait Total'],level=1,inplace=True) | |
before = pd.DataFrame.from_records([PercentWaitMentionedBefore]).unstack() | |
after = pd.DataFrame.from_records([PercentWaitMentionedAfter]).unstack() | |
results = pd.DataFrame(data=[before,after]).unstack().reset_index(level=1, drop=True) | |
results.index.set_levels(['Before NoWait','After NoWait'],level=1,inplace=True) | |
results.to_excel('results.xlsx') | |
all_data.to_excel('allrevies.xlsx') | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment