Last active
December 13, 2017 12:30
-
-
Save prakhar21/1fe87229f99c74d666e07d066b7933a7 to your computer and use it in GitHub Desktop.
Swiggy Account Analysis
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/urs/bin/env python | |
""" | |
@uthor: Prakhar Mishra | |
date: Dec, 13 2017 | |
desc: Read URL for more information on the same. | |
""" | |
import os | |
import csv | |
import time | |
from bs4 import BeautifulSoup | |
import matplotlib.pyplot as plt | |
from selenium import webdriver | |
import re | |
import math | |
from collections import Counter | |
from nltk.util import ngrams | |
PHONENUMBER="number" | |
PASSWORD="password" | |
URL="https://www.swiggy.com/" | |
FILE='data.csv' | |
def crawl(): | |
''' | |
Crawl data from website | |
''' | |
driver = webdriver.Firefox() | |
driver.get(URL) | |
# Login popup | |
driver.find_element_by_link_text('Login').click() | |
# Login | |
# Proper sleep has to be applied for you to not be recognized as bot | |
pnumber = driver.find_element_by_id('phoneNumberField') | |
for num in PHONENUMBER: | |
time.sleep(0.2) | |
pnumber.send_keys(num) | |
time.sleep(1.5) | |
passwd = driver.find_element_by_id('passwordField') | |
for pss in PASSWORD: | |
time.sleep(0.2) | |
passwd.send_keys(pss) | |
time.sleep(1.5) | |
driver.find_element_by_xpath('/html/body/div/div[1]/div/div/div[2]/login/div/div[4]/div/div/div[2]/form/div[3]/button').click() | |
time.sleep(5) | |
# Go to my account | |
header = driver.find_element_by_xpath('//*[@id="header-menu"]') | |
header.click() | |
account = driver.find_element_by_link_text('Account') | |
account.click() | |
# Go to my orders | |
# TODO | |
# Open full list first till [Load More...] is no more visible | |
# TODO | |
# Get RestaurantName, DateTime, PaymentMode, GrandTotal, ItemName, Taxes, Address | |
# TODO | |
# Write to file | |
f = open(FILE, 'wb') | |
writer = csv.writer(f) | |
for record in data: | |
writer.writerow(record) | |
f.close() | |
return | |
def load(): | |
''' | |
Loading data to memory | |
''' | |
data = [] | |
f = open(FILE, 'rb') | |
reader = csv.reader(f) | |
reader.next() | |
for row in reader: | |
data.append(row) | |
f.close() | |
return data | |
def similarity(v1, v2): | |
''' | |
String similarity (Cosine) | |
''' | |
intersection = set(v1.keys()) & set(v2.keys()) | |
numerator = sum([v1[x] * v2[x] for x in intersection]) | |
sum1 = sum([v1[x]**2 for x in v1.keys()]) | |
sum2 = sum([v2[x]**2 for x in v2.keys()]) | |
denominator = math.sqrt(sum1) * math.sqrt(sum2) | |
if not denominator: | |
return 0.0 | |
else: | |
return float(numerator) / denominator | |
def shingles(s): | |
''' | |
Vectorize the string | |
''' | |
temp = [] | |
for i in ngrams(s, 3): | |
temp.append(''.join(i)) | |
return Counter(temp) | |
def smartsimilarity(): | |
data = ["Poori Chole", "Poori Chola", "Chola Puri", "Rajma Chawal", "Rajma Rice", "Chawal Rajma"] | |
lookup = {} | |
for i in enumerate(data): | |
lookup[i[0]] = i[1] | |
data_shingles = [] | |
final = {} | |
for d in data: | |
data_shingles.append(shingles(d)) | |
# Merging - [Level 1] | |
for s1 in range(0, len(data_shingles)): | |
for s2 in range(0, len(data_shingles)): | |
simscore = similarity(data_shingles[s1], data_shingles[s2]) | |
if simscore >= 0.65: | |
if s1 not in final: | |
final[s1] = [s2] | |
else: | |
final[s1].append(s2) | |
clusters = [] | |
# Merging - [Level 2] | |
for k, v in final.items(): | |
if len(v) == 1: | |
clusters.append(v) | |
else: | |
for neig in v: | |
if neig != k: | |
clusters.append(final[neig]) | |
unique_data = [list(x) for x in set(tuple(x) for x in clusters)] | |
p = [] | |
# Remap from index to string | |
for k in unique_data: | |
p.append([lookup[i] for i in k]) | |
return p | |
def top5restaurants(data): | |
restaurants = [d[0] for d in data] | |
return Counter(restaurants) | |
def modeOfpayment(data): | |
payment = [d[1] for d in data] | |
return Counter(payment) | |
def timeline(data): | |
timeline = [d[2] for d in data] | |
return Counter(timeline) | |
def delivery(data): | |
location = [d[3] for d in data] | |
return Counter(location) | |
def extracharges(data): | |
all_extra_costs = [float(d[4]) for d in data] | |
return sum(all_extra_costs) | |
def totalamount(data): | |
all_costs = [float(d[5]) for d in data] | |
return sum(all_costs) | |
def analyze(data): | |
''' | |
Doing Analysis on various factors | |
''' | |
clusters = smartsimilarity() | |
top_5_restaurants = top5restaurants(data) | |
mode_of_payment = modeOfpayment(data) | |
dates = timeline(data) | |
address = delivery(data) | |
#delivery_charges_packing_charge = extracharges(data) | |
#total_spent = totalamount(data) | |
return clusters, top_5_restaurants, mode_of_payment, dates, address | |
def visualize(): | |
''' | |
Visualizing the results | |
''' | |
# TODO | |
return | |
def main(): | |
""" | |
try: | |
crawl() | |
except Exception as e: | |
raise e | |
""" | |
try: | |
data = load() | |
except: | |
raise "Failed to load data from file to memory" | |
try: | |
print analyze(data) | |
except Exception as e: | |
raise e | |
""" | |
try: | |
visualize() | |
except: | |
raise "Failed to visualize the results" | |
""" | |
if __name__=='__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment