Skip to content

Instantly share code, notes, and snippets.

@lovevn
Created December 18, 2012 04:24
Show Gist options
  • Save lovevn/4324998 to your computer and use it in GitHub Desktop.
Save lovevn/4324998 to your computer and use it in GitHub Desktop.
Good deal crawler Get from http://pastebin.com/QNFyu4sa
# -*- coding: utf-8 -*-
# Competitor Intelligence Agent
# License: Creative Commons CC BY-NC-SA
#
# By Michael Gradek, a proud Udacian
# Twitter: @MichaelGradek
#
# Competitor Intelligence Agent aims to collect and display a report with relevant information about Groupon.es, one of the largest Collective Buying sites in Spain (... and in the world!)
# For more information about Collective Buying please refer to: http://en.wikipedia.org/wiki/Group_buying
#
# To keep things simple and easy to understand, my script will only consider Groupon in Spain (groupon.es)
#
# Why is this an interesting thing to do?
# Imagine you are the Business Intelligence director of one of Spain's Collective Buying companies such as Groupalia.com, Letsbonus.com, etc.
# Wouldn't you love to receive a daily report stating what your competition is up to? i.e.: What deals they have? Whats their average discount? What their sales were yesterday? And even how much they are going to sell this year?!?
#
# The goal for this crawler is to crawl the Groupon.es site, collect all the currently posted deals, and then crawl the deals and fetch all information that would be relevant for a competitor's Business Intelligence director
# Once all the deals have been found and saved, the script can print a report showing the performance of each business area, and even forecast yearly sales!
#
# Definitions: Deals: Any sort of coupon sold on the site
# Local: Coupons from local merchants such as a restaurant, beauty & wellness, services, etc.
# Travel: Coupons for travelling such as trips, hotels, etc.
# Shopping: Coupons for products such as phones, devices, watches, etc.
import urllib2
# ----- Begin initialize -----
seed_all = 'http://www.groupon.es/all-deals/oferta-nacional' # This page is only used to get the cities Groupon operates in
seed_barcelona = 'barcelona' # This page will be used only for testing to reduce amount of pages to request (only Barcelona deals, not whole country)
cities = [] # List containing the cities in which Groupon.es operates and the relative url to the city deal mini-site
deals_to_crawl = {'Local': [],
'Travel': [],
'Shopping': []} # Dictionary with the urls to crawl for each category
deals = {'Local': [],
'Travel': [],
'Shopping': []} # Dictionary with deal for each category
number_cities_crawled = 0. # Keep track of number of cities crawled. Print the value to make sure there is no Timeout and that the crawler is progressing
total_cities_to_crawl = 0.
number_deals_crawled = 0. # Keep track of number of deals crawled. Print the value to make sure there is no Timeout and that the crawler is progressing
total_deals_to_crawl = 0.
# ----- End initialize -----
# ----- Begin procedures -----
def fetch_cities(seed): # Crawl seed page to find all cities Groupon.es operates in
global total_cities_to_crawl
def extract_cities(html):
global total_cities_to_crawl
code = html
start_pos = code.find('<div id="citySelectBox" ')
code = code[start_pos:]
for i in range(2): # Skip first two links as they are not cities
start_pos = code.find('<li class')
skip = len('<li class')
code = code[start_pos+skip:]
start_pos = code.find('</li>')
skip = len('</li>')
code = code[start_pos+skip:]
while code.find('</ul>') > 20: # Detect end of list of cities
total_cities_to_crawl += 1
# Start fetching cities - Relative URL
start_pos = code.find("window.location.href = 'http://www.groupon.es/deals/")
skip = len("window.location.href = 'http://www.groupon.es/deals/")
code = code[start_pos+skip:]
end_pos = code.find("';")
url = code[:end_pos]
code = code[end_pos:]
# Fetch city name
start_pos = code.find('<span>')
skip = len('<span>')
code = code[start_pos+skip:]
end_pos = code.find('</span>')
skip = len('</span>')
name = code[:end_pos]
code = code[end_pos+skip:]
# Add to dictionary
cities.append([name, url])
if seed == seed_barcelona: # use seed_barcelona to reduce page requests and thus take less time to execute!
cities.append(['Barcelona', seed_barcelona])
total_cities_to_crawl += 1
else:
seed_response = urllib2.urlopen(seed)
seed_html = seed_response.read()
extract_cities(seed_html)
def crawl_city(url): # Crawl cities to find all URLs to all deals
global number_cities_crawled
city_response = urllib2.urlopen(url)
city_html = city_response.read()
number_cities_crawled += 1
def crawl_local(html): # Search for all local deals URLs in all cities (or only Barcelona if seed_barcelona was used)
code = html
start_pos = code.find('var itemsLocalDeals = [')
skip = len('var itemsLocalDeals = [')
code = code[start_pos+skip:]
while code.find('dealPermaLink":"') < code.find('var itemsShoppingDeals = ['): # If a deal URL is closer than the marker for the next category, I must still be in the 'local' category
start_pos = code.find('dealPermaLink":"')
skip = len('dealPermaLink":"')
code = code[start_pos+skip:]
end_pos = code.find('"')
url = code[:end_pos]
deals_to_crawl['Local'].append(url)
code = code[end_pos:]
def crawl_shopping(html): # Search for shopping deals URLs only one as they do not depend on city
code = html
start_pos = code.find('var itemsShoppingDeals = [')
skip = len('var itemsShoppingDeals = [')
code = code[start_pos+skip:]
while code.find('dealPermaLink":"') < code.find('var itemsTravelDeals = ['): # If a deal URL is closer than the marker for the next category, I must still be in the 'shopping' category
start_pos = code.find('dealPermaLink":"')
skip = len('dealPermaLink":"')
code = code[start_pos+skip:]
end_pos = code.find('"')
url = code[:end_pos]
deals_to_crawl['Shopping'].append(url)
code = code[end_pos:]
def crawl_travel(html): # Search for travel deals URLs only one as they do not depend on city
code = html
start_pos = code.find('var itemsTravelDeals = [')
skip = len('var itemsTravelDeals = [')
code = code[start_pos+skip:]
while code.find('dealPermaLink":"') != -1: # If crawler can't find next 'dealPermaLink', that means there are no deals left
start_pos = code.find('dealPermaLink":"')
skip = len('dealPermaLink":"')
code = code[start_pos+skip:]
end_pos = code.find('"')
url = code[:end_pos]
deals_to_crawl['Travel'].append(url)
code = code[end_pos:]
crawl_local(city_html) # Always crawl to fetch all deals from all cities
if len(deals_to_crawl['Shopping']) == 0: # Crawl only once, as deals from this category are independent from cities
crawl_shopping(city_html)
if len(deals_to_crawl['Travel']) == 0: # Crawl only once, as deals from this category are independent from cities
crawl_travel(city_html)
print str( round( ( number_cities_crawled / total_cities_to_crawl ) * 100, 2)) + '% complete ...'
def fetch_deals():
url_prefix = 'http://www.groupon.es/all-deals/'
for city in cities:
deal_url = url_prefix + city[1]
crawl_city(deal_url)
def crawl_deals(type):
global number_deals_crawled
for i in range(len(deals_to_crawl[type])):
full_url = 'http://www.groupon.es' + deals_to_crawl[type][i]
deal_response = urllib2.urlopen(full_url)
deal_html = deal_response.read()
number_deals_crawled += 1
deal = []
def clean_up_string(string):
output = ''
for i in range(len(string)):
if 47 <= ord(string[i]) <= 57 or 65 <= ord(string[i]) <= 90 or 97 <= ord(string[i]) <= 122 or ord(string[i]) == 44: # Clean up any string from white space and any undesired characters. Allow only Numbers, small-caps and UPPER-CAPS and commas
if ord(string[i]) == 44:
output += "."
else:
output += string[i]
return output
# Find price
start_pos = deal_html.find('Precio:')
deal_html = deal_html[start_pos:]
start_pos = deal_html.find('<span class="noWrap">')
skip = len('<span class="noWrap">')
deal_html = deal_html[start_pos+skip:]
end_pos = deal_html.find(' \xe2') # \xe2 = euro sign (€)
deal_price = deal_html[:end_pos]
deal_html = deal_html[end_pos:]
deal.append(float(clean_up_string(deal_price)))
# Find discount
start_pos = deal_html.find('Descuento</td>')
if start_pos != -1: # Some Shopping deals don't have a discount displayed ... Assumption: Those products simply don't have a discount, therefore the crawler will append a 0
deal_html = deal_html[start_pos:]
start_pos = deal_html.find('<td class="col1">')
skip = len('<td class="col1">')
deal_html = deal_html[start_pos+skip:]
end_pos = deal_html.find('%')
deal_discount = deal_html[:end_pos]
deal_html = deal_html[end_pos:]
deal.append(float(clean_up_string(deal_discount)))
# Find saving
start_pos = deal_html.find('<td>')
skip = len('<td>')
deal_html = deal_html[start_pos+skip:]
end_pos = deal_html.find(' \xe2') # \xe2 = euro sign (€)
deal_saving = deal_html[:end_pos]
deal_html = deal_html[end_pos:]
deal.append(float(clean_up_string(deal_saving)))
else:
deal_discount = 0.
deal_saving = 0.
deal.append(deal_discount)
deal.append(deal_saving)
# Find number of people who bought coupon
start_pos = deal_html.find('<span id="jDealSoldAmount">')
if start_pos != -1: # If this string can't be found, that means that 0 people have bought the deal so far
skip = len('<span id="jDealSoldAmount">')
deal_html = deal_html[start_pos+skip:]
end_pos = deal_html.find('</span>')
deal_bought = deal_html[:end_pos]
deal.append(float(clean_up_string(deal_bought)))
else:
deal.append(0)
deal.append(deals_to_crawl[type][i]) # Append deal relative URL as last element of list
# Add to dictionary
deals[type].append(deal)
print str( round( ( number_deals_crawled / total_deals_to_crawl ) * 100, 2)) + '% complete ...'
def reporting(type, print_forecast = False):
def forecast(daily_sales):
# NOTE: This by no means is a very rigorous method of forecasting yearly sales, but is probably the best we can do with 'one-time' data
# Just imagine what you could do with a slightly more sophisticated robot!!!
return daily_sales * 365
def format_number(number):
string = str(number)
formatted = ''
formatted_reverse = ''
for i in range(len(string)):
if (i+1) % 3 == 0 and i < len(string)-1:
formatted += string[len(string)-i-1]+','
else:
formatted += string[len(string)-i-1]
for i in range(len(formatted)):
formatted_reverse += formatted[len(formatted)-i-1]
return formatted_reverse
if type != 'Global':
number_deals = len(deals[type])
min_price = 9999999.
max_price = -1.
acumulated_price = 0.
min_discount = 100.
max_discount = 0.
acumulated_discount = 0.
min_saving = 9999999.
max_saving = -1.
acumulated_saving = 0.
total_sales_units = 0.
total_sales_euros = 0.
total_saving_euros = 0.
best_deal = [0., '']
for i in deals[type]:
if i[0] * i[3] > best_deal[0]: # What is the best performing deal in terms of revenue?
best_deal[0] = i[0] * i[3]
best_deal[1] = i[4]
if i[0] < min_price:
min_price = i[0]
if i[0] > max_price:
max_price = i[0]
if i[1] < min_discount:
min_discount = i[1]
if i[1] > max_discount:
max_discount = i[1]
if i[2] < min_saving:
min_saving = i[2]
if i[2] > max_saving:
max_saving = i[2]
total_sales_units += i[3]
total_sales_euros += i[0] * i[3]
total_saving_euros += i[2] * i[3]
acumulated_price += i[0]
acumulated_discount += i[1]
acumulated_saving += i[2]
avg_deal_price = acumulated_price / number_deals
avg_deal_discount = acumulated_discount / number_deals
avg_deal_saving = acumulated_saving / number_deals
avg_sale_value = total_sales_euros / total_sales_units
avg_sale_saving = total_saving_euros / total_sales_units
string = '############# '+type+' #############\n\t\t\tMin:\tAvg:\tMax:\nPrice €:\t'+str(int(round(min_price)))+'\t\t'+str(int(round(avg_deal_price)))+'\t\t'+str(int(round(max_price)))+'\nDiscount %:\t'+str(int(round(min_discount)))+'\t\t'+str(int(round(avg_deal_discount)))+'\t\t'+str(int(round(max_discount)))+'\nSaving €:\t'+str(int(round(min_saving)))+'\t\t'+str(int(round(avg_deal_saving)))+'\t\t'+str(int(round(max_saving)))+'\n---------------------------------\nDeals offered #: '+str(number_deals)+'\nTotal deals sold #: '+format_number(int(round(total_sales_units)))+'\nTotal savings €: '+format_number(int(round(total_saving_euros)))+'\nAverage saving €: '+format_number(int(round(avg_sale_saving)))+'\nTotal sales €: '+format_number(int(round(total_sales_euros)))+'\nAverage ticket €: '+format_number(int(round(avg_sale_value)))+'\nBest deal: '+format_number(int(round(best_deal[0])))+' € => '+best_deal[1]
if print_forecast:
string += '\nSales forecast for this year €: '+format_number(str(int(round(forecast(total_sales_euros)))))
return string
else:
number_deals = len(deals['Shopping']) + len(deals['Travel']) + len(deals['Local'])
min_price = 9999999.
max_price = -1.
acumulated_price = 0.
min_discount = 100.
max_discount = 0.
acumulated_discount = 0.
min_saving = 9999999.
max_saving = -1.
acumulated_saving = 0.
total_sales_units = 0.
total_sales_euros = 0.
total_saving_euros = 0.
best_deal = [0., '']
for e in deals:
for i in deals[e]:
if i[0] * i[3] > best_deal[0]: # What is the best performing deal in terms of revenue?
best_deal[0] = i[0] * i[3]
best_deal[1] = i[4]
if i[0] < min_price:
min_price = i[0]
if i[0] > max_price:
max_price = i[0]
if i[1] < min_discount:
min_discount = i[1]
if i[1] > max_discount:
max_discount = i[1]
if i[2] < min_saving:
min_saving = i[2]
if i[2] > max_saving:
max_saving = i[2]
total_sales_units += i[3]
total_sales_euros += i[0] * i[3]
total_saving_euros += i[2] * i[3]
acumulated_price += i[0]
acumulated_discount += i[1]
acumulated_saving += i[2]
avg_deal_price = acumulated_price / number_deals
avg_deal_discount = acumulated_discount / number_deals
avg_deal_saving = acumulated_saving / number_deals
avg_sale_value = total_sales_euros / total_sales_units
avg_sale_saving = total_saving_euros / total_sales_units
string = '############# Global #############\n\t\t\tMin:\tAvg:\tMax:\nPrice €:\t'+str(int(round(min_price)))+'\t\t'+str(int(round(avg_deal_price)))+'\t\t'+str(int(round(max_price)))+'\nDiscount %:\t'+str(int(round(min_discount)))+'\t\t'+str(int(round(avg_deal_discount)))+'\t\t'+str(int(round(max_discount)))+'\nSaving €:\t'+str(int(round(min_saving)))+'\t\t'+str(int(round(avg_deal_saving)))+'\t\t'+str(int(round(max_saving)))+'\n---------------------------------\nDeals offered #: '+str(number_deals)+'\nTotal deals sold #: '+format_number(int(round(total_sales_units)))+'\nTotal savings €: '+format_number(int(round(total_saving_euros)))+'\nAverage saving €: '+format_number(int(round(avg_sale_saving)))+'\nTotal sales €: '+format_number(int(round(total_sales_euros)))+'\nAverage ticket €: '+format_number(int(round(avg_sale_value)))+'\nBest deal: '+format_number(int(round(best_deal[0])))+' € => '+best_deal[1]
if print_forecast:
string += '\nSales forecast for this year €: '+format_number(str(int(round(forecast(total_sales_euros)))))
return string
# ----- End procedures -----
# ----- Begin control panel -----
# Turn these to True or False depending on which ones you want to crawl. No changes further below are necessary, unless you want to fiddle around with the crawler ;)
switch_seed_all = False # True: crawls ALL deals on the site; False: crawls only deals in Barcelona. Leave as False for faster execution.
switch_crawl_shopping = True
switch_crawl_travel = True
switch_crawl_local = True
switch_print_global_report = True # Print a global report of all 3 types of deals (only turn True if previous 3 switches are True)
switch_print_forecast = True # Print a (very non-scientific) yearly sales forecast
# ----- End control panel -----
if switch_seed_all:
if not switch_crawl_local and not switch_crawl_shopping and not switch_crawl_travel:
print 'Switches are set to crawl nothing ...'
else:
print 'Crawling seed site ...'
fetch_cities(seed_all)
print 'Found '+str(len(cities))+' cities to crawl ...'
fetch_deals()
total_deals_to_crawl = 0
if switch_crawl_shopping:
total_deals_to_crawl += len(deals_to_crawl['Shopping'])
if switch_crawl_travel:
total_deals_to_crawl += len(deals_to_crawl['Travel'])
if switch_crawl_local:
total_deals_to_crawl += len(deals_to_crawl['Local'])
print 'Found '+str(total_deals_to_crawl)+' deals to crawl ...'
if switch_crawl_shopping:
crawl_deals('Shopping')
if switch_crawl_travel:
crawl_deals('Travel')
if switch_crawl_local:
crawl_deals('Local')
if switch_crawl_shopping:
print reporting('Shopping', switch_print_forecast)
if switch_crawl_travel:
print reporting('Travel', switch_print_forecast)
if switch_crawl_local:
print reporting('Local', switch_print_forecast)
if switch_print_global_report:
print reporting('Global', switch_print_forecast)
else:
if not switch_crawl_local and not switch_crawl_shopping and not switch_crawl_travel:
print 'Switches are set to crawl nothing ...'
else:
print 'Crawling seed site ...'
fetch_cities(seed_barcelona)
print 'Found '+str(len(cities))+' cities to crawl ...'
fetch_deals()
total_deals_to_crawl = 0
if switch_crawl_shopping:
total_deals_to_crawl += len(deals_to_crawl['Shopping'])
if switch_crawl_travel:
total_deals_to_crawl += len(deals_to_crawl['Travel'])
if switch_crawl_local:
total_deals_to_crawl += len(deals_to_crawl['Local'])
print 'Found '+str(total_deals_to_crawl)+' deals to crawl ...'
if switch_crawl_shopping:
crawl_deals('Shopping')
if switch_crawl_travel:
crawl_deals('Travel')
if switch_crawl_local:
crawl_deals('Local')
if switch_crawl_shopping:
print reporting('Shopping', switch_print_forecast)
if switch_crawl_travel:
print reporting('Travel', switch_print_forecast)
if switch_crawl_local:
print reporting('Local', switch_print_forecast)
if switch_print_global_report:
print reporting('Global', switch_print_forecast)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment