December 18, 2012
Good deal crawler Get from
# -*- coding: utf-8 -*-
# Competitor Intelligence Agent
# License: Creative Commons CC BY-NC-SA
# By Michael Gradek, a proud Udacian
# Twitter: @MichaelGradek
# Competitor Intelligence Agent aims to collect and display a report with relevant information about, one of the largest Collective Buying sites in Spain (... and in the world!)
# For more information about Collective Buying please refer to:
# To keep things simple and easy to understand, my script will only consider Groupon in Spain (
# Why is this an interesting thing to do?
# Imagine you are the Business Intelligence director of one of Spain's Collective Buying companies such as,, etc.
# Wouldn't you love to receive a daily report stating what your competition is up to? i.e.: What deals they have? Whats their average discount? What their sales were yesterday? And even how much they are going to sell this year?!?
# The goal for this crawler is to crawl the site, collect all the currently posted deals, and then crawl the deals and fetch all information that would be relevant for a competitor's Business Intelligence director
# Once all the deals have been found and saved, the script can print a report showing the performance of each business area, and even forecast yearly sales!
# Definitions: Deals: Any sort of coupon sold on the site
# Local: Coupons from local merchants such as a restaurant, beauty & wellness, services, etc.
# Travel: Coupons for travelling such as trips, hotels, etc.
# Shopping: Coupons for products such as phones, devices, watches, etc.
import urllib2
# ----- Begin initialize -----
seed_all = '' # This page is only used to get the cities Groupon operates in
seed_barcelona = 'barcelona' # This page will be used only for testing to reduce amount of pages to request (only Barcelona deals, not whole country)
cities = [] # List containing the cities in which operates and the relative url to the city deal mini-site
deals_to_crawl = {'Local': [],
'Travel': [],
'Shopping': []} # Dictionary with the urls to crawl for each category
deals = {'Local': [],
'Travel': [],
'Shopping': []} # Dictionary with deal for each category
number_cities_crawled = 0. # Keep track of number of cities crawled. Print the value to make sure there is no Timeout and that the crawler is progressing
total_cities_to_crawl = 0.
number_deals_crawled = 0. # Keep track of number of deals crawled. Print the value to make sure there is no Timeout and that the crawler is progressing
total_deals_to_crawl = 0.
# ----- End initialize -----
# ----- Begin procedures -----
def fetch_cities(seed): # Crawl seed page to find all cities operates in
global total_cities_to_crawl
def extract_cities(html):
global total_cities_to_crawl
code = html
start_pos = code.find('<div id="citySelectBox" ')
code = code[start_pos:]
for i in range(2): # Skip first two links as they are not cities
start_pos = code.find('<li class')
skip = len('<li class')
code = code[start_pos+skip:]
start_pos = code.find('</li>')
skip = len('</li>')
code = code[start_pos+skip:]
while code.find('</ul>') > 20: # Detect end of list of cities
total_cities_to_crawl += 1
# Start fetching cities - Relative URL
start_pos = code.find("window.location.href = '")
skip = len("window.location.href = '")
code = code[start_pos+skip:]
end_pos = code.find("';")
url = code[:end_pos]
code = code[end_pos:]
# Fetch city name
start_pos = code.find('<span>')
skip = len('<span>')
code = code[start_pos+skip:]
end_pos = code.find('</span>')
skip = len('</span>')
name = code[:end_pos]
code = code[end_pos+skip:]
# Add to dictionary
cities.append([name, url])
if seed == seed_barcelona: # use seed_barcelona to reduce page requests and thus take less time to execute!
cities.append(['Barcelona', seed_barcelona])
total_cities_to_crawl += 1
seed_response = urllib2.urlopen(seed)
seed_html =
def crawl_city(url): # Crawl cities to find all URLs to all deals
global number_cities_crawled
city_response = urllib2.urlopen(url)
city_html =
number_cities_crawled += 1
def crawl_local(html): # Search for all local deals URLs in all cities (or only Barcelona if seed_barcelona was used)
code = html
start_pos = code.find('var itemsLocalDeals = [')
skip = len('var itemsLocalDeals = [')
code = code[start_pos+skip:]
while code.find('dealPermaLink":"') < code.find('var itemsShoppingDeals = ['): # If a deal URL is closer than the marker for the next category, I must still be in the 'local' category
start_pos = code.find('dealPermaLink":"')
skip = len('dealPermaLink":"')
code = code[start_pos+skip:]
end_pos = code.find('"')
url = code[:end_pos]
code = code[end_pos:]
def crawl_shopping(html): # Search for shopping deals URLs only one as they do not depend on city
code = html
start_pos = code.find('var itemsShoppingDeals = [')
skip = len('var itemsShoppingDeals = [')
code = code[start_pos+skip:]
while code.find('dealPermaLink":"') < code.find('var itemsTravelDeals = ['): # If a deal URL is closer than the marker for the next category, I must still be in the 'shopping' category
start_pos = code.find('dealPermaLink":"')
skip = len('dealPermaLink":"')
code = code[start_pos+skip:]
end_pos = code.find('"')
url = code[:end_pos]
code = code[end_pos:]
def crawl_travel(html): # Search for travel deals URLs only one as they do not depend on city
code = html
start_pos = code.find('var itemsTravelDeals = [')
skip = len('var itemsTravelDeals = [')
code = code[start_pos+skip:]
while code.find('dealPermaLink":"') != -1: # If crawler can't find next 'dealPermaLink', that means there are no deals left
start_pos = code.find('dealPermaLink":"')
skip = len('dealPermaLink":"')
code = code[start_pos+skip:]
end_pos = code.find('"')
url = code[:end_pos]
code = code[end_pos:]
crawl_local(city_html) # Always crawl to fetch all deals from all cities
if len(deals_to_crawl['Shopping']) == 0: # Crawl only once, as deals from this category are independent from cities
if len(deals_to_crawl['Travel']) == 0: # Crawl only once, as deals from this category are independent from cities
print str( round( ( number_cities_crawled / total_cities_to_crawl ) * 100, 2)) + '% complete ...'
def fetch_deals():
url_prefix = ''
for city in cities:
deal_url = url_prefix + city[1]
def crawl_deals(type):
global number_deals_crawled
for i in range(len(deals_to_crawl[type])):
full_url = '' + deals_to_crawl[type][i]
deal_response = urllib2.urlopen(full_url)
deal_html =
number_deals_crawled += 1
deal = []
def clean_up_string(string):
output = ''
for i in range(len(string)):
if 47 <= ord(string[i]) <= 57 or 65 <= ord(string[i]) <= 90 or 97 <= ord(string[i]) <= 122 or ord(string[i]) == 44: # Clean up any string from white space and any undesired characters. Allow only Numbers, small-caps and UPPER-CAPS and commas
if ord(string[i]) == 44:
output += "."
output += string[i]
return output
# Find price
start_pos = deal_html.find('Precio:')
deal_html = deal_html[start_pos:]
start_pos = deal_html.find('<span class="noWrap">')
skip = len('<span class="noWrap">')
deal_html = deal_html[start_pos+skip:]
end_pos = deal_html.find(' \xe2') # \xe2 = euro sign (€)
deal_price = deal_html[:end_pos]
deal_html = deal_html[end_pos:]
# Find discount
start_pos = deal_html.find('Descuento</td>')
if start_pos != -1: # Some Shopping deals don't have a discount displayed ... Assumption: Those products simply don't have a discount, therefore the crawler will append a 0
deal_html = deal_html[start_pos:]
start_pos = deal_html.find('<td class="col1">')
skip = len('<td class="col1">')
deal_html = deal_html[start_pos+skip:]
end_pos = deal_html.find('%')
deal_discount = deal_html[:end_pos]
deal_html = deal_html[end_pos:]
# Find saving
start_pos = deal_html.find('<td>')
skip = len('<td>')
deal_html = deal_html[start_pos+skip:]
end_pos = deal_html.find(' \xe2') # \xe2 = euro sign (€)
deal_saving = deal_html[:end_pos]
deal_html = deal_html[end_pos:]
deal_discount = 0.
deal_saving = 0.
# Find number of people who bought coupon
start_pos = deal_html.find('<span id="jDealSoldAmount">')
if start_pos != -1: # If this string can't be found, that means that 0 people have bought the deal so far
skip = len('<span id="jDealSoldAmount">')
deal_html = deal_html[start_pos+skip:]
end_pos = deal_html.find('</span>')
deal_bought = deal_html[:end_pos]
deal.append(deals_to_crawl[type][i]) # Append deal relative URL as last element of list
# Add to dictionary
print str( round( ( number_deals_crawled / total_deals_to_crawl ) * 100, 2)) + '% complete ...'
def reporting(type, print_forecast = False):
def forecast(daily_sales):
# NOTE: This by no means is a very rigorous method of forecasting yearly sales, but is probably the best we can do with 'one-time' data
# Just imagine what you could do with a slightly more sophisticated robot!!!
return daily_sales * 365
def format_number(number):
string = str(number)
formatted = ''
formatted_reverse = ''
for i in range(len(string)):
if (i+1) % 3 == 0 and i < len(string)-1:
formatted += string[len(string)-i-1]+','
formatted += string[len(string)-i-1]
for i in range(len(formatted)):
formatted_reverse += formatted[len(formatted)-i-1]
return formatted_reverse
if type != 'Global':
number_deals = len(deals[type])
min_price = 9999999.
max_price = -1.
acumulated_price = 0.
min_discount = 100.
max_discount = 0.
acumulated_discount = 0.
min_saving = 9999999.
max_saving = -1.
acumulated_saving = 0.
total_sales_units = 0.
total_sales_euros = 0.
total_saving_euros = 0.
best_deal = [0., '']
for i in deals[type]:
if i[0] * i[3] > best_deal[0]: # What is the best performing deal in terms of revenue?
best_deal[0] = i[0] * i[3]
best_deal[1] = i[4]
if i[0] < min_price:
min_price = i[0]
if i[0] > max_price:
max_price = i[0]
if i[1] < min_discount:
min_discount = i[1]
if i[1] > max_discount:
max_discount = i[1]
if i[2] < min_saving:
min_saving = i[2]
if i[2] > max_saving:
max_saving = i[2]
total_sales_units += i[3]
total_sales_euros += i[0] * i[3]
total_saving_euros += i[2] * i[3]
acumulated_price += i[0]
acumulated_discount += i[1]
acumulated_saving += i[2]
avg_deal_price = acumulated_price / number_deals
avg_deal_discount = acumulated_discount / number_deals
avg_deal_saving = acumulated_saving / number_deals
avg_sale_value = total_sales_euros / total_sales_units
avg_sale_saving = total_saving_euros / total_sales_units
string = '############# '+type+' #############\n\t\t\tMin:\tAvg:\tMax:\nPrice €:\t'+str(int(round(min_price)))+'\t\t'+str(int(round(avg_deal_price)))+'\t\t'+str(int(round(max_price)))+'\nDiscount %:\t'+str(int(round(min_discount)))+'\t\t'+str(int(round(avg_deal_discount)))+'\t\t'+str(int(round(max_discount)))+'\nSaving €:\t'+str(int(round(min_saving)))+'\t\t'+str(int(round(avg_deal_saving)))+'\t\t'+str(int(round(max_saving)))+'\n---------------------------------\nDeals offered #: '+str(number_deals)+'\nTotal deals sold #: '+format_number(int(round(total_sales_units)))+'\nTotal savings €: '+format_number(int(round(total_saving_euros)))+'\nAverage saving €: '+format_number(int(round(avg_sale_saving)))+'\nTotal sales €: '+format_number(int(round(total_sales_euros)))+'\nAverage ticket €: '+format_number(int(round(avg_sale_value)))+'\nBest deal: '+format_number(int(round(best_deal[0])))+' € => '+best_deal[1]
if print_forecast:
string += '\nSales forecast for this year €: '+format_number(str(int(round(forecast(total_sales_euros)))))
return string
number_deals = len(deals['Shopping']) + len(deals['Travel']) + len(deals['Local'])
min_price = 9999999.
max_price = -1.
acumulated_price = 0.
min_discount = 100.
max_discount = 0.
acumulated_discount = 0.
min_saving = 9999999.
max_saving = -1.
acumulated_saving = 0.
total_sales_units = 0.
total_sales_euros = 0.
total_saving_euros = 0.
best_deal = [0., '']
for e in deals:
for i in deals[e]:
if i[0] * i[3] > best_deal[0]: # What is the best performing deal in terms of revenue?
best_deal[0] = i[0] * i[3]
best_deal[1] = i[4]
if i[0] < min_price:
min_price = i[0]
if i[0] > max_price:
max_price = i[0]
if i[1] < min_discount:
min_discount = i[1]
if i[1] > max_discount:
max_discount = i[1]
if i[2] < min_saving:
min_saving = i[2]
if i[2] > max_saving:
max_saving = i[2]
total_sales_units += i[3]
total_sales_euros += i[0] * i[3]
total_saving_euros += i[2] * i[3]
acumulated_price += i[0]
acumulated_discount += i[1]
acumulated_saving += i[2]
avg_deal_price = acumulated_price / number_deals
avg_deal_discount = acumulated_discount / number_deals
avg_deal_saving = acumulated_saving / number_deals
avg_sale_value = total_sales_euros / total_sales_units
avg_sale_saving = total_saving_euros / total_sales_units
string = '############# Global #############\n\t\t\tMin:\tAvg:\tMax:\nPrice €:\t'+str(int(round(min_price)))+'\t\t'+str(int(round(avg_deal_price)))+'\t\t'+str(int(round(max_price)))+'\nDiscount %:\t'+str(int(round(min_discount)))+'\t\t'+str(int(round(avg_deal_discount)))+'\t\t'+str(int(round(max_discount)))+'\nSaving €:\t'+str(int(round(min_saving)))+'\t\t'+str(int(round(avg_deal_saving)))+'\t\t'+str(int(round(max_saving)))+'\n---------------------------------\nDeals offered #: '+str(number_deals)+'\nTotal deals sold #: '+format_number(int(round(total_sales_units)))+'\nTotal savings €: '+format_number(int(round(total_saving_euros)))+'\nAverage saving €: '+format_number(int(round(avg_sale_saving)))+'\nTotal sales €: '+format_number(int(round(total_sales_euros)))+'\nAverage ticket €: '+format_number(int(round(avg_sale_value)))+'\nBest deal: '+format_number(int(round(best_deal[0])))+' € => '+best_deal[1]
if print_forecast:
string += '\nSales forecast for this year €: '+format_number(str(int(round(forecast(total_sales_euros)))))
return string
# ----- End procedures -----
# ----- Begin control panel -----
# Turn these to True or False depending on which ones you want to crawl. No changes further below are necessary, unless you want to fiddle around with the crawler ;)
switch_seed_all = False # True: crawls ALL deals on the site; False: crawls only deals in Barcelona. Leave as False for faster execution.
switch_crawl_shopping = True
switch_crawl_travel = True
switch_crawl_local = True
switch_print_global_report = True # Print a global report of all 3 types of deals (only turn True if previous 3 switches are True)
switch_print_forecast = True # Print a (very non-scientific) yearly sales forecast
# ----- End control panel -----
if switch_seed_all:
