Created
June 11, 2014 00:27
-
-
Save milesgrimshaw/73705bf417fbd32892ce to your computer and use it in GitHub Desktop.
Ruby script to collect data from Quirky product pages
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Last Updated: 2014.06.05 | |
# Gems to include | |
require 'Nokogiri' | |
require "open-uri" | |
require 'pp' | |
require 'json' | |
require 'csv' | |
# def get_default_data( default_stats, default_pricing, date_range ) | |
def get_default_data() | |
default_stats = default_pricing = date_range = [] | |
# Open CSV with all the dates and default values | |
CSV.foreach("quirky_data_all.csv") do |row| | |
info = row | |
if info[0] == "Name" | |
default_stats = row.compact | |
elsif info[0] == "Low" | |
default_pricing = row.compact | |
else | |
date_range = row.compact | |
end | |
end | |
# Edit the dates | |
for d in 0..(date_range.length-1) | |
date = date_range[d] | |
num = date[/\d+/].to_i | |
if num < 10 | |
num = "%02d" % num | |
date_range[d] = date_range[d].gsub(/\d+/, num) | |
end | |
end | |
return default_stats, default_pricing, date_range | |
end | |
def get_category_hash | |
category_hash = Hash.new | |
# Open CSV with all the ids of pages to visit | |
CSV.foreach("quirky_products.csv", :encoding => 'UTF-8') do |row| | |
category_hash[row[1].to_i] = row[0] | |
end | |
return category_hash | |
end | |
def get_page(id) | |
begin | |
url = "https://www.quirky.com/products/#{ id }/timeline" | |
uri = URI.parse( URI.encode ( url.strip ) ) | |
page = Nokogiri::HTML( open ( uri ) ) | |
rescue | |
pp "starting sleep" | |
sleep 60 | |
pp "ending sleep" | |
page = Nokogiri::HTML( open ( uri ) ) | |
end | |
return page | |
end | |
def get_product_name (page ) | |
# Get the name of the product | |
# Catch 404 Error | |
begin | |
name = page.css(".ui .segment").css(".ui .header")[0].content.strip | |
rescue | |
name = "NA" | |
end | |
return name | |
end | |
def get_product_status( page ) | |
status_data = page.css(".ui .segment").css(".icon") | |
begin | |
for b in 0..(status_data.children.length-1) | |
text = status_data.children[b].content.strip | |
if (text == "Status:") | |
b += 1 | |
text = status_data.children[b].content.strip | |
until (text =~ /^\S/) do | |
b += 1 | |
text = status_data.children[b].content.strip | |
end | |
status = text | |
end | |
end | |
# Save the status | |
if status.nil? | |
status = "NA" | |
end | |
rescue | |
status = "NA" | |
end | |
return status | |
end | |
def get_summary_stats ( page, stats ) | |
# Get the data of the page | |
data = page.css(".ui .timeline-stats").css(".column") | |
# Set to nil | |
retail_price = units_sold = community_earned = total_influencers = nil | |
# Loop through each of the pieces of data | |
for a in 0..(data.children.length-1) | |
text = data.children[a].content | |
text = text.strip | |
if (text == "Retail Price") | |
until (text =~ /^\$\d/) do | |
a += 1 | |
text = data.children[a].content | |
text = text.strip | |
end | |
retail_price = text | |
elsif (text == "Units Sold") | |
until (text =~ /^\d/) do | |
a += 1 | |
text = data.children[a].content | |
text = text.strip | |
end | |
units_sold = text | |
elsif (text == "Community Earned") | |
until (text =~ /^\$\d/) do | |
a += 1 | |
text = data.children[a].content | |
text = text.strip | |
end | |
community_earned = text | |
elsif (text == "Total Influencers") | |
until (text =~ /^\d/) do | |
a += 1 | |
text = data.children[a].content | |
text = text.strip | |
end | |
total_influencers = text | |
end | |
end | |
# Save the retail price | |
if !retail_price.nil? | |
stats.push(retail_price) | |
else | |
stats.push("NA") | |
end | |
# Save the total units sold | |
if !units_sold.nil? | |
stats.push(units_sold) | |
else | |
stats.push("NA") | |
end | |
# Save what the communited earned | |
if !community_earned.nil? | |
stats.push(community_earned) | |
else | |
stats.push("NA") | |
end | |
# Save the number of total influencers | |
if !total_influencers.nil? | |
stats.push(total_influencers) | |
else | |
stats.push("NA") | |
end | |
# "On Sale", | |
stats.push("NA") | |
end | |
def extract_pricing_data ( pricing ) | |
pricing_save = [] | |
# Use the pricing data scraped or enter as NA | |
if ( !pricing.nil? ) | |
pricing = pricing[ 'sparkline' ][ 'pricing' ][ 'data' ] | |
pricing_save = [] | |
if !pricing[ 'low_whisker' ].nil? | |
pricing_save.push( pricing[ 'low_whisker' ] ) | |
else | |
pricing_save.push( 'NA' ) | |
end | |
if !pricing[ 'firstq' ].nil? | |
pricing_save.push( pricing[ 'firstq' ] ) | |
else | |
pricing_save.push( 'NA' ) | |
end | |
if !pricing[ 'meian' ].nil? | |
pricing_save.push( pricing[ 'meian' ] ) | |
else | |
pricing_save.push( 'nil' ) | |
end | |
if !pricing[ 'thirdq' ].nil? | |
pricing_save.push( pricing[ 'thirdq' ] ) | |
else | |
pricing_save.push( 'NA' ) | |
end | |
if !pricing[ 'high_whisker'].nil? | |
pricing_save.push( pricing[ 'high_whisker'] ) | |
else | |
pricing_save.push( 'NA' ) | |
end | |
else | |
# Use default data | |
pricing_save = Array.new(5,'NA') | |
end | |
return pricing_save | |
end | |
def extract_payouts_data(payouts, date_range) | |
payouts_save = [] | |
if ( !payouts.nil? ) | |
payouts_dates = payouts['sparkline']['payout']['dates'] | |
payouts_data = payouts['sparkline']['payout']['data'] | |
for a in 0..(payouts_dates.length-1) | |
payouts_dates[a] = payouts_dates[a].gsub( /,/, "-" ) | |
end | |
# Go through date range and see what payouts to ad | |
for c in 0..(date_range.length-1) | |
date_to_check = date_range[c] | |
# If the date is included, then save the data | |
# Turn the array into a hash | |
hash_payouts_dates = Hash[payouts_dates.map.with_index.to_a] | |
# If the date to check is in payouts_dates then save the data | |
index = hash_payouts_dates[date_to_check] | |
# If the date is in payouts_dates than save the data | |
if !index.nil? | |
# Check that index isn't outside of the length of payouts_data | |
if index < payouts_data.length | |
payouts_save.push(payouts_data[index]) | |
else | |
payouts_save.push("NA") | |
end | |
else | |
payouts_save.push("NA") | |
end | |
# End loop through set of default dates | |
end | |
else | |
# Use default data | |
payouts_save = Array.new(date_range.length,'NA') | |
end | |
return payouts_save | |
end | |
def extract_units_data ( units, date_range ) | |
units_save = [] | |
# Use units sold data or enter as blank | |
if ( !units.nil? ) | |
units_dates = units['sparkline']['units']['dates'] | |
units_data = units['sparkline']['units']['data'] | |
for a in 0..(units_dates.length-1) | |
units_dates[a] = units_dates[a].gsub( /,/, "-" ) | |
end | |
# Go through date range and see what payouts to ad | |
for c in 0..(date_range.length-1) | |
date_to_check = date_range[c] | |
# If the date is included, then save the data | |
# Turn the array into a hash | |
hash_units_dates = Hash[units_dates.map.with_index.to_a] | |
# If the date to check is in payouts_dates then save the data | |
index = hash_units_dates[date_to_check] | |
# If the date is in payouts_dates than save the data | |
if !index.nil? | |
# Check that index isn't outside of the length of payouts_data | |
if index < units_data.length | |
units_save.push(units_data[index]) | |
else | |
units_save.push("NA") | |
end | |
else | |
units_save.push("NA") | |
end | |
# End loop through set of default dates | |
end | |
else | |
# Use default data | |
units_save = Array.new(date_range.length,'NA') | |
end | |
return units_save | |
end | |
def get_monthly_data ( page, date_range, pricing_save, payouts_save, units_save ) | |
# All the data is in the script element | |
x = page.search( "script" ) | |
# Find the exact script node that the data is in | |
nodes = x.children.length | |
# Set a variable for if found the node with the data | |
found = false | |
# An iterative counter | |
y = 0 | |
# Create variables for each of the 3 data sets we are looking for | |
pricing = payouts = units = nil | |
begin | |
# Go through all the nodes to find the one with the data | |
until found do | |
# Get the node and its content | |
node = x.children[ y ] | |
if (node.nil?) | |
break | |
else | |
node = node.content | |
end | |
# See if the node contents contain any of the data fields we are looking for | |
# If it does then get the data | |
if ( node =~ /pricing/ || node =~ /payout/ || node =~ /units/ ) | |
# Set found to true | |
found = true | |
# Remove newlines | |
node = node.gsub( /[\r\n]/m, '' ) | |
# Remove spaces | |
node = node.delete(' ') | |
# Replace single with double quotes | |
node = node.gsub(/'/, '"') | |
# Remplace 'median' with 'meian' so that word replacement works later | |
issue = "median" | |
fixed = "meian" | |
node = node.gsub(issue, fixed) | |
# Split the node into the discrete data segments | |
datas = node.split(");") | |
# Loop through each data segment | |
for z in 0..(datas.length-1) | |
# Get the specific section of data looking at | |
data = datas[z] | |
# Get rid of the JQuery function at the beginning | |
issue = "$.extend(true,quirky.config," | |
data = data.gsub(issue, '' ) | |
# Words that I want to rap in quotations so can be parsed as JSON | |
words = [ 'sparkline','payout','dates','data', 'units', 'pricing', 'fields', 'lw', 'lq', 'meian', 'med', 'uq', 'rw', 'low_whisker', 'q1', 'q3', 'high_whisker' ] | |
# Loo through and wrap each of the words in quotations in the string | |
for i in 0..( words.length-1 ) | |
word = words[i] | |
if word != "" | |
word_new = '"' + word + '"' | |
data = data.gsub( word, word_new ) | |
end | |
end | |
# Fix up the issues with the pricing data format | |
if ( data =~ /pricing/ ) | |
# Find and convert all numbers with commas to numbers without commas | |
data = data.gsub(/(\d+),(\d+)/, '\1''\2') | |
# Find and replace all digits with quotes around them | |
data = data.gsub( /( \d+\.?\d* )/, '"'+'\1'+'"' ) | |
# Replace q1 and q2 with names with no numbers | |
# This is because scan will match those numbers and we don't want to replace them | |
data = data.sub( 'q1', 'firstq' ) | |
data = data.sub( 'q3', 'thirdq' ) | |
# Eliminate the extra comma | |
data = data.gsub( ',}','}' ) | |
end | |
# Save information depending on which data set it is | |
if ( data =~ /pricing/ ) | |
pricing = JSON.parse( data ) | |
elsif ( data =~ /payout/ ) | |
payouts = JSON.parse( data ) | |
elsif ( data =~ /units/ ) | |
units = JSON.parse( data ) | |
end | |
# End extracting data from each of the information nodes | |
end | |
# End code if the node has the data | |
else | |
# Increment the node will look at | |
y += 1 | |
end | |
# End loop through each of the nodes | |
end | |
# Construct an array to save to CSV | |
pricing_save = extract_pricing_data (pricing ) | |
# Use payouts data or enter as blank | |
payouts_save = extract_payouts_data(payouts,date_range) | |
# Use units data or enter as blank | |
units_save = extract_units_data(units,date_range) | |
return pricing_save, payouts_save, units_save | |
rescue | |
pp "ERROR" | |
end | |
end | |
## COLLECT DATA | |
def collect_data( a, z ) | |
# get_default_data( default_stats, default_pricing, date_range ) | |
default_stats, default_pricing, date_range = get_default_data() | |
category_hash = get_category_hash | |
# Open the CSVs to save data to | |
sales_csv = CSV.open("quirky_everything_units_sold_data.csv", 'a') | |
payouts_csv = CSV.open("quirky_everything_payouts_data.csv", 'a') | |
# Write the header of the csv with column labels | |
csv_header = default_stats.concat(default_pricing).concat(date_range) | |
sales_csv << csv_header | |
payouts_csv << csv_header | |
for id in a..z | |
pp "New product w/ id: " + id.to_s | |
page = get_page( id ); | |
# Get Product Name | |
stats = [] | |
stats.push( get_product_name( page ) ) | |
# Add the category | |
category = category_hash[id] | |
if category | |
stats.push(category) | |
else | |
stats.push("NA") | |
end | |
# Add the product id | |
stats.push(id) | |
# Catch errors where get a redirect to the store | |
begin | |
# Get the product status | |
stats.push( get_product_status(page) ) | |
# Get the summary stats | |
get_summary_stats( page, stats ) | |
# Get monthly data | |
pricing_save, payouts_save, units_save = get_monthly_data( page, date_range, pricing_save, payouts_save, units_save ) | |
# Write data to Units CSV | |
sales_csv_data = stats + pricing_save + units_save | |
# sales_csv_data = stats.concat(pricing_save).concat(units_save) | |
sales_csv << sales_csv_data | |
# Write data to Payouts CSV | |
payouts_csv_data = stats + pricing_save + payouts_save | |
# payouts_csv_data = stats.concat(pricing_save).concat(payouts_save) | |
payouts_csv << payouts_csv_data | |
rescue | |
sales_csv << stats | |
payouts_csv << stats | |
end | |
end | |
# Close the CSVs | |
sales_csv.close | |
payouts_csv.close | |
end | |
# Go up to 885 | |
collect_data( 1, 885 ) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
great script and even better blog post!
https://www.cbinsights.com/blog/quirky-product-analysis/?utm_source=CB+Insights+Newsletter&utm_campaign=0125548e7e-NewsMediaValuation_09_29_2015&utm_medium=email&utm_term=0_9dc0513989-0125548e7e-86850777