Skip to content

Instantly share code, notes, and snippets.

@milesgrimshaw
Created June 11, 2014 00:27
Show Gist options
  • Save milesgrimshaw/73705bf417fbd32892ce to your computer and use it in GitHub Desktop.
Save milesgrimshaw/73705bf417fbd32892ce to your computer and use it in GitHub Desktop.
Ruby script to collect data from Quirky product pages
# Last Updated: 2014.06.05
# Gems to include
require 'Nokogiri'
require "open-uri"
require 'pp'
require 'json'
require 'csv'
# def get_default_data( default_stats, default_pricing, date_range )
def get_default_data()
default_stats = default_pricing = date_range = []
# Open CSV with all the dates and default values
CSV.foreach("quirky_data_all.csv") do |row|
info = row
if info[0] == "Name"
default_stats = row.compact
elsif info[0] == "Low"
default_pricing = row.compact
else
date_range = row.compact
end
end
# Edit the dates
for d in 0..(date_range.length-1)
date = date_range[d]
num = date[/\d+/].to_i
if num < 10
num = "%02d" % num
date_range[d] = date_range[d].gsub(/\d+/, num)
end
end
return default_stats, default_pricing, date_range
end
def get_category_hash
category_hash = Hash.new
# Open CSV with all the ids of pages to visit
CSV.foreach("quirky_products.csv", :encoding => 'UTF-8') do |row|
category_hash[row[1].to_i] = row[0]
end
return category_hash
end
def get_page(id)
begin
url = "https://www.quirky.com/products/#{ id }/timeline"
uri = URI.parse( URI.encode ( url.strip ) )
page = Nokogiri::HTML( open ( uri ) )
rescue
pp "starting sleep"
sleep 60
pp "ending sleep"
page = Nokogiri::HTML( open ( uri ) )
end
return page
end
def get_product_name (page )
# Get the name of the product
# Catch 404 Error
begin
name = page.css(".ui .segment").css(".ui .header")[0].content.strip
rescue
name = "NA"
end
return name
end
def get_product_status( page )
status_data = page.css(".ui .segment").css(".icon")
begin
for b in 0..(status_data.children.length-1)
text = status_data.children[b].content.strip
if (text == "Status:")
b += 1
text = status_data.children[b].content.strip
until (text =~ /^\S/) do
b += 1
text = status_data.children[b].content.strip
end
status = text
end
end
# Save the status
if status.nil?
status = "NA"
end
rescue
status = "NA"
end
return status
end
def get_summary_stats ( page, stats )
# Get the data of the page
data = page.css(".ui .timeline-stats").css(".column")
# Set to nil
retail_price = units_sold = community_earned = total_influencers = nil
# Loop through each of the pieces of data
for a in 0..(data.children.length-1)
text = data.children[a].content
text = text.strip
if (text == "Retail Price")
until (text =~ /^\$\d/) do
a += 1
text = data.children[a].content
text = text.strip
end
retail_price = text
elsif (text == "Units Sold")
until (text =~ /^\d/) do
a += 1
text = data.children[a].content
text = text.strip
end
units_sold = text
elsif (text == "Community Earned")
until (text =~ /^\$\d/) do
a += 1
text = data.children[a].content
text = text.strip
end
community_earned = text
elsif (text == "Total Influencers")
until (text =~ /^\d/) do
a += 1
text = data.children[a].content
text = text.strip
end
total_influencers = text
end
end
# Save the retail price
if !retail_price.nil?
stats.push(retail_price)
else
stats.push("NA")
end
# Save the total units sold
if !units_sold.nil?
stats.push(units_sold)
else
stats.push("NA")
end
# Save what the communited earned
if !community_earned.nil?
stats.push(community_earned)
else
stats.push("NA")
end
# Save the number of total influencers
if !total_influencers.nil?
stats.push(total_influencers)
else
stats.push("NA")
end
# "On Sale",
stats.push("NA")
end
def extract_pricing_data ( pricing )
pricing_save = []
# Use the pricing data scraped or enter as NA
if ( !pricing.nil? )
pricing = pricing[ 'sparkline' ][ 'pricing' ][ 'data' ]
pricing_save = []
if !pricing[ 'low_whisker' ].nil?
pricing_save.push( pricing[ 'low_whisker' ] )
else
pricing_save.push( 'NA' )
end
if !pricing[ 'firstq' ].nil?
pricing_save.push( pricing[ 'firstq' ] )
else
pricing_save.push( 'NA' )
end
if !pricing[ 'meian' ].nil?
pricing_save.push( pricing[ 'meian' ] )
else
pricing_save.push( 'nil' )
end
if !pricing[ 'thirdq' ].nil?
pricing_save.push( pricing[ 'thirdq' ] )
else
pricing_save.push( 'NA' )
end
if !pricing[ 'high_whisker'].nil?
pricing_save.push( pricing[ 'high_whisker'] )
else
pricing_save.push( 'NA' )
end
else
# Use default data
pricing_save = Array.new(5,'NA')
end
return pricing_save
end
def extract_payouts_data(payouts, date_range)
payouts_save = []
if ( !payouts.nil? )
payouts_dates = payouts['sparkline']['payout']['dates']
payouts_data = payouts['sparkline']['payout']['data']
for a in 0..(payouts_dates.length-1)
payouts_dates[a] = payouts_dates[a].gsub( /,/, "-" )
end
# Go through date range and see what payouts to ad
for c in 0..(date_range.length-1)
date_to_check = date_range[c]
# If the date is included, then save the data
# Turn the array into a hash
hash_payouts_dates = Hash[payouts_dates.map.with_index.to_a]
# If the date to check is in payouts_dates then save the data
index = hash_payouts_dates[date_to_check]
# If the date is in payouts_dates than save the data
if !index.nil?
# Check that index isn't outside of the length of payouts_data
if index < payouts_data.length
payouts_save.push(payouts_data[index])
else
payouts_save.push("NA")
end
else
payouts_save.push("NA")
end
# End loop through set of default dates
end
else
# Use default data
payouts_save = Array.new(date_range.length,'NA')
end
return payouts_save
end
def extract_units_data ( units, date_range )
units_save = []
# Use units sold data or enter as blank
if ( !units.nil? )
units_dates = units['sparkline']['units']['dates']
units_data = units['sparkline']['units']['data']
for a in 0..(units_dates.length-1)
units_dates[a] = units_dates[a].gsub( /,/, "-" )
end
# Go through date range and see what payouts to ad
for c in 0..(date_range.length-1)
date_to_check = date_range[c]
# If the date is included, then save the data
# Turn the array into a hash
hash_units_dates = Hash[units_dates.map.with_index.to_a]
# If the date to check is in payouts_dates then save the data
index = hash_units_dates[date_to_check]
# If the date is in payouts_dates than save the data
if !index.nil?
# Check that index isn't outside of the length of payouts_data
if index < units_data.length
units_save.push(units_data[index])
else
units_save.push("NA")
end
else
units_save.push("NA")
end
# End loop through set of default dates
end
else
# Use default data
units_save = Array.new(date_range.length,'NA')
end
return units_save
end
def get_monthly_data ( page, date_range, pricing_save, payouts_save, units_save )
# All the data is in the script element
x = page.search( "script" )
# Find the exact script node that the data is in
nodes = x.children.length
# Set a variable for if found the node with the data
found = false
# An iterative counter
y = 0
# Create variables for each of the 3 data sets we are looking for
pricing = payouts = units = nil
begin
# Go through all the nodes to find the one with the data
until found do
# Get the node and its content
node = x.children[ y ]
if (node.nil?)
break
else
node = node.content
end
# See if the node contents contain any of the data fields we are looking for
# If it does then get the data
if ( node =~ /pricing/ || node =~ /payout/ || node =~ /units/ )
# Set found to true
found = true
# Remove newlines
node = node.gsub( /[\r\n]/m, '' )
# Remove spaces
node = node.delete(' ')
# Replace single with double quotes
node = node.gsub(/'/, '"')
# Remplace 'median' with 'meian' so that word replacement works later
issue = "median"
fixed = "meian"
node = node.gsub(issue, fixed)
# Split the node into the discrete data segments
datas = node.split(");")
# Loop through each data segment
for z in 0..(datas.length-1)
# Get the specific section of data looking at
data = datas[z]
# Get rid of the JQuery function at the beginning
issue = "$.extend(true,quirky.config,"
data = data.gsub(issue, '' )
# Words that I want to rap in quotations so can be parsed as JSON
words = [ 'sparkline','payout','dates','data', 'units', 'pricing', 'fields', 'lw', 'lq', 'meian', 'med', 'uq', 'rw', 'low_whisker', 'q1', 'q3', 'high_whisker' ]
# Loo through and wrap each of the words in quotations in the string
for i in 0..( words.length-1 )
word = words[i]
if word != ""
word_new = '"' + word + '"'
data = data.gsub( word, word_new )
end
end
# Fix up the issues with the pricing data format
if ( data =~ /pricing/ )
# Find and convert all numbers with commas to numbers without commas
data = data.gsub(/(\d+),(\d+)/, '\1''\2')
# Find and replace all digits with quotes around them
data = data.gsub( /( \d+\.?\d* )/, '"'+'\1'+'"' )
# Replace q1 and q2 with names with no numbers
# This is because scan will match those numbers and we don't want to replace them
data = data.sub( 'q1', 'firstq' )
data = data.sub( 'q3', 'thirdq' )
# Eliminate the extra comma
data = data.gsub( ',}','}' )
end
# Save information depending on which data set it is
if ( data =~ /pricing/ )
pricing = JSON.parse( data )
elsif ( data =~ /payout/ )
payouts = JSON.parse( data )
elsif ( data =~ /units/ )
units = JSON.parse( data )
end
# End extracting data from each of the information nodes
end
# End code if the node has the data
else
# Increment the node will look at
y += 1
end
# End loop through each of the nodes
end
# Construct an array to save to CSV
pricing_save = extract_pricing_data (pricing )
# Use payouts data or enter as blank
payouts_save = extract_payouts_data(payouts,date_range)
# Use units data or enter as blank
units_save = extract_units_data(units,date_range)
return pricing_save, payouts_save, units_save
rescue
pp "ERROR"
end
end
## COLLECT DATA
def collect_data( a, z )
# get_default_data( default_stats, default_pricing, date_range )
default_stats, default_pricing, date_range = get_default_data()
category_hash = get_category_hash
# Open the CSVs to save data to
sales_csv = CSV.open("quirky_everything_units_sold_data.csv", 'a')
payouts_csv = CSV.open("quirky_everything_payouts_data.csv", 'a')
# Write the header of the csv with column labels
csv_header = default_stats.concat(default_pricing).concat(date_range)
sales_csv << csv_header
payouts_csv << csv_header
for id in a..z
pp "New product w/ id: " + id.to_s
page = get_page( id );
# Get Product Name
stats = []
stats.push( get_product_name( page ) )
# Add the category
category = category_hash[id]
if category
stats.push(category)
else
stats.push("NA")
end
# Add the product id
stats.push(id)
# Catch errors where get a redirect to the store
begin
# Get the product status
stats.push( get_product_status(page) )
# Get the summary stats
get_summary_stats( page, stats )
# Get monthly data
pricing_save, payouts_save, units_save = get_monthly_data( page, date_range, pricing_save, payouts_save, units_save )
# Write data to Units CSV
sales_csv_data = stats + pricing_save + units_save
# sales_csv_data = stats.concat(pricing_save).concat(units_save)
sales_csv << sales_csv_data
# Write data to Payouts CSV
payouts_csv_data = stats + pricing_save + payouts_save
# payouts_csv_data = stats.concat(pricing_save).concat(payouts_save)
payouts_csv << payouts_csv_data
rescue
sales_csv << stats
payouts_csv << stats
end
end
# Close the CSVs
sales_csv.close
payouts_csv.close
end
# Go up to 885
collect_data( 1, 885 )