milesgrimshaw/Quirky.rb

## Quirky.rb
# Last Updated: 2014.06.05

# Gems to include
require	'Nokogiri'
require "open-uri"
require 'pp'
require 'json'
require 'csv'

# def get_default_data( default_stats, default_pricing, date_range )
def get_default_data()

	default_stats = default_pricing = date_range = []

	# Open CSV with all the dates and default values
	CSV.foreach("quirky_data_all.csv") do |row|
		info = row
		if info[0] == "Name"
			default_stats = row.compact
		elsif info[0] == "Low"
			default_pricing = row.compact
		else
			date_range = row.compact
		end
	end

	# Edit the dates
	for d in 0..(date_range.length-1)
		date =  date_range[d]
		num = date[/\d+/].to_i
		if num < 10
			num = "%02d" % num
			date_range[d] = date_range[d].gsub(/\d+/, num)
		end
	end

	return default_stats, default_pricing, date_range

end

def get_category_hash

	category_hash = Hash.new

	# Open CSV with all the ids of pages to visit
	CSV.foreach("quirky_products.csv", :encoding => 'UTF-8') do |row|
		category_hash[row[1].to_i] = row[0]
	end

	return category_hash

end

def get_page(id)

	begin
		url = "https://www.quirky.com/products/#{ id }/timeline"
		uri = URI.parse( URI.encode ( url.strip ) )
		page = Nokogiri::HTML( open ( uri ) )
	rescue
		pp "starting sleep"
		sleep 60
		pp "ending sleep"
		page = Nokogiri::HTML( open ( uri ) )
	end

	return page

end

def get_product_name (page )

	# Get the name of the product
	# Catch 404 Error
	begin
		name = page.css(".ui .segment").css(".ui .header")[0].content.strip
	rescue
		name = "NA"
	end

	return name

end

def get_product_status( page )

	status_data = page.css(".ui .segment").css(".icon")

	begin
		for b in 0..(status_data.children.length-1)
			text = status_data.children[b].content.strip
			if (text == "Status:")
				b += 1
				text = status_data.children[b].content.strip
				until (text =~ /^\S/) do
					b += 1
					text = status_data.children[b].content.strip
				end
				status = text
			end
		end

		# Save the status
		if status.nil?
			status = "NA"
		end

	rescue
		status = "NA"
	end

	return status

end

def get_summary_stats ( page, stats )

	# Get the data of the page
	data = page.css(".ui .timeline-stats").css(".column")

	# Set to nil
	retail_price = units_sold = community_earned = total_influencers = nil

	# Loop through each of the pieces of data
	for a in 0..(data.children.length-1)
		text =  data.children[a].content
		text = text.strip

		if (text == "Retail Price")
			until (text =~ /^\$\d/) do
				a += 1
				text = data.children[a].content
				text = text.strip
			end
			retail_price = text
		elsif (text == "Units Sold")
			until (text =~ /^\d/) do
				a += 1
				text = data.children[a].content
				text = text.strip
			end
			units_sold = text
		elsif (text == "Community Earned")
			until (text =~ /^\$\d/) do
				a += 1
				text = data.children[a].content
				text = text.strip
			end
			community_earned = text
		elsif (text == "Total Influencers")
			until (text =~ /^\d/) do
				a += 1
				text = data.children[a].content
				text = text.strip
			end
			total_influencers = text
		end
	end

	# Save the retail price
	if !retail_price.nil?
		stats.push(retail_price)
	else
		stats.push("NA")
	end

	# Save the total units sold
	if !units_sold.nil?
		stats.push(units_sold)
	else
		stats.push("NA")
	end

	# Save what the communited earned
	if !community_earned.nil?
		stats.push(community_earned)
	else
		stats.push("NA")
	end

	# Save the number of total influencers
	if !total_influencers.nil?
		stats.push(total_influencers)
	else
		stats.push("NA")
	end

	#  "On Sale",
	stats.push("NA")

end

def extract_pricing_data ( pricing )

	pricing_save = []

	# Use the pricing data scraped or enter as NA
	if ( !pricing.nil? )
		pricing = pricing[ 'sparkline' ][ 'pricing' ][ 'data' ]
		pricing_save = []

		if !pricing[ 'low_whisker' ].nil?
			pricing_save.push( pricing[ 'low_whisker' ] )
		else
			pricing_save.push( 'NA' )
		end

		if !pricing[ 'firstq' ].nil?
			pricing_save.push( pricing[ 'firstq' ] )
		else
			pricing_save.push( 'NA' )
		end

		if !pricing[ 'meian' ].nil?
			pricing_save.push( pricing[ 'meian' ] )
		else
			pricing_save.push( 'nil' )
		end

		if !pricing[ 'thirdq' ].nil?
			pricing_save.push( pricing[ 'thirdq' ] )
		else
			pricing_save.push( 'NA' )
		end

		if !pricing[ 'high_whisker'].nil?
			pricing_save.push( pricing[ 'high_whisker'] )
		else
			pricing_save.push( 'NA' )
		end

	else
		# Use default data
		pricing_save = Array.new(5,'NA')
	end

	return pricing_save

end

def extract_payouts_data(payouts, date_range)

	payouts_save = []

	if ( !payouts.nil? )
		payouts_dates = payouts['sparkline']['payout']['dates']
		payouts_data = payouts['sparkline']['payout']['data']

		for a in 0..(payouts_dates.length-1)
			payouts_dates[a] = payouts_dates[a].gsub( /,/, "-" )
		end

		# Go through date range and see what payouts to ad
		for c in 0..(date_range.length-1)

			date_to_check = date_range[c]

			# If the date is included, then save the data
			# Turn the array into a hash
			hash_payouts_dates = Hash[payouts_dates.map.with_index.to_a]

			# If the date to check is in payouts_dates then save the data
			index = hash_payouts_dates[date_to_check]

			# If the date is in payouts_dates than save the data
			if !index.nil?

				# Check that index isn't outside of the length of payouts_data
				if index < payouts_data.length
					payouts_save.push(payouts_data[index])
				else
					payouts_save.push("NA")
				end
			else
				payouts_save.push("NA")
			end

		# End loop through set of default dates
		end

	else
		# Use default data
		payouts_save = Array.new(date_range.length,'NA')
	end

	return payouts_save

end

def extract_units_data ( units, date_range )

	units_save = []

	# Use units sold data or enter as blank
	if ( !units.nil? )
		units_dates = units['sparkline']['units']['dates']
		units_data = units['sparkline']['units']['data']

		for a in 0..(units_dates.length-1)
			units_dates[a] = units_dates[a].gsub( /,/, "-" )
		end

		# Go through date range and see what payouts to ad
		for c in 0..(date_range.length-1)

			date_to_check = date_range[c]

			# If the date is included, then save the data
			# Turn the array into a hash
			hash_units_dates = Hash[units_dates.map.with_index.to_a]

			# If the date to check is in payouts_dates then save the data
			index = hash_units_dates[date_to_check]

			# If the date is in payouts_dates than save the data
			if !index.nil?

				# Check that index isn't outside of the length of payouts_data
				if index < units_data.length
					units_save.push(units_data[index])
				else
					units_save.push("NA")
				end
			else
				units_save.push("NA")
			end

		# End loop through set of default dates
		end

	else
		# Use default data
		units_save = Array.new(date_range.length,'NA')
	end

	return units_save

end

def get_monthly_data ( page, date_range, pricing_save, payouts_save, units_save  )

	# All the data is in the script element
	x = page.search( "script" )

	# Find the exact script node that the data is in
	nodes = x.children.length

	# Set a variable for if found the node with the data
	found = false

	# An iterative counter
	y = 0

	# Create variables for each of the 3 data sets we are looking for
	pricing = payouts = units = nil

	begin

		# Go through all the nodes to find the one with the data
		until found do

			# Get the node and its content
			node = x.children[ y ]
			if (node.nil?)
				break
			else
				node = node.content
			end

			# See if the node contents contain any of the data fields we are looking for
			# If it does then get the data
			if ( node =~ /pricing/ || node =~ /payout/ || node =~ /units/ )

				# Set found to true
				found = true

				# Remove newlines
				node = node.gsub( /[\r\n]/m, '' )

				# Remove spaces
				node = node.delete(' ')

				# Replace single with double quotes
				node = node.gsub(/'/, '"')

				# Remplace 'median' with 'meian' so that word replacement works later
				issue = "median"
				fixed = "meian"
				node = node.gsub(issue, fixed)

				# Split the node into the discrete data segments
				datas = node.split(");")

				# Loop through each data segment
				for z in 0..(datas.length-1)

					# Get the specific section of data looking at
					data = datas[z]

					# Get rid of the JQuery function at the beginning
					issue = "$.extend(true,quirky.config,"
					data = data.gsub(issue, '' )

					# Words that I want to rap in quotations so can be parsed as JSON
					words = [ 'sparkline','payout','dates','data', 'units', 'pricing', 'fields', 'lw', 'lq', 'meian', 'med', 'uq', 'rw', 'low_whisker', 'q1', 'q3', 'high_whisker' ]

					# Loo through and wrap each of the words in quotations in the string
					for i in 0..( words.length-1 )

						word = words[i]

						if word != ""
							word_new = '"' + word + '"'
							data = data.gsub( word, word_new )
						end

					end

					# Fix up the issues with the pricing data format
					if ( data =~ /pricing/ )

						# Find and convert all numbers with commas to numbers without commas
						data = data.gsub(/(\d+),(\d+)/, '\1''\2')

						# Find and replace all digits with quotes around them
						data = data.gsub( /( \d+\.?\d* )/, '"'+'\1'+'"' )

						# Replace q1 and q2 with names with no numbers
						# This is because scan will match those numbers and we don't want to replace them
						data = data.sub( 'q1', 'firstq' )
						data = data.sub( 'q3', 'thirdq' )

						# Eliminate the extra comma
						data = data.gsub( ',}','}' )

					end

					# Save information depending on which data set it is
					if ( data =~ /pricing/ )
						pricing = JSON.parse( data )
					elsif ( data =~ /payout/ )
						payouts = JSON.parse( data )
					elsif ( data =~ /units/ )
						units = JSON.parse( data )
					end

				# End extracting data from each of the information nodes
				end

			# End code if the node has the data
			else
				# Increment the node will look at
				y += 1
			end

		# End loop through each of the nodes
		end

		# Construct an array to save to CSV
		pricing_save = extract_pricing_data (pricing )

		# Use payouts data or enter as blank
		payouts_save = extract_payouts_data(payouts,date_range)

		# Use units data or enter as blank
		units_save = extract_units_data(units,date_range)

		return pricing_save, payouts_save, units_save

	rescue
		pp "ERROR"
	end

end


## COLLECT DATA
def collect_data( a, z )

	# get_default_data( default_stats, default_pricing, date_range )
	default_stats, default_pricing, date_range = get_default_data()

	category_hash = get_category_hash

	# Open the CSVs to save data to
	sales_csv = CSV.open("quirky_everything_units_sold_data.csv", 'a')
	payouts_csv = CSV.open("quirky_everything_payouts_data.csv", 'a')

	# Write the header of the csv with column labels
	csv_header = default_stats.concat(default_pricing).concat(date_range)
	sales_csv << csv_header
	payouts_csv << csv_header

	for id in a..z

		pp "New product w/ id: " + id.to_s

		page = get_page( id );

		# Get Product Name
		stats = []
		stats.push( get_product_name( page ) )

		# Add the category
		category = category_hash[id]
		if category
			stats.push(category)
		else
			stats.push("NA")
		end

		# Add the product id
		stats.push(id)

		# Catch errors where get a redirect to the store
		begin

			# Get the product status
			stats.push( get_product_status(page) )

			# Get the summary stats
			get_summary_stats( page, stats )

			# Get monthly data
			pricing_save, payouts_save, units_save = get_monthly_data( page, date_range, pricing_save, payouts_save, units_save )

			# Write data to Units CSV
			sales_csv_data = stats + pricing_save + units_save
			# sales_csv_data = stats.concat(pricing_save).concat(units_save)
			sales_csv << sales_csv_data

			# Write data to Payouts CSV
			payouts_csv_data = stats + pricing_save + payouts_save
			# payouts_csv_data = stats.concat(pricing_save).concat(payouts_save)
			payouts_csv << payouts_csv_data

		rescue
			sales_csv << stats
			payouts_csv << stats
		end

	end

	# Close the CSVs
	sales_csv.close
	payouts_csv.close

end

# Go up to 885
collect_data( 1, 885 )
	# Last Updated: 2014.06.05

	# Gems to include
	require 'Nokogiri'
	require "open-uri"
	require 'pp'
	require 'json'
	require 'csv'

	# def get_default_data( default_stats, default_pricing, date_range )
	def get_default_data()

	default_stats = default_pricing = date_range = []

	# Open CSV with all the dates and default values
	CSV.foreach("quirky_data_all.csv") do \|row\|
	info = row
	if info[0] == "Name"
	default_stats = row.compact
	elsif info[0] == "Low"
	default_pricing = row.compact
	else
	date_range = row.compact
	end
	end

	# Edit the dates
	for d in 0..(date_range.length-1)
	date = date_range[d]
	num = date[/\d+/].to_i
	if num < 10
	num = "%02d" % num
	date_range[d] = date_range[d].gsub(/\d+/, num)
	end
	end

	return default_stats, default_pricing, date_range

	end

	def get_category_hash

	category_hash = Hash.new

	# Open CSV with all the ids of pages to visit
	CSV.foreach("quirky_products.csv", :encoding => 'UTF-8') do \|row\|
	category_hash[row[1].to_i] = row[0]
	end

	return category_hash

	end

	def get_page(id)

	begin
	url = "https://www.quirky.com/products/#{ id }/timeline"
	uri = URI.parse( URI.encode ( url.strip ) )
	page = Nokogiri::HTML( open ( uri ) )
	rescue
	pp "starting sleep"
	sleep 60
	pp "ending sleep"
	page = Nokogiri::HTML( open ( uri ) )
	end

	return page

	end

	def get_product_name (page )

	# Get the name of the product
	# Catch 404 Error
	begin
	name = page.css(".ui .segment").css(".ui .header")[0].content.strip
	rescue
	name = "NA"
	end

	return name

	end

	def get_product_status( page )

	status_data = page.css(".ui .segment").css(".icon")

	begin
	for b in 0..(status_data.children.length-1)
	text = status_data.children[b].content.strip
	if (text == "Status:")
	b += 1
	text = status_data.children[b].content.strip
	until (text =~ /^\S/) do
	b += 1
	text = status_data.children[b].content.strip
	end
	status = text
	end
	end

	# Save the status
	if status.nil?
	status = "NA"
	end

	rescue
	status = "NA"
	end

	return status

	end

	def get_summary_stats ( page, stats )

	# Get the data of the page
	data = page.css(".ui .timeline-stats").css(".column")

	# Set to nil
	retail_price = units_sold = community_earned = total_influencers = nil

	# Loop through each of the pieces of data
	for a in 0..(data.children.length-1)
	text = data.children[a].content
	text = text.strip

	if (text == "Retail Price")
	until (text =~ /^\$\d/) do
	a += 1
	text = data.children[a].content
	text = text.strip
	end
	retail_price = text
	elsif (text == "Units Sold")
	until (text =~ /^\d/) do
	a += 1
	text = data.children[a].content
	text = text.strip
	end
	units_sold = text
	elsif (text == "Community Earned")
	until (text =~ /^\$\d/) do
	a += 1
	text = data.children[a].content
	text = text.strip
	end
	community_earned = text
	elsif (text == "Total Influencers")
	until (text =~ /^\d/) do
	a += 1
	text = data.children[a].content
	text = text.strip
	end
	total_influencers = text
	end
	end

	# Save the retail price
	if !retail_price.nil?
	stats.push(retail_price)
	else
	stats.push("NA")
	end

	# Save the total units sold
	if !units_sold.nil?
	stats.push(units_sold)
	else
	stats.push("NA")
	end

	# Save what the communited earned
	if !community_earned.nil?
	stats.push(community_earned)
	else
	stats.push("NA")
	end

	# Save the number of total influencers
	if !total_influencers.nil?
	stats.push(total_influencers)
	else
	stats.push("NA")
	end

	# "On Sale",
	stats.push("NA")

	end

	def extract_pricing_data ( pricing )

	pricing_save = []

	# Use the pricing data scraped or enter as NA
	if ( !pricing.nil? )
	pricing = pricing[ 'sparkline' ][ 'pricing' ][ 'data' ]
	pricing_save = []

	if !pricing[ 'low_whisker' ].nil?
	pricing_save.push( pricing[ 'low_whisker' ] )
	else
	pricing_save.push( 'NA' )
	end

	if !pricing[ 'firstq' ].nil?
	pricing_save.push( pricing[ 'firstq' ] )
	else
	pricing_save.push( 'NA' )
	end

	if !pricing[ 'meian' ].nil?
	pricing_save.push( pricing[ 'meian' ] )
	else
	pricing_save.push( 'nil' )
	end

	if !pricing[ 'thirdq' ].nil?
	pricing_save.push( pricing[ 'thirdq' ] )
	else
	pricing_save.push( 'NA' )
	end

	if !pricing[ 'high_whisker'].nil?
	pricing_save.push( pricing[ 'high_whisker'] )
	else
	pricing_save.push( 'NA' )
	end

	else
	# Use default data
	pricing_save = Array.new(5,'NA')
	end

	return pricing_save

	end

	def extract_payouts_data(payouts, date_range)

	payouts_save = []

	if ( !payouts.nil? )
	payouts_dates = payouts['sparkline']['payout']['dates']
	payouts_data = payouts['sparkline']['payout']['data']

	for a in 0..(payouts_dates.length-1)
	payouts_dates[a] = payouts_dates[a].gsub( /,/, "-" )
	end

	# Go through date range and see what payouts to ad
	for c in 0..(date_range.length-1)

	date_to_check = date_range[c]

	# If the date is included, then save the data
	# Turn the array into a hash
	hash_payouts_dates = Hash[payouts_dates.map.with_index.to_a]

	# If the date to check is in payouts_dates then save the data
	index = hash_payouts_dates[date_to_check]

	# If the date is in payouts_dates than save the data
	if !index.nil?

	# Check that index isn't outside of the length of payouts_data
	if index < payouts_data.length
	payouts_save.push(payouts_data[index])
	else
	payouts_save.push("NA")
	end
	else
	payouts_save.push("NA")
	end

	# End loop through set of default dates
	end

	else
	# Use default data
	payouts_save = Array.new(date_range.length,'NA')
	end

	return payouts_save

	end

	def extract_units_data ( units, date_range )

	units_save = []

	# Use units sold data or enter as blank
	if ( !units.nil? )
	units_dates = units['sparkline']['units']['dates']
	units_data = units['sparkline']['units']['data']

	for a in 0..(units_dates.length-1)
	units_dates[a] = units_dates[a].gsub( /,/, "-" )
	end

	# Go through date range and see what payouts to ad
	for c in 0..(date_range.length-1)

	date_to_check = date_range[c]

	# If the date is included, then save the data
	# Turn the array into a hash
	hash_units_dates = Hash[units_dates.map.with_index.to_a]

	# If the date to check is in payouts_dates then save the data
	index = hash_units_dates[date_to_check]

	# If the date is in payouts_dates than save the data
	if !index.nil?

	# Check that index isn't outside of the length of payouts_data
	if index < units_data.length
	units_save.push(units_data[index])
	else
	units_save.push("NA")
	end
	else
	units_save.push("NA")
	end

	# End loop through set of default dates
	end

	else
	# Use default data
	units_save = Array.new(date_range.length,'NA')
	end

	return units_save

	end

	def get_monthly_data ( page, date_range, pricing_save, payouts_save, units_save )

	# All the data is in the script element
	x = page.search( "script" )

	# Find the exact script node that the data is in
	nodes = x.children.length

	# Set a variable for if found the node with the data
	found = false

	# An iterative counter
	y = 0

	# Create variables for each of the 3 data sets we are looking for
	pricing = payouts = units = nil

	begin

	# Go through all the nodes to find the one with the data
	until found do

	# Get the node and its content
	node = x.children[ y ]
	if (node.nil?)
	break
	else
	node = node.content
	end

	# See if the node contents contain any of the data fields we are looking for
	# If it does then get the data
	if ( node =~ /pricing/ \|\| node =~ /payout/ \|\| node =~ /units/ )

	# Set found to true
	found = true

	# Remove newlines
	node = node.gsub( /[\r\n]/m, '' )

	# Remove spaces
	node = node.delete(' ')

	# Replace single with double quotes
	node = node.gsub(/'/, '"')

	# Remplace 'median' with 'meian' so that word replacement works later
	issue = "median"
	fixed = "meian"
	node = node.gsub(issue, fixed)

	# Split the node into the discrete data segments
	datas = node.split(");")

	# Loop through each data segment
	for z in 0..(datas.length-1)

	# Get the specific section of data looking at
	data = datas[z]

	# Get rid of the JQuery function at the beginning
	issue = "$.extend(true,quirky.config,"
	data = data.gsub(issue, '' )

	# Words that I want to rap in quotations so can be parsed as JSON
	words = [ 'sparkline','payout','dates','data', 'units', 'pricing', 'fields', 'lw', 'lq', 'meian', 'med', 'uq', 'rw', 'low_whisker', 'q1', 'q3', 'high_whisker' ]

	# Loo through and wrap each of the words in quotations in the string
	for i in 0..( words.length-1 )

	word = words[i]

	if word != ""
	word_new = '"' + word + '"'
	data = data.gsub( word, word_new )
	end

	end

	# Fix up the issues with the pricing data format
	if ( data =~ /pricing/ )

	# Find and convert all numbers with commas to numbers without commas
	data = data.gsub(/(\d+),(\d+)/, '\1''\2')

	# Find and replace all digits with quotes around them
	data = data.gsub( /( \d+\.?\d* )/, '"'+'\1'+'"' )

	# Replace q1 and q2 with names with no numbers
	# This is because scan will match those numbers and we don't want to replace them
	data = data.sub( 'q1', 'firstq' )
	data = data.sub( 'q3', 'thirdq' )

	# Eliminate the extra comma
	data = data.gsub( ',}','}' )

	end

	# Save information depending on which data set it is
	if ( data =~ /pricing/ )
	pricing = JSON.parse( data )
	elsif ( data =~ /payout/ )
	payouts = JSON.parse( data )
	elsif ( data =~ /units/ )
	units = JSON.parse( data )
	end

	# End extracting data from each of the information nodes
	end

	# End code if the node has the data
	else
	# Increment the node will look at
	y += 1
	end

	# End loop through each of the nodes
	end

	# Construct an array to save to CSV
	pricing_save = extract_pricing_data (pricing )

	# Use payouts data or enter as blank
	payouts_save = extract_payouts_data(payouts,date_range)

	# Use units data or enter as blank
	units_save = extract_units_data(units,date_range)

	return pricing_save, payouts_save, units_save

	rescue
	pp "ERROR"
	end

	end




	## COLLECT DATA
	def collect_data( a, z )

	# get_default_data( default_stats, default_pricing, date_range )
	default_stats, default_pricing, date_range = get_default_data()

	category_hash = get_category_hash

	# Open the CSVs to save data to
	sales_csv = CSV.open("quirky_everything_units_sold_data.csv", 'a')
	payouts_csv = CSV.open("quirky_everything_payouts_data.csv", 'a')

	# Write the header of the csv with column labels
	csv_header = default_stats.concat(default_pricing).concat(date_range)
	sales_csv << csv_header
	payouts_csv << csv_header

	for id in a..z

	pp "New product w/ id: " + id.to_s

	page = get_page( id );

	# Get Product Name
	stats = []
	stats.push( get_product_name( page ) )

	# Add the category
	category = category_hash[id]
	if category
	stats.push(category)
	else
	stats.push("NA")
	end

	# Add the product id
	stats.push(id)

	# Catch errors where get a redirect to the store
	begin

	# Get the product status
	stats.push( get_product_status(page) )

	# Get the summary stats
	get_summary_stats( page, stats )

	# Get monthly data
	pricing_save, payouts_save, units_save = get_monthly_data( page, date_range, pricing_save, payouts_save, units_save )

	# Write data to Units CSV
	sales_csv_data = stats + pricing_save + units_save
	# sales_csv_data = stats.concat(pricing_save).concat(units_save)
	sales_csv << sales_csv_data

	# Write data to Payouts CSV
	payouts_csv_data = stats + pricing_save + payouts_save
	# payouts_csv_data = stats.concat(pricing_save).concat(payouts_save)
	payouts_csv << payouts_csv_data

	rescue
	sales_csv << stats
	payouts_csv << stats
	end

	end

	# Close the CSVs
	sales_csv.close
	payouts_csv.close

	end

	# Go up to 885
	collect_data( 1, 885 )