Created
February 22, 2017 02:56
-
-
Save fvaletk/66dcafe5bd0bbcfeb63297f8bb217cbb to your computer and use it in GitHub Desktop.
jcpenny_scraper.rb
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'HTTParty' | |
require 'Nokogiri' | |
require 'Pry' | |
@image_base_url = 'http://zoom.jcpenney.com/is/image/' | |
@image_medium = '?wid=640&hei=640&op_usm=.4,.8,0,0&resmode=sharp2&op_usm=1.5,.8,0,0&resmode=sharp' | |
@image_small = '?wid=96&hei=96&op_usm=.4,.8,0,0&resmode=sharp2&op_usm=1.5,.8,0,0&resmode=sharp' | |
@image_large = '?wid=2000&hei=2000&op_usm=.4,.8,0,0&resmode=sharp2&op_usm=1.5,.8,0,0&resmode=sharp' | |
def price_to_float(price) | |
price.to_f | |
end | |
page_dom = HTTParty.get('http://www.jcpenney.com/dale-tiffany-rose-floor-lamp/prod.jump?ppId=pp5004610106&catId=SearchResults&searchTerm=lamp') | |
page = Nokogiri::HTML(page_dom) | |
product = {} | |
item = page.at_css('.pdp_details') | |
product_id = page.at_css('#ppIdorLotId').text | |
ar = page_dom.match(/<input id=\"loginModelParentUrl\".*?selectedSKUId=([0-9]*?)\&/i) | |
puts "1***************************************" | |
if ar | |
puts "AR #{ar.inspect}" | |
else | |
puts "AR Nil" | |
end | |
puts "2***************************************" | |
# Prices | |
if page.at_css('span.gallery_page_price.flt_wdt') | |
sale_price = page.at_css('span.gallery_page_price.flt_wdt').text.strip | |
elsif page.at_css('span.gallery_page_price.flt_wdt comparisonPrice') | |
sale_price = page.at_css('span.gallery_page_price.flt_wdt.comparisonPrice').text.strip | |
else | |
sale_price = nil | |
end | |
if sale_price | |
if ar = sale_price.match(/(\d+\.?\d*)/) | |
sale_price = price_to_float ar[1] | |
end | |
end | |
puts "SALE PRICE #{sale_price.inspect}" | |
puts "3***************************************" | |
list_price = page.css('span.pp_page_price.flt_wdt').text.strip | |
if ar = list_price.match(/(\d+\.?\d*)/) | |
list_price = price_to_float ar[1] | |
else | |
list_price = nil | |
end | |
puts "LIST PRICE #{list_price.inspect}" | |
puts "4***************************************" | |
if sale_price | |
product['price'] = sale_price | |
if list_price | |
product['initial_price'] = list_price | |
end | |
elsif list_price | |
product['price'] = list_price | |
end | |
puts "PRODUCT #{product.inspect}" | |
options = Hash.new | |
names = [] | |
nSkus = 0 | |
puts "5****************************************" | |
#products (?) | |
subproducts = [] | |
item.css('li#product a').each do |sr| | |
subproducts << sr.text.strip | |
end | |
unless subproducts.empty? | |
subproducts = subproducts.uniq | |
nSkus += 1 | |
options[nSkus] = subproducts | |
names[nSkus] = 'selectedLotValue' | |
end | |
puts "OPTIONS 1 -> #{options.inspect}" | |
puts "SUBPRODUCTS -> 1 #{subproducts.inspect}" | |
puts "nSkus 1 -> #{nSkus.inspect}" | |
puts "NAMES 1 -> #{names.inspect}" | |
puts "6*****************************************" | |
#size_range | |
size_range = [] | |
item.xpath("//li[@id='size range']/a").each do |sr| | |
size_range << sr.text.strip | |
end | |
unless size_range.empty? | |
size_range = size_range.uniq | |
nSkus += 1 | |
options[nSkus] = size_range | |
names[nSkus] = 'selectedLotValue' | |
end | |
puts "SIZE RANGE -> #{size_range.inspect}" | |
puts "OPTIONS 2 -> #{options.inspect}" | |
puts "SUBPRODUCTS 2 -> #{subproducts.inspect}" | |
puts "nSkus 2 -> #{nSkus.inspect}" | |
puts "NAMES 2 -> #{names.inspect}" | |
puts "7*****************************************" | |
# sizes | |
sizes = [] | |
item.css('div#skuOptions_size li#size a').each do |link| | |
sizes << link.text.strip | |
end | |
unless sizes.empty? | |
sizes = sizes.uniq | |
nSkus = nSkus + 1 | |
options[nSkus] = sizes | |
names[nSkus] = 'skuSelectionMap.SIZE' | |
end | |
puts "SIZE -> #{sizes.inspect}" | |
puts "OPTIONS 3 -> #{options.inspect}" | |
puts "SUBPRODUCTS 3 -> #{subproducts.inspect}" | |
puts "nSkus 3 -> #{nSkus.inspect}" | |
puts "NAMES 3 -> #{names.inspect}" | |
puts "8*****************************************" | |
# waist | |
waist = [] | |
item.css('div#skuOptions_waist li#waist a').each do |w| | |
waist << w.text.strip | |
end | |
unless waist.empty? | |
waist = waist.uniq | |
nSkus = nSkus + 1 | |
options[nSkus] = waist | |
names[nSkus] = 'skuSelectionMap.WAIST' | |
end | |
puts "WAIST -> #{waist.inspect}" | |
puts "OPTIONS 4 -> #{options.inspect}" | |
puts "SUBPRODUCTS 4 -> #{subproducts.inspect}" | |
puts "nSkus 4 -> #{nSkus.inspect}" | |
puts "NAMES 4 -> #{names.inspect}" | |
puts "9*****************************************" | |
# inseam | |
inseam = [] | |
item.css('div#skuOptions_inseam li#inseam a').each do |i| | |
inseam << i.text.strip | |
end | |
unless inseam.empty? | |
inseam = inseam.uniq | |
nSkus = nSkus + 1 | |
options[nSkus] = inseam | |
names[nSkus] = 'skuSelectionMap.INSEAM' | |
end | |
puts "INSEAM -> #{inseam.inspect}" | |
puts "OPTIONS 5 -> #{options.inspect}" | |
puts "SUBPRODUCTS 5 -> #{subproducts.inspect}" | |
puts "nSkus 5 -> #{nSkus.inspect}" | |
puts "NAMES 5 -> #{names.inspect}" | |
puts "10****************************************" | |
# neck size | |
neck_size = [] | |
item.xpath("//li[@id='neck size']/a").each do |n| | |
neck_size << n.text.strip | |
end | |
unless neck_size.empty? | |
neck_size = neck_size.uniq | |
nSkus = nSkus + 1 | |
options[nSkus] = neck_size | |
names[nSkus] = 'skuSelectionMap.NECK_SIZE' | |
end | |
puts "NECK SIZE -> #{neck_size.inspect}" | |
puts "OPTIONS 6 -> #{options.inspect}" | |
puts "SUBPRODUCTS 6 -> #{subproducts.inspect}" | |
puts "nSkus 6 -> #{nSkus.inspect}" | |
puts "NAMES 6 -> #{names.inspect}" | |
puts "11****************************************" | |
# sleeves | |
sleeves = [] | |
item.css('div#skuOptions_sleeve li#sleeve a').each do |s| | |
sleeves << s.text.strip | |
end | |
unless sleeves.empty? | |
sleeves = sleeves.uniq | |
nSkus = nSkus + 1 | |
options[nSkus] = sleeves | |
names[nSkus] = 'skuSelectionMap.SLEEVE' | |
end | |
puts "SLEEVE -> #{sleeves.inspect}" | |
puts "OPTIONS 7 -> #{options.inspect}" | |
puts "SUBPRODUCTS 7 -> #{subproducts.inspect}" | |
puts "nSkus 7 -> #{nSkus.inspect}" | |
puts "NAMES 7 -> #{names.inspect}" | |
puts "12****************************************" | |
#chest | |
chest = [] | |
item.css('div#skuOptions_chest li#chest a').each do |c| | |
chest << c.text.strip | |
end | |
unless chest.empty? | |
chest = chest.uniq | |
nSkus = nSkus + 1 | |
options[nSkus] = chest | |
names[nSkus] = 'skuSelectionMap.CHEST' | |
end | |
puts "CHEST -> #{chest.inspect}" | |
puts "OPTIONS 8 -> #{options.inspect}" | |
puts "SUBPRODUCTS 8 -> #{subproducts.inspect}" | |
puts "nSkus 8 -> #{nSkus.inspect}" | |
puts "NAMES 8 -> #{names.inspect}" | |
puts "13****************************************" | |
# length | |
length = [] | |
item.css('div#skuOptions_length li#length a').each do |c| | |
length << c.text.strip | |
end | |
unless length.empty? | |
length = length.uniq | |
nSkus = nSkus + 1 | |
options[nSkus] = length | |
names[nSkus] = 'skuSelectionMap.LENGTH' | |
end | |
puts "LENGTH -> #{length.inspect}" | |
puts "OPTIONS 9 -> #{options.inspect}" | |
puts "SUBPRODUCTS 9 -> #{subproducts.inspect}" | |
puts "nSkus 9 -> #{nSkus.inspect}" | |
puts "NAMES 9 -> #{names.inspect}" | |
puts "14****************************************" | |
# width | |
width = [] | |
item.css('div#skuOptions_width li#width a').each do |w| | |
width << w.text.strip | |
end | |
unless width.empty? | |
width = width.uniq | |
nSkus = nSkus + 1 | |
options[nSkus] = width | |
names[nSkus] = 'skuSelectionMap.WIDTH' | |
end | |
puts "WIDTH -> #{width.inspect}" | |
puts "OPTIONS 10 -> #{options.inspect}" | |
puts "SUBPRODUCTS 10 -> #{subproducts.inspect}" | |
puts "nSkus 10 -> #{nSkus.inspect}" | |
puts "NAMES 10 -> #{names.inspect}" | |
puts "15****************************************" | |
# color names | |
colors = [] | |
images = {} # image file base name by color name | |
item.css('ul.small_swatches a.swatch img').each do |img| | |
colors << img['name'] | |
end | |
colors = colors.uniq | |
# the default image name of the product page, we use it for colors that have no image of their own | |
default_image = page.at_css('meta[property="og:image"]') | |
if default_image | |
if match = default_image['content'].match(/[^\/?]+\.tif/) | |
default_image = match.to_s | |
end | |
end | |
# we search for the other image names | |
item.css('ul.small_swatches a.swatch').each do |a| | |
color_name = a.xpath('img/@name').to_s | |
if match = a['onclick'].match(/'([^']+\.tif)'/i) | |
images[color_name] = match[1] | |
else | |
# this should have worked, otherwise we have no particular image, so we use the default | |
images[color_name] = default_image | |
end | |
end | |
puts "COLORS -> #{colors.inspect}" | |
puts "OPTIONS 11 -> #{options.inspect}" | |
puts "SUBPRODUCTS 11 -> #{subproducts.inspect}" | |
puts "nSkus 11 -> #{nSkus.inspect}" | |
puts "NAMES 11 -> #{names.inspect}" | |
puts "16****************************************" | |
# colors and sizes | |
product['colors'] = [] | |
product['style_id'] = product_id | |
colors.each do |color| | |
# first, add the color in product.colors | |
color_size_dicts = [] | |
color_dict = { | |
'name' => color.strip, | |
'sizes' => color_size_dicts, | |
'image' => @image_base_url + images[color] + @image_small, | |
'medium_image' => @image_base_url + images[color] + @image_medium, | |
'large_image' => @image_base_url + images[color] + @image_large, | |
} | |
product['colors'] << color_dict | |
# size_combos contains the 'sizes' arrays of the final color_size_dict get_size_info_params | |
size_combos = nil | |
options.each do |name_index, opt_values| | |
# this way we know we're on the first size level | |
if size_combos | |
# we generate the cartesian product of current comvos with additional opt_values | |
size_combos = size_combos.product(opt_values) | |
else | |
# we initialize size_combos to the first opt_values | |
# we put each opt_values element in a separate Array for conformity with other cases | |
size_combos = opt_values.collect{|v| [v]} | |
end | |
end | |
if options.empty? | |
# if we have no options, we have only one size per color | |
size_combos = [[]] # an Array of Arrays is what we want for conformity with other cases | |
end | |
# create the color_size_dict for every size_combo | |
size_combos.each do |size_combo| | |
# the size_combo elements may be nested inside arrays | |
# because of succesive #product calls, so we flatten them (the order is preserved) | |
size_combo.flatten! if Array === size_combo | |
# get the sizes hash for get_size_info_params | |
sizes_dict = {} | |
options.each do |name_index, opt_values| | |
size_type = names[name_index] | |
# the size_combo is an array with size values ordered as in options, but with a different index value | |
sizes_dict[size_type] = size_combo[name_index - 1] | |
end | |
# sizes_dict will be empty if one size only | |
size_name = '' | |
sizes_dict.each do |key, value| | |
size_name += " #{value.strip.upcase}" | |
end | |
puts "SIZE NAME 11 -> #{size_name.inspect}" | |
# size_name will be blank if one size only | |
size_name = ONE_SIZE if size_name == '' | |
color_size_dicts << { | |
'name' => size_name.strip, | |
'get_size_info_params' => { | |
'style_id' => product_id, | |
'color' => color.strip, | |
'sizes' => sizes_dict, | |
} | |
} | |
end | |
puts "SIZE COMBOS 11 -> #{size_combos.inspect}" | |
end | |
puts "OPTIONS 11 -> #{options.inspect}" | |
puts "SUBPRODUCTS 11 -> #{subproducts.inspect}" | |
puts "nSkus 11 -> #{nSkus.inspect}" | |
puts "NAMES 11 -> #{names.inspect}" | |
puts "PRODUCTS -> #{product.inspect}" | |
Pry.start(binding) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment