Skip to content

Instantly share code, notes, and snippets.

@albulescu
Last active August 29, 2015 14:00
Show Gist options
  • Save albulescu/11232996 to your computer and use it in GitHub Desktop.
Save albulescu/11232996 to your computer and use it in GitHub Desktop.
Products ripper
require 'open-uri'
require 'nokogiri' #You need to install this gem
require 'net/http'
require 'pp'
require 'csv'
=begin
@autor Albulescu Cosmin <cosmin@albulescu.ro>
All xpaths are copied with chrome browser. Inspect element, right click and copy xpath
=end
products_file_name = 'products.csv'
# File used to write non 200 http responses
fail_file = File.open('fail_list', 'a+')
#pages range
pages = (('A'..'Z').to_a + (1..9).to_a)
#Delete products csv file
File.delete products_file_name if File.exist? products_file_name
#products csv
$products = CSV.open(products_file_name, "wb")
$products << ['URL', 'TITLE', 'BRAND/COMPANY', 'QUANTITY', 'ART.NR.',
'PZN', 'EAN', 'PRICE 1', 'PRICE 2', 'DELIVERY TIME',
'DELIVERY INFO', 'IMAGE']
#counter to know where we are
$write_count = 1
def strip(text)
if !text.nil?
text.strip
else
text
end
end
def price(text)
if !text.nil? then
strip text.to_s.split(' ')[1]
else
text
end
end
def no_label(text)
if !text.nil? then
strip text.to_s.split(':')[1]
else
text
end
end
def check_image(image)
if image.to_s == 'http://www.volksversand.de/images/product_images/popup_images/04.jpg' then
'N/A'
else
image
end
end
#
# Function to read product data from url dom document
#
def read_product(doc, url)
line = Array.new
puts '#' + $write_count.to_s + ' - ' + url + "...\n"
#product url
line.push url
#product title
line.push doc.xpath('//*[@id="cart_quantity"]/div/*/h1').text # This xpath has been changed manually
#product category
line.push doc.xpath('//*[@id="cart_quantity"]/div/*/p').text # This xpath has been changed manually
#product quantity
line.push doc.xpath('//*[@id="productinfowrap"]/div[1]/div[1]/div[2]/div/div/p[1]').text
#Art.Nr.
line.push no_label doc.xpath('//*[@id="productinfowrap"]/div[1]/div[1]/div[2]/div/div/p[2]').text
#PZN
line.push no_label doc.xpath('//*[@id="productinfowrap"]/div[1]/div[1]/div[2]/div/div/p[3]').text
#EAN
line.push no_label doc.xpath('//*[@id="productinfowrap"]/div[1]/div[1]/div[2]/div/div/p[4]').text
#PRICE 1
line.push price doc.xpath('//*[@id="productinfowrap"]/div[1]/div[1]/div[2]/div/p[1]').text
#PRICE 2
line.push price doc.xpath('//*[@id="productinfowrap"]/div[1]/div[1]/div[2]/div/p[2]').text
#SHIPPING TIME
line.push strip doc.xpath('//*[@id="productinfowrap"]/div[1]/div[1]/div[2]/div/p[5]').text
#TAX AND SHIPPING INFO
line.push strip doc.xpath('//*[@id="productinfowrap"]/div[1]/div[1]/div[2]/div/p[6]').text
#IMAGE
line.push check_image 'http://www.volksversand.de/' + doc.xpath('//*[@id="imageTarget"]/a').attribute('href').value
# Write the line to csv
$products << line
#flush the file
$products.flush
$write_count+=1
end
# Loop through pages
pages.each { |alpha|
open ('http://www.volksversand.de/shop_content.php?coID=116&alpha=' + alpha) { |f|
if f.status[0] == '200' then
doc = Nokogiri::HTML f.read
# Read all products links from current page
doc.xpath('//*[@id="content"]/div/div[*]/a').each { |link|
#get url from product link
product_url = link.attribute 'href'
#read product url
open(product_url) { |product|
if product.status[0] == '200' then
#read product page and save
read_product Nokogiri::HTML(product.read), product_url.value
else
fail_file.write f.status[0] + '@' + product_url + "\n"
fail_file.flush
end
sleep 1
}
}
# log fail urls
else
fail_file.write f.status[0] + '@' + alpha + "\n"
fail_file.flush
end
}
}
#close products file pointer
$products.close
#close fail file pointer
fail_file.close
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment