albulescu/rip.rb

## rip.rb
require 'open-uri'
require 'nokogiri' #You need to install this gem
require 'net/http'
require 'pp'
require 'csv'

=begin

  @autor Albulescu Cosmin <cosmin@albulescu.ro>

  All xpaths are copied with chrome browser. Inspect element, right click and copy xpath

=end

products_file_name = 'products.csv'

# File used to write non 200 http responses
fail_file = File.open('fail_list', 'a+')

#pages range
pages = (('A'..'Z').to_a + (1..9).to_a)

#Delete products csv file
File.delete products_file_name if File.exist? products_file_name

#products csv
$products = CSV.open(products_file_name, "wb")

$products << ['URL', 'TITLE', 'BRAND/COMPANY', 'QUANTITY', 'ART.NR.',
              'PZN', 'EAN', 'PRICE 1', 'PRICE 2', 'DELIVERY TIME',
              'DELIVERY INFO', 'IMAGE']

#counter to know where we are
$write_count = 1


def strip(text)
  if !text.nil?
    text.strip
  else
    text
  end
end

def price(text)
  if !text.nil? then
    strip text.to_s.split('  ')[1]
  else
    text
  end
end


def no_label(text)
  if !text.nil? then
    strip text.to_s.split(':')[1]
  else
    text
  end
end

def check_image(image)
  if image.to_s == 'http://www.volksversand.de/images/product_images/popup_images/04.jpg' then
    'N/A'
  else
    image
  end
end

#
# Function to read product data from url dom document
#
def read_product(doc, url)

  line = Array.new

  puts '#' + $write_count.to_s + ' - ' + url + "...\n"

  #product url
  line.push url

  #product title
  line.push doc.xpath('//*[@id="cart_quantity"]/div/*/h1').text # This xpath has been changed manually

  #product category
  line.push doc.xpath('//*[@id="cart_quantity"]/div/*/p').text # This xpath has been changed manually

  #product quantity
  line.push doc.xpath('//*[@id="productinfowrap"]/div[1]/div[1]/div[2]/div/div/p[1]').text

  #Art.Nr.
  line.push no_label doc.xpath('//*[@id="productinfowrap"]/div[1]/div[1]/div[2]/div/div/p[2]').text

  #PZN
  line.push no_label doc.xpath('//*[@id="productinfowrap"]/div[1]/div[1]/div[2]/div/div/p[3]').text

  #EAN
  line.push no_label doc.xpath('//*[@id="productinfowrap"]/div[1]/div[1]/div[2]/div/div/p[4]').text

  #PRICE 1
  line.push price doc.xpath('//*[@id="productinfowrap"]/div[1]/div[1]/div[2]/div/p[1]').text

  #PRICE 2
  line.push price doc.xpath('//*[@id="productinfowrap"]/div[1]/div[1]/div[2]/div/p[2]').text

  #SHIPPING TIME
  line.push strip doc.xpath('//*[@id="productinfowrap"]/div[1]/div[1]/div[2]/div/p[5]').text

  #TAX AND SHIPPING INFO
  line.push strip doc.xpath('//*[@id="productinfowrap"]/div[1]/div[1]/div[2]/div/p[6]').text

  #IMAGE
  line.push check_image 'http://www.volksversand.de/' + doc.xpath('//*[@id="imageTarget"]/a').attribute('href').value

  # Write the line to csv
  $products << line

  #flush the file
  $products.flush

  $write_count+=1

end

# Loop through pages
pages.each { |alpha|

  open ('http://www.volksversand.de/shop_content.php?coID=116&alpha=' + alpha) { |f|

    if f.status[0] == '200' then

      doc = Nokogiri::HTML f.read

      # Read all products links from current page
      doc.xpath('//*[@id="content"]/div/div[*]/a').each { |link|

        #get url from product link
        product_url = link.attribute 'href'

        #read product url
        open(product_url) { |product|

          if product.status[0] == '200' then

            #read product page and save
            read_product Nokogiri::HTML(product.read), product_url.value

          else
            fail_file.write f.status[0] + '@' + product_url + "\n"
            fail_file.flush
          end

          sleep 1
        }
      }

      # log fail urls
    else
      fail_file.write f.status[0] + '@' + alpha + "\n"
      fail_file.flush
    end
  }
}

#close products file pointer
$products.close

#close fail file pointer
fail_file.close
	require 'open-uri'
	require 'nokogiri' #You need to install this gem
	require 'net/http'
	require 'pp'
	require 'csv'

	=begin

	@autor Albulescu Cosmin <cosmin@albulescu.ro>

	All xpaths are copied with chrome browser. Inspect element, right click and copy xpath

	=end

	products_file_name = 'products.csv'

	# File used to write non 200 http responses
	fail_file = File.open('fail_list', 'a+')

	#pages range
	pages = (('A'..'Z').to_a + (1..9).to_a)

	#Delete products csv file
	File.delete products_file_name if File.exist? products_file_name

	#products csv
	$products = CSV.open(products_file_name, "wb")

	$products << ['URL', 'TITLE', 'BRAND/COMPANY', 'QUANTITY', 'ART.NR.',
	'PZN', 'EAN', 'PRICE 1', 'PRICE 2', 'DELIVERY TIME',
	'DELIVERY INFO', 'IMAGE']

	#counter to know where we are
	$write_count = 1


	def strip(text)
	if !text.nil?
	text.strip
	else
	text
	end
	end

	def price(text)
	if !text.nil? then
	strip text.to_s.split(' ')[1]
	else
	text
	end
	end


	def no_label(text)
	if !text.nil? then
	strip text.to_s.split(':')[1]
	else
	text
	end
	end

	def check_image(image)
	if image.to_s == 'http://www.volksversand.de/images/product_images/popup_images/04.jpg' then
	'N/A'
	else
	image
	end
	end

	#
	# Function to read product data from url dom document
	#
	def read_product(doc, url)

	line = Array.new

	puts '#' + $write_count.to_s + ' - ' + url + "...\n"

	#product url
	line.push url

	#product title
	line.push doc.xpath('//[@id="cart_quantity"]/div//h1').text # This xpath has been changed manually

	#product category
	line.push doc.xpath('//[@id="cart_quantity"]/div//p').text # This xpath has been changed manually

	#product quantity
	line.push doc.xpath('//*[@id="productinfowrap"]/div[1]/div[1]/div[2]/div/div/p[1]').text

	#Art.Nr.
	line.push no_label doc.xpath('//*[@id="productinfowrap"]/div[1]/div[1]/div[2]/div/div/p[2]').text

	#PZN
	line.push no_label doc.xpath('//*[@id="productinfowrap"]/div[1]/div[1]/div[2]/div/div/p[3]').text

	#EAN
	line.push no_label doc.xpath('//*[@id="productinfowrap"]/div[1]/div[1]/div[2]/div/div/p[4]').text

	#PRICE 1
	line.push price doc.xpath('//*[@id="productinfowrap"]/div[1]/div[1]/div[2]/div/p[1]').text

	#PRICE 2
	line.push price doc.xpath('//*[@id="productinfowrap"]/div[1]/div[1]/div[2]/div/p[2]').text

	#SHIPPING TIME
	line.push strip doc.xpath('//*[@id="productinfowrap"]/div[1]/div[1]/div[2]/div/p[5]').text

	#TAX AND SHIPPING INFO
	line.push strip doc.xpath('//*[@id="productinfowrap"]/div[1]/div[1]/div[2]/div/p[6]').text

	#IMAGE
	line.push check_image 'http://www.volksversand.de/' + doc.xpath('//*[@id="imageTarget"]/a').attribute('href').value

	# Write the line to csv
	$products << line

	#flush the file
	$products.flush

	$write_count+=1

	end

	# Loop through pages
	pages.each { \|alpha\|

	open ('http://www.volksversand.de/shop_content.php?coID=116&alpha=' + alpha) { \|f\|

	if f.status[0] == '200' then

	doc = Nokogiri::HTML f.read

	# Read all products links from current page
	doc.xpath('//[@id="content"]/div/div[]/a').each { \|link\|

	#get url from product link
	product_url = link.attribute 'href'

	#read product url
	open(product_url) { \|product\|

	if product.status[0] == '200' then

	#read product page and save
	read_product Nokogiri::HTML(product.read), product_url.value

	else
	fail_file.write f.status[0] + '@' + product_url + "\n"
	fail_file.flush
	end

	sleep 1
	}
	}

	# log fail urls
	else
	fail_file.write f.status[0] + '@' + alpha + "\n"
	fail_file.flush
	end
	}
	}

	#close products file pointer
	$products.close

	#close fail file pointer
	fail_file.close