Skip to content

Instantly share code, notes, and snippets.

/xtrafoo.rb Secret

Created November 6, 2014 02:30
Show Gist options
  • Save anonymous/3f0cfda8586fab859d09 to your computer and use it in GitHub Desktop.
Save anonymous/3f0cfda8586fab859d09 to your computer and use it in GitHub Desktop.
require 'nokogiri'
require 'open-uri'
require 'mechanize'
#ARGV.each do|a|
#url = "#{a}"
url = "http://www.tristar.eu/pt/Electronica/Colunas_de_som/Speakers_Bluetooth/SK-1512/3/6193"
mechanize = Mechanize.new
doc = Nokogiri::HTML(open(url))
nomeprod = doc.at_css('#_c3890_uxHeaderBox').text.strip[7..-1]
codprod = doc.at_css('#_c3890_uxHeaderBox').text.strip[0..6]
desc = doc.xpath(' //*[(@id = "Gegevens")]//p ').collect {|node| node.text.strip}
specs = doc.xpath(' //*[(@id = "Gegevens")]//li').collect {|node| node.text.strip}
peso = doc.xpath(' //tr[(((count(preceding-sibling::*) + 1) = 5) and parent::*)]//td[(((count(preceding-sibling::*) + 1) = 4) and parent::*)]').collect {|node| node.text.strip}.first
ean = doc.xpath(' //tr[(((count(preceding-sibling::*) + 1) = 3) and parent::*)]//td[(((count(preceding-sibling::*) + 1) = 3) and parent::*)]').collect {|node| node.text.strip}
#specs
specs2 = "#{specs}"
specs3 = specs2[2..-3]
#categorias
catmaintemp = doc.xpath(' //*[(@id = "AlgNavigatie")]//h2 ').collect {|node| node.text.strip}
catmain = "#{catmaintemp}"[2..-3]
catsubtemp = doc.xpath(' //*[contains(concat( " ", @class, " " ), concat( " ", "active", " " ))] ').collect {|node| node.text.strip}
catsubtemp2 = "#{catsubtemp}"
catsubtemp3 = catsubtemp2.gsub "#{catmain}"[0..-1], " " #
catsubtemp4 = catsubtemp3.tr('"','')
#img
image_urls = doc.xpath('//*[contains(concat( " ", @class, " " ), concat( " ", "image", " " ))]//a ').collect do |el|
mechanize.get("http://www.tristar.eu#{el['href']}").uri
end
#pdf_url = doc.xpath('//*[contains(concat( " ", @class, " " ), concat( " ", "pdf", " " ))]//a ' ).collect do |el|
#mechanize.get("http://www.tristar.eu#{el['href']}").uri
#end
#pdf_url = doc.xpath('//*[contains(concat( " ", @class, " " ), concat( " ", "pdf", " " ))]//a ' ).collect {|node| node.text.strip}
#doc_urls = doc.xpath('//*[contains(concat( " ", @class, " " ), concat( " ", "pdf", " " ), contact)]//a ').collect do |el|
#mechanize.get("http://www.tristar.eu#{el['href']}").uri
#end
#temp = doc.xpath('//*[(@id = "Downloads")]//article ').collect {|node| node.text.strip}
#specs2 = specs.to_s[2..-3]
specs2 = "#{specs}"
#specs3 = specs2.gsub('",/' , '\\n •')
specs3 = "#{specs2}".gsub(/",/, '\\n•')
descshort = specs3
#specs4 = specs3.gsub(/"/ , '')
#descshort = specs3.to_s[2..-3]
output = [1, 1000, 'Tristar', nomeprod, codprod, desc, descshort, peso, ean.to_s[2..-3], image_urls.join(', '), catmain, catsubtemp4.to_s[2..-5]]
puts output.join(';')
#output = [1, 1000, 'Tristar', nomeprod, codprod, desc[2..-3], specs2, peso, ean.to_s[2..-3], '', image_urls.join(', '), catmain, #catsubtemp4.to_s[2..-5], '']
#puts output.join(';')
#puts "1" + ";" + "1000" + ";" + "Tristar" + ";" + "#{nomeprod}" + ";" "#{codprod}" + ";" + "#{desc}"[2..-3] + ";" "#{specs3}" + ";" + "#{peso}" + ";" + "#{ean}"[2..-3] + ";;" + image_urls.join(', ') + ";" + "#{catmain}" +","+ "#{catsubtemp4}"[2..-5] + ";"
#end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment