Created
December 13, 2011 08:50
-
-
Save rachid/1471273 to your computer and use it in GitHub Desktop.
Product Scraper for caloriecijfer.nl
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'open-uri' | |
require 'nokogiri' | |
USER_AGENTS = [ | |
'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.6; en-US; rv:1.9.2.3) Gecko/20100401 Firefox/3.6.3', | |
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; .NET CLR 2.0.50727)', | |
'Mozilla/5.0 (X11; U; Linux x86_64; en-US; rv:1.9.2.3) Gecko/20100423 Ubuntu/10.04 (lucid) Firefox/3.6.3', | |
'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_3; en-US) AppleWebKit/533.4 (KHTML, like Gecko) Chrome/5.0.375.70 Safari/533.4', | |
'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.2.2) Gecko/20100323 Namoroka/3.6.2', | |
'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.1.9) Gecko/20100401 Ubuntu/9.10 (karmic) Firefox/3.5.9' | |
] | |
class CalorieCijfer | |
def initialize | |
@site = "http://www.calorielijst.nl" | |
end | |
def random_agent | |
USER_AGENTS[rand(USER_AGENTS.size-1)] | |
end | |
def page_numbers(url) | |
@doc = Nokogiri::HTML(open('http://www.calorielijst.nl' + url, "User-Agent" => self.random_agent,)) | |
@doc.search('.paging a , .thispage').map { |categorie| | |
categorie['href'] | |
}.compact! | |
end | |
def categorielijst | |
@doc = Nokogiri::HTML(open('http://www.calorielijst.nl', "User-Agent" => self.random_agent,)) | |
@doc.search('#ll a , #llsel').each do |categorie| | |
self.page_numbers(categorie['href']).each do |url| | |
TmpUrl.create(:url => url) unless url.include?("?p=") | |
end | |
end | |
end | |
def product_detail | |
TmpProduct.where("done IS NULL").each do |p| | |
@doc = Nokogiri::HTML(open('http://www.calorielijst.nl/product/?calorie=' + p.external_id, "User-Agent" => self.random_agent,)) | |
update = { | |
:barcode => "#{@doc.at_css("#bcTarget")['title'] unless @doc.at_css("#bcTarget").nil?}", | |
:ingredients => "#{@doc.search("#m_ingred").first.next().next().text unless @doc.search("#m_ingred").first.nil?}", | |
:done => 1 | |
} | |
p.update_attributes(update) | |
puts p.inspect | |
end | |
end | |
def productlijst_import | |
TmpUrl.all.each do |page| | |
@doc = Nokogiri::HTML(open('http://www.calorielijst.nl' + page.url, "User-Agent" => self.random_agent,)) | |
@doc.search('.calorielijstdiv').each_with_index do |product,index| | |
if product.search('.calorienamediv a').text.size > 0 | |
t = TmpProduct.new({ | |
:full_name => product.search('.calorienamediv a').text, | |
:name => "#{product.search('.calorienamediv a').text.to_s.gsub(/\(+[\w\s\S]+\)/,'').strip}", | |
:brand => "#{product.search('.calorienamediv a').text.to_s.scan(/\(+[\w\s\S]+\)/).first.gsub(/[\(|\)]/,'') unless product.search('.calorienamediv a').text.to_s.scan(/\(+[\w\s\S]+\)/).first.nil?}", | |
:external_id => product.search('.calorienamediv a').first['href'].gsub(/\/+[\w\s\S]+\=/,''), | |
:group_name => product.search('.valuesLdiv a').first['title'].sub('Bekijk producten uit de productgroep','').strip, | |
:image_url => "http://www.calorielijst.nl#{product.search('.calorienamediv a').first['onmouseover'].sub("Tip('<img src=\\'","").sub("\\' />')","") unless product.search('.calorienamediv a').first['onmouseover'].nil?}" | |
}) | |
t.save | |
puts "#{index} #{{ | |
:full_name => product.search('.calorienamediv a').text, | |
:name => "#{product.search('.calorienamediv a').text.to_s.gsub(/\(+[\w\s\S]+\)/,'').strip}", | |
:brand => "#{product.search('.calorienamediv a').text.to_s.scan(/\(+[\w\s\S]+\)/).first.gsub(/[\(|\)]/,'') unless product.search('.calorienamediv a').text.to_s.scan(/\(+[\w\s\S]+\)/).first.nil?}", | |
:external_id => product.search('.calorienamediv a').first['href'].gsub(/\/+[\w\s\S]+\=/,''), | |
:group_name => product.search('.valuesLdiv a').first['title'].sub('Bekijk producten uit de productgroep','').strip, | |
:image_url => "http://www.calorielijst.nl#{product.search('.calorienamediv a').first['onmouseover'].sub("Tip('<img src=\\'","").sub("\\' />')","") unless product.search('.calorienamediv a').first['onmouseover'].nil?}" | |
}}" | |
end | |
end | |
end | |
end | |
def productlijst | |
@doc = Nokogiri::HTML(open('http://www.calorielijst.nl' , "User-Agent" => self.random_agent,)) | |
@doc.search('.calorielijstdiv').map do |product| | |
if product.search('.calorienamediv a').text.size > 0 | |
{ | |
:full_name => product.search('.calorienamediv a').text, | |
:name => "#{product.search('.calorienamediv a').text.to_s.gsub(/\(+[\w\s\S]+\)/,'').strip}", | |
:brand => "#{product.search('.calorienamediv a').text.to_s.scan(/\(+[\w\s\S]+\)/).first.gsub(/[\(|\)]/,'') unless product.search('.calorienamediv a').text.to_s.scan(/\(+[\w\s\S]+\)/).first.nil?}", | |
:external_id => product.search('.calorienamediv a').first['href'].gsub(/\/+[\w\s\S]+\=/,''), | |
:group_name => product.search('.valuesLdiv a').first['title'].sub('Bekijk producten uit de productgroep','').strip, | |
:image_url => "http://www.calorielijst.nl#{product.search('.calorienamediv a').first['onmouseover'].sub("Tip('<img src=\\'","").sub("\\' />')","") unless product.search('.calorienamediv a').first['onmouseover'].nil?}" | |
} | |
end | |
end | |
end | |
def ingredients(item) | |
puts item[:link] | |
@doc = Nokogiri::HTML(open(item[:link])) | |
@doc.search('p').map do |ingredient| | |
if (link = ingredient.children.first['href']) and ingredient.content.scan(/\(+.+\)/).first | |
{ | |
:summary => ingredient.content.scan(/\(+.+\)/).first.gsub(/[\(|\)]/,''), | |
:name => ingredient.content.scan(/^.+[\(]/).first.gsub(/[\(|\)]/,'').strip, | |
:description => get_ingredient_detail("#{@site}#{ingredient.children.first['href']}").first | |
} | |
end | |
end | |
end | |
def get_ingredient_detail(url) | |
@doc = Nokogiri::HTML(open(url)) | |
@doc.search('.attentionbottom').map do |ingredient| | |
ingredient.content | |
end | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment