Skip to content

Instantly share code, notes, and snippets.

@rachid
Created December 13, 2011 08:50
Show Gist options
  • Save rachid/1471273 to your computer and use it in GitHub Desktop.
Save rachid/1471273 to your computer and use it in GitHub Desktop.
Product Scraper for caloriecijfer.nl
require 'open-uri'
require 'nokogiri'
USER_AGENTS = [
'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.6; en-US; rv:1.9.2.3) Gecko/20100401 Firefox/3.6.3',
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; .NET CLR 2.0.50727)',
'Mozilla/5.0 (X11; U; Linux x86_64; en-US; rv:1.9.2.3) Gecko/20100423 Ubuntu/10.04 (lucid) Firefox/3.6.3',
'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_3; en-US) AppleWebKit/533.4 (KHTML, like Gecko) Chrome/5.0.375.70 Safari/533.4',
'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.2.2) Gecko/20100323 Namoroka/3.6.2',
'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.1.9) Gecko/20100401 Ubuntu/9.10 (karmic) Firefox/3.5.9'
]
class CalorieCijfer
def initialize
@site = "http://www.calorielijst.nl"
end
def random_agent
USER_AGENTS[rand(USER_AGENTS.size-1)]
end
def page_numbers(url)
@doc = Nokogiri::HTML(open('http://www.calorielijst.nl' + url, "User-Agent" => self.random_agent,))
@doc.search('.paging a , .thispage').map { |categorie|
categorie['href']
}.compact!
end
def categorielijst
@doc = Nokogiri::HTML(open('http://www.calorielijst.nl', "User-Agent" => self.random_agent,))
@doc.search('#ll a , #llsel').each do |categorie|
self.page_numbers(categorie['href']).each do |url|
TmpUrl.create(:url => url) unless url.include?("?p=")
end
end
end
def product_detail
TmpProduct.where("done IS NULL").each do |p|
@doc = Nokogiri::HTML(open('http://www.calorielijst.nl/product/?calorie=' + p.external_id, "User-Agent" => self.random_agent,))
update = {
:barcode => "#{@doc.at_css("#bcTarget")['title'] unless @doc.at_css("#bcTarget").nil?}",
:ingredients => "#{@doc.search("#m_ingred").first.next().next().text unless @doc.search("#m_ingred").first.nil?}",
:done => 1
}
p.update_attributes(update)
puts p.inspect
end
end
def productlijst_import
TmpUrl.all.each do |page|
@doc = Nokogiri::HTML(open('http://www.calorielijst.nl' + page.url, "User-Agent" => self.random_agent,))
@doc.search('.calorielijstdiv').each_with_index do |product,index|
if product.search('.calorienamediv a').text.size > 0
t = TmpProduct.new({
:full_name => product.search('.calorienamediv a').text,
:name => "#{product.search('.calorienamediv a').text.to_s.gsub(/\(+[\w\s\S]+\)/,'').strip}",
:brand => "#{product.search('.calorienamediv a').text.to_s.scan(/\(+[\w\s\S]+\)/).first.gsub(/[\(|\)]/,'') unless product.search('.calorienamediv a').text.to_s.scan(/\(+[\w\s\S]+\)/).first.nil?}",
:external_id => product.search('.calorienamediv a').first['href'].gsub(/\/+[\w\s\S]+\=/,''),
:group_name => product.search('.valuesLdiv a').first['title'].sub('Bekijk producten uit de productgroep','').strip,
:image_url => "http://www.calorielijst.nl#{product.search('.calorienamediv a').first['onmouseover'].sub("Tip('<img src=\\'","").sub("\\' />')","") unless product.search('.calorienamediv a').first['onmouseover'].nil?}"
})
t.save
puts "#{index} #{{
:full_name => product.search('.calorienamediv a').text,
:name => "#{product.search('.calorienamediv a').text.to_s.gsub(/\(+[\w\s\S]+\)/,'').strip}",
:brand => "#{product.search('.calorienamediv a').text.to_s.scan(/\(+[\w\s\S]+\)/).first.gsub(/[\(|\)]/,'') unless product.search('.calorienamediv a').text.to_s.scan(/\(+[\w\s\S]+\)/).first.nil?}",
:external_id => product.search('.calorienamediv a').first['href'].gsub(/\/+[\w\s\S]+\=/,''),
:group_name => product.search('.valuesLdiv a').first['title'].sub('Bekijk producten uit de productgroep','').strip,
:image_url => "http://www.calorielijst.nl#{product.search('.calorienamediv a').first['onmouseover'].sub("Tip('<img src=\\'","").sub("\\' />')","") unless product.search('.calorienamediv a').first['onmouseover'].nil?}"
}}"
end
end
end
end
def productlijst
@doc = Nokogiri::HTML(open('http://www.calorielijst.nl' , "User-Agent" => self.random_agent,))
@doc.search('.calorielijstdiv').map do |product|
if product.search('.calorienamediv a').text.size > 0
{
:full_name => product.search('.calorienamediv a').text,
:name => "#{product.search('.calorienamediv a').text.to_s.gsub(/\(+[\w\s\S]+\)/,'').strip}",
:brand => "#{product.search('.calorienamediv a').text.to_s.scan(/\(+[\w\s\S]+\)/).first.gsub(/[\(|\)]/,'') unless product.search('.calorienamediv a').text.to_s.scan(/\(+[\w\s\S]+\)/).first.nil?}",
:external_id => product.search('.calorienamediv a').first['href'].gsub(/\/+[\w\s\S]+\=/,''),
:group_name => product.search('.valuesLdiv a').first['title'].sub('Bekijk producten uit de productgroep','').strip,
:image_url => "http://www.calorielijst.nl#{product.search('.calorienamediv a').first['onmouseover'].sub("Tip('<img src=\\'","").sub("\\' />')","") unless product.search('.calorienamediv a').first['onmouseover'].nil?}"
}
end
end
end
def ingredients(item)
puts item[:link]
@doc = Nokogiri::HTML(open(item[:link]))
@doc.search('p').map do |ingredient|
if (link = ingredient.children.first['href']) and ingredient.content.scan(/\(+.+\)/).first
{
:summary => ingredient.content.scan(/\(+.+\)/).first.gsub(/[\(|\)]/,''),
:name => ingredient.content.scan(/^.+[\(]/).first.gsub(/[\(|\)]/,'').strip,
:description => get_ingredient_detail("#{@site}#{ingredient.children.first['href']}").first
}
end
end
end
def get_ingredient_detail(url)
@doc = Nokogiri::HTML(open(url))
@doc.search('.attentionbottom').map do |ingredient|
ingredient.content
end
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment