rachid/calorie_cijfer.rb

## calorie_cijfer.rb
require 'open-uri'
require 'nokogiri'


  USER_AGENTS = [
    'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.6; en-US; rv:1.9.2.3) Gecko/20100401 Firefox/3.6.3',
    'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; .NET CLR 2.0.50727)',
    'Mozilla/5.0 (X11; U; Linux x86_64; en-US; rv:1.9.2.3) Gecko/20100423 Ubuntu/10.04 (lucid) Firefox/3.6.3',
    'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_3; en-US) AppleWebKit/533.4 (KHTML, like Gecko) Chrome/5.0.375.70 Safari/533.4',
    'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.2.2) Gecko/20100323 Namoroka/3.6.2',
    'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.1.9) Gecko/20100401 Ubuntu/9.10 (karmic) Firefox/3.5.9'
  ]


class CalorieCijfer

  def initialize
    @site = "http://www.calorielijst.nl"
  end


  def random_agent
    USER_AGENTS[rand(USER_AGENTS.size-1)]
  end

  def page_numbers(url)
    @doc = Nokogiri::HTML(open('http://www.calorielijst.nl' + url, "User-Agent" => self.random_agent,))
    @doc.search('.paging a , .thispage').map { |categorie|
      categorie['href']
    }.compact!
  end

  def categorielijst
    @doc = Nokogiri::HTML(open('http://www.calorielijst.nl', "User-Agent" => self.random_agent,))
    @doc.search('#ll a , #llsel').each do |categorie|
        self.page_numbers(categorie['href']).each do |url|
          TmpUrl.create(:url => url) unless url.include?("?p=")
        end
    end
  end

  def product_detail

    TmpProduct.where("done IS NULL").each do |p|
      @doc = Nokogiri::HTML(open('http://www.calorielijst.nl/product/?calorie=' + p.external_id, "User-Agent" => self.random_agent,))
      update = {
        :barcode =>  "#{@doc.at_css("#bcTarget")['title'] unless @doc.at_css("#bcTarget").nil?}",
        :ingredients => "#{@doc.search("#m_ingred").first.next().next().text unless @doc.search("#m_ingred").first.nil?}",
        :done => 1
      }
      p.update_attributes(update)
      puts p.inspect
    end


  end


  def productlijst_import
    TmpUrl.all.each do |page|

      @doc = Nokogiri::HTML(open('http://www.calorielijst.nl' + page.url, "User-Agent" => self.random_agent,))
      @doc.search('.calorielijstdiv').each_with_index do |product,index|
        if product.search('.calorienamediv a').text.size > 0

        t = TmpProduct.new({
          :full_name => product.search('.calorienamediv a').text,
          :name => "#{product.search('.calorienamediv a').text.to_s.gsub(/\(+[\w\s\S]+\)/,'').strip}",
          :brand => "#{product.search('.calorienamediv a').text.to_s.scan(/\(+[\w\s\S]+\)/).first.gsub(/[\(|\)]/,'') unless product.search('.calorienamediv a').text.to_s.scan(/\(+[\w\s\S]+\)/).first.nil?}",
          :external_id => product.search('.calorienamediv a').first['href'].gsub(/\/+[\w\s\S]+\=/,''),
          :group_name => product.search('.valuesLdiv a').first['title'].sub('Bekijk producten uit de productgroep','').strip,
          :image_url => "http://www.calorielijst.nl#{product.search('.calorienamediv a').first['onmouseover'].sub("Tip('<img src=\\'","").sub("\\' />')","") unless product.search('.calorienamediv a').first['onmouseover'].nil?}"
       })
       t.save
        puts "#{index} #{{
            :full_name => product.search('.calorienamediv a').text,
            :name => "#{product.search('.calorienamediv a').text.to_s.gsub(/\(+[\w\s\S]+\)/,'').strip}",
            :brand => "#{product.search('.calorienamediv a').text.to_s.scan(/\(+[\w\s\S]+\)/).first.gsub(/[\(|\)]/,'') unless product.search('.calorienamediv a').text.to_s.scan(/\(+[\w\s\S]+\)/).first.nil?}",
            :external_id => product.search('.calorienamediv a').first['href'].gsub(/\/+[\w\s\S]+\=/,''),
            :group_name => product.search('.valuesLdiv a').first['title'].sub('Bekijk producten uit de productgroep','').strip,
            :image_url => "http://www.calorielijst.nl#{product.search('.calorienamediv a').first['onmouseover'].sub("Tip('<img src=\\'","").sub("\\' />')","") unless product.search('.calorienamediv a').first['onmouseover'].nil?}"
         }}"


        end
      end
    end
  end

  def productlijst
      @doc = Nokogiri::HTML(open('http://www.calorielijst.nl' , "User-Agent" => self.random_agent,))
      @doc.search('.calorielijstdiv').map do |product|
        if product.search('.calorienamediv a').text.size > 0
        {
          :full_name => product.search('.calorienamediv a').text,
          :name => "#{product.search('.calorienamediv a').text.to_s.gsub(/\(+[\w\s\S]+\)/,'').strip}",
          :brand => "#{product.search('.calorienamediv a').text.to_s.scan(/\(+[\w\s\S]+\)/).first.gsub(/[\(|\)]/,'') unless product.search('.calorienamediv a').text.to_s.scan(/\(+[\w\s\S]+\)/).first.nil?}",
          :external_id => product.search('.calorienamediv a').first['href'].gsub(/\/+[\w\s\S]+\=/,''),
          :group_name => product.search('.valuesLdiv a').first['title'].sub('Bekijk producten uit de productgroep','').strip,
          :image_url => "http://www.calorielijst.nl#{product.search('.calorienamediv a').first['onmouseover'].sub("Tip('<img src=\\'","").sub("\\' />')","") unless product.search('.calorienamediv a').first['onmouseover'].nil?}"
       }
        end
      end
  end


  def ingredients(item)
    puts item[:link]
    @doc = Nokogiri::HTML(open(item[:link]))
    @doc.search('p').map do |ingredient|
      if (link = ingredient.children.first['href']) and ingredient.content.scan(/\(+.+\)/).first
        {
          :summary => ingredient.content.scan(/\(+.+\)/).first.gsub(/[\(|\)]/,''),
          :name => ingredient.content.scan(/^.+[\(]/).first.gsub(/[\(|\)]/,'').strip,
          :description => get_ingredient_detail("#{@site}#{ingredient.children.first['href']}").first
        }
      end
    end
  end

  def get_ingredient_detail(url)
    @doc = Nokogiri::HTML(open(url))
    @doc.search('.attentionbottom').map do |ingredient|
      ingredient.content
    end
  end


end
	require 'open-uri'
	require 'nokogiri'


	USER_AGENTS = [
	'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.6; en-US; rv:1.9.2.3) Gecko/20100401 Firefox/3.6.3',
	'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; .NET CLR 2.0.50727)',
	'Mozilla/5.0 (X11; U; Linux x86_64; en-US; rv:1.9.2.3) Gecko/20100423 Ubuntu/10.04 (lucid) Firefox/3.6.3',
	'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_3; en-US) AppleWebKit/533.4 (KHTML, like Gecko) Chrome/5.0.375.70 Safari/533.4',
	'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.2.2) Gecko/20100323 Namoroka/3.6.2',
	'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.1.9) Gecko/20100401 Ubuntu/9.10 (karmic) Firefox/3.5.9'
	]


	class CalorieCijfer

	def initialize
	@site = "http://www.calorielijst.nl"
	end


	def random_agent
	USER_AGENTS[rand(USER_AGENTS.size-1)]
	end

	def page_numbers(url)
	@doc = Nokogiri::HTML(open('http://www.calorielijst.nl' + url, "User-Agent" => self.random_agent,))
	@doc.search('.paging a , .thispage').map { \|categorie\|
	categorie['href']
	}.compact!
	end

	def categorielijst
	@doc = Nokogiri::HTML(open('http://www.calorielijst.nl', "User-Agent" => self.random_agent,))
	@doc.search('#ll a , #llsel').each do \|categorie\|
	self.page_numbers(categorie['href']).each do \|url\|
	TmpUrl.create(:url => url) unless url.include?("?p=")
	end
	end
	end

	def product_detail

	TmpProduct.where("done IS NULL").each do \|p\|
	@doc = Nokogiri::HTML(open('http://www.calorielijst.nl/product/?calorie=' + p.external_id, "User-Agent" => self.random_agent,))
	update = {
	:barcode => "#{@doc.at_css("#bcTarget")['title'] unless @doc.at_css("#bcTarget").nil?}",
	:ingredients => "#{@doc.search("#m_ingred").first.next().next().text unless @doc.search("#m_ingred").first.nil?}",
	:done => 1
	}
	p.update_attributes(update)
	puts p.inspect
	end


	end


	def productlijst_import
	TmpUrl.all.each do \|page\|

	@doc = Nokogiri::HTML(open('http://www.calorielijst.nl' + page.url, "User-Agent" => self.random_agent,))
	@doc.search('.calorielijstdiv').each_with_index do \|product,index\|
	if product.search('.calorienamediv a').text.size > 0

	t = TmpProduct.new({
	:full_name => product.search('.calorienamediv a').text,
	:name => "#{product.search('.calorienamediv a').text.to_s.gsub(/\(+[\w\s\S]+\)/,'').strip}",
	:brand => "#{product.search('.calorienamediv a').text.to_s.scan(/\(+[\w\s\S]+\)/).first.gsub(/[\(\|\)]/,'') unless product.search('.calorienamediv a').text.to_s.scan(/\(+[\w\s\S]+\)/).first.nil?}",
	:external_id => product.search('.calorienamediv a').first['href'].gsub(/\/+[\w\s\S]+\=/,''),
	:group_name => product.search('.valuesLdiv a').first['title'].sub('Bekijk producten uit de productgroep','').strip,
	:image_url => "http://www.calorielijst.nl#{product.search('.calorienamediv a').first['onmouseover'].sub("Tip('<img src=\\'","").sub("\\' />')","") unless product.search('.calorienamediv a').first['onmouseover'].nil?}"
	})
	t.save
	puts "#{index} #{{
	:full_name => product.search('.calorienamediv a').text,
	:name => "#{product.search('.calorienamediv a').text.to_s.gsub(/\(+[\w\s\S]+\)/,'').strip}",
	:brand => "#{product.search('.calorienamediv a').text.to_s.scan(/\(+[\w\s\S]+\)/).first.gsub(/[\(\|\)]/,'') unless product.search('.calorienamediv a').text.to_s.scan(/\(+[\w\s\S]+\)/).first.nil?}",
	:external_id => product.search('.calorienamediv a').first['href'].gsub(/\/+[\w\s\S]+\=/,''),
	:group_name => product.search('.valuesLdiv a').first['title'].sub('Bekijk producten uit de productgroep','').strip,
	:image_url => "http://www.calorielijst.nl#{product.search('.calorienamediv a').first['onmouseover'].sub("Tip('<img src=\\'","").sub("\\' />')","") unless product.search('.calorienamediv a').first['onmouseover'].nil?}"
	}}"


	end
	end
	end
	end

	def productlijst
	@doc = Nokogiri::HTML(open('http://www.calorielijst.nl' , "User-Agent" => self.random_agent,))
	@doc.search('.calorielijstdiv').map do \|product\|
	if product.search('.calorienamediv a').text.size > 0
	{
	:full_name => product.search('.calorienamediv a').text,
	:name => "#{product.search('.calorienamediv a').text.to_s.gsub(/\(+[\w\s\S]+\)/,'').strip}",
	:brand => "#{product.search('.calorienamediv a').text.to_s.scan(/\(+[\w\s\S]+\)/).first.gsub(/[\(\|\)]/,'') unless product.search('.calorienamediv a').text.to_s.scan(/\(+[\w\s\S]+\)/).first.nil?}",
	:external_id => product.search('.calorienamediv a').first['href'].gsub(/\/+[\w\s\S]+\=/,''),
	:group_name => product.search('.valuesLdiv a').first['title'].sub('Bekijk producten uit de productgroep','').strip,
	:image_url => "http://www.calorielijst.nl#{product.search('.calorienamediv a').first['onmouseover'].sub("Tip('<img src=\\'","").sub("\\' />')","") unless product.search('.calorienamediv a').first['onmouseover'].nil?}"
	}
	end
	end
	end


	def ingredients(item)
	puts item[:link]
	@doc = Nokogiri::HTML(open(item[:link]))
	@doc.search('p').map do \|ingredient\|
	if (link = ingredient.children.first['href']) and ingredient.content.scan(/\(+.+\)/).first
	{
	:summary => ingredient.content.scan(/\(+.+\)/).first.gsub(/[\(\|\)]/,''),
	:name => ingredient.content.scan(/^.+[\(]/).first.gsub(/[\(\|\)]/,'').strip,
	:description => get_ingredient_detail("#{@site}#{ingredient.children.first['href']}").first
	}
	end
	end
	end

	def get_ingredient_detail(url)
	@doc = Nokogiri::HTML(open(url))
	@doc.search('.attentionbottom').map do \|ingredient\|
	ingredient.content
	end
	end


	end