Skip to content

Instantly share code, notes, and snippets.

@evenv
Created July 2, 2012 20:15
Show Gist options
  • Save evenv/3035416 to your computer and use it in GitHub Desktop.
Save evenv/3035416 to your computer and use it in GitHub Desktop.
Scrape Vinmonopolet web site
require 'rubygems'
require 'hpricot'
require 'open-uri'
require 'sqlite3'
require 'iconv'
require 'HTMLEntities.rb'
class String
def to_iso
Iconv.conv('ISO-8859-1', 'utf-8', self)
end
def fixx
self.gsub(" "," ").decode_entities.strip
end
def camelcase
self.downcase.capitalize.gsub(/ (.)/) { " #{$1.upcase}" }
end
end
db = SQLite3::Database.new('vino.db')
wines = {}
base = "http://www.vinmonopolet.no/is-bin/INTERSHOP.enfinity/WFS/store-vmp-Site/no_NO/-/NOK/"
nexturl = ARGV[0]
while nexturl
puts nexturl
doc = Hpricot(open([base,nexturl].join))
doc.search("#productList tr").each do |row|
next if row.at("th")
wine = {}
wine[:polnumber] = row.at("td.id > p").inner_html.match(/(([0-9]+))/)[1]
wine[:volume] = row.at("td.price em").inner_html.fixx
if db.get_first_value("SELECT 1 FROM vino WHERE polnumber = '%s' AND volume = '%s';" % [wine[:polnumber],wine[:volume]])
puts "XX %s" % wine[:polnumber]
next
else
puts "-- %s" % wine[:polnumber]
end
wine[:link] = row.at("h3 a").attributes['href']
wine[:title] = row.at("h3 a").inner_html.fixx
wdoc = Hpricot(open(wine[:link]))
wine[:price] = wdoc.at("td.price strong").inner_html.gsub("Kr. ","").fixx
wdoc.search(".productData li").each do |data|
wine[:region] = data.at("span").inner_html.fixx if data.at("strong").inner_html.include? "distrikt"
wine[:varietals] = data.at("span").inner_html.fixx if data.at("strong").inner_html.include? "stoff"
wine[:winery] = data.at("span").inner_html.fixx if data.at("strong").inner_html.include? "Produsent"
wine[:type] = data.at("span").inner_html.fixx if data.at("strong").inner_html.include? "Varetype"
end
wine[:vintage] = wine[:title].scan(/[0-9]{4}/).last
db.execute("INSERT INTO vino VALUES (:title,:volume,:price,:region,:varietals,:winery,:polnumber,:type,:vintage,:link);",wine)
end #wine
lastlink = doc.at("table.pages tr > td:last a:last")
begin
if lastlink.inner_html.include? "Neste"
nexturl = lastlink.attributes['href']
else
nexturl = false
end
rescue
next
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment