danielharan (owner)

Revisions

gist: 86333 Download_button fork
public
Public Clone URL: git://gist.github.com/86333.git
Embed All Files: show embed
extract data #
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
require 'hpricot'
 
namespace :import do
  
  def x(path)
    (f = (@doc/('/html/body/div/div[4]/div'+path)).first) ? f.inner_html : ''
  end
  
  def __left_value_for_right_matching(exp)
    if (f = (@doc/"/html/body/div/div[4]/div/table[2]/tbody/tr/td[1]").detect {|e| e.inner_html =~ exp})
      (f.parent / 'td[2]').first.inner_html
    else
      nil
    end
  end
  
  def alcohol
    v = __left_value_for_right_matching(/alcool/)
    v ? v.to_f : nil
  end
  
  def producer
    __left_value_for_right_matching(/Fournisseur/)
  end
  
  def upc
    if (_upc = x("/table/tr/td[2]/p/strong[2]")).empty?
      nil
    else
      _upc.gsub(" ",'').match(/\d+/)[0]
    end
  end
  
  desc 'scrape the downloaded saq product pages'
  task :saq => :environment do
    Dir.glob("#{RAILS_ROOT}/saq/pages/*").each do |f|
      begin
        @doc = Hpricot(open(f))
        saq = x("/table/tr/td[2]/p/strong[1]")
        if saq == ''
          puts "NO SAQ CODE on: #{f}"
          next
        end
        saq = saq.match(/\d+/)[0]
        next unless Wine.find_by_saq_code(saq).nil?
        
        Wine.create :upc => upc,
                    :title => (@doc/"h2").first.inner_html, :saq_code => saq,
                    :colour => x("/table/tr/td[2]/table/tbody/tr[2]/td[2]"), :volume => x('/table/tr/td[2]/table/tbody/tr[4]/td[2]'),
                    :price => x("/table/tr/td[3]/p/span").gsub(",","").to_i / 100.0,
                    :country => x("/table[2]/tbody/tr/td[2]"), :alcohol => alcohol, :producer => producer
        
      rescue Exception => e
        puts "unable to process: #{f}" #exception doesn't help much
        #raise e
      end
    end
  end
end
Indexes #
1
2
3
4
5
6
7
8
9
10
11
12
# Script written a year ago to get SAQ wines...
#
# first, get all search pages
def url(i)
  "http://www.saq.com/webapp/wcs/stores/servlet/CatalogSearchResultView?storeId=10001&langId=-2&catalogId=10001&searchTerm=&resultCatEntryType=2&beginIndex=#{i}&tri=RechercheUCIProdDescAttributeInfo&sensTri=AscOperator&searchType=400&codeReseau=&categoryId=11748&viewTaskName=SAQCatalogSearchResultView&catalogVenteId=&origineId=&codePrix=&pageSize=100"
end
 
(0..71).each do |i|
  `curl '#{url(i*100)}' > saq/#{i*100}`
  sleep(30)
  puts i.to_s
end
Products #
1
2
3
4
5
6
# get all product urls, save them to file
require 'hpricot'
wine_urls = []
(0..72).each do |i|
  wine_urls << Hpricot(open("saq/search/#{i*100}")).search("a").collect {|e| e['href']}.uniq.select {|href| href =~ /^ProductDisplay/}
end