danielharan (owner)

Revisions

gist: 16810 Download_button fork
public
Public Clone URL: git://gist.github.com/16810.git
Embed All Files: show embed
scrape_cbc_ca.rb #
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
require 'rubygems'
require 'mechanize'
 
postal_codes = File.open("postal_codes.txt").read.split("\n")
# randomize to make the pattern slightly harder to see in logs
postal_codes = postal_codes.sort_by {|e| rand(10_000)}
 
def filename(postcode)
  postcode.sub(' ', '')
end
 
@agent = WWW::Mechanize.new do |a|
  a.user_agent_alias = 'Mac Safari'
  a.max_history = 1
end
 
def scrape(postcode)
  fsa, ldu = postcode.split(' ').collect {|e| e.downcase }
  page = @agent.get("http://www.cbc.ca/news/canadavotes/myriding/postalcodes/#{postcode[0..0].downcase}/#{fsa}/#{ldu}.html")
  
  File.open("pages/#{filename(postcode)}", "w") do |f|
    f.puts page.body
  end
end
 
postal_codes.each do |postcode|
  begin
    next if File.exists?("pages/#{filename(postcode)}")
 
    scrape(postcode)
    sleep(2 + (rand(3_000) / 1_000.0))
  rescue
    puts "could not get #{postcode}, sleeping for a bit"
    sleep 20
    retry
  end
end