Skip to content

Instantly share code, notes, and snippets.

@shawn0lds442
Created December 21, 2011 00:48
Show Gist options
  • Save shawn0lds442/1504001 to your computer and use it in GitHub Desktop.
Save shawn0lds442/1504001 to your computer and use it in GitHub Desktop.
Web Scraper for Data Mining
require 'mechanize'
require 'active_record'
require 'nokogiri'
require 'sqlite3'
require 'csv'
ActiveRecord::Base.establish_connection(:adapter => "sqlite3", :database => "db")
ActiveRecord::Schema.define do
create_table "locations", :force => true do |t|
t.string "name"
t.string "address"
t.string "city"
t.string "zip"
t.string "phone"
t.string "website"
t.datetime "created_at"
t.datetime "updated_at"
end
create_table "zip_codes", :force => true do |t|
t.string "zip"
t.decimal "lat", :precision => 15, :scale => 10
t.decimal "lng", :precision => 15, :scale => 10
t.string "city"
t.string "state"
t.string "the_type"
t.datetime "created_at"
t.datetime "updated_at"
end
end
class Location < ActiveRecord::Base
end
class ZipCode < ActiveRecord::Base
end
CSV.foreach("zip_codes.csv") do |row|
ZipCode.create(
:zip => row[0],
:lat => row[1],
:lng => row[2],
:city => row[3],
:state => row[4],
:the_type => row[5]
)
puts "stored #{row[0]}"
end
zip_codes = ZipCode.find(:all, :select => "distinct(city)")
zip_codes.each do |zip_code|
agent = Mechanize.new
page = agent.post('http://www.geosprawl.com/Default.aspx?alias=www.geosprawl.com/norvell',
[
["Searching", 1],
["city", "#{zip_code.city}"],
["cname", "BizSearchResult"],
["country", ""],
["fullcrit", ""] ,
["m", "40"],
["state", ""],
["tag", "All"],
["zip", ""]
])
doc = Nokogiri::HTML.parse(page.body)
data = []
result_div = doc.at_css(".repeaterscroll")
result_div.css("tr").each do |row|
id = row.attribute("id").to_s
if (id)
if id =~ /BusinessName/
@hash = Hash.new
@hash[:name] = (row.css("td").first.text).lstrip.rstrip
end
if id =~ /FullAddress/
@hash[:address] = (row.css("td").first.text).lstrip.rstrip
end
if id =~ /Phone/
@hash[:phone] = (row.css("td").first.text).lstrip.rstrip
end
if id =~ /Website/
@hash[:website] = (row.css("td").first.text).lstrip.rstrip
data << @hash
puts "scraped #{zip_code.zip}"
end
end
end
num = 0
data.each do |data_hash|
data_hash[:city] = data_hash[:address].split(' ')[-3]
data_hash[:zip] = (/[0-9]{5}/.match(data_hash[:address])).to_s
Location.create(data_hash)
puts "stored #{num}"
num = num + 1
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment