Skip to content

Instantly share code, notes, and snippets.

@duonoid
Forked from elskwid/washoe_eats.rb
Last active December 24, 2015 06:39
Show Gist options
  • Save duonoid/6758038 to your computer and use it in GitHub Desktop.
Save duonoid/6758038 to your computer and use it in GitHub Desktop.
I played with it a little, attempting to understand their site...
#
# usage: ruby -Imechanize extract_facilities.rb
require "mechanize"
class FacilityScraper
def self.call
new.call
end
def self.mechanize
@mechanize ||= MechanizeFactory.()
end
attr_accessor :page_number, :last_page_contents
def initialize(last_page_contents = nil, page_number = nil)
@last_page_contents = last_page_contents
@page_number = page_number
end
# we may need these later
# form.add_field!("__EVENTTARGET",'ctl00$ContentPlaceHolder1$grdHealth')
# form.add_field!("__EVENTARGUMENT", 'Page$2')
def call
Eats::Facilities::Parser::Results.(page_contents).each do |row|
STDERR.puts "#{self.class}.#{__method__} #{row.inspect}" if $DEBUG
data_row = "#{row[:name]}\t#{row[:facility_type]}\t#{row[:address]}"
puts data_row unless data_row.strip.empty?
end
if next_page_number
throttle_limit
self.class.new(page_contents, next_page_number).call
end
end
private
def next_page_number
Eats::Facilities::Parser::NextPage.(page_contents)
end
def extracted_form
Eats::Html::FormExtractor.(last_page_contents)
end
def page_contents
unless @page_contents
@page_contents = if page_number
STDERR.puts "pulling page: #{page_number}..." if $VERBOSE
Eats::Facilities::ListPage.(extracted_form, page_number)
else
STDERR.puts "pulling first page..." if $VERBOSE
Eats::Search::Form.(mechanize)
end
end
@page_contents
end
def throttle_limit
sleep 3
end
def mechanize
self.class.mechanize
end
end
module Eats
module Html
# e.g.,
# http://eats.washoecounty.us/
class FirstPage
BASE_URL = "http://eats.washoecounty.us/"
def self.call(mechanize)
mechanize.get(BASE_URL)
end
end
class FormExtractor
FORM_ID = "aspnetForm"
def self.call(page_contents)
page_contents.form(FORM_ID)
end
end
end
module Search
class Form
def self.call(mechanize)
new(mechanize).call
end
attr_reader :mechanize
def initialize(mechanize)
@mechanize = mechanize
end
def call
form.submit
end
private
def form
unless @form
@form = Html::FormExtractor.(page_contents)
Filter::Search.(form)
Filter::City.(form)
end
@form
end
def page_contents
@page_contents ||= Html::FirstPage.(mechanize)
end
end
module Filter
class Search
SEARCH_BUTTON_ID = "ctl00$ContentPlaceHolder1$btnSearch"
SEARCH_VALUE = "Search"
def self.call(form)
form[SEARCH_BUTTON_ID] = SEARCH_VALUE
end
end
class City
CITY_FIELD_ID = "ctl00$ContentPlaceHolder1$txtCity"
def self.call(form, city = "RENO")
form[CITY_FIELD_ID] = city
end
end
class FacilityType
ID = "ctl00$ContentPlaceHolder1$txtFacilityType"
def self.call(form, facility_type = "Snackbar")
form[ID] = facility_type
end
end
class PageNumber
def self.call(form, page_number)
form["__EVENTTARGET"] = "ctl00$ContentPlaceHolder1$grdHealth"
form["__EVENTARGUMENT"] = "#{page_number}"
end
end
end
end
module Facilities
class ListPage
def self.call(form, page_number)
Search::Filter::PageNumber.(form, page_number)
Search::Filter::City.(form)
form.submit
end
end
module Parser
class Results
FACILITY_COLS = %w[ link name score facility_type address
inspection_date ]
def self.call(page_contents)
new(page_contents).call
end
attr_reader :page_contents
def initialize(page_contents)
@page_contents = page_contents
end
def call
results.xpath('tr').collect do |tr|
next if tr[:align] == "center"
# hash per row of data
row = {}
# get data from cells
tr.xpath('td').each_with_index do |td, i|
row[FACILITY_COLS[i].to_sym] = if i == 0
# get the link
td.at_css("a")[:href].to_s.strip
else
td.text.to_s.strip
end
end
row
end.compact
end
private
def results
ResultsTable.(page_contents)
end
end
class ResultsTable
RESULTS_TABLE_ID = "table#ctl00_ContentPlaceHolder1_grdHealth"
def self.call(page_contents)
page_contents.search(RESULTS_TABLE_ID)
end
end
class NextPage
def self.call(page_contents)
new(page_contents).call
end
attr_reader :page_contents
def initialize(page_contents)
@page_contents = page_contents
end
def call
return unless next_page
next_page_number.empty? ? nil : next_page_number
end
private
def next_page_number
@next_page_number ||= next_page.attribute("href").to_s.match(/Page\$(\d+)/)[0]
end
def next_page
results.search('//td/table/tr/td[span]/following-sibling::td[1]/a[@href]')
end
def results
ResultsTable.(page_contents)
end
end
end
end
end
class MechanizeFactory
def self.call
new.call
end
def call
setup_agent(Mechanize.new)
end
protected
def setup_agent(mechanize)
mechanize.agent.keep_alive = true
mechanize.agent.read_timeout = 60
mechanize.agent.retry_change_requests = true
mechanize.agent.http.debug_output = $stderr if $DEBUG
mechanize.user_agent_alias = 'Mac Safari'
mechanize
end
end
STDERR.puts ">> running scraper..." if $VERBOSE
FacilityScraper.()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment