Skip to content

Instantly share code, notes, and snippets.

@tachiba
Created October 18, 2012 04:57
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save tachiba/3909926 to your computer and use it in GitHub Desktop.
Save tachiba/3909926 to your computer and use it in GitHub Desktop.
parser base
require 'open-uri'
require 'timeout'
require 'resolv-replace'
require 'mechanize'
module Parser
module Base
attr_reader :current_html, :agent
USER_AGENT = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6"
LOGGER = ActiveSupport::BufferedLogger.new(Rails.root.join('log/parser.log'))
INTERVAL = 0.3
TIMEOUT_INTERVAL = 10
RETRY = 3
ParseFailed = Class.new(StandardError)
CHARSET = "UTF-8"
def initialize
@agent = Mechanize.new
@agent.log = logger
end
def post(url, q={}, h={})
try_to(url: url, queris: q, headers: h) do |i|
@current_html = @agent.post(url, q, h).root
sleep(INTERVAL)
end
end
def get(url, q={}, h={})
try_to(url: url, queris: q, headers: h) do |i|
ref = h.delete('Referer')
q = [] if q.size == 0
@current_html = @agent.get(url, q, ref, h).root
sleep(INTERVAL)
end
end
def read(url, q={})
url_with_param = url + '?' + q.to_query
try_to(url: url_with_param, queris: q) do |i|
open(url_with_param) do |f|
data = f.read
sleep(INTERVAL)
data
end
end
end
def parse(url, opt={})
charset = opt.delete(:charset) || CHARSET
io = nil
try_to(url: url) do |i|
io = open(url, "User-Agent" => USER_AGENT)
sleep(INTERVAL)
end
Nokogiri::HTML(io, nil, charset)
end
def parse_feed(url)
try_to(url: url) do |i|
feed = Feedzirra::Feed.fetch_and_parse(url)
sleep(INTERVAL)
feed
end
end
def logger
LOGGER
end
def try_to(opt={})
raise ArgumentError unless block_given?
retry_count = opt.delete(:retry_count) || RETRY
retry_count.times do |i|
begin
logger.info "TRY##{i}".foreground(:blue) + " #{opt.inspect}"
timeout(TIMEOUT_INTERVAL) do
return yield(i)
end
rescue TimeoutError => e
logger.error e.class.to_s.foreground(:red)
logger.error e.message.foreground(:red)
rescue => e
logger.error e.class.to_s.foreground(:red)
logger.error e.message.foreground(:red)
end
end
raise ParseFailed, "Failed #{opt.inspect}"
end
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment