Skip to content

Instantly share code, notes, and snippets.

@giannafusaro
Last active June 27, 2017 20:10
Show Gist options
  • Save giannafusaro/99626c1a01f5598e982d to your computer and use it in GitHub Desktop.
Save giannafusaro/99626c1a01f5598e982d to your computer and use it in GitHub Desktop.
require 'capybara'
require 'capybara/poltergeist'
class Scraper
include Capybara::DSL
attr_accessor :text, :document, :metas, :title, :session, :globals, :globals_script
# settings for poltergeist driver
Capybara.register_driver :poltergeist do |app|
Capybara::Poltergeist::Driver.new app,
js_errors: false,
phantomjs_options: ['--load-images=no', '--ignore-ssl-errors=yes'],
debug: false,
inspector: false
end
Capybara.default_driver = :poltergeist
# Create a new PhantomJS session in Capybara
def initialize
# Start up a new thread
@session = Capybara::Session.new(:poltergeist)
@session.driver.headers = { 'User-Agent' => "Mozilla/5.0 (Macintosh; Intel Mac OS X)" }
@globals_script =
"(function getGlobals() {
var globals = [],
globalsBlacklist = ['__commandLineAPI','applicationCache','chrome','closed','console','crypto','CSS','defaultstatus',
'defaultStatus','devicePixelRatio','document','external','frameElement','history','indexedDB','innerHeight',
'innerWidth','length','localStorage','location','name','offscreenBuffering','opener','outerHeight','outerWidth',
'pageXOffset','pageYOffset','performance','screen','screenLeft','screenTop','screenX','screenY','scrollX',
'scrollY','sessionStorage','speechSynthesis','status','styleMedia', 'window'];
for (var prop in window) {
if ( !(window[prop] instanceof BarProp) && !(window[prop] instanceof Navigator) &&
(prop.indexOf('webkit') == -1) && (globalsBlacklist.indexOf(prop)== -1) ) {
globals.push(prop);
}
}
return globals;
}())".gsub(/[\n](\s{2,})/, '')
end
def get(url)
@session.visit url
@document = @session.document
@title = @session.title
@metas = @session.find_all('meta', visible: false).collect(&:native).collect(&:attributes)
@text = @document.text 'body'
@globals = @session.evaluate_script(@globals_script)
puts "globals: #{@globals}"
# shut down driver, no need to hang around
@session.driver.quit
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment