Last active
June 27, 2017 20:10
-
-
Save giannafusaro/99626c1a01f5598e982d to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'capybara' | |
require 'capybara/poltergeist' | |
class Scraper | |
include Capybara::DSL | |
attr_accessor :text, :document, :metas, :title, :session, :globals, :globals_script | |
# settings for poltergeist driver | |
Capybara.register_driver :poltergeist do |app| | |
Capybara::Poltergeist::Driver.new app, | |
js_errors: false, | |
phantomjs_options: ['--load-images=no', '--ignore-ssl-errors=yes'], | |
debug: false, | |
inspector: false | |
end | |
Capybara.default_driver = :poltergeist | |
# Create a new PhantomJS session in Capybara | |
def initialize | |
# Start up a new thread | |
@session = Capybara::Session.new(:poltergeist) | |
@session.driver.headers = { 'User-Agent' => "Mozilla/5.0 (Macintosh; Intel Mac OS X)" } | |
@globals_script = | |
"(function getGlobals() { | |
var globals = [], | |
globalsBlacklist = ['__commandLineAPI','applicationCache','chrome','closed','console','crypto','CSS','defaultstatus', | |
'defaultStatus','devicePixelRatio','document','external','frameElement','history','indexedDB','innerHeight', | |
'innerWidth','length','localStorage','location','name','offscreenBuffering','opener','outerHeight','outerWidth', | |
'pageXOffset','pageYOffset','performance','screen','screenLeft','screenTop','screenX','screenY','scrollX', | |
'scrollY','sessionStorage','speechSynthesis','status','styleMedia', 'window']; | |
for (var prop in window) { | |
if ( !(window[prop] instanceof BarProp) && !(window[prop] instanceof Navigator) && | |
(prop.indexOf('webkit') == -1) && (globalsBlacklist.indexOf(prop)== -1) ) { | |
globals.push(prop); | |
} | |
} | |
return globals; | |
}())".gsub(/[\n](\s{2,})/, '') | |
end | |
def get(url) | |
@session.visit url | |
@document = @session.document | |
@title = @session.title | |
@metas = @session.find_all('meta', visible: false).collect(&:native).collect(&:attributes) | |
@text = @document.text 'body' | |
@globals = @session.evaluate_script(@globals_script) | |
puts "globals: #{@globals}" | |
# shut down driver, no need to hang around | |
@session.driver.quit | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment