Skip to content

Instantly share code, notes, and snippets.

@arvinwiyono
Last active April 24, 2017 09:28
Show Gist options
  • Save arvinwiyono/6b10a087163a826c18f4dace800af564 to your computer and use it in GitHub Desktop.
Save arvinwiyono/6b10a087163a826c18f4dace800af564 to your computer and use it in GitHub Desktop.
FIT Research Methods
require 'capybara'
require 'capybara/dsl'
require 'capybara/poltergeist'
require 'csv'
include Capybara::DSL
Capybara.register_driver :poltergeist do |app|
options = {
js_errors: false,
phantomjs_options: ['--load-images=no']
}
Capybara::Poltergeist::Driver.new(app, options)
end
Capybara.default_driver = :poltergeist
Capybara.default_selector = :xpath
URL = 'http://www.wtatennis.com/singles-rankings'
CONTAINER_PATH = '//*[@id="myTable"]/tbody'
BOUNDARY_PATH = './tr'
SEX = 'FEMALE'
WORLD_RANKING_PATH = './td[2]'
NAME_PATH = './td[3]/a'
COUNTRY_PATH = './td[4]'
BIRTH_YEAR_PATH = './td[5]'
# In the details page
HEIGHT_PATH = '//strong[.="Height"]/..'
HANDED_PATH = '//strong[.="Plays"]/..'
WEIGHT_PATH = '//strong[.="Weight"]/..'
players = []
session = Capybara::Session.new(:poltergeist)
puts "Visiting site with URL: #{URL}"
session.visit(URL)
# get all options
options = session.all('//*[@class="rankings-rank-change"]').first.all('./option[position()>2]')
options.each do |o|
puts 'Selecting option...'
o.select_option
sleep 5
rows = session.find(CONTAINER_PATH).all(BOUNDARY_PATH)
puts "Extracting rows..."
csv = CSV.open('women_ranking_2.csv', 'wb')
csv << ["world_rank","lastname","firstname","country","birth_year","height","weight","handed", "sex"]
rows.each do |row|
world_rank = row.find(WORLD_RANKING_PATH).text.to_i
lastname, firstname = row.find(NAME_PATH).text.split(',').map(&:strip).map(&:capitalize)
country = row.find(COUNTRY_PATH).text.capitalize
birth_year = row.find(BIRTH_YEAR_PATH).text.match(/(\d+)$/)[1].to_i
puts "Extracting #{firstname} #{lastname}"
details_page_url = row.find(NAME_PATH)[:href]
puts "Visiting #{details_page_url}"
dp_session = Capybara::Session.new(:poltergeist)
dp_session.visit details_page_url
dp_session.save_screenshot("./screenshots/#{firstname}_#{lastname}.png")
height = dp_session.find(HEIGHT_PATH).text[/\((.+) m\)/, 1].to_f * 100
weight = dp_session.find(WEIGHT_PATH).text[/(\d+) kg/, 1].to_i
raw_handed = dp_session.find(HANDED_PATH).text[/:\s([^\s]+)/, 1]
handed = raw_handed =~ /left/i ? 'LEFT' : 'RIGHT'
puts 'Finished details page extraction...'
csv << [world_rank,lastname,firstname,country,birth_year,height,weight,handed, SEX]
end
end
csv.close
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment