Skip to content

Instantly share code, notes, and snippets.

@arvinwiyono
Created April 24, 2017 09:09
Show Gist options
  • Save arvinwiyono/fde7ca9ddc54c467d748496ff7bbe83f to your computer and use it in GitHub Desktop.
Save arvinwiyono/fde7ca9ddc54c467d748496ff7bbe83f to your computer and use it in GitHub Desktop.
FIT Research Methods
require 'capybara'
require 'capybara/dsl'
require 'capybara/poltergeist'
require 'csv'
require 'date'
include Capybara::DSL
Capybara.register_driver :poltergeist do |app|
options = {
js_errors: false,
phantomjs_options: ['--load-images=no']
}
Capybara::Poltergeist::Driver.new(app, options)
end
Capybara.default_driver = :poltergeist
Capybara.default_selector = :xpath
URL = 'http://www.atpworldtour.com/en/rankings/singles'
SEX = 'MALE'
CONTAINER_PATH = '//table[@class="mega-table"]/tbody'
BOUNDARY_PATH = './tr'
WORLD_RANKING_PATH = './td[1]'
NAME_PATH = './td[4]/a'
# In the details page
BIRTH_YEAR_PATH = '//span[@class="table-birthday"]'
HEIGHT_PATH = '//span[@class="table-height-cm-wrapper"]'
WEIGHT_PATH = '//span[@class="table-weight-kg-wrapper"]'
HANDED_PATH = '//div[contains(., "Plays") and @class="table-label"]/following-sibling::div[1]'
COUNTRY_PATH = '//div[@class="player-flag-code"]'
players = []
session = Capybara::Session.new(:poltergeist)
puts "Visiting site with URL: #{URL}"
session.visit(URL)
rows = session.find(CONTAINER_PATH).all(BOUNDARY_PATH)
puts "Extracting rows..."
csv = CSV.open('men_ranking.csv', 'wb')
csv << ["world_rank","lastname","firstname","country","birth_year","height","weight","handed", "sex"]
rows.each do |row|
world_rank = row.find(WORLD_RANKING_PATH).text.to_i
firstname, lastname = row.find(NAME_PATH).text.split(' ').map(&:strip).map(&:capitalize)
puts "Extracting #{firstname} #{lastname}"
details_page_url = row.find(NAME_PATH)[:href]
puts "Visiting #{details_page_url}"
dp_session = Capybara::Session.new(:poltergeist)
dp_session.visit details_page_url
dp_session.save_screenshot("./screenshots/#{firstname}_#{lastname}.png")
# debug
# puts dp_session.find(HEIGHT_PATH).text
height = dp_session.find(HEIGHT_PATH).text[/\((\d+)cm\)/i, 1].to_f
weight = dp_session.find(WEIGHT_PATH).text[/\((\d+)kg\)/i, 1].to_i
raw_handed = dp_session.find(HANDED_PATH).text.split(',').first.strip
handed = raw_handed =~ /left/i ? 'LEFT' : 'RIGHT'
country = dp_session.find(COUNTRY_PATH).text
birth_year = dp_session.find(BIRTH_YEAR_PATH).text.strip[/\(([^\.]+)/,1].to_i
puts 'Finished details page extraction...'
csv << [world_rank,lastname,firstname,country,birth_year,height,weight,handed, SEX]
end
csv.close
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment