Last active
October 26, 2016 12:30
-
-
Save sumoward/469e655cc70dbf82b7ecebafc78a3a22 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'open-uri' | |
require 'nokogiri' | |
require 'mechanize' | |
require 'csv' | |
def scrape(row) | |
puts '_________start_________' | |
mechanize = Mechanize.new | |
page = mechanize.get('http://www.aircraftbluebook.com/Tools/HVR/Calculate.do') | |
# find login form and add credentials | |
form = page.form | |
form.field_with(:name => "email").value = 'USERNAME' | |
form.field_with(:name => "password").value = 'PASSWORD' | |
page = form.submit | |
# get required link | |
link = page.link_with(text: 'Historical Value Reference') | |
# navigate | |
# page = link.click | |
# for the moment lets just use the constructed link | |
page = mechanize.get(row[2]) | |
# check the wholesale radio button, it is the second one | |
page.forms.last.radiobuttons[1].checked = true | |
# set other radiobutton to false | |
page.forms.last.radiobuttons[0].checked = false | |
page = page.forms.last.submit | |
# the body of the page | |
body = page.body | |
# use nokogiri to catch the table in the body | |
doc = Nokogiri::HTML(body) | |
table = doc.css('table.idp-table') | |
# Fetches all rows (<tr>s) | |
rows = table.css('tr') | |
# The column names are the first row (shift returns the first element and removes it from the array). | |
# On that row we get the text of each individual <th> This will be Table name, Column name 1, Column name 2... | |
title = rows.shift.css('th').map(&:text) | |
# repeat to get column names | |
column_names = rows.shift.css('th').map(&:text) | |
# On each of the remaining rows | |
text_all_rows = rows.map do |row| | |
# We get the text of each individual value (<td>) | |
# On the first row this will be 1001, 1002, 1003... | |
# on the second - 2001, 2002, 2003... etc | |
row_values = row.css('td').map(&:text) | |
# We map the name, followed by all the values | |
[*row_values] | |
end | |
puts title | |
puts column_names | |
puts text_all_rows | |
# write table to csv | |
CSV.open("#{title[0]}.csv", "a") do |csv| | |
csv << [row[0], row[1]] | |
csv << column_names | |
text_all_rows.each do |row| | |
csv << row | |
end | |
end | |
puts '_________end_________' | |
end | |
# def plane | |
# product = "HVR" | |
# type_id = 7 | |
# type = "Business+Jet" | |
# make_id = 12 | |
# make = "Bombardier" | |
# model_id = 741 | |
# model = "BOMBARDIER+GLOBAL+6000" | |
# model_instance_id = 6512 | |
# model_instance = 'Global%206000' | |
# "http://www.aircraftbluebook.com/Navigation.do?product=#{product}&typeId=#{type_id}&type=#{type}&makeId=#{make_id}&make=#{make}&modelId=#{model_id}&model=#{model}&modelInstanceId=#{model_instance_id}&modelInstance=#{global_instance}" | |
# 'http://www.aircraftbluebook.com/Navigation.do?product=HVR&typeId=7&type=Business+Jet&makeId=12&make=Bombardier&modelId=741&model=BOMBARDIER+GLOBAL+6000&modelInstanceId=6512&modelInstance=Global%206000' | |
# | |
# end | |
def do_all | |
CSV.foreach("aircraft.csv") do |row| | |
puts row | |
scrape row | |
sleep(5) | |
end | |
puts('ALL DONE') | |
end | |
do_all | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment