Created
April 9, 2019 13:58
-
-
Save usmanasif/35f88bc91e6d5bbff8f213834b87f3df to your computer and use it in GitHub Desktop.
Scraper for Edmunds Site
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require "selenium-webdriver" | |
require 'write_xlsx' | |
namespace :scraping do | |
desc "Fetch all the data" | |
task fetch: :environment do | |
workbook, worksheet, worksheet2 = create_worksheet | |
options = Selenium::WebDriver::Chrome::Options.new(args: ['headless', '--blink-settings=imagesEnabled=false']) | |
driver = Selenium::WebDriver.for :chrome, options: options | |
driver.navigate.to "https://www.edmunds.com/car-maintenance/guide-page.html" | |
years = (1990..2020) | |
mileages = [5_000, 10_000, 15_000, 25_000, 50_000, 75_000, 100_000, 200_000, 300_000] | |
row_index = 1 | |
vehicle_id = 0 | |
begin | |
years.each do |year| | |
wait_for_element("year", driver) | |
reload_opts(driver, year: year) | |
wait_for_element("makes", driver) | |
sleep 1 | |
make_options = set_make(driver) | |
make_options.each do |make_option| | |
reload_opts(driver, make_option: make_option, year: year) | |
wait_for_element("models", driver) | |
models_options = set_model(driver) | |
models_options.each do |models_option| | |
reload_opts(driver, models_option: models_option, make_option: make_option, year: year) | |
wait_for_element("styles", driver) | |
trim_options = set_trim(driver) | |
trim_options.each do |trim_option| | |
reload_opts(driver, trim_option: trim_option, models_option: models_option, | |
make_option: make_option, year: year) | |
wait_for_element("engines", driver) | |
engine_options = set_engine(driver) | |
engine_options.each do |engine_option| | |
reload_opts(driver, engine_option: engine_option, trim_option: trim_option, | |
models_option: models_option, make_option: make_option, | |
year: year) | |
wait_for_element("transmissions", driver) | |
transmission_options = set_transmission(driver) | |
transmission_options.each do |transmission_option| | |
vehicle_id += 1 | |
insert_vehicle_info(vehicle_id, year, make_option, models_option, | |
trim_option, engine_option, transmission_option, worksheet2) | |
sleep 2 | |
reload_opts(driver, transmission_option: transmission_option, engine_option: engine_option, | |
trim_option: trim_option, models_option: models_option, | |
make_option: make_option, year: year) | |
mileages.each do |mileage| | |
reload_opts(driver, transmission_option: transmission_option, engine_option: engine_option, | |
trim_option: trim_option, models_option: models_option, | |
make_option: make_option, year: year) | |
mileage_field = driver.find_element(name: 'mileage') | |
mileage_field.send_keys(mileage.to_s) | |
zipcode_field = driver.find_element(name: 'zip') | |
zipcode_field.send_keys("95131") | |
submit_button = driver.find_element(name: 'Go') | |
submit_button.submit | |
sleep 2 | |
html = Nokogiri::HTML(driver.page_source) | |
main_div = html.css(".item-list.three-column") | |
uls = main_div.css("ul").reject{|ul| [ul.attributes["class"].value].to_set.intersect?(["head", "last"].to_set)} | |
uls.each do |ul| | |
data = ul.css("li").map(&:text).collect{|el| el.gsub("?", "")}.map(&:strip) | |
price = nil | |
if data[0].include?("Replace") || data[0].include?("Change") | |
replace_div = html.css(".item-list.two-column").first | |
replace_uls = replace_div.css('ul').reject{|ule| [ule.attributes["class"].value].to_set.intersect?(["head", "last"].to_set)} | |
matched_ul = replace_uls.find{|rep_ul| rep_ul.css("li").first.text.gsub("?", "").strip == data[1]} | |
price = matched_ul.css("li.right").text | |
end | |
insert_services_info(vehicle_id, mileage, data[0], data[1], data[2], price, row_index, worksheet) | |
row_index += 1 | |
end | |
driver.navigate.to "https://www.edmunds.com/car-maintenance/guide-page.html" | |
end | |
end | |
end | |
end | |
end | |
end | |
end | |
ensure | |
workbook.close | |
end | |
end | |
def insert_vehicle_info(vehicle_id, year, make, models, | |
trim, engine, transmission, worksheet2) | |
worksheet2.write(vehicle_id, 0, vehicle_id) | |
worksheet2.write(vehicle_id, 1, year) | |
worksheet2.write(vehicle_id, 2, make) | |
worksheet2.write(vehicle_id, 3, models) | |
worksheet2.write(vehicle_id, 4, trim) | |
worksheet2.write(vehicle_id, 5, engine) | |
worksheet2.write(vehicle_id, 6, transmission) | |
end | |
def insert_services_info(vehicle_id, mileage, labor_action, item, | |
labor_cost, part_cost, row_index, worksheet) | |
worksheet.write(row_index, 0, vehicle_id) | |
worksheet.write(row_index, 1, mileage) | |
worksheet.write(row_index, 2, labor_action) | |
worksheet.write(row_index, 3, item) | |
worksheet.write(row_index, 4, labor_cost) | |
worksheet.write(row_index, 5, part_cost) | |
end | |
def create_worksheet | |
workbook = WriteXLSX.new('./ruby4.xlsx') | |
worksheet = workbook.add_worksheet | |
worksheet2 = workbook.add_worksheet | |
worksheet.write(0, 0, "Vehicle ID") | |
worksheet.write(0, 1, "Service Interval") | |
worksheet.write(0, 2, "Labor Action") | |
worksheet.write(0, 3, "Item") | |
worksheet.write(0, 4, "Labor Cost") | |
worksheet.write(0, 5, "Part Cost") | |
worksheet2.write(0, 0, "Vehicle_Id") | |
worksheet2.write(0, 1, "year") | |
worksheet2.write(0, 2, "make") | |
worksheet2.write(0, 3, "model") | |
worksheet2.write(0, 4, "trim") | |
worksheet2.write(0, 5, "engine") | |
worksheet2.write(0, 6, "transmission") | |
[workbook, worksheet, worksheet2] | |
end | |
def wait_for_element(element, driver) | |
wait = Selenium::WebDriver::Wait.new(:timeout => 30) | |
wait.until { driver.find_element(:name, element).enabled?} | |
end | |
def reload_opts(driver, transmission_option: nil, engine_option: nil, | |
trim_option: nil, models_option: nil, | |
make_option: nil, year: nil) | |
if year | |
sleep 5 | |
set_year(driver, year) | |
end | |
if make_option | |
wait_for_element("makes", driver) | |
sleep 1 | |
set_make(driver, make_option) | |
end | |
if models_option | |
wait_for_element("models", driver) | |
sleep 1 | |
set_model(driver, models_option) | |
end | |
if trim_option | |
wait_for_element("styles", driver) | |
sleep 1 | |
set_trim(driver, trim_option) | |
end | |
if engine_option | |
wait_for_element("engines", driver) | |
sleep 1 | |
set_engine(driver, engine_option) | |
end | |
if transmission_option | |
wait_for_element("transmissions", driver) | |
sleep 1 | |
set_transmission(driver, transmission_option) | |
end | |
end | |
def set_transmission(driver, transmission_option=nil) | |
transmission_element = driver.find_element(:name, "transmissions") | |
transmission_select = Selenium::WebDriver::Support::Select.new(transmission_element) | |
transmission_select.select_by(:text, transmission_option) if transmission_option.present? | |
transmission_select.options.map(&:text)[1..-1] | |
end | |
def set_engine(driver, engine_option=nil) | |
engine_element = driver.find_element(:name, "engines") | |
engine_select = Selenium::WebDriver::Support::Select.new(engine_element) | |
engine_select.select_by(:text, engine_option) if engine_option.present? | |
engine_select.options.map(&:text)[1..-1] | |
end | |
def set_trim(driver, trim_option=nil) | |
trim_element = driver.find_element(:name, "styles") | |
trim_select = Selenium::WebDriver::Support::Select.new(trim_element) | |
trim_select.select_by(:text, trim_option) if trim_option.present? | |
trim_select.options.map(&:text)[1..-1] | |
end | |
def set_model(driver, models_option=nil) | |
models_element = driver.find_element(:name, "models") | |
models_select = Selenium::WebDriver::Support::Select.new(models_element) | |
models_select.select_by(:text, models_option) if models_option.present? | |
models_select.options.map(&:text)[1..-1] | |
end | |
def set_make(driver, make_option=nil) | |
make_element = driver.find_element(:name, "makes") | |
make_select = Selenium::WebDriver::Support::Select.new(make_element) | |
make_select.select_by(:text, make_option) if make_option.present? | |
make_select.options.map(&:text)[1..-1] | |
end | |
def set_year(driver, year) | |
year_element = driver.find_element(:name, "year") | |
year_select = Selenium::WebDriver::Support::Select.new(year_element) | |
year_select.select_by(:text, year.to_s) | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment