Created
April 10, 2016 17:14
-
-
Save gen1321/3dde82853990c2423f5c809ba235bd2b to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#require 'rubygems' | |
#require 'Celerity' | |
require 'nokogiri' | |
require 'watir-webdriver' | |
require "open-uri" | |
#b.goto 'bit.ly/watir-webdriver-demo' | |
#b.text_field(:id => 'entry_1000000').set 'your name' | |
#b.select_list(:id => 'entry_1000001').select 'Ruby' | |
#b.select_list(:id => 'entry_1000001').selected? 'Ruby' | |
#b.button(:name => 'submit').click | |
#b.text.include? 'Thank you' | |
#puts "New attempt3" | |
#exit(1) | |
######################################################################################################################## | |
#ahmed@fetchopia.com | |
#Date: 30/05.2013 | |
#Description: | |
#Navigates BetterBidding forums to find the Priceline hotel list for each state, or group of states. When navigation to | |
#hotel list is complete, the page is scraped. Regions are links found on the left hand side of the webpage. Zones and | |
#Hotels are found on the right hand side and are stored as images. They are produced by AJAX requests, | |
#you must ensure the browser loads images before saving | |
def scrape_state(browser, state) | |
if browser.link(:xpath, "//div[@class='overlay_link']/a").visible? then | |
browser.link(:xpath, "//div[@class='overlay_link']/a").click | |
else | |
puts "overlay not visible" | |
end | |
def save(path , img_src) | |
open(img_src) {|f| | |
File.open(path,"wb") do |file| | |
file.puts f.read | |
end | |
} | |
end | |
#regions = browser.links(:xpath, "//a[@class='region_link']") | |
#Extract links for all regions on left hand side | |
#regions = browser.links.select{|link| link.onclick =~ /HotelList.send_region/} | |
regions = browser.links(:class => "region_link") | |
puts regions.count | |
regionnum = 1 | |
# The page has regions on the left, and zones and hotels on the right for each state | |
# | |
# 1 For every region in a state | |
# | |
#Replace all / with _ in state name, for file naming reasons | |
if state =~ /\// then | |
state = state.gsub(/ \/ /, "_") | |
end | |
date = Time.new | |
date = /[0-9-]+/.match date.to_s | |
date = "BetterBidding-Priceline-#{state}-"+date.to_s | |
if not Dir.exists?(date) then | |
Dir.mkdir(date) | |
end | |
f = File.new("#{date}\\#{state}.html", "w") | |
f.write("<h1>"+ state +"</h1>") | |
f.write("<br/>") | |
regions.each do |region| | |
region.click | |
regiontext = region.text | |
f.write("<h2>"+ regiontext +"</h2>") | |
f.write("<br/>") | |
puts regiontext | |
# 3 Select all the zones in the region | |
# | |
zones = browser.links(:xpath => "//div[@class='con']/a") | |
puts "number of zones " + zones.count.to_s | |
# elect{|link| link.onclick =~ /HotelList.send_zone/} | |
#locate state that the region lies in | |
state = browser.element(:xpath => "//*[@class='hotel_lists_left_box']//a[@region='"+regiontext+"']/preceding-sibling::span[1]").text | |
puts "belongs to " + state | |
regionnum += 1 | |
# one zone | |
zonenum = 1 | |
zones.each do |zone| | |
# 4 In each zone, save zone image (E.G. IN | |
# | |
zoneid = zone.parent.attribute_value('zone_id') | |
puts "zoneid" + zoneid | |
# We save 2nd image (1st image is transparent) | |
# BORIS START HERE>>>> ENSURE THIS IS SAVING CHANGE TO URI METHOD | |
image_src = browser.div(:class,"con").images.last.src | |
save("#{date}\\#{state}_#{region.text}_zone_#{zonenum.to_s}.png",image_src) | |
f.write("<img src=\""+ state + "_" + region.text+"_zone_"+zonenum.to_s+".png\">") | |
f.write("<br/>") | |
# 5 If multiple zones click to load images when only one they autoopen it | |
# | |
if zones.count > 1 then | |
zone.click | |
end | |
picnum = 1 | |
hotelsnum = 1 | |
# 6 Loop through subsequent images and they will either be new hotels or amenities | |
# BORIS THIS SECTION IS NOT COMPLETE BUT THINKING YOU GO THROUGH EACH IMAGE IN THIS ZONE | |
# AND SAVE THE HOTELS TO HOTEL FILES AND AMENITIES TO AMENITY FILES | |
# e.g. | |
begin | |
while browser.image(:xpath, '//*[@class="hotels_block"]['+zonenum.to_s+']/div/div['+picnum.to_s+']/img[2]').exists? | |
image_src=browser.image(:xpath, "//*[@class='hotels_block'][@zone_id='#{zoneid_to_s}']/div/div['+picnum.to_s+']/img[2]'").src | |
save("#{date}\\#{state}_#{region.text}_zone_#{zonenum.to_s}_hotel_#{hotelsnum.to_s}.png",image_src) | |
f.write("<img src=\"" + state + "_" + region.text+"_zone_"+zonenum.to_s+"_hotel_"+hotelsnum.to_s+".png\">") | |
f.write("<br/>") | |
picnum += 1 | |
hotelsnum += 1 | |
end | |
zonenum += 1 | |
rescue | |
end | |
end | |
f.write("<br/>") | |
end | |
f.close | |
rescue Watir::Exception::UnknownObjectException, Timeout::Error | |
puts "couldn't find overlay link" | |
end | |
# Clicks through BetterBidding.com boards to the reach hotel list for each state | |
# and extract zone, hotel and amenity images | |
# | |
puts "Starting" | |
browser = Watir::Browser.new :chrome | |
#browser = Celerity::Browser.new(:resynchronize => true) | |
puts "Now i am here 1" | |
#browser.driver.manage.timeouts.implicit_wait = 5 | |
#browser.css = false | |
#browser.javascript_enabled = false | |
browser.goto("http://www.betterbidding.com/") | |
puts "Now i am here" | |
# 1 Parse HTML for board links | |
# | |
homepage = browser.html | |
#print homepage | |
doc = Nokogiri::HTML(homepage) | |
boards = doc.xpath("//li[@data-categoryid='441']//div[@class='ipsDataItem_main']//h4/a") | |
#boards = doc.xpath("//div[@id='categories']/div[3]//h4/a") | |
begin | |
boards.each do |board| | |
state = board.text | |
puts state | |
browser.link(:text, state).click | |
# 2 Check for special case, California | |
# | |
if state == "California" then | |
browser.link(:text, "Priceline - California (all other cities)").click | |
else | |
browser.link(:text, "Priceline - " + state).click | |
end | |
# 3 Click on hotel list link, relies on structure of the HTML | |
# | |
# Nevada is a special case because it is not the first link on the board | |
# | |
if state == "Nevada" then | |
browser.goto("http://www.betterbidding.com/index.php?app=hotel_lists&tid=85") | |
else | |
begin | |
browser.link(:xpath, "//ol/li/div[@class='ipsDataItem_main']/a").click | |
rescue | |
end | |
end | |
# 4 Call function to extract images | |
# | |
scrape_state(browser, state) | |
# 5 return to homepage so next board can be chosen | |
# | |
browser.goto("http://www.betterbidding.com/") | |
end | |
rescue | |
end | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment