Skip to content

Instantly share code, notes, and snippets.

@gen1321
Created April 10, 2016 17:14
Show Gist options
  • Save gen1321/3dde82853990c2423f5c809ba235bd2b to your computer and use it in GitHub Desktop.
Save gen1321/3dde82853990c2423f5c809ba235bd2b to your computer and use it in GitHub Desktop.
#require 'rubygems'
#require 'Celerity'
require 'nokogiri'
require 'watir-webdriver'
require "open-uri"
#b.goto 'bit.ly/watir-webdriver-demo'
#b.text_field(:id => 'entry_1000000').set 'your name'
#b.select_list(:id => 'entry_1000001').select 'Ruby'
#b.select_list(:id => 'entry_1000001').selected? 'Ruby'
#b.button(:name => 'submit').click
#b.text.include? 'Thank you'
#puts "New attempt3"
#exit(1)
########################################################################################################################
#ahmed@fetchopia.com
#Date: 30/05.2013
#Description:
#Navigates BetterBidding forums to find the Priceline hotel list for each state, or group of states. When navigation to
#hotel list is complete, the page is scraped. Regions are links found on the left hand side of the webpage. Zones and
#Hotels are found on the right hand side and are stored as images. They are produced by AJAX requests,
#you must ensure the browser loads images before saving
def scrape_state(browser, state)
if browser.link(:xpath, "//div[@class='overlay_link']/a").visible? then
browser.link(:xpath, "//div[@class='overlay_link']/a").click
else
puts "overlay not visible"
end
def save(path , img_src)
open(img_src) {|f|
File.open(path,"wb") do |file|
file.puts f.read
end
}
end
#regions = browser.links(:xpath, "//a[@class='region_link']")
#Extract links for all regions on left hand side
#regions = browser.links.select{|link| link.onclick =~ /HotelList.send_region/}
regions = browser.links(:class => "region_link")
puts regions.count
regionnum = 1
# The page has regions on the left, and zones and hotels on the right for each state
#
# 1 For every region in a state
#
#Replace all / with _ in state name, for file naming reasons
if state =~ /\// then
state = state.gsub(/ \/ /, "_")
end
date = Time.new
date = /[0-9-]+/.match date.to_s
date = "BetterBidding-Priceline-#{state}-"+date.to_s
if not Dir.exists?(date) then
Dir.mkdir(date)
end
f = File.new("#{date}\\#{state}.html", "w")
f.write("<h1>"+ state +"</h1>")
f.write("<br/>")
regions.each do |region|
region.click
regiontext = region.text
f.write("<h2>"+ regiontext +"</h2>")
f.write("<br/>")
puts regiontext
# 3 Select all the zones in the region
#
zones = browser.links(:xpath => "//div[@class='con']/a")
puts "number of zones " + zones.count.to_s
# elect{|link| link.onclick =~ /HotelList.send_zone/}
#locate state that the region lies in
state = browser.element(:xpath => "//*[@class='hotel_lists_left_box']//a[@region='"+regiontext+"']/preceding-sibling::span[1]").text
puts "belongs to " + state
regionnum += 1
# one zone
zonenum = 1
zones.each do |zone|
# 4 In each zone, save zone image (E.G. IN
#
zoneid = zone.parent.attribute_value('zone_id')
puts "zoneid" + zoneid
# We save 2nd image (1st image is transparent)
# BORIS START HERE>>>> ENSURE THIS IS SAVING CHANGE TO URI METHOD
image_src = browser.div(:class,"con").images.last.src
save("#{date}\\#{state}_#{region.text}_zone_#{zonenum.to_s}.png",image_src)
f.write("<img src=\""+ state + "_" + region.text+"_zone_"+zonenum.to_s+".png\">")
f.write("<br/>")
# 5 If multiple zones click to load images when only one they autoopen it
#
if zones.count > 1 then
zone.click
end
picnum = 1
hotelsnum = 1
# 6 Loop through subsequent images and they will either be new hotels or amenities
# BORIS THIS SECTION IS NOT COMPLETE BUT THINKING YOU GO THROUGH EACH IMAGE IN THIS ZONE
# AND SAVE THE HOTELS TO HOTEL FILES AND AMENITIES TO AMENITY FILES
# e.g.
begin
while browser.image(:xpath, '//*[@class="hotels_block"]['+zonenum.to_s+']/div/div['+picnum.to_s+']/img[2]').exists?
image_src=browser.image(:xpath, "//*[@class='hotels_block'][@zone_id='#{zoneid_to_s}']/div/div['+picnum.to_s+']/img[2]'").src
save("#{date}\\#{state}_#{region.text}_zone_#{zonenum.to_s}_hotel_#{hotelsnum.to_s}.png",image_src)
f.write("<img src=\"" + state + "_" + region.text+"_zone_"+zonenum.to_s+"_hotel_"+hotelsnum.to_s+".png\">")
f.write("<br/>")
picnum += 1
hotelsnum += 1
end
zonenum += 1
rescue
end
end
f.write("<br/>")
end
f.close
rescue Watir::Exception::UnknownObjectException, Timeout::Error
puts "couldn't find overlay link"
end
# Clicks through BetterBidding.com boards to the reach hotel list for each state
# and extract zone, hotel and amenity images
#
puts "Starting"
browser = Watir::Browser.new :chrome
#browser = Celerity::Browser.new(:resynchronize => true)
puts "Now i am here 1"
#browser.driver.manage.timeouts.implicit_wait = 5
#browser.css = false
#browser.javascript_enabled = false
browser.goto("http://www.betterbidding.com/")
puts "Now i am here"
# 1 Parse HTML for board links
#
homepage = browser.html
#print homepage
doc = Nokogiri::HTML(homepage)
boards = doc.xpath("//li[@data-categoryid='441']//div[@class='ipsDataItem_main']//h4/a")
#boards = doc.xpath("//div[@id='categories']/div[3]//h4/a")
begin
boards.each do |board|
state = board.text
puts state
browser.link(:text, state).click
# 2 Check for special case, California
#
if state == "California" then
browser.link(:text, "Priceline - California (all other cities)").click
else
browser.link(:text, "Priceline - " + state).click
end
# 3 Click on hotel list link, relies on structure of the HTML
#
# Nevada is a special case because it is not the first link on the board
#
if state == "Nevada" then
browser.goto("http://www.betterbidding.com/index.php?app=hotel_lists&tid=85")
else
begin
browser.link(:xpath, "//ol/li/div[@class='ipsDataItem_main']/a").click
rescue
end
end
# 4 Call function to extract images
#
scrape_state(browser, state)
# 5 return to homepage so next board can be chosen
#
browser.goto("http://www.betterbidding.com/")
end
rescue
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment