Skip to content

Instantly share code, notes, and snippets.

@LitvinenkoD89
Last active July 21, 2017 07:45
Show Gist options
  • Save LitvinenkoD89/564807c673f166457077a49860a1f5f3 to your computer and use it in GitHub Desktop.
Save LitvinenkoD89/564807c673f166457077a49860a1f5f3 to your computer and use it in GitHub Desktop.
booksamillion
# BOOKS A MILLION {source_name: 'booksamillion', batch_number: 5, request_id: 5000000041, request_name: 'BOOKS A MILLION'}
scraper_service.scrape do |browser, scraper, init_vars|
easy_seeder = Library.lib('EasySeeder')
easy_extractor = Library.lib('EasyExtractor')
fetcher_agent = Library.lib('FetcherAgent')
easy_seeder.seed(source_name: init_vars[:source_name], with_priority: true) do
loop_postal_codes "us", radius: 10 do |zip, lat, long|
queue_url "http://www.bullseyelocations.com/pages/BAMStoreFinder?f=1##{zip}", zip: zip, lat: lat, long: long, priority: 0
end
end
easy_extractor.extract(
source_name: init_vars[:source_name],
scraper: scraper,
batch_number: init_vars[:batch_number],
request_id: init_vars[:request_id],
request_name: init_vars[:request_name],
) do
find_pages page_format: :html do |url, parser_page, page|
action = URI.join(url, page.at('#form1').attr('action')).to_s
seeder.queue_url action, {
method: :post,
page_type: 'search',
body: {
'__EVENTTARGET' => '',
'__EVENTARGUMENT' => '',
'__LASTFOCUS' => '',
'__SCROLLPOSITIONX' => 0,
'__SCROLLPOSITIONY' => 0,
'ctl00$ContentPlaceHolder1$txtCityStateZip' => parser_page[:zip],
'ctl00$ToolkitScriptManager1' =>'ctl00$ContentPlaceHolder1$upLocator|ctl00$ContentPlaceHolder1$searchButton',
'ctl00$ContentPlaceHolder1$hfLocation' => "#{parser_page[:zip]}&country=United States",
'ctl00$ContentPlaceHolder1$hfOrigCoords' => "#{parser_page[:lat]},#{parser_page[:long]}",
'ctl00$ContentPlaceHolder1$hfMobile' => 'false',
'ctl00$ContentPlaceHolder1$hfSearch' => "&PostalCode=#{parser_page[:zip]}&CountryId=1&CategoryIds=87387,87389,87388&Radius=50&InterfaceID=5308&FindNearestForNoResults=true&GetHoursForUpcomingWeek=true&LanguageCode=ru",
'ctl00$ContentPlaceHolder1$hfNext' => '',
'ctl00$ContentPlaceHolder1$hfPrev' => '',
'ctl00$ContentPlaceHolder1$searchButton' => 'Search'
},
priority: 100,
headers: fetcher_agent.get_headers(parser_page),
}
end
find_pages page_format: :html, page_type: 'search' do |url, parser_page, page|
page.search('.resultsDetails').each do |el|
name = el.css('[itemprop="name"]').children.text
t = el.next.next.next.next.next.next.children[3].children.text
id = el.css('[itemprop="url"]').attribute('href').value.split('/')[2]
city = el.css('[itemprop="addressLocality"]').children.text.gsub(", ", '')
state = el.css('[itemprop="addressRegion"]').children.text
zip_code = el.css('[itemprop="postalCode"]').children.text
location_url = ('http://www.bullseyelocations.com' + el.css('[itemprop="url"]').attribute('href').value).split('&PostalCode')[0]
latitude = el.css('[itemprop="latitude"]').attribute('content').value
longitude = el.css('[itemprop="geo"]').first.inner_html.gsub("\r", '').gsub("\n", '').split("meta itemprop=")[2].gsub(/[^\d,\.,\(,\,\-)]/, '')
street1 = el.css('[itemprop="streetAddress"]').children.text
country = ''
if name =~ /MILLION/i
brand = "BOOKS A MILLION"
else

brand = "BOOKS AND CO"
end
doc_id = store_doc({
store_id: id,
brand: brand,
type: brand,
name: name, # required
address_1: street1,
# address_2: street2,
city: city,
state: state,
zipcode: zip_code,
country: country,
lat: latitude,
long: longitude,
# map_link: page.at('a.map-it').attr('href'),
# address_container_html: page.at('.rio-address-section').to_html,
location_url: location_url,
flags: {
'JOE MUGS' => (t =~ /JOE MUGS/i).present?,
'KIDS-A-MILLION' => (t =~ /KIDS-A-MILLION/i).present?,
'YOGURT MOUNTAIN' => (t =~ /YOGURT MOUNTAIN/i).present?
}
})
end
end
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment