Skip to content

Instantly share code, notes, and snippets.

@LitvinenkoD89
Last active July 17, 2017 14:34
Show Gist options
  • Save LitvinenkoD89/4387f1f8867ddd48fb51b6d13b5ba8b2 to your computer and use it in GitHub Desktop.
Save LitvinenkoD89/4387f1f8867ddd48fb51b6d13b5ba8b2 to your computer and use it in GitHub Desktop.
clarksusa
scraper_service.scrape do |browser, scraper, init_vars|
easy_seeder = Library.lib('EasySeeder')
easy_extractor = Library.lib('EasyExtractor')
fetcher_agent = Library.lib('FetcherAgent')
easy_seeder.seed(source_name: init_vars[:source_name], with_priority: true) do
loop_postal_codes "us", radius: 10 do |zip, lat, long|
queue_url "http://www.clarksusa.com/us/store-locator##{zip}", zip: zip, lat: lat, long: long, priority: 0
end
end
easy_extractor.extract(
source_name: init_vars[:source_name],
scraper: scraper,
batch_number: init_vars[:batch_number],
request_id: init_vars[:request_id],
request_name: init_vars[:request_name],
) do
find_pages page_format: :html do |url, parser_page, page|
action = URI.join(url, page.at('#storeSearchForm').attr('action')).to_s
seeder.queue_url action, {
method: :post,
page_type: 'search',
body: {
geocoded:parser_page[:zip],
latitude:parser_page[:latitude],
longitude:parser_page[:longitude],
tab:'',
q:79072,
radius:50,
_kids:'on',
CSRFToken: page.at("#storeSearchForm input[name=CSRFToken]")['value']
},
priority: 100,
headers: fetcher_agent.get_headers(parser_page),
}
end
find_pages page_format: :html, page_type: 'search' do |url, parser_page, page|
JSON.parse page.at('[data-stores]').attr('data-stores').each do |el|
id = el['id']
name = el['name']
latitude = el['latitude']
longitude = el['longitude']
property_id = name.scan(/\#(\d+)/).first.first
location_url = URI.join(url, page.at('[data-code="571"]').search('li.details').search('a').attribute('href').value).to_s
if name =~ /outlet/i

t = 'CLARKS BOSTONIAN OUTLET'

else

t = 'CLARKS'

end
address_container_html = page.at("[data-code='#{id}']").search('.address-line').map(&:text).join('<br>')
tmp = page.at("[data-code='#{id}']").search('.address-line').map(&:text)
adr = tmp.pop
city, state_zip = adr.split(',', 2).map(&:strip)
state, zip = state_zip.split(/\s+/, 2) rescue [nil,nil]
street1,street2 = tmp
country = ''
doc_id = store_doc({
# store_id: id,
brand: 'CLARK',
type: t,
property_id: id,
name: name, # required
address_1: street1,
address_2: street2,
city: city,
state: state,
zipcode: zip,
country: country,
lat: latitude,
long: longitude,
# map_link: page.at('a.map-it').attr('href'),
# address_container_html: page.at('.rio-address-section').to_html,
location_url: location_url
flags: {
}
})
end
end
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment