Skip to content

Instantly share code, notes, and snippets.

@LitvinenkoD89
Last active July 18, 2017 13:43
Show Gist options
  • Save LitvinenkoD89/24c34dfde6f87efdf701d5f51cb2c15f to your computer and use it in GitHub Desktop.
Save LitvinenkoD89/24c34dfde6f87efdf701d5f51cb2c15f to your computer and use it in GitHub Desktop.
calvin
# CALVIN KLEIN {source_name: 'calvinklein', batch_number: 5, request_id: 5000000042, request_name: 'CALVIN KLEIN'}
scraper_service.scrape do |browser, scraper, init_vars|
easy_seeder = Library.lib('EasySeeder')
easy_extractor = Library.lib('EasyExtractor')
fetcher_agent = Library.lib('FetcherAgent')
easy_seeder.seed(source_name: init_vars[:source_name], with_priority: true) do
loop_states "us", radius: 10 do |short_code, full_name|
queue_url "https://secure.gotwww.com/gotlocations.com/ck.com/ck.php##{short_code}", short_code: short_code, full_name: full_name, priority: 0
end
end
easy_extractor.extract(
source_name: init_vars[:source_name],
scraper: scraper,
batch_number: init_vars[:batch_number],
request_id: init_vars[:request_id],
request_name: init_vars[:request_name],
) do
find_pages page_format: :html do |url, parser_page, page|
action = URI.join(url, page.at("[name='search']").attr('action')).to_s
seeder.queue_url action, {
text:2,
address:parser_page[:full_name],
ip_country:'US',
brand_array:'',
product_categories:'',
store_type:'',
subcategory_array:'',
metadata:'',
language_user:'en_US',
diag:'',
Submit:'search'
}
end
find_pages page_format: :html, page_type: 'search' do |url, parser_page, page|
page.search('label.brandtextdisplay').each do |label|
t = label.parent.inner_html.split('<br>').map(&:strip).select(&:present?).last
if t =~ /FLAGSHIP/i
t = 'CALVIN KLEIN FLAGSHIP STORE'
elsif t =~ /RETAIL/i
t = 'CALVIN KLEIN RETAIL STORE'
elsif t =~ /OUTLET/i
t = 'CALVIN KLEIN OUTLET STORE'
elsif t =~ /ACCESSORY/i
t = 'CALVIN KLEIN ACCESSORY STORE'
elsif t =~ /SPECIALTY/i
t = 'CALVIN KLEIN SPECIALTY STORE'
elsif t =~ /SHOPPING/i
t = 'CALVIN KLEIN SHOPPING MALL STORE'
else
next
end
name = label.text
address_container = label.parent.next.next.next.next.inner_html.split('<br>')
longitude, latitude = label.parent.parent.next.next.at('a').attr('href').gsub(/[^\d,\.,\(,\,\-)]/, '').split('(')[2].split(')')[0].split(',')
street1 = address_container[0].gsub("\t", '').gsub("\r", '').gsub("\n", '')
street2 = address_container[1]
tmp = address_container[2].split(',')
city = tmp[0]
state, zip_code = tmp[1].split(' ')
country = ''
doc_id = store_doc({
# store_id: id,
brand: 'CALVIN KLEIN',
type: t,
# property_id: id,
name: name, # required
address_1: street1,
address_2: street2,
city: city,
state: state,
zipcode: zip_code,
country: country,
lat: latitude,
long: longitude,
# map_link: page.at('a.map-it').attr('href'),
# address_container_html: page.at('.rio-address-section').to_html,
# location_url: location_url,
flags: {
}
})
end
end
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment