Skip to content

Instantly share code, notes, and snippets.

@LitvinenkoD89
Last active August 1, 2017 14:37
Show Gist options
  • Save LitvinenkoD89/470ed4352f8745bbbb69ffefb8f631ab to your computer and use it in GitHub Desktop.
Save LitvinenkoD89/470ed4352f8745bbbb69ffefb8f631ab to your computer and use it in GitHub Desktop.
pappas
# PAPPAS BRANDS {source_name: 'pappas', batch_number: 7, request_id: 7000000048, request_name: 'PAPPAS BRANDS'}
scraper_service.scrape do |browser, scraper, init_vars|
easy_seeder = Library.lib('EasySeeder')
easy_extractor = Library.lib('EasyExtractor')
fetcher_agent = Library.lib('FetcherAgent')
easy_seeder.seed(source_name: init_vars[:source_name]) do
queue_url "http://www.pappas.com/locations-list/?msg=noaddy"
end
easy_extractor.extract(
source_name: init_vars[:source_name],
scraper: scraper,
batch_number: init_vars[:batch_number],
request_id: init_vars[:request_id],
request_name: init_vars[:request_name],
) do
find_pages page_format: :html do |url, parser_page, page|
page.search('.locListState').each do |store|
tmp = store.at('.locListLoc').inner_html.gsub("\t", '').gsub("\r", '').gsub("\n", '').split('<br>')
tmp.pop
link = tmp.pop
location_url = link.split('"')[1]
id = location_url.split('id=')[1]
tel = tmp.pop
tmp = tmp.slice(1, tmp.length)
address_container = tmp.join('<br>')
street1, city_and_st = tmp
city, st = city_and_st_zip.split(', ')
city = city.strip
doc_id = store_doc({
store_id: id,
# brand: "",
# type: "",
property_id: id,
name: 'PAPPAS BRANDS', # required
address_1: street1,
address_2: '',
city: city,
state: state,
# zipcode: zip_code,
country: '',
# lat: lat,
# long: long,
address_container_html: address_container,
# map_link: store.parent.parent.at('.googlemap').attr('name'),
location_url: location_url,
flags: {
}
})
seeder.queue_url location_url, {
page_type: 'profile',
doc_id: doc_id
}
end
end
# Find profile pages
find_pages page_format: :html, page_type: 'profile' do |url, parser_page, page|
doc = find_location(parser_page[:doc_id])
if doc.present?
title = page.at('title').text.split(' - ')[1]
if title =~ /PAPPADEAUX SEAFOOD KITCHEN/i
t = 'PAPPADEAUX SEAFOOD KITCHEN'
elsif title =~ /PAPPASITOS CANTINA/i
t = 'PAPPASITOS CANTINA'
elsif title =~ /PAPPAS BROS STEAKHOUSE/i
t = 'PAPPAS BROS STEAKHOUSE'
elsif title =~ /PAPPAS SEAFOOD HOUSE/i
t = 'PAPPAS SEAFOOD HOUSE'
elsif title =~ /PAPPAS BAR B Q/i
t = 'PAPPAS BAR B Q'
elsif title =~ /PAPPAS BURGER/i
t = 'PAPPAS BURGER'
elsif title =~ /PAPPAS BROS/i
t = 'PAPPAS BROS'
elsif title =~ /YIA YIA MARYS/i
t = 'PAPPAS BROS'
elsif title =~ /DOT/i
t = 'DOT'
end
address, tel = page.at('.profile_location').at('p').inner_html.split('<br>')
tmp = address.split(',').map(&:strip)
state, zip_code = tmp.last.split(' ')
doc[:brand] = t
doc[:type] = t
doc[:zipcode] = zip_code
store_doc doc
end
end
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment