Skip to content

Instantly share code, notes, and snippets.

@dstanek
Created February 12, 2019 13:42
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save dstanek/2e3448da010eb8950f765c12c9e8c0af to your computer and use it in GitHub Desktop.
Save dstanek/2e3448da010eb8950f765c12c9e8c0af to your computer and use it in GitHub Desktop.
import csv
import sys
import urllib
import lxml.html
import requests
import requests_cache
requests_cache.install_cache('cache')
BASE = 'https://www.tvfoodmaps.com/show/Diners-Drive-Ins-Dives'
def get(url):
resp = requests.get(url)
resp.raise_for_status()
return resp
def xpath(data, path):
root = lxml.html.fromstring(data)
return root.xpath(path)
def iter_state_urls(data):
root = lxml.html.fromstring(data)
for a in xpath(data, '//a'):
if 'Dives in ' in a.attrib.get('title', ''):
yield a.attrib['href']
def iter_dives_in_a_state(state_url):
resp = get(state_url)
viewall_btn = xpath(resp.content, '//div[@id="paging-info"]/h4/a')[0]
viewall_url = urllib.parse.urljoin(BASE, viewall_btn.attrib['href'])
resp = get(viewall_url)
for div in xpath(resp.content, '//div[@class="inner-results"]'):
yield parse_dive(div)
def parse_dive(div):
name = div.xpath('h3/a')[0].text
image = div.xpath('div[@class="pull-left"]/img')[0].attrib['src']
if '?' in image:
image = image[:image.index('?')]
address = div.xpath('p[@class="searchResAddress"]')[0].text
if ' (' in address:
address = address[:address.index(' (')]
desc = div.xpath('p/i')
if desc:
desc = desc[0].text.strip().replace('\xa0', ' ')
else:
desc = ''
return {'name': name, 'address': address, 'image': image, 'desc': desc}
writer = csv.writer(sys.stdout)
writer.writerow(['name', 'address', 'image', 'desc'])
resp = get(BASE)
for state_url in iter_state_urls(resp.content):
state_url = urllib.parse.urljoin(BASE, state_url)
for dive in iter_dives_in_a_state(state_url):
writer.writerow([dive['name'], dive['address'], dive['image'], dive['desc']])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment