Skip to content

Instantly share code, notes, and snippets.

@tazjel
Forked from MarkDunne/scraper.py
Created September 13, 2017 20:24
Show Gist options
  • Save tazjel/1a7e0e97387cf5108d74577b3da98377 to your computer and use it in GitHub Desktop.
Save tazjel/1a7e0e97387cf5108d74577b3da98377 to your computer and use it in GitHub Desktop.
Scraper for food hygiene dataset
import requests
import pandas as pd
from bs4 import BeautifulSoup
categories = {
'7846': 'Mobile caterer',
'7838': 'Farmers/growers',
'14': 'Importers/Exporters',
'7843': 'Pub/bar/nightclub',
'4613': 'Retailers - other',
'1': 'Restaurant/Cafe/Canteen',
'7839': 'Manufacturers/packers',
'7844': 'Takeaway/sandwich shop',
'7': 'Distributors/Transporters',
'7841': 'Other catering premises',
'7845': 'School/college/university',
'5': 'Hospitals/Childcare/Caring Premises',
'7842': 'Hotel/bed & breakfast/guest house',
'7840': 'Retailers - supermarkets/hypermarkets',
}
base_url = 'http://ratings.food.gov.uk/enhanced-search/en-GB/%5E/%5E/Relevance/{category}/%5E/%5E/0/{page}/1000'
results = []
for i, (cat_id, category) in enumerate(categories.items()):
category_progress = '({x}/{y})'.format(x=i+1, y=len(categories))
page = 0
while True:
request = requests.get(base_url.format(category=cat_id, page=page))
if request.ok:
soup = BeautifulSoup(request.content, 'html5lib')
paging_total = soup.find('div', id='pagingTotal').text
print 'Category', category, category_progress, ' '.join(paging_total.split())
for result in soup.find_all('div', class_='ResultRow'):
results.append({
'category': category,
'name': result.find('div', class_='ResultsBusinessName').text.strip(),
'result_id': result.find('input', class_='ResultsFHRSID')['value'],
'latitude': result.find('input', class_='ResultsLatitude')['value'],
'longitude': result.find('input', class_='ResultsLongitude')['value'],
'address': result.find('div', class_='ResultsBusinessAddress').text.strip(),
'postcode': result.find('div', class_='ResultsBusinessPostcode').text.strip(),
'rating': result.find('div', class_='ratingColumnPadding').img['alt'],
'date': result.find('div', class_='ResultsRatingDate').text.strip(),
})
if soup.find('input', id='SearchResults_uxPagerNext').get('disabled') == 'disabled':
break
else:
print 'Error', url
break
page += 1
dataset = pd.DataFrame.from_dict(results).drop_duplicates()
dataset.to_csv('dataset.csv', index=False, encoding='utf-8')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment