Skip to content

Instantly share code, notes, and snippets.

@puhitaku
Last active October 18, 2023 19:37
Show Gist options
  • Save puhitaku/47f0ce324ca9695b8c7f11be29ae5775 to your computer and use it in GitHub Desktop.
Save puhitaku/47f0ce324ca9695b8c7f11be29ae5775 to your computer and use it in GitHub Desktop.
Scraper of the list of Muslim-friendly restaurants on Okayama Health Tourism / Okayama City
'''
This script scrapes okayamahealthtourism.com/food/okayama-city/ and
gathers all restaurants' name and address.
The result will be printed out to the stdout. Redirect the stderr
if you find it annoying.
The result will need some hand-picking and hand-cleansing.
While I've never confirmed, this will work for Maniwa City page.
LICENSE (all files in this Gist): CC0 1.0 Universal
'''
import json
import sys
import requests
from bs4 import BeautifulSoup
page = 'https://okayamahealthtourism.com/food/okayama-city/'
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36'
}
try:
res = requests.get(page, headers=headers)
except Exception as e:
print('Failed to get the foods page: {res.}', file=sys.stderr)
sys.exit(1)
if res.status_code != 200:
print('Failed to get the foods page: {res.}', file=sys.stderr)
sys.exit(1)
html = BeautifulSoup(res.text, 'html.parser')
urls = html.select('.vc_column-inner > .wpb_wrapper a')
urls = [u.attrs['href'] for u in urls]
addresses = dict()
for url in urls:
print(f'>> {url}', file=sys.stderr)
res = None
for i in range(3):
try:
res = requests.get(url, headers=headers)
if res.status_code != 200:
print(f'Got non-200 for {url}', file=sys.stderr)
continue
break
except Exception as e:
print(f'Error: failed to get {url}: {e}', file=sys.stderr)
if res is None:
continue
html = BeautifulSoup(res.text, 'html.parser')
title = html.find('title').text.replace('Okayama Health Tourism | ', '').strip()
spans = html.find_all('span')
found = False
for span in spans:
text = span.text.lower()
if 'address' in text:
address = span.text.replace('ADDRESS', '').split('\n')[0].strip(': \u00a0\u0020')
al = addresses.get(title, [])
al.append(address)
addresses[title] = al
print(f'{title}: {address}', file=sys.stderr)
found = True
if not found:
ps = html.find_all('p')
for p in ps:
text = p.text.lower()
if 'copyright' in text:
continue
if 'okayama' in text and ('city' in text or 'shi' in text):
address = p.text.replace('ADDRESS', '').split('\n')[0].strip(': \u00a0\u0020')
al = addresses.get(title, [])
al.append(address)
addresses[title] = al
print(f'{title}: {address}', file=sys.stderr)
found = True
if not found:
print(f'Warning: no address was found for {title}', file=sys.stderr)
continue
addresses['__len__'] = len(addresses)
print(json.dumps(addresses, indent=2, ensure_ascii=False))
requests
beautifulsoup4
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment