puhitaku/okayama_health_tourism_scraper.py

## okayama_health_tourism_scraper.py
'''
This script scrapes okayamahealthtourism.com/food/okayama-city/ and
gathers all restaurants' name and address.

The result will be printed out to the stdout. Redirect the stderr
if you find it annoying.

The result will need some hand-picking and hand-cleansing.

While I've never confirmed, this will work for Maniwa City page.

LICENSE (all files in this Gist): CC0 1.0 Universal
'''

import json
import sys

import requests
from bs4 import BeautifulSoup


page = 'https://okayamahealthtourism.com/food/okayama-city/'
headers = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36'
}

try:
    res = requests.get(page, headers=headers)
except Exception as e:
    print('Failed to get the foods page: {res.}', file=sys.stderr)
    sys.exit(1)

if res.status_code != 200:
    print('Failed to get the foods page: {res.}', file=sys.stderr)
    sys.exit(1)

html = BeautifulSoup(res.text, 'html.parser')
urls = html.select('.vc_column-inner > .wpb_wrapper a')
urls = [u.attrs['href'] for u in urls]

addresses = dict()

for url in urls:
    print(f'>> {url}', file=sys.stderr)

    res = None
    for i in range(3):
        try:
            res = requests.get(url, headers=headers)
            if res.status_code != 200:
                print(f'Got non-200 for {url}', file=sys.stderr)
                continue
            break
        except Exception as e:
            print(f'Error: failed to get {url}: {e}', file=sys.stderr)

    if res is None:
        continue

    html = BeautifulSoup(res.text, 'html.parser')
    title = html.find('title').text.replace('Okayama Health Tourism | ', '').strip()
    spans = html.find_all('span')

    found = False
    for span in spans:
        text = span.text.lower()

        if 'address' in text:
            address = span.text.replace('ADDRESS', '').split('\n')[0].strip(': \u00a0\u0020')
            al = addresses.get(title, [])
            al.append(address)
            addresses[title] = al
            print(f'{title}: {address}', file=sys.stderr)
            found = True

    if not found:
        ps = html.find_all('p')
        for p in ps:
            text = p.text.lower()
            if 'copyright' in text:
                continue

            if 'okayama' in text and ('city' in text or 'shi' in text):
                address = p.text.replace('ADDRESS', '').split('\n')[0].strip(': \u00a0\u0020')
                al = addresses.get(title, [])
                al.append(address)
                addresses[title] = al
                print(f'{title}: {address}', file=sys.stderr)
                found = True

    if not found:
        print(f'Warning: no address was found for {title}', file=sys.stderr)
        continue

addresses['__len__'] = len(addresses)
print(json.dumps(addresses, indent=2, ensure_ascii=False))

## requirements.txt
requests
beautifulsoup4
	'''
	This script scrapes okayamahealthtourism.com/food/okayama-city/ and
	gathers all restaurants' name and address.

	The result will be printed out to the stdout. Redirect the stderr
	if you find it annoying.

	The result will need some hand-picking and hand-cleansing.

	While I've never confirmed, this will work for Maniwa City page.

	LICENSE (all files in this Gist): CC0 1.0 Universal
	'''

	import json
	import sys

	import requests
	from bs4 import BeautifulSoup


	page = 'https://okayamahealthtourism.com/food/okayama-city/'
	headers = {
	'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36'
	}

	try:
	res = requests.get(page, headers=headers)
	except Exception as e:
	print('Failed to get the foods page: {res.}', file=sys.stderr)
	sys.exit(1)

	if res.status_code != 200:
	print('Failed to get the foods page: {res.}', file=sys.stderr)
	sys.exit(1)

	html = BeautifulSoup(res.text, 'html.parser')
	urls = html.select('.vc_column-inner > .wpb_wrapper a')
	urls = [u.attrs['href'] for u in urls]

	addresses = dict()

	for url in urls:
	print(f'>> {url}', file=sys.stderr)

	res = None
	for i in range(3):
	try:
	res = requests.get(url, headers=headers)
	if res.status_code != 200:
	print(f'Got non-200 for {url}', file=sys.stderr)
	continue
	break
	except Exception as e:
	print(f'Error: failed to get {url}: {e}', file=sys.stderr)

	if res is None:
	continue

	html = BeautifulSoup(res.text, 'html.parser')
	title = html.find('title').text.replace('Okayama Health Tourism \| ', '').strip()
	spans = html.find_all('span')

	found = False
	for span in spans:
	text = span.text.lower()

	if 'address' in text:
	address = span.text.replace('ADDRESS', '').split('\n')[0].strip(': \u00a0\u0020')
	al = addresses.get(title, [])
	al.append(address)
	addresses[title] = al
	print(f'{title}: {address}', file=sys.stderr)
	found = True

	if not found:
	ps = html.find_all('p')
	for p in ps:
	text = p.text.lower()
	if 'copyright' in text:
	continue

	if 'okayama' in text and ('city' in text or 'shi' in text):
	address = p.text.replace('ADDRESS', '').split('\n')[0].strip(': \u00a0\u0020')
	al = addresses.get(title, [])
	al.append(address)
	addresses[title] = al
	print(f'{title}: {address}', file=sys.stderr)
	found = True

	if not found:
	print(f'Warning: no address was found for {title}', file=sys.stderr)
	continue

	addresses['__len__'] = len(addresses)
	print(json.dumps(addresses, indent=2, ensure_ascii=False))