Skip to content

Instantly share code, notes, and snippets.

@shinyaoguri
Last active January 14, 2021 02:39
Show Gist options
  • Save shinyaoguri/b9fb6244276cb2251a1bf6b1c190ad00 to your computer and use it in GitHub Desktop.
Save shinyaoguri/b9fb6244276cb2251a1bf6b1c190ad00 to your computer and use it in GitHub Desktop.
半田市の給食献立情報のスクレイピング
# encoding: utf-8
import urllib.request
import urllib.parse
import json
import os
import ftplib
import dotenv
import git
import shutil
import requests
import re
import datetime
from bs4 import BeautifulSoup
dotenv_path = os.path.join(os.path.dirname(__file__), '.env')
dotenv.load_dotenv(dotenv_path)
# 給食献立データのLinkDataのURL
HANDA_ALL_TABLES_URL = 'http://linkdata.org/api/1/rdf1s4907i/datapackage.json'
# LinkDataからダウンロードするCSVのローカルの保存先
MENUES_DIR = 'menues'
# FTPSの情報
HOST_NAME = os.environ.get("HOST_NAME")
USER_NAME = os.environ.get("USER_NAME")
PASSWORD = os.environ.get("PASSWORD")
FTPS_DATA_DIR = os.environ.get("FTPS_DATA_DIR")
# GitHubのリポジトリ名
REPO_NAME = 'shocknine'
# GitHubのリポジトリ
GITHUB = 'git@github.com:shinyaoguri/shocknine.git'
# リポジトリの中でCSVデータのあるディレクトリ
GH_DATA_DIR = 'data'
# 半田市のベースURL
HANDA_BASE_URL = 'http://www.city.handa.lg.jp'
# 半田市今日の献立ページのURL
HANDA_TODAY_MENU_URL = 'http://www.city.handa.lg.jp/kyushoku/kosodate/kyoiku/kyushoku/kyonokondate.html'
# 半田市ページから画像を一時的にダウンロードするディレクトリ
MENU_PIC_TMP_DIR = 'tmp_pic'
def get_all_tables(url):
req = urllib.request.Request(HANDA_ALL_TABLES_URL)
all_tables = {}
with urllib.request.urlopen(req) as response:
html = response.read().decode("utf-8")
all_tables = json.loads(html)
print('[LOG] get all table data.')
return all_tables
def get_all_csvs_url(data):
csv_list = []
for d in data["resources"]:
csv_list.append(d["url"])
print('[LOG] get ' + str(len(csv_list)) + ' csv urls')
return csv_list
def download_csv(url, name):
if not os.path.exists(MENUES_DIR):
os.mkdir(MENUES_DIR)
file_is_exist = os.path.isfile(os.path.join(MENUES_DIR, name))
if not file_is_exist:
print('[LOG] ' + name + ' was not exist. so download this.')
tmp = urllib.request.urlretrieve(url, os.path.join(MENUES_DIR, name))
else:
print('[LOG] ' + name + ' was already exist. so, do nothing')
def put_ftp_pictures(pic_name, pic_path):
with ftplib.FTP_TLS(HOST_NAME) as ftps:
ftps.set_pasv("true")
ftps.login(USER_NAME, PASSWORD)
ftps.prot_p()
with open(pic_path, 'rb') as fp:
log = ftps.storbinary('STOR ' + FTPS_DATA_DIR + '/' + pic_name, fp)
print('[LOG] upload to ftp server ' + pic_name + ' ' + log)
def get_ftp_pictures():
with ftplib.FTP_TLS(HOST_NAME) as ftps:
ftps.login(USER_NAME, PASSWORD)
pic_list = ftps.nlst(FTPS_DATA_DIR)
print('[LOG] get ftp picture list. there are ' + str(len(pic_list)) + ' files')
return pic_list
def get_github_resource():
if os.path.exists(REPO_NAME):
print('[LOG] Repository was already exist. so delete it.')
shutil.rmtree(REPO_NAME)
print('[LOG] git clone ' + GITHUB)
git.Git().clone(GITHUB)
gh_csv_res_list = os.listdir(os.path.join(REPO_NAME, GH_DATA_DIR))
print('[LOG] github repository has ' + str(len(gh_csv_res_list)) + ' files')
return gh_csv_res_list
def get_handa_lunch_picture():
picture_list = dict()
response = requests.get(HANDA_TODAY_MENU_URL)
soup = BeautifulSoup(response.content, 'html5lib')
tmp_html = soup.find('div', id='tmp_contents')
img_tag_list = tmp_html.find_all("img")
for c in img_tag_list:
img_path = c['src']
img_alt = c['alt']
menu_txt = re.search(r"(?P<month>[0-9]+)月(?P<date>[0-9]+)日(?P<option>.*)", img_alt)
print('[LOG] find picture src=' + img_path + ' alt=' + img_alt + ' month=' + menu_txt.group('month') + ' date=' + menu_txt.group('date') + ' option=' + menu_txt.group('option'))
pic_name = str(datetime.date.today().year) + str(menu_txt.group('month').zfill(2)) + str(menu_txt.group('date').zfill(2))
pic_name = pic_name + '.jpg'
if not os.path.exists(MENU_PIC_TMP_DIR):
os.mkdir(MENU_PIC_TMP_DIR)
urllib.request.urlretrieve(HANDA_BASE_URL+img_path, os.path.join(MENU_PIC_TMP_DIR, pic_name))
print('[LOG] download picture from ' + HANDA_BASE_URL + img_path + ' as ' + pic_name)
picture_list.update([(pic_name, os.path.join(MENU_PIC_TMP_DIR, pic_name))])
return picture_list
def remove_tmp_dir():
if os.path.exists(REPO_NAME):
shutil.rmtree(REPO_NAME)
print('[LOG] remove ' + REPO_NAME)
if os.path.exists(MENU_PIC_TMP_DIR):
shutil.rmtree(MENU_PIC_TMP_DIR)
print('[LOG] remove ' + MENU_PIC_TMP_DIR)
def main():
# LinkDataから給食献立データの一覧情報を取得
#data = get_all_tables(HANDA_ALL_TABLES_URL)
# 各月の献立CSVデータのURLを取得
#all_csv_url = get_all_csvs_url(data)
# 各CSVデータをダウンロード
# for csv in all_csv_url:
# tmp = csv.split('/')
# file_name = tmp[-1]
# download_csv(csv, file_name)
# GitHubのリポジトリをCloneしてCSVファイル一覧を取得
#gh_csv_list = get_github_resource()
# リポジトリ内に献立CSVがなければアップロード
# 半田市のサイトから最新の画像一覧を取得
handa_pic_list = get_handa_lunch_picture()
# FPTサーバにある画像一覧を取得
ftp_pic_list = get_ftp_pictures()
# FTPサーバに画像が存在しなければアップロード
for pic in handa_pic_list.items():
if pic[0] not in list(map(lambda ftp_pic: os.path.basename(ftp_pic), ftp_pic_list)):
print('[LOG] ' + pic[0])
put_ftp_pictures(pic[0], pic[1])
# 一時的なファイル,ディレクトリを削除
remove_tmp_dir()
if __name__ == "__main__":
main()
beautifulsoup4
GitPython
html5lib
python-dotenv
requests
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment