Last active
July 30, 2022 16:38
-
-
Save nac-39/48d38e37140ebfeb193564541295fa3a to your computer and use it in GitHub Desktop.
名大のHPで公開されている最新のPDFから感染者数を読み取って辞書形式で返すスクリプト
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from pdfminer.high_level import extract_text | |
import re | |
import requests | |
import datetime | |
import os | |
import time | |
def get_pdf(): | |
# pdfを保存するディレクトリを指定 | |
PDF_DIR ="./pdf/" | |
# get用のurlを作成 | |
t_delta = datetime.timedelta(hours=9) | |
JST = datetime.timezone(t_delta, 'JST') | |
request_date = datetime.datetime.now(JST) | |
while True: | |
date = request_date.strftime('%Y%m%d') | |
url = "https://www.nagoya-u.ac.jp/info/upload_images/"+ date + "_kansen.pdf" | |
res = requests.get(url) | |
#HTTP Responseのエラーチェック | |
try: | |
response_status = res.raise_for_status() | |
except Exception as exc: | |
response_status = True | |
print("Error:{}".format(exc)) | |
if response_status == None: | |
try: | |
f = open(os.path.join(PDF_DIR,str(date+"_kansen.pdf"))) | |
f.close() | |
except Exception as e: | |
#open()関数にwbを渡し、バイナリ書き込みモードで新規ファイル生成 | |
with open(os.path.join("./pdf",os.path.basename(url)),"wb") as f: | |
#各チャンクをwrite()関数でローカルファイルに書き込む | |
for chunk in res.iter_content(100000): | |
f.write(chunk) | |
#ファイルを閉じる | |
f.close() | |
path = os.path.join(PDF_DIR,date+"_kansen.pdf") | |
return (path,date) | |
else: | |
request_date = request_date - datetime.timedelta(days=1) | |
time.sleep(1)# 大学のサーバーへの申し訳なさのあらわれ | |
def main(): | |
res = get_pdf() # pdfの保存先のパスを取得 | |
text = extract_text(res[0]) # pdfから文字列を取得 | |
text = text.translate(str.maketrans({chr(0xFF01 + i): chr(0x21 + i) for i in range(94)})) #全角英数字を半角に変換(コピペコード) | |
text = re.sub(r'\n| ',"",text) # 改行と空白を取り除く | |
ex = [0 for _ in range(3)] | |
ans = dict({"date":res[1],"data":dict()}) | |
# 正規表現オブジェクトを作成 | |
campus_ja = ["東山","鶴舞","大幸"] | |
for i in range(len(campus_ja)): | |
ex[i] = re.compile(campus_ja[i] + r"キャンパスに通う(学生([0-9]{1,3})名)?、?(教職員([0-9]{1,3})名)?")#感染者数は高々3桁なので1,3 | |
# テキスト内を検索,結果からdictを作成 | |
campus_en = ["higashiyama","tsurumai","daiko"] | |
for i in range(len(campus_en)): | |
tmp = re.findall(ex[i],text)[0] | |
ans["data"].update({str(campus_en[i]):{"student":tmp[1],"teacher":tmp[3]}}) | |
# python3.9以降は以下でも可 | |
# ans |= {str(campus[i]):{"student":tmp[1],"teacher":tmp[3]}} | |
print(ans) | |
if __name__=="__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
requirements.txt代わりに