Skip to content

Instantly share code, notes, and snippets.

@nac-39
Last active July 30, 2022 16:38
Show Gist options
  • Save nac-39/48d38e37140ebfeb193564541295fa3a to your computer and use it in GitHub Desktop.
Save nac-39/48d38e37140ebfeb193564541295fa3a to your computer and use it in GitHub Desktop.
名大のHPで公開されている最新のPDFから感染者数を読み取って辞書形式で返すスクリプト
from pdfminer.high_level import extract_text
import re
import requests
import datetime
import os
import time
def get_pdf():
# pdfを保存するディレクトリを指定
PDF_DIR ="./pdf/"
# get用のurlを作成
t_delta = datetime.timedelta(hours=9)
JST = datetime.timezone(t_delta, 'JST')
request_date = datetime.datetime.now(JST)
while True:
date = request_date.strftime('%Y%m%d')
url = "https://www.nagoya-u.ac.jp/info/upload_images/"+ date + "_kansen.pdf"
res = requests.get(url)
#HTTP Responseのエラーチェック
try:
response_status = res.raise_for_status()
except Exception as exc:
response_status = True
print("Error:{}".format(exc))
if response_status == None:
try:
f = open(os.path.join(PDF_DIR,str(date+"_kansen.pdf")))
f.close()
except Exception as e:
#open()関数にwbを渡し、バイナリ書き込みモードで新規ファイル生成
with open(os.path.join("./pdf",os.path.basename(url)),"wb") as f:
#各チャンクをwrite()関数でローカルファイルに書き込む
for chunk in res.iter_content(100000):
f.write(chunk)
#ファイルを閉じる
f.close()
path = os.path.join(PDF_DIR,date+"_kansen.pdf")
return (path,date)
else:
request_date = request_date - datetime.timedelta(days=1)
time.sleep(1)# 大学のサーバーへの申し訳なさのあらわれ
def main():
res = get_pdf() # pdfの保存先のパスを取得
text = extract_text(res[0]) # pdfから文字列を取得
text = text.translate(str.maketrans({chr(0xFF01 + i): chr(0x21 + i) for i in range(94)})) #全角英数字を半角に変換(コピペコード)
text = re.sub(r'\n| ',"",text) # 改行と空白を取り除く
ex = [0 for _ in range(3)]
ans = dict({"date":res[1],"data":dict()})
# 正規表現オブジェクトを作成
campus_ja = ["東山","鶴舞","大幸"]
for i in range(len(campus_ja)):
ex[i] = re.compile(campus_ja[i] + r"キャンパスに通う(学生([0-9]{1,3})名)?、?(教職員([0-9]{1,3})名)?")#感染者数は高々3桁なので1,3
# テキスト内を検索,結果からdictを作成
campus_en = ["higashiyama","tsurumai","daiko"]
for i in range(len(campus_en)):
tmp = re.findall(ex[i],text)[0]
ans["data"].update({str(campus_en[i]):{"student":tmp[1],"teacher":tmp[3]}})
# python3.9以降は以下でも可
# ans |= {str(campus[i]):{"student":tmp[1],"teacher":tmp[3]}}
print(ans)
if __name__=="__main__":
main()
@nac-39
Copy link
Author

nac-39 commented Jan 22, 2022

requirements.txt代わりに

pdfminer.six==20211012
requests==2.27.1

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment