Skip to content

Instantly share code, notes, and snippets.

@Na0ki
Last active August 22, 2018 14:13
Show Gist options
  • Save Na0ki/b6846567d03b8fc837247c4cbb86ba3b to your computer and use it in GitHub Desktop.
Save Na0ki/b6846567d03b8fc837247c4cbb86ba3b to your computer and use it in GitHub Desktop.
とある小説サイトのページをスクレイピングして新着があればslackに通知するやつ
import datetime
import json
import requests
from bs4 import BeautifulSoup
TARGET = 'SCRAPING_TARGET_URL'
HOOK = 'SLACK_WEB_HOOK_URL'
class MyConfig:
"""
設定ファイルに最終更新日時を書いたり、読んだりする
setter, getter でファイルの操作をかましたかったので @property を使ってみた
"""
def __init__(self, path):
"""
Parameters
----------
path : String
設定ファイルのパス
"""
self.path = path
@property
def last_updated(self):
try:
with open(self.path) as file:
return datetime.datetime.strptime(file.read().strip(), '%Y-%m-%d %H:%M:%S')
except (FileNotFoundError, ValueError):
now = datetime.datetime.now()
with open(self.path, mode='w') as file:
file.write(now.strftime('%Y-%m-%d %H:%M:%S'))
return now
@last_updated.setter
def last_updated(self, value):
with open(self.path, mode='w') as file:
file.write(value.strftime('%Y-%m-%d %H:%M:%S'))
def to_index_box_dict(dl) -> bool:
"""
dl タグをパースして dict を作って返す
"""
dd = dl.select_one('dd.subtitle > a').text.strip()
dt = dl.select_one('dt.long_update').text.replace('(改)', '').strip()
updated_at = datetime.datetime.strptime(dt, '%Y/%m/%d %H:%M')
return {'subtitle': dd, 'updated_at': updated_at}
config = MyConfig('./lastupdated.txt')
r = requests.get(TARGET)
r.encoding = r.apparent_encoding
soup = BeautifulSoup(r.text, 'html.parser')
sublist = soup.find_all('dl', class_='novel_sublist2')
mappedlist = map(to_index_box_dict, sublist)
filteredlist = list(
filter(lambda x: x['updated_at'] > config.last_updated, mappedlist))
if len(filteredlist) > 0:
text = "[更新]\n"
for item in filteredlist:
text += f"{item['subtitle']}({item['updated_at']})\n"
text += 'が投稿されました!'
requests.post(
HOOK,
data=json.dumps({'text': text, 'username': u'shosetu watch'})
)
config.last_updated = max(
list(map(lambda x: x['updated_at'], filteredlist)))
else:
print('nothing new')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment