Created
June 10, 2024 12:58
-
-
Save zhullyb/4b608ce88da8ef62b3e8378d4556f1f3 to your computer and use it in GitHub Desktop.
针对 fwwb.org.cn 的新公告爬虫
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
import os | |
import requests | |
import random | |
from bs4 import BeautifulSoup as bs | |
import time | |
import pytz | |
import datetime | |
url = 'http://www.fwwb.org.cn/public/index' | |
tz = pytz.timezone('Asia/Shanghai') | |
def get_time(): | |
return datetime.datetime.now(tz).strftime('%Y-%m-%d %H:%M:%S') | |
def get_ua(): | |
version = random.randint(100,127) | |
linux_ua = f"Mozilla/5.0 (X11; Linux x86_64; rv:{version}.0) Gecko/20100101 Firefox/{version}.0" | |
windows_ua = f"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:{version}.0) Gecko/20100101 Firefox/{version}.0" | |
return random.choice([linux_ua, windows_ua]) | |
def get_html(): | |
ua = get_ua() | |
headers = {'User-Agent': ua} | |
r = requests.get(url, headers=headers) | |
return r.text | |
def get_title(t): | |
soup = bs(t, 'html.parser') | |
return [i.get_text() for i in soup.find_all('h3')] | |
def read_from_file(): | |
if not os.path.exists('fwwb.txt'): | |
return [] | |
with open('fwwb.txt', 'r') as f: | |
return [i.strip() for i in f.readlines()] | |
def save_to_file(t): | |
with open('fwwb.txt', 'w') as f: | |
for i in t: | |
f.write(i + '\n') | |
def notify_me(text): | |
pass | |
def main(): | |
titles = get_title(get_html()) | |
old_titles = read_from_file() | |
if set(old_titles) != set(titles): | |
notify_me(titles[0]) | |
save_to_file(titles) | |
print("=" * 50) | |
print(get_time()) | |
print("New title found: " + titles[0]) | |
print("=" * 50) | |
else: | |
print(f"{get_time()}: No new title found.") | |
if __name__ == '__main__': | |
while True: | |
main() | |
time.sleep(600) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment