Created
February 16, 2021 15:27
-
-
Save jasongzy/6aaff0cedba7ecc6e2f4b8be118b3fd4 to your computer and use it in GitHub Desktop.
NJUST 级网爬虫(selenium 版)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python3 | |
import datetime | |
import re | |
from time import sleep | |
import PyRSS2Gen | |
from bs4 import BeautifulSoup | |
from selenium import webdriver | |
# url = "http://dgxg.njust.edu.cn/_t689/main.htm" | |
url = "http://dgxg.njust.edu.cn/_t689/njtz/list.htm" | |
options = webdriver.ChromeOptions() | |
options.add_argument( | |
'user-agent="Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36"' | |
) | |
options.add_argument("--no-sandbox") | |
options.add_argument("--disable-dev-shm-usage") | |
options.add_argument("--headless") | |
options.add_argument("--disable-gpu") | |
# options.add_experimental_option("excludeSwitches", ["enable-automation"]) | |
# options.add_experimental_option("useAutomationExtension", False) | |
driver = webdriver.Chrome(chrome_options=options) | |
# 78 以上版本的 chrome 上之前的隐藏方法失效,故采用以下方法。请参考 https://mp.weixin.qq.com/s/U2aAC6K6RuQDRqfb8m35_w | |
driver.execute_cdp_cmd( | |
"Page.addScriptToEvaluateOnNewDocument", | |
{ | |
"source": """ | |
Object.defineProperty(navigator, 'webdriver', { | |
get: () => undefined | |
}) | |
""" | |
}, | |
) | |
driver.get(url) | |
sleep(2) | |
# driver.save_screenshot("chromedriver.png") | |
response = driver.page_source | |
# print(response) | |
# driver.quit() | |
soup = BeautifulSoup(response, "lxml") | |
item_list = soup.findAll("li", {"class": re.compile("^list_item i")}) | |
# print(item_list) | |
rssitem_list = [] | |
for item in item_list: | |
Link = str(item.find("span", {"class": "Article_Title"}).contents[0].attrs["href"]) | |
Link = "http://dgxg.njust.edu.cn" + Link | |
Title = str(item.find("span", {"class": "Article_Title"}).contents[0].contents[0]) | |
PublishDate = str(item.find("span", {"class": "Article_PublishDate"}).contents[0]) | |
print(Title + "\n" + PublishDate + "\n" + Link) | |
# 获取通知正文内容 | |
driver.get(Link) | |
tmp = BeautifulSoup(driver.page_source, "lxml") | |
contents = tmp.find("div", {"class": "Article_Content"}).contents | |
Description = "".join("%s" % id for id in contents).replace( | |
r"/_upload", r"http://dgxg.njust.edu.cn/_upload" | |
) | |
# print(Description) | |
print("*" * 10) | |
# data_content = ( | |
# Title | |
# + "\n" | |
# + PublishDate | |
# + "\n" | |
# + Link | |
# + "\n" | |
# + Description | |
# + "\n" | |
# + "*" * 20 | |
# + "\n" | |
# ) | |
# with open("./dg17.html", "a") as f: | |
# f.write(data_content) | |
rssitem_list.append( | |
PyRSS2Gen.RSSItem( | |
title=Title, | |
link=Link, | |
description=Description, | |
guid=PyRSS2Gen.Guid(Link), | |
pubDate=datetime.datetime.strptime(PublishDate, "%Y-%m-%d") | |
+ datetime.timedelta(hours=-8), | |
), | |
) | |
driver.quit() | |
rss = PyRSS2Gen.RSS2( | |
title="电光 17 年级通知", | |
link="http://dgxg.njust.edu.cn/_t689/main.htm", | |
description="Made by jasongzy", | |
lastBuildDate=datetime.datetime.now(), | |
items=rssitem_list, | |
) | |
if len(rss.items) != 0: | |
rss.write_xml(open("dg17.xml", "wb")) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment