Skip to content

Instantly share code, notes, and snippets.

@jasongzy
Created February 16, 2021 15:27
Show Gist options
  • Save jasongzy/6aaff0cedba7ecc6e2f4b8be118b3fd4 to your computer and use it in GitHub Desktop.
Save jasongzy/6aaff0cedba7ecc6e2f4b8be118b3fd4 to your computer and use it in GitHub Desktop.
NJUST 级网爬虫(selenium 版)
#!/usr/bin/python3
import datetime
import re
from time import sleep
import PyRSS2Gen
from bs4 import BeautifulSoup
from selenium import webdriver
# url = "http://dgxg.njust.edu.cn/_t689/main.htm"
url = "http://dgxg.njust.edu.cn/_t689/njtz/list.htm"
options = webdriver.ChromeOptions()
options.add_argument(
'user-agent="Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36"'
)
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")
options.add_argument("--headless")
options.add_argument("--disable-gpu")
# options.add_experimental_option("excludeSwitches", ["enable-automation"])
# options.add_experimental_option("useAutomationExtension", False)
driver = webdriver.Chrome(chrome_options=options)
# 78 以上版本的 chrome 上之前的隐藏方法失效,故采用以下方法。请参考 https://mp.weixin.qq.com/s/U2aAC6K6RuQDRqfb8m35_w
driver.execute_cdp_cmd(
"Page.addScriptToEvaluateOnNewDocument",
{
"source": """
Object.defineProperty(navigator, 'webdriver', {
get: () => undefined
})
"""
},
)
driver.get(url)
sleep(2)
# driver.save_screenshot("chromedriver.png")
response = driver.page_source
# print(response)
# driver.quit()
soup = BeautifulSoup(response, "lxml")
item_list = soup.findAll("li", {"class": re.compile("^list_item i")})
# print(item_list)
rssitem_list = []
for item in item_list:
Link = str(item.find("span", {"class": "Article_Title"}).contents[0].attrs["href"])
Link = "http://dgxg.njust.edu.cn" + Link
Title = str(item.find("span", {"class": "Article_Title"}).contents[0].contents[0])
PublishDate = str(item.find("span", {"class": "Article_PublishDate"}).contents[0])
print(Title + "\n" + PublishDate + "\n" + Link)
# 获取通知正文内容
driver.get(Link)
tmp = BeautifulSoup(driver.page_source, "lxml")
contents = tmp.find("div", {"class": "Article_Content"}).contents
Description = "".join("%s" % id for id in contents).replace(
r"/_upload", r"http://dgxg.njust.edu.cn/_upload"
)
# print(Description)
print("*" * 10)
# data_content = (
# Title
# + "\n"
# + PublishDate
# + "\n"
# + Link
# + "\n"
# + Description
# + "\n"
# + "*" * 20
# + "\n"
# )
# with open("./dg17.html", "a") as f:
# f.write(data_content)
rssitem_list.append(
PyRSS2Gen.RSSItem(
title=Title,
link=Link,
description=Description,
guid=PyRSS2Gen.Guid(Link),
pubDate=datetime.datetime.strptime(PublishDate, "%Y-%m-%d")
+ datetime.timedelta(hours=-8),
),
)
driver.quit()
rss = PyRSS2Gen.RSS2(
title="电光 17 年级通知",
link="http://dgxg.njust.edu.cn/_t689/main.htm",
description="Made by jasongzy",
lastBuildDate=datetime.datetime.now(),
items=rssitem_list,
)
if len(rss.items) != 0:
rss.write_xml(open("dg17.xml", "wb"))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment