jasongzy/selenium_chrome_linux.py

## selenium_chrome_linux.py
#!/usr/bin/python3
import datetime
import re
from time import sleep

import PyRSS2Gen
from bs4 import BeautifulSoup
from selenium import webdriver

# url = "http://dgxg.njust.edu.cn/_t689/main.htm"
url = "http://dgxg.njust.edu.cn/_t689/njtz/list.htm"

options = webdriver.ChromeOptions()
options.add_argument(
    'user-agent="Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36"'
)
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")
options.add_argument("--headless")
options.add_argument("--disable-gpu")
# options.add_experimental_option("excludeSwitches", ["enable-automation"])
# options.add_experimental_option("useAutomationExtension", False)

driver = webdriver.Chrome(chrome_options=options)

# 78 以上版本的 chrome 上之前的隐藏方法失效，故采用以下方法。请参考 https://mp.weixin.qq.com/s/U2aAC6K6RuQDRqfb8m35_w
driver.execute_cdp_cmd(
    "Page.addScriptToEvaluateOnNewDocument",
    {
        "source": """
    Object.defineProperty(navigator, 'webdriver', {
      get: () => undefined
    })
  """
    },
)

driver.get(url)
sleep(2)
# driver.save_screenshot("chromedriver.png")
response = driver.page_source
# print(response)
# driver.quit()

soup = BeautifulSoup(response, "lxml")
item_list = soup.findAll("li", {"class": re.compile("^list_item i")})
# print(item_list)

rssitem_list = []

for item in item_list:
    Link = str(item.find("span", {"class": "Article_Title"}).contents[0].attrs["href"])
    Link = "http://dgxg.njust.edu.cn" + Link
    Title = str(item.find("span", {"class": "Article_Title"}).contents[0].contents[0])
    PublishDate = str(item.find("span", {"class": "Article_PublishDate"}).contents[0])
    print(Title + "\n" + PublishDate + "\n" + Link)
    # 获取通知正文内容
    driver.get(Link)
    tmp = BeautifulSoup(driver.page_source, "lxml")
    contents = tmp.find("div", {"class": "Article_Content"}).contents
    Description = "".join("%s" % id for id in contents).replace(
        r"/_upload", r"http://dgxg.njust.edu.cn/_upload"
    )
    # print(Description)
    print("*" * 10)
    # data_content = (
    #     Title
    #     + "\n"
    #     + PublishDate
    #     + "\n"
    #     + Link
    #     + "\n"
    #     + Description
    #     + "\n"
    #     + "*" * 20
    #     + "\n"
    # )
    # with open("./dg17.html", "a") as f:
    #     f.write(data_content)

    rssitem_list.append(
        PyRSS2Gen.RSSItem(
            title=Title,
            link=Link,
            description=Description,
            guid=PyRSS2Gen.Guid(Link),
            pubDate=datetime.datetime.strptime(PublishDate, "%Y-%m-%d")
            + datetime.timedelta(hours=-8),
        ),
    )

driver.quit()

rss = PyRSS2Gen.RSS2(
    title="电光 17 年级通知",
    link="http://dgxg.njust.edu.cn/_t689/main.htm",
    description="Made by jasongzy",
    lastBuildDate=datetime.datetime.now(),
    items=rssitem_list,
)

if len(rss.items) != 0:
    rss.write_xml(open("dg17.xml", "wb"))
	#!/usr/bin/python3
	import datetime
	import re
	from time import sleep

	import PyRSS2Gen
	from bs4 import BeautifulSoup
	from selenium import webdriver

	# url = "http://dgxg.njust.edu.cn/_t689/main.htm"
	url = "http://dgxg.njust.edu.cn/_t689/njtz/list.htm"

	options = webdriver.ChromeOptions()
	options.add_argument(
	'user-agent="Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36"'
	)
	options.add_argument("--no-sandbox")
	options.add_argument("--disable-dev-shm-usage")
	options.add_argument("--headless")
	options.add_argument("--disable-gpu")
	# options.add_experimental_option("excludeSwitches", ["enable-automation"])
	# options.add_experimental_option("useAutomationExtension", False)

	driver = webdriver.Chrome(chrome_options=options)

	# 78 以上版本的 chrome 上之前的隐藏方法失效，故采用以下方法。请参考 https://mp.weixin.qq.com/s/U2aAC6K6RuQDRqfb8m35_w
	driver.execute_cdp_cmd(
	"Page.addScriptToEvaluateOnNewDocument",
	{
	"source": """
	Object.defineProperty(navigator, 'webdriver', {
	get: () => undefined
	})
	"""
	},
	)

	driver.get(url)
	sleep(2)
	# driver.save_screenshot("chromedriver.png")
	response = driver.page_source
	# print(response)
	# driver.quit()

	soup = BeautifulSoup(response, "lxml")
	item_list = soup.findAll("li", {"class": re.compile("^list_item i")})
	# print(item_list)

	rssitem_list = []

	for item in item_list:
	Link = str(item.find("span", {"class": "Article_Title"}).contents[0].attrs["href"])
	Link = "http://dgxg.njust.edu.cn" + Link
	Title = str(item.find("span", {"class": "Article_Title"}).contents[0].contents[0])
	PublishDate = str(item.find("span", {"class": "Article_PublishDate"}).contents[0])
	print(Title + "\n" + PublishDate + "\n" + Link)
	# 获取通知正文内容
	driver.get(Link)
	tmp = BeautifulSoup(driver.page_source, "lxml")
	contents = tmp.find("div", {"class": "Article_Content"}).contents
	Description = "".join("%s" % id for id in contents).replace(
	r"/_upload", r"http://dgxg.njust.edu.cn/_upload"
	)
	# print(Description)
	print("" 10)
	# data_content = (
	# Title
	# + "\n"
	# + PublishDate
	# + "\n"
	# + Link
	# + "\n"
	# + Description
	# + "\n"
	# + "" 20
	# + "\n"
	# )
	# with open("./dg17.html", "a") as f:
	# f.write(data_content)

	rssitem_list.append(
	PyRSS2Gen.RSSItem(
	title=Title,
	link=Link,
	description=Description,
	guid=PyRSS2Gen.Guid(Link),
	pubDate=datetime.datetime.strptime(PublishDate, "%Y-%m-%d")
	+ datetime.timedelta(hours=-8),
	),
	)

	driver.quit()

	rss = PyRSS2Gen.RSS2(
	title="电光 17 年级通知",
	link="http://dgxg.njust.edu.cn/_t689/main.htm",
	description="Made by jasongzy",
	lastBuildDate=datetime.datetime.now(),
	items=rssitem_list,
	)

	if len(rss.items) != 0:
	rss.write_xml(open("dg17.xml", "wb"))