|
import os |
|
import time |
|
from collections import defaultdict |
|
from bs4 import BeautifulSoup |
|
from selenium.webdriver.common.by import By |
|
from UI_Base_Page.basepage import BasePage |
|
from save_excel import * |
|
from config import * |
|
|
|
|
|
def start(): |
|
# 启动浏览器 |
|
driver = BasePage() |
|
driver.open_browser() |
|
return driver |
|
|
|
|
|
# 注意:这个函数会覆盖掉已存在的 excel |
|
def get_subject_list(driver): |
|
# 用于存储结果的字典 |
|
categories_dict = defaultdict(dict) |
|
# 访问 all_essays_url |
|
driver.get_url(all_essays_url) |
|
# 获取 科目列表 |
|
subject_list_loc = (By.CLASS_NAME, 'subject-list') |
|
subject_list_html = driver.get_outerHTML(subject_list_loc) |
|
# 解析HTML文档 |
|
soup = BeautifulSoup(subject_list_html, 'lxml') |
|
# 找到最外层的ul标签 |
|
subject_list = soup.find('ul', class_='subject-list') |
|
# 遍历 ul 内的所有 li 标签 |
|
for main_category in subject_list.find_all('li', recursive=False): |
|
main_category_link = main_category.find('a', class_='subject-list__link') |
|
main_category_name = main_category_link.get('title') |
|
# main_category_href = main_category_link.get('href') |
|
# 创建或加载对应的.xlsx文件 |
|
file_name = f"{main_category_name}.xlsx" |
|
if os.path.exists(file_name): |
|
workbook = load_workbook(f'save_path/{file_name}') |
|
else: |
|
workbook = Workbook() |
|
workbook.remove(workbook.active) # 删除默认的Sheet |
|
|
|
# 查找子分类 |
|
sub_category_list = main_category.find('ul') |
|
if sub_category_list: |
|
for sub_category in sub_category_list.find_all('li'): |
|
sub_category_link = sub_category.find('a', class_='subject-list__link') |
|
sub_category_name = sub_category_link.get('title') |
|
sub_category_href = sub_category_link.get('href') |
|
# 检查并截断工作表名称 |
|
sheet_name = sub_category_name if len(sub_category_name) <= 31 else sub_category_name[:28] + "..." |
|
# 检查 sheet 是否已经存在 |
|
if sheet_name in workbook.sheetnames: |
|
sheet = workbook[sheet_name] |
|
else: |
|
sheet = workbook.create_sheet(title=sheet_name) |
|
# 添加表头 |
|
sheet.append(["Sub Category", "Href"]) |
|
# 添加子分类的数据 |
|
sheet.append([sub_category_name, sub_category_href]) |
|
# 将子分类信息加入到字典中 |
|
categories_dict[main_category_name][sheet_name] = sub_category_href |
|
|
|
# 保存文件 |
|
workbook.save(f'save_path/{file_name}') |
|
return categories_dict |
|
|
|
|
|
def get_article_detail_links(html_content): |
|
# 解析HTML内容 |
|
soup = BeautifulSoup(html_content, 'html.parser') |
|
# 找到所有的article标签 |
|
articles = soup.find_all('article') |
|
# 用于存储所有href链接的列表 |
|
links = [] |
|
# 遍历每个article标签,并提取其中的href链接 |
|
for article in articles: |
|
a_tags = article.find_all('a', href=True) |
|
for a_tag in a_tags: |
|
href = a_tag['href'] |
|
links.append(href) |
|
return links |
|
|
|
|
|
def get_article_url(driver, categories_dict, start_url=None): |
|
for main_category, sub_categories_dict in categories_dict.items(): |
|
for sub_categories_name, sub_categories_href in sub_categories_dict.items(): |
|
# 访问子分类列表 |
|
driver.get_url(sub_categories_href) |
|
try: |
|
# 获取页数 |
|
page_loc = (By.XPATH, '/html/body/div[1]/main/div/div[3]/div/nav/div') |
|
page = driver.get_outerHTML(page_loc) |
|
# 解析HTML内容 |
|
soup = BeautifulSoup(page, 'html.parser') |
|
# 找到所有a标签,并提取href属性 |
|
links = [a.get('href') for a in soup.find_all('a', href=True)] |
|
# TODO: 当初始数量大于 > 120时, # if max >= 120: |
|
|
|
# 获取所有文章详情链接 |
|
html = driver.get_page_source() |
|
article_detail_links = get_article_detail_links(html) |
|
if start_url is not None: |
|
start_index = article_detail_links.index(start_url) |
|
article_detail_links = article_detail_links[start_index + 1:] |
|
start_url = None |
|
# 要删除的特定URL |
|
url_to_remove = 'https://service.ivypanda.com/writing-help' \ |
|
# 使用列表推导式删除所有带有'#'的元素和特定的URL |
|
article_detail_links = [url for url in article_detail_links if '#' not in url and url != url_to_remove] |
|
# 打印更新后的列表 |
|
print(article_detail_links) |
|
|
|
# 遍历链接,并访问爬取论文 |
|
for url in article_detail_links: |
|
print('流程1') |
|
print(url) |
|
if url == 'https://service.ivypanda.com/writing-help': |
|
pass |
|
if '#' in url: |
|
pass |
|
else: |
|
file_name = f'{main_category}.xlsx' |
|
file_path = f'save_path/{file_name}' |
|
# 检查并截断工作表名称 |
|
sheet_name = sub_categories_name if len(sub_categories_name) <= 31 else sub_categories_name[ |
|
:28] + "..." |
|
get_essays(driver, url, file_name, file_path, sheet_name) |
|
|
|
for link in links: |
|
driver.get_url(link) |
|
# 获取所有文章详情链接 |
|
html = driver.get_page_source() |
|
article_detail_links = get_article_detail_links(html) |
|
# 要删除的特定URL |
|
url_to_remove = 'https://service.ivypanda.com/writing-help' \ |
|
# 使用列表推导式删除所有带有'#'的元素和特定的URL |
|
article_detail_links = [url for url in article_detail_links if |
|
'#' not in url and url != url_to_remove] |
|
# 打印更新后的列表 |
|
print(article_detail_links) |
|
# 遍历链接,并访问爬取论文 |
|
for url in article_detail_links: |
|
file_name = f'{main_category}.xlsx' |
|
file_path = f'save_path/{file_name}' |
|
# 检查并截断工作表名称 |
|
sheet_name = sub_categories_name if len(sub_categories_name) <= 31 else sub_categories_name[ |
|
:28] + "..." |
|
get_essays(driver, url, file_name, file_path, sheet_name) |
|
except: |
|
# 获取所有文章详情链接 |
|
html = driver.get_page_source() |
|
article_detail_links = get_article_detail_links(html) |
|
if start_url is not None: |
|
start_index = article_detail_links.index(start_url) |
|
article_detail_links = article_detail_links[start_index + 1:] |
|
start_url = None |
|
# 要删除的特定URL |
|
url_to_remove = 'https://service.ivypanda.com/writing-help' \ |
|
# 使用列表推导式删除所有带有'#'的元素和特定的URL |
|
article_detail_links = [url for url in article_detail_links if '#' not in url and url != url_to_remove] |
|
# 打印更新后的列表 |
|
print(article_detail_links) |
|
# 遍历链接,并访问爬取论文 |
|
for url in article_detail_links: |
|
print('流程2') |
|
print(url) |
|
if url == 'https://service.ivypanda.com/writing-help': |
|
pass |
|
if '#' in url: |
|
pass |
|
else: |
|
file_name = f'{main_category}.xlsx' |
|
file_path = f'save_path/{file_name}' |
|
# 检查并截断工作表名称 |
|
sheet_name = sub_categories_name if len(sub_categories_name) <= 31 else sub_categories_name[ |
|
:28] + "..." |
|
get_essays(driver, url, file_name, file_path, sheet_name) |
|
|
|
|
|
def check_words(text): |
|
# 拆分文本为单词列表 |
|
words = text.split() |
|
# 计算单词数 |
|
word_count = len(words) |
|
# 检查单词数量是否在200到2000之间 |
|
if 200 <= word_count <= 2000: |
|
return True |
|
else: |
|
return False |
|
|
|
|
|
def get_essays(driver, url, file_name, file_path, sheet_name=None): |
|
try: |
|
# 启动浏览器并访问论文详情 |
|
driver.get_url(url) |
|
time.sleep(5) |
|
# 获取Table of Contents |
|
table_of_contents_loc = (By.CLASS_NAME, 'article__content') |
|
html = driver.get_outerHTML(table_of_contents_loc) |
|
# 解析HTML内容 |
|
soup = BeautifulSoup(html, 'lxml') |
|
# 筛除 tableOfContentBody |
|
table_of_content_body = soup.find(id='tableOfContentBody') |
|
if table_of_content_body: |
|
table_of_content_body.decompose() |
|
# 提取所有的 h2 和 p 标签 |
|
elements = soup.find_all(['h2', 'p', 'li']) |
|
# 定义内容 |
|
content = "" |
|
# 按顺序打印所有提取的内容 |
|
for element in elements: |
|
# 提取出元素标签和内容 |
|
element_name = element.name |
|
element_text = element.text.strip() |
|
# 筛除 work cited |
|
if any(keyword in element_text for keyword in keywords): |
|
break |
|
# subtitle 前需要换行 |
|
if element_name == 'h2': |
|
content += '\n' |
|
# 拼装正文内容 |
|
content += element_text + '\n' |
|
# 正文内容字数检查(大于200,且小于2000) |
|
result = check_words(content) |
|
if result is True: |
|
# 写入表格 |
|
print(file_path, sheet_name) |
|
wb, ws = load_existing_excel(file_path, sheet_name) |
|
max_row, max_column = get_max(ws) |
|
print(max_row + 1, max_column) |
|
write_excel(ws, max_row + 1, 1, content) |
|
save_excel(wb, file_name) |
|
print(f'保存成功!') |
|
else: |
|
print('字数不符合要求,跳过保存!') |
|
except: |
|
print('保存失败!') |
|
|