YANGoio010/READ_ME.md Secret

## config.py
all_essays_url = 'https://ivypanda.com/essays/all/'
excel_save_path = r"请输入你文档保存的路径"
keywords = ['FAQ', 'Reference', 'Bibliography', 'Works Cited']  # 需要过滤的内容

## get_essay.py
import os
import time
from collections import defaultdict
from bs4 import BeautifulSoup
from selenium.webdriver.common.by import By
from UI_Base_Page.basepage import BasePage
from save_excel import *
from config import *


def start():
    # 启动浏览器
    driver = BasePage()
    driver.open_browser()
    return driver


# 注意：这个函数会覆盖掉已存在的 excel
def get_subject_list(driver):
    # 用于存储结果的字典
    categories_dict = defaultdict(dict)
    # 访问 all_essays_url
    driver.get_url(all_essays_url)
    # 获取 科目列表
    subject_list_loc = (By.CLASS_NAME, 'subject-list')
    subject_list_html = driver.get_outerHTML(subject_list_loc)
    # 解析HTML文档
    soup = BeautifulSoup(subject_list_html, 'lxml')
    # 找到最外层的ul标签
    subject_list = soup.find('ul', class_='subject-list')
    # 遍历 ul 内的所有 li 标签
    for main_category in subject_list.find_all('li', recursive=False):
        main_category_link = main_category.find('a', class_='subject-list__link')
        main_category_name = main_category_link.get('title')
        # main_category_href = main_category_link.get('href')
        # 创建或加载对应的.xlsx文件
        file_name = f"{main_category_name}.xlsx"
        if os.path.exists(file_name):
            workbook = load_workbook(f'save_path/{file_name}')
        else:
            workbook = Workbook()
            workbook.remove(workbook.active)  # 删除默认的Sheet

        # 查找子分类
        sub_category_list = main_category.find('ul')
        if sub_category_list:
            for sub_category in sub_category_list.find_all('li'):
                sub_category_link = sub_category.find('a', class_='subject-list__link')
                sub_category_name = sub_category_link.get('title')
                sub_category_href = sub_category_link.get('href')
                # 检查并截断工作表名称
                sheet_name = sub_category_name if len(sub_category_name) <= 31 else sub_category_name[:28] + "..."
                # 检查 sheet 是否已经存在
                if sheet_name in workbook.sheetnames:
                    sheet = workbook[sheet_name]
                else:
                    sheet = workbook.create_sheet(title=sheet_name)
                    # 添加表头
                    sheet.append(["Sub Category", "Href"])
                # 添加子分类的数据
                sheet.append([sub_category_name, sub_category_href])
                # 将子分类信息加入到字典中
                categories_dict[main_category_name][sheet_name] = sub_category_href

        # 保存文件
        workbook.save(f'save_path/{file_name}')
    return categories_dict


def get_article_detail_links(html_content):
    # 解析HTML内容
    soup = BeautifulSoup(html_content, 'html.parser')
    # 找到所有的article标签
    articles = soup.find_all('article')
    # 用于存储所有href链接的列表
    links = []
    # 遍历每个article标签，并提取其中的href链接
    for article in articles:
        a_tags = article.find_all('a', href=True)
        for a_tag in a_tags:
            href = a_tag['href']
            links.append(href)
    return links


def get_article_url(driver, categories_dict, start_url=None):
    for main_category, sub_categories_dict in categories_dict.items():
        for sub_categories_name, sub_categories_href in sub_categories_dict.items():
            # 访问子分类列表
            driver.get_url(sub_categories_href)
            try:
                # 获取页数
                page_loc = (By.XPATH, '/html/body/div[1]/main/div/div[3]/div/nav/div')
                page = driver.get_outerHTML(page_loc)
                # 解析HTML内容
                soup = BeautifulSoup(page, 'html.parser')
                # 找到所有a标签，并提取href属性
                links = [a.get('href') for a in soup.find_all('a', href=True)]
                # TODO: 当初始数量大于 > 120时, # if max >= 120:

                # 获取所有文章详情链接
                html = driver.get_page_source()
                article_detail_links = get_article_detail_links(html)
                if start_url is not None:
                    start_index = article_detail_links.index(start_url)
                    article_detail_links = article_detail_links[start_index + 1:]
                    start_url = None
                # 要删除的特定URL
                url_to_remove = 'https://service.ivypanda.com/writing-help' \
                    # 使用列表推导式删除所有带有'#'的元素和特定的URL
                article_detail_links = [url for url in article_detail_links if '#' not in url and url != url_to_remove]
                # 打印更新后的列表
                print(article_detail_links)

                # 遍历链接，并访问爬取论文
                for url in article_detail_links:
                    print('流程1')
                    print(url)
                    if url == 'https://service.ivypanda.com/writing-help':
                        pass
                    if '#' in url:
                        pass
                    else:
                        file_name = f'{main_category}.xlsx'
                        file_path = f'save_path/{file_name}'
                        # 检查并截断工作表名称
                        sheet_name = sub_categories_name if len(sub_categories_name) <= 31 else sub_categories_name[
                                                                                                :28] + "..."
                        get_essays(driver, url, file_name, file_path, sheet_name)

                for link in links:
                    driver.get_url(link)
                    # 获取所有文章详情链接
                    html = driver.get_page_source()
                    article_detail_links = get_article_detail_links(html)
                    # 要删除的特定URL
                    url_to_remove = 'https://service.ivypanda.com/writing-help' \
                        # 使用列表推导式删除所有带有'#'的元素和特定的URL
                    article_detail_links = [url for url in article_detail_links if
                                            '#' not in url and url != url_to_remove]
                    # 打印更新后的列表
                    print(article_detail_links)
                    # 遍历链接，并访问爬取论文
                    for url in article_detail_links:
                        file_name = f'{main_category}.xlsx'
                        file_path = f'save_path/{file_name}'
                        # 检查并截断工作表名称
                        sheet_name = sub_categories_name if len(sub_categories_name) <= 31 else sub_categories_name[
                                                                                                :28] + "..."
                        get_essays(driver, url, file_name, file_path, sheet_name)
            except:
                # 获取所有文章详情链接
                html = driver.get_page_source()
                article_detail_links = get_article_detail_links(html)
                if start_url is not None:
                    start_index = article_detail_links.index(start_url)
                    article_detail_links = article_detail_links[start_index + 1:]
                    start_url = None
                # 要删除的特定URL
                url_to_remove = 'https://service.ivypanda.com/writing-help' \
                    # 使用列表推导式删除所有带有'#'的元素和特定的URL
                article_detail_links = [url for url in article_detail_links if '#' not in url and url != url_to_remove]
                # 打印更新后的列表
                print(article_detail_links)
                # 遍历链接，并访问爬取论文
                for url in article_detail_links:
                    print('流程2')
                    print(url)
                    if url == 'https://service.ivypanda.com/writing-help':
                        pass
                    if '#' in url:
                        pass
                    else:
                        file_name = f'{main_category}.xlsx'
                        file_path = f'save_path/{file_name}'
                        # 检查并截断工作表名称
                        sheet_name = sub_categories_name if len(sub_categories_name) <= 31 else sub_categories_name[
                                                                                                :28] + "..."
                        get_essays(driver, url, file_name, file_path, sheet_name)


def check_words(text):
    # 拆分文本为单词列表
    words = text.split()
    # 计算单词数
    word_count = len(words)
    # 检查单词数量是否在200到2000之间
    if 200 <= word_count <= 2000:
        return True
    else:
        return False


def get_essays(driver, url, file_name, file_path, sheet_name=None):
    try:
        # 启动浏览器并访问论文详情
        driver.get_url(url)
        time.sleep(5)
        # 获取Table of Contents
        table_of_contents_loc = (By.CLASS_NAME, 'article__content')
        html = driver.get_outerHTML(table_of_contents_loc)
        # 解析HTML内容
        soup = BeautifulSoup(html, 'lxml')
        # 筛除 tableOfContentBody
        table_of_content_body = soup.find(id='tableOfContentBody')
        if table_of_content_body:
            table_of_content_body.decompose()
        # 提取所有的 h2 和 p 标签
        elements = soup.find_all(['h2', 'p', 'li'])
        # 定义内容
        content = ""
        # 按顺序打印所有提取的内容
        for element in elements:
            # 提取出元素标签和内容
            element_name = element.name
            element_text = element.text.strip()
            # 筛除 work cited
            if any(keyword in element_text for keyword in keywords):
                break
            # subtitle 前需要换行
            if element_name == 'h2':
                content += '\n'
            # 拼装正文内容
            content += element_text + '\n'
        # 正文内容字数检查(大于200，且小于2000)
        result = check_words(content)
        if result is True:
            # 写入表格
            print(file_path, sheet_name)
            wb, ws = load_existing_excel(file_path, sheet_name)
            max_row, max_column = get_max(ws)
            print(max_row + 1, max_column)
            write_excel(ws, max_row + 1, 1, content)
            save_excel(wb, file_name)
            print(f'保存成功！')
        else:
            print('字数不符合要求，跳过保存！')
    except:
        print('保存失败！')


## READ_ME.md

      
    Raw
  

              READ_ME.md
            
          
    需求：


爬取网站文章并保存到 excel 中

限制要求：


只保存字数200-2000之间的文章
过滤掉 ['FAQ', 'Reference', 'Bibliography', 'Works Cited'] 等内容

作者留言：

脚本的开发在很多地方都显得有些粗糙，没有过多的代码优化。由于爬虫脚本属于一次性使用，每个网站的页面结构和数据格式都不同，因此每次针对新网站进行爬取，都需要重新编写代码。因此，开发过程主要追求效率。
希望我提供的代码能够为学习爬虫的各位提供新的灵感和思路。
代码依赖的第三方库：


安装 selenium：
pip install selenium


安装 BeautifulSoup：
pip install beautifulsoup4


安装 openpyxl：
pip install openpyxl


安装 defaultdict：
pip install collections-extended


## run.py
if __name__ == '__main__':
    driver = start()
    # categories_dict = get_subject_list(driver)    # 注意：执行该函数会覆盖掉所有原有的已保存论文
    # print(categories_dict)  # 将提取的所有内容复制出来，然后分配到多个脚本进行调用，实现多线程进行爬虫
    categories_dict = {
        'Sociology': {'Animal Rights': 'https://ivypanda.com/essays/subject/animal-rights/',
                      'Communications': 'https://ivypanda.com/essays/subject/communications/',
                      'Drug and Alcohol Addiction': 'https://ivypanda.com/essays/subject/drug-and-alcohol-addiction/',
                      'Ethics': 'https://ivypanda.com/essays/subject/ethics/',
                      'Everyday Interactions': 'https://ivypanda.com/essays/subject/everyday-interactions/',
                      'Feminism': 'https://ivypanda.com/essays/subject/feminism/',
                      'Gender Identity': 'https://ivypanda.com/essays/subject/gender-identity/',
                      'Gender Inequality': 'https://ivypanda.com/essays/subject/gender-inequality/',
                      'Gender Studies': 'https://ivypanda.com/essays/subject/gender-studies/',
                      'Human Rights': 'https://ivypanda.com/essays/subject/human-rights/',
                      'Identity': 'https://ivypanda.com/essays/subject/identity/',
                      'Immigration': 'https://ivypanda.com/essays/subject/immigration/',
                      'Overcoming Difficulties': 'https://ivypanda.com/essays/subject/overcoming-difficulties/',
                      'Overpopulation': 'https://ivypanda.com/essays/subject/overpopulation/',
                      'Poverty': 'https://ivypanda.com/essays/subject/poverty/',
                      'Racism': 'https://ivypanda.com/essays/subject/racism/',
                      'Rhetoric': 'https://ivypanda.com/essays/subject/rhetoric/',
                      'Slavery': 'https://ivypanda.com/essays/subject/slavery/',
                      'Social Movements': 'https://ivypanda.com/essays/subject/social-movements/',
                      'Socialization': 'https://ivypanda.com/essays/subject/socialization/',
                      "Society's Imperfections": 'https://ivypanda.com/essays/subject/societys-imperfections/',
                      'Sociological Issues': 'https://ivypanda.com/essays/subject/sociological-issues/',
                      'Sociological Theories': 'https://ivypanda.com/essays/subject/sociological-theories/',
                      'Violence': 'https://ivypanda.com/essays/subject/violence/'},
    }
    # start_url 是调试脚本时增加的容错机制，列表第一页只有120条数据，若120条数据以内报错，则可以将中断的url传入函数，就可以从该位置开始
    # 但是后来我优化了逻辑的过滤，几乎不会再报错了，这个容错方案应该也用不上了。
    # start_url = 'https://ivypanda.com/essays/torts-and-crimes-liability-for-traffic-accidents/'
    get_article_url(driver, categories_dict, start_url=None)
    driver.quit()


## save_excel.py
from openpyxl import Workbook, load_workbook
from config import excel_save_path
import openpyxl


def create_excel():
    # 创建一个新的工作簿和工作表
    wb = Workbook()
    ws = wb.active
    return wb, ws


def load_existing_excel(file_path, sheet_name=None):
    """
    加载已存在的Excel文件，并获取指定的工作表。

    :param file_path: 已存在的Excel文件的路径。
    :param sheet_name: 工作表名称。如果为None，则返回活动工作表。
    :return: 返回工作簿(wb)和工作表(ws)对象。
    """
    try:
        # 加载现有的Excel文件
        wb = load_workbook(file_path)

        # 获取指定的工作表，如果没有指定则获取活动工作表
        if sheet_name:
            ws = wb[sheet_name]
        else:
            ws = wb.active

        return wb, ws
    except Exception as e:
        print(f"加载Excel文件时出错：{e}")
        return None, None


def write_excel(ws, row, col, value):
    # 写入数据到指定单元格
    ws.cell(row=row, column=col).value = value


def save_excel(wb, file_name):
    # 保存工作簿到文件
    wb.save(f'{excel_save_path}/{file_name}')


def get_max_row(categories_dict):
    # 遍历类别字典
    for main_category, sub_categories_dict in categories_dict.items():
        file_path = excel_save_path + rf"\{main_category}.xlsx"
        try:
            # 尝试打开工作簿
            workbook = openpyxl.load_workbook(file_path)
        except FileNotFoundError:
            print(f"文件 {file_path} 未找到。")
            continue

        # 遍历子类别并获取每个子类别对应sheet的最大行和列
        for sub_category_name, _ in sub_categories_dict.items():
            if sub_category_name in workbook.sheetnames:
                sheet = workbook[sub_category_name]
                return sheet.max_row - 2


# 获取最大行数和行数
def get_max(ws):
    max_row = ws.max_row
    max_column = ws.max_column
    return max_row, max_column
	all_essays_url = 'https://ivypanda.com/essays/all/'
	excel_save_path = r"请输入你文档保存的路径"
	keywords = ['FAQ', 'Reference', 'Bibliography', 'Works Cited'] # 需要过滤的内容
	import os
	import time
	from collections import defaultdict
	from bs4 import BeautifulSoup
	from selenium.webdriver.common.by import By
	from UI_Base_Page.basepage import BasePage
	from save_excel import *
	from config import *


	def start():
	# 启动浏览器
	driver = BasePage()
	driver.open_browser()
	return driver


	# 注意：这个函数会覆盖掉已存在的 excel
	def get_subject_list(driver):
	# 用于存储结果的字典
	categories_dict = defaultdict(dict)
	# 访问 all_essays_url
	driver.get_url(all_essays_url)
	# 获取科目列表
	subject_list_loc = (By.CLASS_NAME, 'subject-list')
	subject_list_html = driver.get_outerHTML(subject_list_loc)
	# 解析HTML文档
	soup = BeautifulSoup(subject_list_html, 'lxml')
	# 找到最外层的ul标签
	subject_list = soup.find('ul', class_='subject-list')
	# 遍历 ul 内的所有 li 标签
	for main_category in subject_list.find_all('li', recursive=False):
	main_category_link = main_category.find('a', class_='subject-list__link')
	main_category_name = main_category_link.get('title')
	# main_category_href = main_category_link.get('href')
	# 创建或加载对应的.xlsx文件
	file_name = f"{main_category_name}.xlsx"
	if os.path.exists(file_name):
	workbook = load_workbook(f'save_path/{file_name}')
	else:
	workbook = Workbook()
	workbook.remove(workbook.active) # 删除默认的Sheet

	# 查找子分类
	sub_category_list = main_category.find('ul')
	if sub_category_list:
	for sub_category in sub_category_list.find_all('li'):
	sub_category_link = sub_category.find('a', class_='subject-list__link')
	sub_category_name = sub_category_link.get('title')
	sub_category_href = sub_category_link.get('href')
	# 检查并截断工作表名称
	sheet_name = sub_category_name if len(sub_category_name) <= 31 else sub_category_name[:28] + "..."
	# 检查 sheet 是否已经存在
	if sheet_name in workbook.sheetnames:
	sheet = workbook[sheet_name]
	else:
	sheet = workbook.create_sheet(title=sheet_name)
	# 添加表头
	sheet.append(["Sub Category", "Href"])
	# 添加子分类的数据
	sheet.append([sub_category_name, sub_category_href])
	# 将子分类信息加入到字典中
	categories_dict[main_category_name][sheet_name] = sub_category_href

	# 保存文件
	workbook.save(f'save_path/{file_name}')
	return categories_dict


	def get_article_detail_links(html_content):
	# 解析HTML内容
	soup = BeautifulSoup(html_content, 'html.parser')
	# 找到所有的article标签
	articles = soup.find_all('article')
	# 用于存储所有href链接的列表
	links = []
	# 遍历每个article标签，并提取其中的href链接
	for article in articles:
	a_tags = article.find_all('a', href=True)
	for a_tag in a_tags:
	href = a_tag['href']
	links.append(href)
	return links


	def get_article_url(driver, categories_dict, start_url=None):
	for main_category, sub_categories_dict in categories_dict.items():
	for sub_categories_name, sub_categories_href in sub_categories_dict.items():
	# 访问子分类列表
	driver.get_url(sub_categories_href)
	try:
	# 获取页数
	page_loc = (By.XPATH, '/html/body/div[1]/main/div/div[3]/div/nav/div')
	page = driver.get_outerHTML(page_loc)
	# 解析HTML内容
	soup = BeautifulSoup(page, 'html.parser')
	# 找到所有a标签，并提取href属性
	links = [a.get('href') for a in soup.find_all('a', href=True)]
	# TODO: 当初始数量大于 > 120时, # if max >= 120:

	# 获取所有文章详情链接
	html = driver.get_page_source()
	article_detail_links = get_article_detail_links(html)
	if start_url is not None:
	start_index = article_detail_links.index(start_url)
	article_detail_links = article_detail_links[start_index + 1:]
	start_url = None
	# 要删除的特定URL
	url_to_remove = 'https://service.ivypanda.com/writing-help' \
	# 使用列表推导式删除所有带有'#'的元素和特定的URL
	article_detail_links = [url for url in article_detail_links if '#' not in url and url != url_to_remove]
	# 打印更新后的列表
	print(article_detail_links)

	# 遍历链接，并访问爬取论文
	for url in article_detail_links:
	print('流程1')
	print(url)
	if url == 'https://service.ivypanda.com/writing-help':
	pass
	if '#' in url:
	pass
	else:
	file_name = f'{main_category}.xlsx'
	file_path = f'save_path/{file_name}'
	# 检查并截断工作表名称
	sheet_name = sub_categories_name if len(sub_categories_name) <= 31 else sub_categories_name[
	:28] + "..."
	get_essays(driver, url, file_name, file_path, sheet_name)

	for link in links:
	driver.get_url(link)
	# 获取所有文章详情链接
	html = driver.get_page_source()
	article_detail_links = get_article_detail_links(html)
	# 要删除的特定URL
	url_to_remove = 'https://service.ivypanda.com/writing-help' \
	# 使用列表推导式删除所有带有'#'的元素和特定的URL
	article_detail_links = [url for url in article_detail_links if
	'#' not in url and url != url_to_remove]
	# 打印更新后的列表
	print(article_detail_links)
	# 遍历链接，并访问爬取论文
	for url in article_detail_links:
	file_name = f'{main_category}.xlsx'
	file_path = f'save_path/{file_name}'
	# 检查并截断工作表名称
	sheet_name = sub_categories_name if len(sub_categories_name) <= 31 else sub_categories_name[
	:28] + "..."
	get_essays(driver, url, file_name, file_path, sheet_name)
	except:
	# 获取所有文章详情链接
	html = driver.get_page_source()
	article_detail_links = get_article_detail_links(html)
	if start_url is not None:
	start_index = article_detail_links.index(start_url)
	article_detail_links = article_detail_links[start_index + 1:]
	start_url = None
	# 要删除的特定URL
	url_to_remove = 'https://service.ivypanda.com/writing-help' \
	# 使用列表推导式删除所有带有'#'的元素和特定的URL
	article_detail_links = [url for url in article_detail_links if '#' not in url and url != url_to_remove]
	# 打印更新后的列表
	print(article_detail_links)
	# 遍历链接，并访问爬取论文
	for url in article_detail_links:
	print('流程2')
	print(url)
	if url == 'https://service.ivypanda.com/writing-help':
	pass
	if '#' in url:
	pass
	else:
	file_name = f'{main_category}.xlsx'
	file_path = f'save_path/{file_name}'
	# 检查并截断工作表名称
	sheet_name = sub_categories_name if len(sub_categories_name) <= 31 else sub_categories_name[
	:28] + "..."
	get_essays(driver, url, file_name, file_path, sheet_name)


	def check_words(text):
	# 拆分文本为单词列表
	words = text.split()
	# 计算单词数
	word_count = len(words)
	# 检查单词数量是否在200到2000之间
	if 200 <= word_count <= 2000:
	return True
	else:
	return False


	def get_essays(driver, url, file_name, file_path, sheet_name=None):
	try:
	# 启动浏览器并访问论文详情
	driver.get_url(url)
	time.sleep(5)
	# 获取Table of Contents
	table_of_contents_loc = (By.CLASS_NAME, 'article__content')
	html = driver.get_outerHTML(table_of_contents_loc)
	# 解析HTML内容
	soup = BeautifulSoup(html, 'lxml')
	# 筛除 tableOfContentBody
	table_of_content_body = soup.find(id='tableOfContentBody')
	if table_of_content_body:
	table_of_content_body.decompose()
	# 提取所有的 h2 和 p 标签
	elements = soup.find_all(['h2', 'p', 'li'])
	# 定义内容
	content = ""
	# 按顺序打印所有提取的内容
	for element in elements:
	# 提取出元素标签和内容
	element_name = element.name
	element_text = element.text.strip()
	# 筛除 work cited
	if any(keyword in element_text for keyword in keywords):
	break
	# subtitle 前需要换行
	if element_name == 'h2':
	content += '\n'
	# 拼装正文内容
	content += element_text + '\n'
	# 正文内容字数检查(大于200，且小于2000)
	result = check_words(content)
	if result is True:
	# 写入表格
	print(file_path, sheet_name)
	wb, ws = load_existing_excel(file_path, sheet_name)
	max_row, max_column = get_max(ws)
	print(max_row + 1, max_column)
	write_excel(ws, max_row + 1, 1, content)
	save_excel(wb, file_name)
	print(f'保存成功！')
	else:
	print('字数不符合要求，跳过保存！')
	except:
	print('保存失败！')
	if __name__ == '__main__':
	driver = start()
	# categories_dict = get_subject_list(driver) # 注意：执行该函数会覆盖掉所有原有的已保存论文
	# print(categories_dict) # 将提取的所有内容复制出来，然后分配到多个脚本进行调用，实现多线程进行爬虫
	categories_dict = {
	'Sociology': {'Animal Rights': 'https://ivypanda.com/essays/subject/animal-rights/',
	'Communications': 'https://ivypanda.com/essays/subject/communications/',
	'Drug and Alcohol Addiction': 'https://ivypanda.com/essays/subject/drug-and-alcohol-addiction/',
	'Ethics': 'https://ivypanda.com/essays/subject/ethics/',
	'Everyday Interactions': 'https://ivypanda.com/essays/subject/everyday-interactions/',
	'Feminism': 'https://ivypanda.com/essays/subject/feminism/',
	'Gender Identity': 'https://ivypanda.com/essays/subject/gender-identity/',
	'Gender Inequality': 'https://ivypanda.com/essays/subject/gender-inequality/',
	'Gender Studies': 'https://ivypanda.com/essays/subject/gender-studies/',
	'Human Rights': 'https://ivypanda.com/essays/subject/human-rights/',
	'Identity': 'https://ivypanda.com/essays/subject/identity/',
	'Immigration': 'https://ivypanda.com/essays/subject/immigration/',
	'Overcoming Difficulties': 'https://ivypanda.com/essays/subject/overcoming-difficulties/',
	'Overpopulation': 'https://ivypanda.com/essays/subject/overpopulation/',
	'Poverty': 'https://ivypanda.com/essays/subject/poverty/',
	'Racism': 'https://ivypanda.com/essays/subject/racism/',
	'Rhetoric': 'https://ivypanda.com/essays/subject/rhetoric/',
	'Slavery': 'https://ivypanda.com/essays/subject/slavery/',
	'Social Movements': 'https://ivypanda.com/essays/subject/social-movements/',
	'Socialization': 'https://ivypanda.com/essays/subject/socialization/',
	"Society's Imperfections": 'https://ivypanda.com/essays/subject/societys-imperfections/',
	'Sociological Issues': 'https://ivypanda.com/essays/subject/sociological-issues/',
	'Sociological Theories': 'https://ivypanda.com/essays/subject/sociological-theories/',
	'Violence': 'https://ivypanda.com/essays/subject/violence/'},
	}
	# start_url 是调试脚本时增加的容错机制，列表第一页只有120条数据，若120条数据以内报错，则可以将中断的url传入函数，就可以从该位置开始
	# 但是后来我优化了逻辑的过滤，几乎不会再报错了，这个容错方案应该也用不上了。
	# start_url = 'https://ivypanda.com/essays/torts-and-crimes-liability-for-traffic-accidents/'
	get_article_url(driver, categories_dict, start_url=None)
	driver.quit()
	from openpyxl import Workbook, load_workbook
	from config import excel_save_path
	import openpyxl


	def create_excel():
	# 创建一个新的工作簿和工作表
	wb = Workbook()
	ws = wb.active
	return wb, ws


	def load_existing_excel(file_path, sheet_name=None):
	"""
	加载已存在的Excel文件，并获取指定的工作表。

	:param file_path: 已存在的Excel文件的路径。
	:param sheet_name: 工作表名称。如果为None，则返回活动工作表。
	:return: 返回工作簿(wb)和工作表(ws)对象。
	"""
	try:
	# 加载现有的Excel文件
	wb = load_workbook(file_path)

	# 获取指定的工作表，如果没有指定则获取活动工作表
	if sheet_name:
	ws = wb[sheet_name]
	else:
	ws = wb.active

	return wb, ws
	except Exception as e:
	print(f"加载Excel文件时出错：{e}")
	return None, None


	def write_excel(ws, row, col, value):
	# 写入数据到指定单元格
	ws.cell(row=row, column=col).value = value


	def save_excel(wb, file_name):
	# 保存工作簿到文件
	wb.save(f'{excel_save_path}/{file_name}')


	def get_max_row(categories_dict):
	# 遍历类别字典
	for main_category, sub_categories_dict in categories_dict.items():
	file_path = excel_save_path + rf"\{main_category}.xlsx"
	try:
	# 尝试打开工作簿
	workbook = openpyxl.load_workbook(file_path)
	except FileNotFoundError:
	print(f"文件 {file_path} 未找到。")
	continue

	# 遍历子类别并获取每个子类别对应sheet的最大行和列
	for sub_category_name, _ in sub_categories_dict.items():
	if sub_category_name in workbook.sheetnames:
	sheet = workbook[sub_category_name]
	return sheet.max_row - 2


	# 获取最大行数和行数
	def get_max(ws):
	max_row = ws.max_row
	max_column = ws.max_column
	return max_row, max_column