Skip to content

Instantly share code, notes, and snippets.

@YANGoio010
Created July 2, 2024 01:51
Show Gist options
  • Save YANGoio010/508ef339489b689413eac6b1766d1d90 to your computer and use it in GitHub Desktop.
Save YANGoio010/508ef339489b689413eac6b1766d1d90 to your computer and use it in GitHub Desktop.
Python爬虫实战:如何爬取网站文章并保存到Excel
all_essays_url = 'https://ivypanda.com/essays/all/'
excel_save_path = r"请输入你文档保存的路径"
keywords = ['FAQ', 'Reference', 'Bibliography', 'Works Cited'] # 需要过滤的内容
import os
import time
from collections import defaultdict
from bs4 import BeautifulSoup
from selenium.webdriver.common.by import By
from UI_Base_Page.basepage import BasePage
from save_excel import *
from config import *
def start():
# 启动浏览器
driver = BasePage()
driver.open_browser()
return driver
# 注意:这个函数会覆盖掉已存在的 excel
def get_subject_list(driver):
# 用于存储结果的字典
categories_dict = defaultdict(dict)
# 访问 all_essays_url
driver.get_url(all_essays_url)
# 获取 科目列表
subject_list_loc = (By.CLASS_NAME, 'subject-list')
subject_list_html = driver.get_outerHTML(subject_list_loc)
# 解析HTML文档
soup = BeautifulSoup(subject_list_html, 'lxml')
# 找到最外层的ul标签
subject_list = soup.find('ul', class_='subject-list')
# 遍历 ul 内的所有 li 标签
for main_category in subject_list.find_all('li', recursive=False):
main_category_link = main_category.find('a', class_='subject-list__link')
main_category_name = main_category_link.get('title')
# main_category_href = main_category_link.get('href')
# 创建或加载对应的.xlsx文件
file_name = f"{main_category_name}.xlsx"
if os.path.exists(file_name):
workbook = load_workbook(f'save_path/{file_name}')
else:
workbook = Workbook()
workbook.remove(workbook.active) # 删除默认的Sheet
# 查找子分类
sub_category_list = main_category.find('ul')
if sub_category_list:
for sub_category in sub_category_list.find_all('li'):
sub_category_link = sub_category.find('a', class_='subject-list__link')
sub_category_name = sub_category_link.get('title')
sub_category_href = sub_category_link.get('href')
# 检查并截断工作表名称
sheet_name = sub_category_name if len(sub_category_name) <= 31 else sub_category_name[:28] + "..."
# 检查 sheet 是否已经存在
if sheet_name in workbook.sheetnames:
sheet = workbook[sheet_name]
else:
sheet = workbook.create_sheet(title=sheet_name)
# 添加表头
sheet.append(["Sub Category", "Href"])
# 添加子分类的数据
sheet.append([sub_category_name, sub_category_href])
# 将子分类信息加入到字典中
categories_dict[main_category_name][sheet_name] = sub_category_href
# 保存文件
workbook.save(f'save_path/{file_name}')
return categories_dict
def get_article_detail_links(html_content):
# 解析HTML内容
soup = BeautifulSoup(html_content, 'html.parser')
# 找到所有的article标签
articles = soup.find_all('article')
# 用于存储所有href链接的列表
links = []
# 遍历每个article标签,并提取其中的href链接
for article in articles:
a_tags = article.find_all('a', href=True)
for a_tag in a_tags:
href = a_tag['href']
links.append(href)
return links
def get_article_url(driver, categories_dict, start_url=None):
for main_category, sub_categories_dict in categories_dict.items():
for sub_categories_name, sub_categories_href in sub_categories_dict.items():
# 访问子分类列表
driver.get_url(sub_categories_href)
try:
# 获取页数
page_loc = (By.XPATH, '/html/body/div[1]/main/div/div[3]/div/nav/div')
page = driver.get_outerHTML(page_loc)
# 解析HTML内容
soup = BeautifulSoup(page, 'html.parser')
# 找到所有a标签,并提取href属性
links = [a.get('href') for a in soup.find_all('a', href=True)]
# TODO: 当初始数量大于 > 120时, # if max >= 120:
# 获取所有文章详情链接
html = driver.get_page_source()
article_detail_links = get_article_detail_links(html)
if start_url is not None:
start_index = article_detail_links.index(start_url)
article_detail_links = article_detail_links[start_index + 1:]
start_url = None
# 要删除的特定URL
url_to_remove = 'https://service.ivypanda.com/writing-help' \
# 使用列表推导式删除所有带有'#'的元素和特定的URL
article_detail_links = [url for url in article_detail_links if '#' not in url and url != url_to_remove]
# 打印更新后的列表
print(article_detail_links)
# 遍历链接,并访问爬取论文
for url in article_detail_links:
print('流程1')
print(url)
if url == 'https://service.ivypanda.com/writing-help':
pass
if '#' in url:
pass
else:
file_name = f'{main_category}.xlsx'
file_path = f'save_path/{file_name}'
# 检查并截断工作表名称
sheet_name = sub_categories_name if len(sub_categories_name) <= 31 else sub_categories_name[
:28] + "..."
get_essays(driver, url, file_name, file_path, sheet_name)
for link in links:
driver.get_url(link)
# 获取所有文章详情链接
html = driver.get_page_source()
article_detail_links = get_article_detail_links(html)
# 要删除的特定URL
url_to_remove = 'https://service.ivypanda.com/writing-help' \
# 使用列表推导式删除所有带有'#'的元素和特定的URL
article_detail_links = [url for url in article_detail_links if
'#' not in url and url != url_to_remove]
# 打印更新后的列表
print(article_detail_links)
# 遍历链接,并访问爬取论文
for url in article_detail_links:
file_name = f'{main_category}.xlsx'
file_path = f'save_path/{file_name}'
# 检查并截断工作表名称
sheet_name = sub_categories_name if len(sub_categories_name) <= 31 else sub_categories_name[
:28] + "..."
get_essays(driver, url, file_name, file_path, sheet_name)
except:
# 获取所有文章详情链接
html = driver.get_page_source()
article_detail_links = get_article_detail_links(html)
if start_url is not None:
start_index = article_detail_links.index(start_url)
article_detail_links = article_detail_links[start_index + 1:]
start_url = None
# 要删除的特定URL
url_to_remove = 'https://service.ivypanda.com/writing-help' \
# 使用列表推导式删除所有带有'#'的元素和特定的URL
article_detail_links = [url for url in article_detail_links if '#' not in url and url != url_to_remove]
# 打印更新后的列表
print(article_detail_links)
# 遍历链接,并访问爬取论文
for url in article_detail_links:
print('流程2')
print(url)
if url == 'https://service.ivypanda.com/writing-help':
pass
if '#' in url:
pass
else:
file_name = f'{main_category}.xlsx'
file_path = f'save_path/{file_name}'
# 检查并截断工作表名称
sheet_name = sub_categories_name if len(sub_categories_name) <= 31 else sub_categories_name[
:28] + "..."
get_essays(driver, url, file_name, file_path, sheet_name)
def check_words(text):
# 拆分文本为单词列表
words = text.split()
# 计算单词数
word_count = len(words)
# 检查单词数量是否在200到2000之间
if 200 <= word_count <= 2000:
return True
else:
return False
def get_essays(driver, url, file_name, file_path, sheet_name=None):
try:
# 启动浏览器并访问论文详情
driver.get_url(url)
time.sleep(5)
# 获取Table of Contents
table_of_contents_loc = (By.CLASS_NAME, 'article__content')
html = driver.get_outerHTML(table_of_contents_loc)
# 解析HTML内容
soup = BeautifulSoup(html, 'lxml')
# 筛除 tableOfContentBody
table_of_content_body = soup.find(id='tableOfContentBody')
if table_of_content_body:
table_of_content_body.decompose()
# 提取所有的 h2 和 p 标签
elements = soup.find_all(['h2', 'p', 'li'])
# 定义内容
content = ""
# 按顺序打印所有提取的内容
for element in elements:
# 提取出元素标签和内容
element_name = element.name
element_text = element.text.strip()
# 筛除 work cited
if any(keyword in element_text for keyword in keywords):
break
# subtitle 前需要换行
if element_name == 'h2':
content += '\n'
# 拼装正文内容
content += element_text + '\n'
# 正文内容字数检查(大于200,且小于2000)
result = check_words(content)
if result is True:
# 写入表格
print(file_path, sheet_name)
wb, ws = load_existing_excel(file_path, sheet_name)
max_row, max_column = get_max(ws)
print(max_row + 1, max_column)
write_excel(ws, max_row + 1, 1, content)
save_excel(wb, file_name)
print(f'保存成功!')
else:
print('字数不符合要求,跳过保存!')
except:
print('保存失败!')

需求:

  • 爬取网站文章并保存到 excel 中

限制要求:

  • 只保存字数200-2000之间的文章
  • 过滤掉 ['FAQ', 'Reference', 'Bibliography', 'Works Cited'] 等内容

作者留言:

脚本的开发在很多地方都显得有些粗糙,没有过多的代码优化。由于爬虫脚本属于一次性使用,每个网站的页面结构和数据格式都不同,因此每次针对新网站进行爬取,都需要重新编写代码。因此,开发过程主要追求效率。

希望我提供的代码能够为学习爬虫的各位提供新的灵感和思路。

代码依赖的第三方库:

  1. 安装 selenium:

    pip install selenium
  2. 安装 BeautifulSoup:

    pip install beautifulsoup4
  3. 安装 openpyxl:

    pip install openpyxl
  4. 安装 defaultdict:

    pip install collections-extended

if __name__ == '__main__':
driver = start()
# categories_dict = get_subject_list(driver) # 注意:执行该函数会覆盖掉所有原有的已保存论文
# print(categories_dict) # 将提取的所有内容复制出来,然后分配到多个脚本进行调用,实现多线程进行爬虫
categories_dict = {
'Sociology': {'Animal Rights': 'https://ivypanda.com/essays/subject/animal-rights/',
'Communications': 'https://ivypanda.com/essays/subject/communications/',
'Drug and Alcohol Addiction': 'https://ivypanda.com/essays/subject/drug-and-alcohol-addiction/',
'Ethics': 'https://ivypanda.com/essays/subject/ethics/',
'Everyday Interactions': 'https://ivypanda.com/essays/subject/everyday-interactions/',
'Feminism': 'https://ivypanda.com/essays/subject/feminism/',
'Gender Identity': 'https://ivypanda.com/essays/subject/gender-identity/',
'Gender Inequality': 'https://ivypanda.com/essays/subject/gender-inequality/',
'Gender Studies': 'https://ivypanda.com/essays/subject/gender-studies/',
'Human Rights': 'https://ivypanda.com/essays/subject/human-rights/',
'Identity': 'https://ivypanda.com/essays/subject/identity/',
'Immigration': 'https://ivypanda.com/essays/subject/immigration/',
'Overcoming Difficulties': 'https://ivypanda.com/essays/subject/overcoming-difficulties/',
'Overpopulation': 'https://ivypanda.com/essays/subject/overpopulation/',
'Poverty': 'https://ivypanda.com/essays/subject/poverty/',
'Racism': 'https://ivypanda.com/essays/subject/racism/',
'Rhetoric': 'https://ivypanda.com/essays/subject/rhetoric/',
'Slavery': 'https://ivypanda.com/essays/subject/slavery/',
'Social Movements': 'https://ivypanda.com/essays/subject/social-movements/',
'Socialization': 'https://ivypanda.com/essays/subject/socialization/',
"Society's Imperfections": 'https://ivypanda.com/essays/subject/societys-imperfections/',
'Sociological Issues': 'https://ivypanda.com/essays/subject/sociological-issues/',
'Sociological Theories': 'https://ivypanda.com/essays/subject/sociological-theories/',
'Violence': 'https://ivypanda.com/essays/subject/violence/'},
}
# start_url 是调试脚本时增加的容错机制,列表第一页只有120条数据,若120条数据以内报错,则可以将中断的url传入函数,就可以从该位置开始
# 但是后来我优化了逻辑的过滤,几乎不会再报错了,这个容错方案应该也用不上了。
# start_url = 'https://ivypanda.com/essays/torts-and-crimes-liability-for-traffic-accidents/'
get_article_url(driver, categories_dict, start_url=None)
driver.quit()
from openpyxl import Workbook, load_workbook
from config import excel_save_path
import openpyxl
def create_excel():
# 创建一个新的工作簿和工作表
wb = Workbook()
ws = wb.active
return wb, ws
def load_existing_excel(file_path, sheet_name=None):
"""
加载已存在的Excel文件,并获取指定的工作表。
:param file_path: 已存在的Excel文件的路径。
:param sheet_name: 工作表名称。如果为None,则返回活动工作表。
:return: 返回工作簿(wb)和工作表(ws)对象。
"""
try:
# 加载现有的Excel文件
wb = load_workbook(file_path)
# 获取指定的工作表,如果没有指定则获取活动工作表
if sheet_name:
ws = wb[sheet_name]
else:
ws = wb.active
return wb, ws
except Exception as e:
print(f"加载Excel文件时出错:{e}")
return None, None
def write_excel(ws, row, col, value):
# 写入数据到指定单元格
ws.cell(row=row, column=col).value = value
def save_excel(wb, file_name):
# 保存工作簿到文件
wb.save(f'{excel_save_path}/{file_name}')
def get_max_row(categories_dict):
# 遍历类别字典
for main_category, sub_categories_dict in categories_dict.items():
file_path = excel_save_path + rf"\{main_category}.xlsx"
try:
# 尝试打开工作簿
workbook = openpyxl.load_workbook(file_path)
except FileNotFoundError:
print(f"文件 {file_path} 未找到。")
continue
# 遍历子类别并获取每个子类别对应sheet的最大行和列
for sub_category_name, _ in sub_categories_dict.items():
if sub_category_name in workbook.sheetnames:
sheet = workbook[sub_category_name]
return sheet.max_row - 2
# 获取最大行数和行数
def get_max(ws):
max_row = ws.max_row
max_column = ws.max_column
return max_row, max_column
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment