Skip to content

Instantly share code, notes, and snippets.

@prnake
Created October 30, 2022 07:15
Show Gist options
  • Save prnake/91a2d5ab06cd1c8f685b2cfa1ae114a8 to your computer and use it in GitHub Desktop.
Save prnake/91a2d5ab06cd1c8f685b2cfa1ae114a8 to your computer and use it in GitHub Desktop.
from lxml import etree
import requests
import html2text
from bs4 import BeautifulSoup
import codecs
def request_get(url):
session = requests.Session()
headers = {
'User-Agent': 'User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.118 Safari/537.36'}
response = requests.get(url, headers=headers, timeout=3)
return response
def CrawlingItemBlog(base_url, id):
second_url = base_url + 'article/details/'
url = second_url + id
# 发送request请求并接受返回值
item_html = request_get(url)
if item_html.status_code == 200:
'''
需要的信息:
1:标题
2:markdown内容
3:发表日期
4:标签
5:类别
'''
# 利用BeautifulSoup解析返回的html
soup = BeautifulSoup(item_html.text)
c = soup.find(id="content_views")
# 标题
title_article = soup.find(attrs={'class': 'title-article'})
# 这里是将标题作为最后存储的文件名
file_name = title_article.get_text()
title_article = title_article.prettify()
# 设置hexo格式博客开头的格式(title)
hexo_title = 'title: ' + file_name + '\n'
# 文章的categories
hexo_categories = ''
# 有可能出现这篇文章没有categories的情况
try:
hexo_categories = soup.find(attrs={'class': 'tags-box space'}).find(attrs={'class': 'tag-link'}).get_text()
except Exception:
pass
if hexo_categories == '':
pass
else:
# 去除拿到的str中的'\t'
hexo_categories = hexo_categories.replace('\t', '')
hexo_categories = 'categories:\n' + '- ' + hexo_categories + '\n'
# 发表时间
# time = soup.find(attrs={'class': 'time'}).get_text()
# s_time1 = time.split('年')
# year = s_time1[0]
# s_time2 = s_time1[1].split('月')
# month = s_time2[0]
# s_time3 = s_time2[1].split('日')
# day = s_time3[0]
# minite = s_time3[1].strip()
# hexo_date = 'date: ' + year + '-' + month + '-' + day + ' ' + minite + '\n'
hexo_tags = ''
# 获取tags
tags = ''
try:
tags = soup.find(attrs={'class': 'tags-box artic-tag-box'}).get_text()
except Exception:
pass
if tags == '':
pass
else:
tags = tags.split('\n')
tags = tags[2]
tags = tags.replace('\t', ' ')
tags = tags.split(' ')
hexo_tags = 'tags:\n'
for tag in tags:
if tag == '':
continue
else:
hexo_tags = hexo_tags + '- ' + tag + '\n'
# 将html转化为markdown
text_maker = html2text.HTML2Text()
text_maker.bypass_tables = False
text = text_maker.handle(c.prettify())
# 有的文章名字特殊,会新建文件失败
try:
# 写入文件
f = codecs.open('./mds/' + file_name + '.md', 'w', encoding='utf-8')
f.write(text)
f.close()
except Exception:
print(file_name)
return True
else:
return False
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment