Skip to content

Instantly share code, notes, and snippets.

@ttttmr
Last active December 30, 2020 13:53
Show Gist options
  • Save ttttmr/7521bdf1cb5dda3b4895f576e2385525 to your computer and use it in GitHub Desktop.
Save ttttmr/7521bdf1cb5dda3b4895f576e2385525 to your computer and use it in GitHub Desktop.
获取文章标题
import requests
from bs4 import BeautifulSoup
from collections import Counter
import jieba
import jieba.analyse
def get_titles(url):
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36',
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'sec-fetch-site': 'none',
'sec-fetch-mode': 'navigate',
'sec-fetch-dest': 'document',
'accept-language': 'zh-CN,zh;q=0.9'
}
result = []
try:
response = requests.request("GET", url, headers=headers, timeout=5)
# 收集标题
if response.ok:
response.encoding = response.apparent_encoding
soup = BeautifulSoup(response.text, 'html.parser')
titles = []
# 遍历tag
tags = ["title", "h1", "h2"]
for tag in tags:
items = soup.find_all(tag)
# 清理,提取文本
texts = []
for item in items:
text = clear_text(item.text)
if text:
texts.append(text)
# 取前2个
for text in texts[:2]:
titles.append(text)
# 遍历class
classes = ["title", "content-title",
"DailyHeader-title", "question-title"]
for c in classes:
items = soup.find_all(attrs={"class": c})
# 清理,提取文本
texts = []
for item in items:
text = clear_text(item.text)
if text:
texts.append(text)
# 取前2个
for text in texts[:2]:
titles.append(text)
print("all titles: ", titles)
titles = list(set(titles))
if len(titles) <= 1:
return titles
# 找出最适合的标题
# 拆关键字top3
top_tag = jieba.analyse.extract_tags("\n".join(titles), topK=3)
print("top_tag: ", top_tag)
# 遍历关键字命中率
counter = Counter()
for title in set(titles):
for t in top_tag:
if t in title:
counter[title] += 1
print("counter: ", counter)
# 取top2
top_title = counter.most_common(2)
length = len(top_title)
if length == 2:
# 命中次数相同,且有前后缀关系,取最短的
# if top_title[0][1] == top_title[1][1] and (top_title[0][0] in top_title[1][0] or top_title[1][0] in top_title[0][0]):
if top_title[0][1] == top_title[1][1] and (top_title[0][0].startswith(top_title[1][0]) or top_title[0][0].endswith(top_title[1][0]) or top_title[1][0].startswith(top_title[0][0]) or top_title[1][0].endswith(top_title[0][0])):
result.append(min(top_title, key=lambda x: len(x[0]))[0])
else:
for title in top_title:
result.append(title[0])
elif length == 1:
result.append(top_title[0][0])
except Exception as e:
print(e)
return result
beautifulsoup4==4.9.3
bs4==0.0.1
requests==2.25.1
jieba==0.42.1
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment