Skip to content

Instantly share code, notes, and snippets.

@rex-chien
Last active February 7, 2021 13:15
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save rex-chien/6a69a64b6e812461aa718349e0237da8 to your computer and use it in GitHub Desktop.
Save rex-chien/6a69a64b6e812461aa718349e0237da8 to your computer and use it in GitHub Desktop.
import requests
from bs4 import BeautifulSoup
from datetime import datetime
import re
from pymongo import MongoClient
host = 'localhost'
dbname = '<'
client = MongoClient(host, 27017)
print('資料庫連線成功!')
db = client[dbname]
article_collection = db.article
response_collection = db.response
def crawl_list():
"""爬取文章列表頁
"""
# 抓取 1~10 頁
for page in range(1, 11):
html_doc = requests.get(f'https://ithelp.ithome.com.tw/articles?tab=tech&page={page}').text
soup = BeautifulSoup(html_doc, 'lxml')
# 先找到文章區塊
article_tags = soup.find_all('div', class_='qa-list')
# 沒有文章
if len(article_tags) == 0:
# 跳出換頁迴圈或離開程式
print('沒有文章了!')
break
for article_tag in article_tags:
# 再由每個區塊去找文章連結
title_tag = article_tag.find('a', class_='qa-list__title-link')
article_url = title_tag['href']
crawl_content(article_url)
def crawl_content(url):
"""爬取文章內容
:param url: 文章連結
"""
html_doc = requests.get(url).text
soup = BeautifulSoup(html_doc, 'lxml')
leftside = soup.find('div', class_='leftside')
original_post = leftside.find('div', class_='qa-panel')
article_header = original_post.find('div', class_='qa-header')
def is_info_tag(css_class):
return css_class == 'ir-article-info__content' or css_class == 'qa-header__info'
article_info = article_header.find('div', class_=is_info_tag)
# 標題
title = article_header.find('h2', class_='qa-header__title').get_text(strip=True)
# 作者
def is_author_tag(css_class):
return css_class == 'ir-article-info__name' or css_class == 'qa-header__info-person'
author = article_info.find('a', class_=is_author_tag).get_text(strip=True)
# 發文時間
def is_time_tag(css_class):
return css_class == 'ir-article-info__time' or css_class == 'qa-header__info-time'
published_time_str = article_info.find('a', class_=is_time_tag).get_text(strip=True)
published_time = datetime.strptime(published_time_str, '%Y-%m-%d %H:%M:%S')
# 文章標籤
tag_group = article_header.find('div', class_='qa-header__tagGroup')
tags_element = tag_group.find_all('a', class_='tag')
tags = [tag_element.get_text(strip=True) for tag_element in tags_element]
# 內文
content = original_post.find('div', class_='markdown__style').get_text(strip=True)
# 瀏覽數
def is_view_count_tag(css_class):
return css_class == 'ir-article-info__view' or css_class == 'qa-header__info-view'
view_count_str = article_info.find(class_=is_view_count_tag).get_text(strip=True)
view_count = int(re.search('(\d+).*', view_count_str).group(1))
article = {
'url': url,
'title': title,
'author': author,
'publish_time': published_time,
'tags': ','.join(tags),
'content': content,
'view_count': view_count
}
article_id = insert_article(article)
crawl_response(article_id, soup)
def crawl_response(article_id, soup):
"""爬取文章回應
:param article_id: 文章 ID,作為外來鍵
:param soup: 因為跟原文跟回應在同一個畫面,這邊偷懶直接用文章的 soup 物件
"""
leftside = soup.find('div', class_='leftside')
responses = leftside.find_all('div', class_='response')
results = []
for response in responses:
panel = response.find('div', class_='qa-panel__content')
header = panel.find('div', class_='response-header__info')
result = {'article_id': article_id}
# 回應 ID
result['_id'] = int(response.find('a')['name'].replace('response-', ''))
# 回應作者
result['author'] = header.find('a', class_='response-header__person').get_text(strip=True)
# 回應時間
time_str = header.find('a', class_='ans-header__time').get_text(strip=True)
result['publish_time'] = datetime.strptime(time_str, '%Y-%m-%d %H:%M:%S')
# 回應內容
result['content'] = panel.find('div', class_='markdown__style').get_text(strip=True)
results.append(result)
insert_responses(results)
def insert_article(article):
"""把文章插入到資料庫中
:param article: 文章資料
:return: 文章 ID
:rtype: ObjectId
"""
# 查詢資料庫中是否有相同網址的資料存在
doc = article_collection.find_one({'url': article['url']})
article['update_time'] = datetime.now()
if not doc:
# 沒有就新增
article_id = article_collection.insert_one(article).inserted_id
else:
# 已存在則更新
article_collection.update_one(
{'_id': doc['_id']},
{'$set': article}
)
article_id = doc['_id']
print(f'[{article["title"]}] 新增成功!')
return article_id
def insert_responses(responses):
"""把回文插入到資料庫中
:param responses: 回文資料
"""
for response in responses:
response_collection.update_one(
{'_id': response['_id']},
{'$set': response},
upsert=True
)
if __name__ == '__main__':
crawl_list()
client.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment