Last active
January 14, 2020 07:02
-
-
Save rex-chien/6293c5485b43cd79430d3a94a031eec0 to your computer and use it in GitHub Desktop.
iT 邦幫忙技術文章爬蟲,新增到 postgres
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
from bs4 import BeautifulSoup | |
from datetime import datetime | |
import re | |
import psycopg2 | |
host = 'localhost' | |
user = 'postgres' | |
dbname = 'ithome2019' | |
password = '<server_admin_password>' | |
conn_string = f'host={host} user={user} dbname={dbname} password={password}' | |
conn = psycopg2.connect(conn_string) | |
print('資料庫連線成功!') | |
cursor = conn.cursor() | |
def crawl_list(): | |
"""爬取文章列表頁 | |
""" | |
# 抓取 1~10 頁 | |
for page in range(1, 11): | |
html_doc = requests.get(f'https://ithelp.ithome.com.tw/articles?tab=tech&page={page}').text | |
soup = BeautifulSoup(html_doc, 'lxml') | |
# 先找到文章區塊 | |
article_tags = soup.find_all('div', class_='qa-list') | |
# 沒有文章 | |
if len(article_tags) == 0: | |
# 跳出換頁迴圈或離開程式 | |
print('沒有文章了!') | |
break | |
for article_tag in article_tags: | |
# 再由每個區塊去找文章連結 | |
title_tag = article_tag.find('a', class_='qa-list__title-link') | |
article_url = title_tag['href'] | |
crawl_content(article_url) | |
def crawl_content(url): | |
"""爬取文章內容 | |
:param url: 文章連結 | |
""" | |
html_doc = requests.get(url).text | |
soup = BeautifulSoup(html_doc, 'lxml') | |
leftside = soup.find('div', class_='leftside') | |
original_post = leftside.find('div', class_='qa-panel') | |
article_header = original_post.find('div', class_='qa-header') | |
article_info = article_header.find('div', class_='ir-article-info__content') | |
# 標題 | |
title = article_header.find('h2', class_='qa-header__title').get_text(strip=True) | |
# 作者 | |
author = article_info.find('a', class_='ir-article-info__name').get_text(strip=True) | |
# 發文時間 | |
published_time_str = article_info.find('a', class_='ir-article-info__time').get_text(strip=True) | |
published_time = datetime.strptime(published_time_str, '%Y-%m-%d %H:%M:%S') | |
# 文章標籤 | |
tag_group = article_header.find('div', class_='qa-header__tagGroup') | |
tags_element = tag_group.find_all('a', class_='tag') | |
tags = [tag_element.get_text(strip=True) for tag_element in tags_element] | |
# 內文 | |
content = original_post.find('div', class_='markdown__style').get_text(strip=True) | |
# 瀏覽數 | |
view_count_str = article_info.find('div', class_='ir-article-info__view').get_text(strip=True) | |
view_count = int(re.search('(\d+).*', view_count_str).group(1)) | |
article = { | |
'url': url, | |
'title': title, | |
'author': author, | |
'publish_time': published_time, | |
'tags': ','.join(tags), | |
'content': content, | |
'view_count': view_count | |
} | |
insert_db(article) | |
def insert_db(article): | |
"""把文章插入到資料庫中 | |
:param article: 文章資料 | |
""" | |
cursor.execute(''' | |
INSERT INTO public.ithome_article(title, url, author, publish_time, tags, content, view_count) | |
VALUES (%(title)s,%(url)s,%(author)s,%(publish_time)s,%(tags)s,%(content)s,%(view_count)s); | |
''', | |
article) | |
print(f'[{article["title"]}] 新增成功!') | |
conn.commit() | |
if __name__ == '__main__': | |
crawl_list() | |
cursor.close() | |
conn.close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment