Skip to content

Instantly share code, notes, and snippets.

@rex-chien
Created October 9, 2019 07:57
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save rex-chien/e7a5d57ddd361fc1c7f4754999b9c9b1 to your computer and use it in GitHub Desktop.
Save rex-chien/e7a5d57ddd361fc1c7f4754999b9c9b1 to your computer and use it in GitHub Desktop.
用 Scrapy 爬 iT 邦幫忙的技術文章
import scrapy
from datetime import datetime
import re
class IthomeSpider(scrapy.Spider):
name = 'ithome'
allowed_domains = ['ithome.com.tw']
def start_requests(self):
for page in range(1, 11):
yield scrapy.Request(url=f'https://ithelp.ithome.com.tw/articles?tab=tech&page={page}', callback=self.parse)
def parse(self, response):
# 先找到文章區塊
article_tags = response.css('div.qa-list')
# 有文章才要繼續
if len(article_tags) > 0:
for article_tag in article_tags:
# 再由每個區塊去找文章連結
title_tag = article_tag.css('a.qa-list__title-link')
article_url = title_tag.css('::attr(href)').get().strip()
yield response.follow(article_url, callback=self.parse_article)
def parse_article(self, response):
leftside = response.css('div.leftside')
original_post = leftside.css('div.qa-panel')
article_header = original_post.css('div.qa-header')
article_info = article_header.css('div.ir-article-info__content, div.qa-header__info')
# 標題
title = article_header.css('h2.qa-header__title::text').get().strip()
# 作者
author = article_info.css('a.ir-article-info__name, a.qa-header__info-person').css('::text').get().strip()
# 發文時間
published_time_str = article_info.css('a.ir-article-info__time, a.qa-header__info-time').css('::text').get().strip()
published_time = datetime.strptime(published_time_str, '%Y-%m-%d %H:%M:%S')
# 文章標籤
tag_group = article_header.css('div.qa-header__tagGroup')
tag_elements = tag_group.css('a.tag')
tags = [tag_element.css('::text').get().strip() for tag_element in tag_elements]
# 內文
content = ' '.join(original_post.css('div.markdown__style').css('::text').getall())
# 瀏覽數
view_count_str = article_info.css('.ir-article-info__view, .qa-header__info-view').css('::text').get().strip()
view_count = int(re.search('(\d+).*', view_count_str).group(1))
article = {
'url': response.url,
'title': title,
'author': author,
'publish_time': published_time,
'tags': ','.join(tags),
'content': content,
'view_count': view_count
}
yield article
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment