Skip to content

Instantly share code, notes, and snippets.

@rex-chien
Last active October 22, 2019 09:32
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save rex-chien/8b10074b3d8c047b24d558bb6481d59e to your computer and use it in GitHub Desktop.
Save rex-chien/8b10074b3d8c047b24d558bb6481d59e to your computer and use it in GitHub Desktop.
用 Scrapy 爬 iT 邦幫忙的技術文章和回文
import scrapy
class IthomeArticleItem(scrapy.Item):
_id = scrapy.Field()
url = scrapy.Field()
title = scrapy.Field()
author = scrapy.Field()
publish_time = scrapy.Field()
tags = scrapy.Field()
content = scrapy.Field()
view_count = scrapy.Field()
class IthomeReplyItem(scrapy.Item):
_id = scrapy.Field()
article_id = scrapy.Field()
author = scrapy.Field()
publish_time = scrapy.Field()
content = scrapy.Field()
import scrapy
from datetime import datetime
import re
import ithome_crawlers.items as items
from scrapy_puppeteer import PuppeteerRequest
class IthomeSpider(scrapy.Spider):
name = 'ithome'
allowed_domains = ['ithome.com.tw']
def start_requests(self):
for page in range(1, 2):
yield scrapy.Request(url=f'https://ithelp.ithome.com.tw/articles?tab=tech&page={page}', callback=self.parse)
def parse(self, response):
# 先找到文章區塊
article_tags = response.css('div.qa-list')
# 有文章才要繼續
if len(article_tags) > 0:
for article_tag in article_tags:
# 再由每個區塊去找文章連結
title_tag = article_tag.css('a.qa-list__title-link')
article_url = title_tag.css('::attr(href)').get().strip()
yield response.follow(article_url, callback=self.parse_article)
def parse_article(self, response):
leftside = response.css('div.leftside')
original_post = leftside.css('div.qa-panel')
article_header = original_post.css('div.qa-header')
article_info = article_header.css('div.ir-article-info__content, div.qa-header__info')
# 標題
title = article_header.css('h2.qa-header__title::text').get().strip()
# 作者
author = article_info.css('a.ir-article-info__name, a.qa-header__info-person').css('::text').get().strip()
# 發文時間
published_time_str = article_info.css('a.ir-article-info__time, a.qa-header__info-time').css('::text').get().strip()
published_time = datetime.strptime(published_time_str, '%Y-%m-%d %H:%M:%S')
# 文章標籤
tag_group = article_header.css('div.qa-header__tagGroup')
tag_elements = tag_group.css('a.tag')
tags = [tag_element.css('::text').get().strip() for tag_element in tag_elements]
# 內文
content = ' '.join(original_post.css('div.markdown__style').css('::text').getall())
# 瀏覽數
view_count_str = article_info.css('.ir-article-info__view, .qa-header__info-view').css('::text').get().strip()
view_count = int(re.search('(\d+).*', view_count_str).group(1))
article = items.IthomeArticleItem()
article['url'] = response.url
article['title'] = title
article['author'] = author
article['publish_time'] = published_time
article['tags'] = ''.join(tags)
article['content'] = content
article['view_count'] = view_count
yield article
'''
瀏覽數小於 20 的文章會被移除
就不會有新增後的識別值
'''
if '_id' in article:
'''
上一行執行後資料已更新到資料庫中
因為是同一個物件參照
可以取得識別值
'''
article_id = article['_id']
'''
因為 iTHome 原文與回文都是在同一個畫面中
剖析回文時使用原本的 response 即可
否則這邊需要再回傳 Request 物件
yield scrapy.Request(url, callback=self.parse_reply)
'''
yield from self.parse_reply(response, article_id)
def parse_reply(self, response, article_id):
leftside = response.css('div.leftside')
replies = leftside.css('div.response')
for reply in replies:
panel = reply.css('div.qa-panel__content')
header = panel.css('div.response-header__info')
reply_item = items.IthomeReplyItem()
reply_item['article_id'] = article_id
# 回應 ID
reply_item['_id'] = int(reply.css('a::attr(name)').get().replace('response-', ''))
# 回應作者
reply_item['author'] = header.css('a.response-header__person').css('::text').get()
# 回應時間
time_str = header.css('a.ans-header__time').css('::text').get().strip()
reply_item['publish_time'] = datetime.strptime(time_str, '%Y-%m-%d %H:%M:%S')
# 回應內容
reply_item['content'] = ' '.join(panel.css('div.markdown__style').css('::text').getall())
yield reply_item
from scrapy.exceptions import DropItem
import ithome_crawlers.items as items
class IthomeCrawlersPipeline(object):
def process_item(self, item, spider):
if type(item) is items.IthomeArticleItem:
if item['view_count'] < 20:
raise DropItem(f'[{item["title"]}] 瀏覽數小於 20')
return item
import pymongo
class AbstractMongoPipeline(object):
collection_name = None
def __init__(self, mongo_uri, mongo_db):
self.mongo_uri = mongo_uri
self.mongo_db = mongo_db
self.client = pymongo.MongoClient(self.mongo_uri)
self.db = self.client[self.mongo_db]
self.collection = self.db[self.collection_name]
@classmethod
def from_crawler(cls, crawler):
return cls(
mongo_uri=crawler.settings.get('MONGO_URI'),
mongo_db=crawler.settings.get('MONGO_DATABASE')
)
def close_spider(self, spider):
self.client.close()
class IthomeArticlePipeline(AbstractMongoPipeline):
collection_name = 'article'
def process_item(self, item, spider):
if type(item) is items.IthomeArticleItem:
document = self.collection.find_one({'url': item['url']})
if not document:
insert_result = self.collection.insert_one(dict(item))
item['_id'] = insert_result.inserted_id
else:
self.collection.update_one(
{'_id': document['_id']},
{'$set': dict(item)},
upsert=True
)
item['_id'] = document['_id']
return item
class IthomeReplyPipeline(AbstractMongoPipeline):
collection_name = 'reply'
def process_item(self, item, spider):
if type(item) is items.IthomeReplyItem:
document = self.collection.find_one(item['_id'])
if not document:
insert_result = self.collection.insert_one(dict(item))
else:
del item['_id']
self.collection.update_one(
{'_id': document['_id']},
{'$set': dict(item)},
upsert=True
)
return item
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment