rex-chien/ithome.py

## ithome.py
import scrapy
from datetime import datetime
import re

class IthomeSpider(scrapy.Spider):
    name = 'ithome'
    allowed_domains = ['ithome.com.tw']

    def start_requests(self):
        for page in range(1, 11):
            yield scrapy.Request(url=f'https://ithelp.ithome.com.tw/articles?tab=tech&page={page}', callback=self.parse)

    def parse(self, response):
        # 先找到文章區塊
        article_tags = response.css('div.qa-list')

        # 有文章才要繼續
        if len(article_tags) > 0:
            for article_tag in article_tags:
                # 再由每個區塊去找文章連結
                title_tag = article_tag.css('a.qa-list__title-link')
                article_url = title_tag.css('::attr(href)').get().strip()

                yield response.follow(article_url, callback=self.parse_article)

    def parse_article(self, response):
        leftside = response.css('div.leftside')
        original_post = leftside.css('div.qa-panel')

        article_header = original_post.css('div.qa-header')
        article_info = article_header.css('div.ir-article-info__content, div.qa-header__info')

        # 標題
        title = article_header.css('h2.qa-header__title::text').get().strip()

        # 作者
        author = article_info.css('a.ir-article-info__name, a.qa-header__info-person').css('::text').get().strip()

        # 發文時間
        published_time_str = article_info.css('a.ir-article-info__time, a.qa-header__info-time').css('::text').get().strip()
        published_time = datetime.strptime(published_time_str, '%Y-%m-%d %H:%M:%S')

        # 文章標籤
        tag_group = article_header.css('div.qa-header__tagGroup')
        tag_elements = tag_group.css('a.tag')
        tags = [tag_element.css('::text').get().strip() for tag_element in tag_elements]

        # 內文
        content = ' '.join(original_post.css('div.markdown__style').css('::text').getall())

        # 瀏覽數
        view_count_str = article_info.css('.ir-article-info__view, .qa-header__info-view').css('::text').get().strip()
        view_count = int(re.search('(\d+).*', view_count_str).group(1))

        article = {
            'url': response.url,
            'title': title,
            'author': author,
            'publish_time': published_time,
            'tags': ','.join(tags),
            'content': content,
            'view_count': view_count
        }

        yield article
	import scrapy
	from datetime import datetime
	import re

	class IthomeSpider(scrapy.Spider):
	name = 'ithome'
	allowed_domains = ['ithome.com.tw']

	def start_requests(self):
	for page in range(1, 11):
	yield scrapy.Request(url=f'https://ithelp.ithome.com.tw/articles?tab=tech&page={page}', callback=self.parse)

	def parse(self, response):
	# 先找到文章區塊
	article_tags = response.css('div.qa-list')

	# 有文章才要繼續
	if len(article_tags) > 0:
	for article_tag in article_tags:
	# 再由每個區塊去找文章連結
	title_tag = article_tag.css('a.qa-list__title-link')
	article_url = title_tag.css('::attr(href)').get().strip()

	yield response.follow(article_url, callback=self.parse_article)

	def parse_article(self, response):
	leftside = response.css('div.leftside')
	original_post = leftside.css('div.qa-panel')

	article_header = original_post.css('div.qa-header')
	article_info = article_header.css('div.ir-article-info__content, div.qa-header__info')

	# 標題
	title = article_header.css('h2.qa-header__title::text').get().strip()

	# 作者
	author = article_info.css('a.ir-article-info__name, a.qa-header__info-person').css('::text').get().strip()

	# 發文時間
	published_time_str = article_info.css('a.ir-article-info__time, a.qa-header__info-time').css('::text').get().strip()
	published_time = datetime.strptime(published_time_str, '%Y-%m-%d %H:%M:%S')

	# 文章標籤
	tag_group = article_header.css('div.qa-header__tagGroup')
	tag_elements = tag_group.css('a.tag')
	tags = [tag_element.css('::text').get().strip() for tag_element in tag_elements]

	# 內文
	content = ' '.join(original_post.css('div.markdown__style').css('::text').getall())

	# 瀏覽數
	view_count_str = article_info.css('.ir-article-info__view, .qa-header__info-view').css('::text').get().strip()
	view_count = int(re.search('(\d+).*', view_count_str).group(1))

	article = {
	'url': response.url,
	'title': title,
	'author': author,
	'publish_time': published_time,
	'tags': ','.join(tags),
	'content': content,
	'view_count': view_count
	}

	yield article