LittleYenMin/parse_content_scrapy6.py

## parse_content_scrapy6.py
def parse_content(self, response):
        for body in response.xpath('//div[contains(@class, "articlebody")]'):
            title = body.xpath('./h1/text()').get()
            view_time = body.xpath('.//span[contains(@class, "viewtime")]/text()').get()
            contents = body.xpath('.//div[contains(@class, "text")]//p//text()').extract()
            content = ' '.join(contents)
            if len(content) > 300:
                content = content[:300] # 如果字長度超過300則擷取前300字
            # 確認我們所需要的資料都不為空，如為空則不存入
            if response.url and title and view_time and content:
                yield {
                    'url': response.url,
                    'title': title,
                    'date': view_time,
                    'content': content,
                }
	def parse_content(self, response):
	for body in response.xpath('//div[contains(@class, "articlebody")]'):
	title = body.xpath('./h1/text()').get()
	view_time = body.xpath('.//span[contains(@class, "viewtime")]/text()').get()
	contents = body.xpath('.//div[contains(@class, "text")]//p//text()').extract()
	content = ' '.join(contents)
	if len(content) > 300:
	content = content[:300] # 如果字長度超過300則擷取前300字
	# 確認我們所需要的資料都不為空，如為空則不存入
	if response.url and title and view_time and content:
	yield {
	'url': response.url,
	'title': title,
	'date': view_time,
	'content': content,
	}