Last active
July 8, 2019 14:13
-
-
Save LittleYenMin/f48987ba3fc560566e356c587bd3df6f to your computer and use it in GitHub Desktop.
ltn-scrapy
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import scrapy | |
class LtnSearchCrawler(scrapy.Spider): | |
name = 'ltn_search_page' | |
start_urls = ['https://news.ltn.com.tw/search/?keyword=反紅媒'] | |
def parse(self, response): | |
for block in response.xpath('//ul[@id="newslistul"]//li'): | |
href = block.xpath('.//a[contains(@class, "tit")]/@href').extract_first() | |
# 爬取新聞正文內容 | |
yield response.follow(url=href, callback=self.parse_content) | |
a_next = response.xpath('//a[contains(@class, "p_next")]/@href').extract_first() | |
if a_next: | |
# 爬下一頁 | |
yield response.follow(a_next, callback=self.parse) | |
def parse_content(self, response): | |
for body in response.xpath('//div[contains(@class, "articlebody")]'): | |
title = body.xpath('./h1/text()').get() | |
view_time = body.xpath('.//span[contains(@class, "viewtime")]/text()').get() | |
contents = body.xpath('.//div[contains(@class, "text")]//p//text()').extract() | |
content = ' '.join(contents) | |
if len(content) > 300: | |
content = content[:300] # 如果字長度超過300則擷取前300字 | |
# 確認我們所需要的資料都不為空,如為空則不存入 | |
if response.url and title and view_time and content: | |
yield { | |
'url': response.url, | |
'title': title, | |
'date': view_time, | |
'content': content, | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment