Skip to content

Instantly share code, notes, and snippets.

@LittleYenMin
Last active July 8, 2019 14:13
Show Gist options
  • Save LittleYenMin/f48987ba3fc560566e356c587bd3df6f to your computer and use it in GitHub Desktop.
Save LittleYenMin/f48987ba3fc560566e356c587bd3df6f to your computer and use it in GitHub Desktop.
ltn-scrapy
import scrapy
class LtnSearchCrawler(scrapy.Spider):
name = 'ltn_search_page'
start_urls = ['https://news.ltn.com.tw/search/?keyword=反紅媒']
def parse(self, response):
for block in response.xpath('//ul[@id="newslistul"]//li'):
href = block.xpath('.//a[contains(@class, "tit")]/@href').extract_first()
# 爬取新聞正文內容
yield response.follow(url=href, callback=self.parse_content)
a_next = response.xpath('//a[contains(@class, "p_next")]/@href').extract_first()
if a_next:
# 爬下一頁
yield response.follow(a_next, callback=self.parse)
def parse_content(self, response):
for body in response.xpath('//div[contains(@class, "articlebody")]'):
title = body.xpath('./h1/text()').get()
view_time = body.xpath('.//span[contains(@class, "viewtime")]/text()').get()
contents = body.xpath('.//div[contains(@class, "text")]//p//text()').extract()
content = ' '.join(contents)
if len(content) > 300:
content = content[:300] # 如果字長度超過300則擷取前300字
# 確認我們所需要的資料都不為空,如為空則不存入
if response.url and title and view_time and content:
yield {
'url': response.url,
'title': title,
'date': view_time,
'content': content,
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment