This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def parse(self, response): | |
for block in response.xpath('//ul[@id="newslistul"]//li'): | |
href = block.xpath('.//a[contains(@class, "tit")]/@href').extract_first() | |
# 爬取新聞正文內容 | |
yield response.follow(url=href, callback=self.parse_content) | |
a_next = response.xpath('//a[contains(@class, "p_next")]/@href').extract_first() | |
if a_next: | |
# 爬下一頁 | |
yield response.follow(a_next, callback=self.parse) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def parse_content(self, response): | |
for body in response.xpath('//div[contains(@class, "articlebody")]'): | |
title = body.xpath('./h1/text()').get() | |
view_time = body.xpath('.//span[contains(@class, "viewtime")]/text()').get() | |
contents = body.xpath('.//div[contains(@class, "text")]//p//text()').extract() | |
content = ' '.join(contents) | |
if len(content) > 300: | |
content = content[:300] # 如果字長度超過300則擷取前300字 | |
# 確認我們所需要的資料都不為空,如為空則不存入 | |
if response.url and title and view_time and content: |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def parse(self, response): | |
for block in response.xpath('//ul[@id="newslistul"]//li'): | |
href = block.xpath('.//a[contains(@class, "tit")]/@href').extract_first() | |
print(href) | |
# 跳頁 | |
a_next = response.xpath('//a[contains(@class, "p_next")]/@href').extract_first() | |
if a_next: yield response.follow(a_next, callback=self.parse) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import scrapy | |
class LtnSearchCrawler(scrapy.Spider): | |
name = 'ltn_search_page' | |
start_urls = ['https://news.ltn.com.tw/search/?keyword=反紅媒'] | |
def parse(self, response): | |
for block in response.xpath('//ul[@id="newslistul"]//li'): | |
href = block.xpath('.//a[contains(@class, "tit")]/@href').extract_first() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import scrapy | |
class LtnSearchCrawler(scrapy.Spider): | |
name = 'ltn_search_page' | |
start_urls = ['https://news.ltn.com.tw/search/?keyword=反紅媒'] | |
def parse(self, response): | |
for block in response.xpath('//ul[@id="newslistul"]//li'): | |
href = block.xpath('.//a[contains(@class, "tit")]/@href').extract_first() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import scrapy | |
class QuotesSpider(scrapy.Spider): | |
name = 'quotes' | |
start_urls = [ | |
'http://quotes.toscrape.com/tag/humor/', | |
] | |
def parse(self, response): |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import json | |
import requests | |
import lxml.html | |
start_url = 'http://quotes.toscrape.com' | |
def parse_quotes(response: requests.Response): | |
tree = lxml.html.fromstring(response.content.decode('utf-8')) |