邱彥銘 LittleYenMin

## parse_completed_scrapy6.py
def parse(self, response):
        for block in response.xpath('//ul[@id="newslistul"]//li'):
            href = block.xpath('.//a[contains(@class, "tit")]/@href').extract_first()
            # 爬取新聞正文內容
            yield response.follow(url=href, callback=self.parse_content)
        a_next = response.xpath('//a[contains(@class, "p_next")]/@href').extract_first()
        if a_next:
            # 爬下一頁
            yield response.follow(a_next, callback=self.parse)

## parse_content_scrapy6.py
def parse_content(self, response):
        for body in response.xpath('//div[contains(@class, "articlebody")]'):
            title = body.xpath('./h1/text()').get()
            view_time = body.xpath('.//span[contains(@class, "viewtime")]/text()').get()
            contents = body.xpath('.//div[contains(@class, "text")]//p//text()').extract()
            content = ' '.join(contents)
            if len(content) > 300:
                content = content[:300] # 如果字長度超過300則擷取前300字
            # 確認我們所需要的資料都不為空，如為空則不存入
            if response.url and title and view_time and content:

## parse_scrapy5.py
def parse(self, response):
    for block in response.xpath('//ul[@id="newslistul"]//li'):
        href = block.xpath('.//a[contains(@class, "tit")]/@href').extract_first()
        print(href)
        # 跳頁
        a_next = response.xpath('//a[contains(@class, "p_next")]/@href').extract_first()
        if a_next:            yield response.follow(a_next, callback=self.parse)

## ltn-scrapy-pagination.py
import scrapy


class LtnSearchCrawler(scrapy.Spider):
    name = 'ltn_search_page'
    start_urls = ['https://news.ltn.com.tw/search/?keyword=反紅媒']

    def parse(self, response):
        for block in response.xpath('//ul[@id="newslistul"]//li'):
            href = block.xpath('.//a[contains(@class, "tit")]/@href').extract_first()

## ltn-scrapy.py
import scrapy


class LtnSearchCrawler(scrapy.Spider):
    name = 'ltn_search_page'
    start_urls = ['https://news.ltn.com.tw/search/?keyword=反紅媒']

    def parse(self, response):
        for block in response.xpath('//ul[@id="newslistul"]//li'):
            href = block.xpath('.//a[contains(@class, "tit")]/@href').extract_first()

## scrapy.py
import scrapy


class QuotesSpider(scrapy.Spider):
    name = 'quotes'
    start_urls = [
        'http://quotes.toscrape.com/tag/humor/',
    ]

    def parse(self, response):

## common.py
import json

import requests
import lxml.html

start_url = 'http://quotes.toscrape.com'


def parse_quotes(response: requests.Response):
    tree = lxml.html.fromstring(response.content.decode('utf-8'))
	def parse(self, response):
	for block in response.xpath('//ul[@id="newslistul"]//li'):
	href = block.xpath('.//a[contains(@class, "tit")]/@href').extract_first()
	# 爬取新聞正文內容
	yield response.follow(url=href, callback=self.parse_content)
	a_next = response.xpath('//a[contains(@class, "p_next")]/@href').extract_first()
	if a_next:
	# 爬下一頁
	yield response.follow(a_next, callback=self.parse)
	def parse_content(self, response):
	for body in response.xpath('//div[contains(@class, "articlebody")]'):
	title = body.xpath('./h1/text()').get()
	view_time = body.xpath('.//span[contains(@class, "viewtime")]/text()').get()
	contents = body.xpath('.//div[contains(@class, "text")]//p//text()').extract()
	content = ' '.join(contents)
	if len(content) > 300:
	content = content[:300] # 如果字長度超過300則擷取前300字
	# 確認我們所需要的資料都不為空，如為空則不存入
	if response.url and title and view_time and content:
	import scrapy


	class LtnSearchCrawler(scrapy.Spider):
	name = 'ltn_search_page'
	start_urls = ['https://news.ltn.com.tw/search/?keyword=反紅媒']

	def parse(self, response):
	for block in response.xpath('//ul[@id="newslistul"]//li'):
	href = block.xpath('.//a[contains(@class, "tit")]/@href').extract_first()
	import scrapy


	class QuotesSpider(scrapy.Spider):
	name = 'quotes'
	start_urls = [
	'http://quotes.toscrape.com/tag/humor/',
	]

	def parse(self, response):
	import json

	import requests
	import lxml.html

	start_url = 'http://quotes.toscrape.com'


	def parse_quotes(response: requests.Response):
	tree = lxml.html.fromstring(response.content.decode('utf-8'))