Skip to content

Instantly share code, notes, and snippets.

View LittleYenMin's full-sized avatar
👻
I'm not a python developer!! It was accident!!!

邱彥銘 LittleYenMin

👻
I'm not a python developer!! It was accident!!!
View GitHub Profile
@LittleYenMin
LittleYenMin / common.py
Last active June 29, 2019 07:46
Crawler demo in common way.
import json
import requests
import lxml.html
start_url = 'http://quotes.toscrape.com'
def parse_quotes(response: requests.Response):
tree = lxml.html.fromstring(response.content.decode('utf-8'))
@LittleYenMin
LittleYenMin / scrapy.py
Created June 29, 2019 07:47
Crawler demo in scrapy
import scrapy
class QuotesSpider(scrapy.Spider):
name = 'quotes'
start_urls = [
'http://quotes.toscrape.com/tag/humor/',
]
def parse(self, response):
@LittleYenMin
LittleYenMin / ltn-scrapy.py
Last active July 8, 2019 14:13
ltn-scrapy
import scrapy
class LtnSearchCrawler(scrapy.Spider):
name = 'ltn_search_page'
start_urls = ['https://news.ltn.com.tw/search/?keyword=反紅媒']
def parse(self, response):
for block in response.xpath('//ul[@id="newslistul"]//li'):
href = block.xpath('.//a[contains(@class, "tit")]/@href').extract_first()
@LittleYenMin
LittleYenMin / ltn-scrapy-pagination.py
Created July 8, 2019 14:15
自由時報新聞爬蟲(純換頁)
import scrapy
class LtnSearchCrawler(scrapy.Spider):
name = 'ltn_search_page'
start_urls = ['https://news.ltn.com.tw/search/?keyword=反紅媒']
def parse(self, response):
for block in response.xpath('//ul[@id="newslistul"]//li'):
href = block.xpath('.//a[contains(@class, "tit")]/@href').extract_first()
@LittleYenMin
LittleYenMin / parse_scrapy5.py
Created July 13, 2019 14:56
Scrapy爬蟲第五章的Parse code
def parse(self, response):
for block in response.xpath('//ul[@id="newslistul"]//li'):
href = block.xpath('.//a[contains(@class, "tit")]/@href').extract_first()
print(href)
# 跳頁
a_next = response.xpath('//a[contains(@class, "p_next")]/@href').extract_first()
if a_next: yield response.follow(a_next, callback=self.parse)
@LittleYenMin
LittleYenMin / parse_content_scrapy6.py
Last active July 13, 2019 15:12
Scrapy爬蟲第六章新聞正文的function
def parse_content(self, response):
for body in response.xpath('//div[contains(@class, "articlebody")]'):
title = body.xpath('./h1/text()').get()
view_time = body.xpath('.//span[contains(@class, "viewtime")]/text()').get()
contents = body.xpath('.//div[contains(@class, "text")]//p//text()').extract()
content = ' '.join(contents)
if len(content) > 300:
content = content[:300] # 如果字長度超過300則擷取前300字
# 確認我們所需要的資料都不為空,如為空則不存入
if response.url and title and view_time and content:
@LittleYenMin
LittleYenMin / parse_completed_scrapy6.py
Last active July 13, 2019 15:16
Scrapy第六章改寫後的parse程式
def parse(self, response):
for block in response.xpath('//ul[@id="newslistul"]//li'):
href = block.xpath('.//a[contains(@class, "tit")]/@href').extract_first()
# 爬取新聞正文內容
yield response.follow(url=href, callback=self.parse_content)
a_next = response.xpath('//a[contains(@class, "p_next")]/@href').extract_first()
if a_next:
# 爬下一頁
yield response.follow(a_next, callback=self.parse)