Skip to content

Instantly share code, notes, and snippets.

@recall704
Created July 3, 2016 11:05
Show Gist options
  • Save recall704/a1bce5df79650c995d2eac7fe1170c62 to your computer and use it in GitHub Desktop.
Save recall704/a1bce5df79650c995d2eac7fe1170c62 to your computer and use it in GitHub Desktop.
# -*- coding: utf-8 -*-
import re
import scrapy
from scrapy.utils.response import get_base_url
from pyquery import PyQuery
from biquge.items import BiqugeItem
class BiqugeSpider(scrapy.Spider):
name = 'biquge'
allowed_domains = ["biquge.com.tw"]
base_url = 'http://www.biquge.com.tw/'
book_id = '0_681'
def start_requests(self):
url = self.base_url + self.book_id.strip('/') + '/'
yield scrapy.Request(url, callback=self.parse)
def parse(self, response):
# self.log(response.url)
jq = PyQuery(response.text)
sel = jq('#list dl dd a')
for index,s in enumerate(sel):
url = self.base_url + s.get('href')
# title = s.text
yield scrapy.Request(url, callback=self.parse_detail, meta={
'index': index,
# 'title': title,
})
def parse_detail(self, response):
# self.log(response.url)
item = BiqugeItem()
item['book_id'] = self.book_id
item['index'] = response.meta['index']
item['title'] = response.css('.bookname').xpath('./h1/text()').extract_first()
item['content'] = ''.join(response.xpath('//*[@id="content"]/text()').extract())
yield item
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment