Skip to content

Instantly share code, notes, and snippets.

@lxneng
Forked from qingfeng/sp.py
Created February 21, 2010 15:20
Show Gist options
  • Save lxneng/310366 to your computer and use it in GitHub Desktop.
Save lxneng/310366 to your computer and use it in GitHub Desktop.
from scrapy import log
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.xpath.selector import HtmlXPathSelector
from scrapy.item import ScrapedItem
def safecn(i):
try:
return unichr(int(i))
except:
return i
cn = lambda s:"".join(
map(lambda x:safecn(x.replace("&#","")),
s.strip().split(";"))
)
class ZaojiaoSpider(CrawlSpider):
domain_name = "zaojiao.com"
start_urls = [
"http://www.zaojiao.com/pregnancy/",
]
rules = (
Rule(SgmlLinkExtractor(allow=('\.html',),), \
callback='parse_item'),
)
def parse_item(self, response):
log.msg("response.url",response.url)
hxs = HtmlXPathSelector(response)
item = ScrapedItem()
item.title = cn(hxs.x('//h1/text()').extract()[0])
item.body = cn(hxs.x('//div[@id="content"]').extract()[0])
item.url = response.url
item.uuid = hash(response.url)
return [item]
SPIDER = ZaojiaoSpider()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment