Skip to content

Instantly share code, notes, and snippets.

@qingfeng
Created July 9, 2009 17:33
Show Gist options
  • Save qingfeng/143824 to your computer and use it in GitHub Desktop.
Save qingfeng/143824 to your computer and use it in GitHub Desktop.
from scrapy import log
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.xpath.selector import HtmlXPathSelector
from scrapy.item import ScrapedItem
def safecn(i):
try:
return unichr(int(i))
except:
return i
cn = lambda s:"".join(
map(lambda x:safecn(x.replace("&#","")),
s.strip().split(";"))
)
class ZaojiaoSpider(CrawlSpider):
domain_name = "zaojiao.com"
start_urls = [
"http://www.zaojiao.com/pregnancy/",
]
rules = (
Rule(SgmlLinkExtractor(allow=('\.html',),), \
callback='parse_item'),
)
def parse_item(self, response):
log.msg("response.url",response.url)
hxs = HtmlXPathSelector(response)
item = ScrapedItem()
item.title = cn(hxs.x('//h1/text()').extract()[0])
item.body = cn(hxs.x('//div[@id="content"]').extract()[0])
item.url = response.url
item.uuid = hash(response.url)
return [item]
SPIDER = ZaojiaoSpider()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment