Created
October 15, 2013 03:49
-
-
Save jokaye/6986241 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from scrapy.selector import HtmlXPathSelector | |
from scrapy.spider import BaseSpider | |
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor | |
from scrapy.utils.url import urljoin_rfc | |
from scrapy.http import Request | |
class MySpider(BaseSpider): | |
name = ’test’ | |
allowed_domains = [’xxxx.com’] | |
start_urls = [ | |
’http://www.xxxx.com’, | |
] | |
download_delay = 10 | |
def parse(self, response): | |
for link in SgmlLinkExtractor(allow=" product.htm\?id=\d+").extract_links(response): | |
yield Request(link.url,callback=self.parse_detail) | |
hxs = HtmlXPathSelector(response) | |
for url in hxs.select(’//a/@href’).extract(): | |
url = self._urljoin(response,url) | |
#print url | |
yield Request(url, callback=self.parse) | |
def parse_detail(self, response): | |
hxs = HtmlXPathSelector(response) | |
what_u_want= hxs.select("/xpath/text()").extract()[0] | |
print ’url=’,response.url, what_u_want.strip() | |
return | |
def _urljoin(self, response, url): | |
"""Helper to convert relative urls to absolute""" | |
return urljoin_rfc(response.url, url, response.encoding) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment