Skip to content

Instantly share code, notes, and snippets.

@razaulhaq
Created September 4, 2015 03:02
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save razaulhaq/d39ec325d1f075b6572b to your computer and use it in GitHub Desktop.
Save razaulhaq/d39ec325d1f075b6572b to your computer and use it in GitHub Desktop.
from scrapy.linkextractors import LinkExtractor
from scrapy.selector import Selector
class WikispiderSpider(CrawlSpider):
name = "wikispider"
allowed_domains = ["wikipedia.org"]
start_urls = (
'https://en.wikipedia.org/wiki/Mathematics',
)
rules = (
Rule(LinkExtractor(restrict_xpaths=('//div[@class="mw-body-content"]//a/@href'))),
Rule(LinkExtractor(allow=("https://en.wikipedia.org/wiki/",)),callback='parse_item'),
)
def parse(self, response):
hxs = Selector(response)
print hxs.xpath('//h1[@class= "firstHeading"]/span/text()').extract()
#heading = hxs.select('//h1[@class="firstHeading"]/span/text()').extract()
pass
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment