Skip to content

Instantly share code, notes, and snippets.

@timtan
Last active December 14, 2015 05:49
Show Gist options
  • Save timtan/5038216 to your computer and use it in GitHub Desktop.
Save timtan/5038216 to your computer and use it in GitHub Desktop.
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from ..items import WikiItem
from scrapy.http import Request
import urlparse
class DmozSpider(BaseSpider):
name = "wiki"
allowed_domains = ["wikipedia.org"]
start_urls = [
'http://en.wikipedia.org/wiki/Main_Page'
]
base_url = 'http://en.wikipedia.org'
def parse(self, response):
hxs = HtmlXPathSelector(response)
title = hxs.select("//h2[@id='mp-tfa-h2']/span/text()").extract()
link = hxs.select("//tr[2]/td/div[@id='mp-tfa']/p[1]/a/@href")[-1].extract()
link = urlparse.urljoin(self.base_url, link)
self.log('next link is %s'%link)
return Request(url=link, callback=self.parse_article)
def parse_article(selfs, response):
hxs = HtmlXPathSelector(response)
references = hxs.select("//span[@class='reference-text']//@href").extract()
item = WikiItem()
item['references'] = references
return item
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment