Skip to content

Instantly share code, notes, and snippets.

@const-g
Created April 23, 2016 15:11
Show Gist options
  • Save const-g/db25423e8f2f0a4f4c464966cb2f4d07 to your computer and use it in GitHub Desktop.
Save const-g/db25423e8f2f0a4f4c464966cb2f4d07 to your computer and use it in GitHub Desktop.
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from deloitte_listing.items import DeloitteListingItem
class DeloitteListingSpider(BaseSpider):
name = "deloitte_listing"
allowed_domains = ["deloitte.com"]
start_urls = [
"http://www.deloitte.com/view/fr_FR/fr/technology-fast-50/palmares/palmares-national/index.htm",
]
def parse(self, response):
hxs = HtmlXPathSelector(response)
sites = hxs.select('//table[@class="custom_table"]/tr')
items = []
for site in sites:
#print site
item = DeloitteListingItem()
name = ''.join(site.select('./td/a/text()').extract())
url = ''.join(site.select('./td/a/@href').extract())
ca = ''.join(site.select('./td[4]/text()').extract())
item['name'] = name
item['url'] = url
item['ca'] = ca
items.append(item)
return items
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment