Skip to content

Instantly share code, notes, and snippets.

@herrbuerger
Created August 30, 2012 10:01
Show Gist options
  • Save herrbuerger/150b30eaa97e0518673a to your computer and use it in GitHub Desktop.
Save herrbuerger/150b30eaa97e0518673a to your computer and use it in GitHub Desktop.
#!/usr/bin/python
# -*- coding: utf-8 -*-
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.http import Request
from scrapy.selector import HtmlXPathSelector
from example.items import ExampleItem
from scrapy.contrib.loader.processor import TakeFirst
import re
import urllib
take_first = TakeFirst()
class ExampleSpider(CrawlSpider):
name = "test"
allowed_domains = ["example.de"]
city = ""
category = "Versicherungsvermittlung"
login_page = "http://www.example.de/index/search?method=simple"
start_url = "http://www.example.de/index/search?filter=homepage"
start_urls = [start_url]
rules = (
# http://www.example.de/index/search?page=2
Rule(SgmlLinkExtractor(allow=('\/index\/search\?page=\d*$', )), callback='parse_item', follow=True),
)
def init_request(self):
"""This function is called before crawling starts."""
return Request(url=self.login_page, callback=self.check_response)
def check_response(self, response):
return self.initialized()
def parse_item(self, response):
self.log("Actually we should start now!")
hxs = HtmlXPathSelector(response)
# fetch all company entries
companies = hxs.select("//ul[contains(@class, 'directresults')]/li[contains(@id, 'entry')]")
items = []
for company in companies:
item = ExampleItem()
item['name'] = take_first(company.select(".//span[@class='fn']/text()").extract())
item['address'] = company.select(".//p[@class='data track']/text()").extract()
item['website'] = take_first(company.select(".//p[@class='customurl track']/a/@href").extract())
# we try to fetch the number directly from the page (only works for premium entries)
item['telephone'] = take_first(company.select(".//p[@class='numericdata track']/a/text()").extract())
if not item['telephone']:
# if we cannot fetch the number it has been encoded on the client and hidden in the rel=""
item['telephone'] = take_first(company.select(".//p[@class='numericdata track']/a/@rel").extract())
items.append(item)
return items
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment