-
-
Save herrbuerger/150b30eaa97e0518673a to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
# -*- coding: utf-8 -*- | |
from scrapy.contrib.spiders import CrawlSpider, Rule | |
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor | |
from scrapy.http import Request | |
from scrapy.selector import HtmlXPathSelector | |
from example.items import ExampleItem | |
from scrapy.contrib.loader.processor import TakeFirst | |
import re | |
import urllib | |
take_first = TakeFirst() | |
class ExampleSpider(CrawlSpider): | |
name = "test" | |
allowed_domains = ["example.de"] | |
city = "" | |
category = "Versicherungsvermittlung" | |
login_page = "http://www.example.de/index/search?method=simple" | |
start_url = "http://www.example.de/index/search?filter=homepage" | |
start_urls = [start_url] | |
rules = ( | |
# http://www.example.de/index/search?page=2 | |
Rule(SgmlLinkExtractor(allow=('\/index\/search\?page=\d*$', )), callback='parse_item', follow=True), | |
) | |
def init_request(self): | |
"""This function is called before crawling starts.""" | |
return Request(url=self.login_page, callback=self.check_response) | |
def check_response(self, response): | |
return self.initialized() | |
def parse_item(self, response): | |
self.log("Actually we should start now!") | |
hxs = HtmlXPathSelector(response) | |
# fetch all company entries | |
companies = hxs.select("//ul[contains(@class, 'directresults')]/li[contains(@id, 'entry')]") | |
items = [] | |
for company in companies: | |
item = ExampleItem() | |
item['name'] = take_first(company.select(".//span[@class='fn']/text()").extract()) | |
item['address'] = company.select(".//p[@class='data track']/text()").extract() | |
item['website'] = take_first(company.select(".//p[@class='customurl track']/a/@href").extract()) | |
# we try to fetch the number directly from the page (only works for premium entries) | |
item['telephone'] = take_first(company.select(".//p[@class='numericdata track']/a/text()").extract()) | |
if not item['telephone']: | |
# if we cannot fetch the number it has been encoded on the client and hidden in the rel="" | |
item['telephone'] = take_first(company.select(".//p[@class='numericdata track']/a/@rel").extract()) | |
items.append(item) | |
return items |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment