Skip to content

Instantly share code, notes, and snippets.

@rikva
Created May 21, 2012 09:08
Show Gist options
  • Save rikva/2761356 to your computer and use it in GitHub Desktop.
Save rikva/2761356 to your computer and use it in GitHub Desktop.
Don't Judge This Code
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from kamertje.items import KamertjeItem
import sqlite3 as sqlite
from scrapy import log
class KamertjeSpider(CrawlSpider):
name = "kamertje"
allowed_domains = ["www.kamertje.nl", "kamertje.nl"]
start_urls = ["http://www.kamertje.nl/"]
rules = (
Rule(SgmlLinkExtractor(allow=('kamers_in', 'index.php'),unique=True)),
Rule(SgmlLinkExtractor(allow=('kamers\/[0-9]'),unique=True), callback='parse_item'),
)
def parse_item(self, response):
self.log('Found kamer page: %s' % response.url)
# check for duplicates in DB right here
# this is not very efficient disk/cpu-wise but saves bandwidth.
# TODO: find a way to open a connection once per crawl session.
self.connection = sqlite.connect('../scrapedata.db')
self.cursor = self.connection.cursor()
# Create SQL tables here instead of in pipeline because otherwise they'll never be created.
# create kamertje table
self.cursor.execute('CREATE TABLE IF NOT EXISTS kamertje ('
'url text PRIMARY KEY, '
'title text, '
'street text, '
'city text, '
'description text,'
'price text,'
'priceType text,'
'size text,'
'type text,'
'date_added datetime)')
# create images table
self.cursor.execute('CREATE TABLE IF NOT EXISTS kamertje_img ('
'kamertje_url text,'
'image_url text,'
'image_file text)')
url = (response.url.lower(),)
self.cursor.execute('select * from kamertje where url=?', url)
result = self.cursor.fetchone()
if result:
log.msg("Item is already found in database : %s" % url)
else:
# item not found in DB, so continue.
hxs = HtmlXPathSelector(response)
kamerItem = KamertjeItem()
kamerItem['url'] = response.url.lower() #must be lowercased because of duplicates - kamertje sucks
kamerItem['street'] = hxs.select("//*/tr[@class='tableheader']/td/table/tr/td/h1/b/font/text()")[0].extract().lower()
kamerItem['city'] = hxs.select("//*/tr[@class='tableheader']/td/table/tr/td/h1/b/a/text()")[0].extract().split(' in ')[1].lower().strip()
# can be empty
try:
kamerItem['title'] = hxs.select("//*/td[contains(.,'beschrijving')]/../td[2]/b/text()")[0].extract()
except:
kamerItem['title'] = ''
kamerItem['price'] = hxs.select("//*/td[contains(.,'kamerhuur')]/../td[2]/b/text()")[0].extract().split()[1].split(',')[0]
kamerItem['priceType'] = hxs.select("//*/td[contains(.,'kamerhuur')]/../td[2]/text()")[0].extract().strip()
kamerItem['size'] = hxs.select("//*/td[contains(.,'Oppervlakte')]/../td[2]/b/text()")[0].extract().split()[0]
kamerItem['type'] = hxs.select("//*/td[contains(.,'Soort')]/../td[2]/b/text()")[0].extract()
# can be empty
try:
kamerItem['description'] = hxs.select("//*/td[contains(.,'Omschrijving')]/../td[2]/text()")[0].extract()
except:
kamerItem['description'] = ''
kamerItem['image_urls'] = hxs.select("//*/img/@src[contains(.,'imagesroom')]").extract()
kamerItem['images'] = '' # neccesary
# we want full images, not thumbs
for subItem in kamerItem['image_urls']:
kamerItem['image_urls'][kamerItem['image_urls'].index(subItem)] = "http://kamertje.nl/" + subItem.replace('room/t','room/')
return kamerItem
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment