Skip to content

Instantly share code, notes, and snippets.

@carlessanagustin
Forked from motanelu/house.py
Last active May 13, 2016 07:56
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save carlessanagustin/84a022df42ad4fbf71ccec577b7e5850 to your computer and use it in GitHub Desktop.
Save carlessanagustin/84a022df42ad4fbf71ccec577b7e5850 to your computer and use it in GitHub Desktop.
Finding a house with python - from: https://tudorbarbu.ninja/hacking-the-bcn-house-market/
import scrapy
import peewee
import re
import urllib
import cStringIO
from PIL import Image
from playhouse.db_url import connect
db = connect('mysql://root:@127.0.0.1/house')
def dhash(image, hash_size = 8):
image = image.convert('L').resize(
(hash_size + 1, hash_size),
Image.ANTIALIAS,
)
pixels = list(image.getdata())
difference = []
for row in xrange(hash_size):
for col in xrange(hash_size):
pixel_left = image.getpixel((col, row))
pixel_right = image.getpixel((col + 1, row))
difference.append(pixel_left > pixel_right)
decimal_value = 0
hex_string = []
for index, value in enumerate(difference):
if value:
decimal_value += 2**(index % 8)
if (index % 8) == 7:
hex_string.append(hex(decimal_value)[2:].rjust(2, '0'))
decimal_value = 0
return ''.join(hex_string)
def hamming_distance(s1, s2):
if len(s1) != len(s2):
raise ValueError('Undefined for sequences of unequal length')
return sum(ch1 != ch2 for ch1, ch2 in zip(s1, s2))
class House(peewee.Model):
bathrooms = peewee.IntegerField()
floor = peewee.IntegerField()
price = peewee.IntegerField()
rooms = peewee.IntegerField()
external = peewee.IntegerField(db_column='external_id', unique=True)
rejected = peewee.IntegerField(index=True, null=True)
street = peewee.TextField()
url = peewee.CharField()
class Meta:
database = db
class Picture(peewee.Model):
hash = peewee.CharField()
house = peewee.ForeignKeyField(House, related_name='pictures')
class Meta:
database = db
class HouseSpider(scrapy.Spider):
name = 'HouseSpider'
start_urls = [
'http://fotocasa.es/search/results.aspx?opi=36&bti=2&bsm=&tti=3&prchti=2&cu=es-ES&mode=1&maxp=600&minr=2&fs=true&ts=Barcelona%2520Capital&l=724,9,8,232,376,8019,0,0,0'
]
def parse(self, response):
for li in response.css('div#photo-content ul.listPhotos li'):
url, external_id = self.extract_url_info(
li.css(
'div.property-information a.property-location'
)
.xpath('@href')
.extract()[0]
)
try:
House.get(House.external == external_id)
except peewee.DoesNotExist:
yield scrapy.Request(
url=url,
callback=self.parse_secondary_request,
meta={
'url' : url,
'external_id' : external_id
}
)
def parse_secondary_request(self, response):
rooms = response.css('#litRooms b::text').extract()[0]
try:
bathrooms = response.css('#litBaths b::text').extract()[0]
except:
bathrooms = 0
street = response.css('h1.property-title::text').extract()[0].strip()
comma = street.find(',')
street = street[9:comma if comma != -1 else len(street)]
price = int(re.search(r'\d+', response.css('#priceContainer::text').extract()[0]).group())
try:
floor = int(re.search(r'\d+', response.css('#litFloor::text').extract()[0]).group())
except:
floor = 0
duplicate = None
insert = False
hashes = self.get_image_hashes(response)
houses = House.select(House, Picture).join(Picture).where(
House.rooms == rooms,
House.bathrooms == bathrooms,
House.floor == floor
)
def find(houses, hashes):
for house in houses:
for picture in house.pictures:
for hash in hashes:
if hamming_distance(picture.hash, hash) <= 10:
return house
return None
if houses.count() == 0:
duplicate = None
else:
duplicate = find(houses, hashes)
if duplicate == None:
insert = True
else:
if duplicate.price > price:
duplicate.delete_instance(True)
insert = True
if insert == True:
house = House(
bathrooms = bathrooms,
floor = floor,
price = price,
rooms = rooms,
external = response.meta['external_id'],
street = street,
url = response.meta['url']
)
house.save()
for hash in hashes:
picture = Picture(
house = house.id,
hash = hash
)
picture.save()
def get_image_hashes(self, response):
hashes = []
for img in response.css('.carousel_slide img'):
path = img.xpath('@data-src').extract()
if len(path) == 0:
path = img.xpath('@src').extract()
path = path[0]
path = cStringIO.StringIO(urllib.urlopen(path).read())
hashes.append(dhash(Image.open(path)))
return hashes
def extract_url_info(self, long_url):
url = long_url[0:long_url.find('?')]
external_id = url[url.rfind('-') + 1:]
return url, external_id
-- --------------------------------------------------------
--
-- Table structure for table `house`
--
CREATE TABLE IF NOT EXISTS `house` (
`id` int(10) unsigned NOT NULL,
`external_id` int(10) unsigned NOT NULL,
`rejected` tinyint(1) DEFAULT '0',
`url` varchar(256) COLLATE utf16_bin NOT NULL,
`rooms` int(11) NOT NULL,
`bathrooms` int(11) NOT NULL,
`floor` int(11) NOT NULL,
`street` text COLLATE utf16_bin NOT NULL,
`price` int(11) NOT NULL
) ENGINE=InnoDB AUTO_INCREMENT=22 DEFAULT CHARSET=utf16 COLLATE=utf16_bin;
-- --------------------------------------------------------
--
-- Table structure for table `picture`
--
CREATE TABLE IF NOT EXISTS `picture` (
`id` int(10) unsigned NOT NULL,
`house_id` int(10) unsigned NOT NULL,
`hash` varchar(255) COLLATE utf16_bin NOT NULL
) ENGINE=InnoDB AUTO_INCREMENT=204 DEFAULT CHARSET=utf16 COLLATE=utf16_bin;
--
-- Indexes for dumped tables
--
--
-- Indexes for table `house`
--
ALTER TABLE `house`
ADD PRIMARY KEY (`id`), ADD UNIQUE KEY `external_id` (`external_id`), ADD KEY `rejected` (`rejected`);
--
-- Indexes for table `picture`
--
ALTER TABLE `picture`
ADD PRIMARY KEY (`id`), ADD KEY `house_id` (`house_id`);
--
-- AUTO_INCREMENT for dumped tables
--
--
-- AUTO_INCREMENT for table `house`
--
ALTER TABLE `house`
MODIFY `id` int(10) unsigned NOT NULL AUTO_INCREMENT,AUTO_INCREMENT=22;
--
-- AUTO_INCREMENT for table `picture`
--
ALTER TABLE `picture`
MODIFY `id` int(10) unsigned NOT NULL AUTO_INCREMENT,AUTO_INCREMENT=204;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment