Skip to content

Instantly share code, notes, and snippets.

@nuklea
Created July 4, 2017 03:27
Show Gist options
  • Save nuklea/9bf765ae0b8ef95e84fed56900fa5b89 to your computer and use it in GitHub Desktop.
Save nuklea/9bf765ae0b8ef95e84fed56900fa5b89 to your computer and use it in GitHub Desktop.
class GeocoderPipeline(object):
namespaces = {'gml': 'http://www.opengis.net/gml',
'ymaps': 'http://maps.yandex.ru/ymaps/1.x',
'geocoder': 'http://maps.yandex.ru/geocoder/1.x'}
@classmethod
def from_crawler(cls, crawler):
try:
pipe = cls.from_settings(crawler.settings)
except AttributeError:
pipe = cls()
pipe.crawler = crawler
return pipe
def process_item(self, item, spider):
previous = Advert.objects.filter(source=item['source'], remote_id=item['remote_id']) \
.exclude(coordinates__isnull=True).values('coordinates', 'address', 'remote_address').first()
if previous and item['remote_address'] == previous['remote_address']:
item['address'] = previous['address']
item['coordinates'] = previous['coordinates']
return item
dfd = self.crawler.engine.download(self.get_request(item), spider)
return dfd.addCallback(self.parse_response, item)
def get_request(self, item):
qs = {'geocode': 'Россия, Красноярский край, Красноярск, {remote_address}'.format(**item), 'lang': 'ru-RU'}
return FormRequest('https://geocode-maps.yandex.ru/1.x/', method='GET', formdata=qs)
def parse_response(self, response, item):
s = response.selector
for namespace, schema in self.namespaces.items():
s.register_namespace(namespace, schema)
# Нам нужны координаты дома, а не улицы и так далее
geo_object = s.xpath('(//geocoder:kind[text()="house"]//ancestor::ymaps:GeoObject)[1]')
if not geo_object:
# Пропускаем объявления с кривым адресом
raise DropItem('Location "{remote_address}" does not exist'.format(**item))
# Всегда записываем нормализованный адрес
item['address'] = geo_object.xpath('./gml:name/text()').extract()[0]
if all(item.get(key) for key in ('longitude', 'latitude')):
item['coordinates'] = Point(item['longitude'], item['latitude'])
else:
item['coordinates'] = Point(*map(float, geo_object.xpath('./gml:Point/gml:pos/text()')
.extract()[0].split()))
return item
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment