Skip to content

Instantly share code, notes, and snippets.

@ldgarcia
Last active June 2, 2020 18:24
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save ldgarcia/55c919b34ace1cae09b4ed25ae2ba594 to your computer and use it in GitHub Desktop.
Save ldgarcia/55c919b34ace1cae09b4ed25ae2ba594 to your computer and use it in GitHub Desktop.
GeoJsonItemExporter for Scrapy

About the GeoJsonItemExporter for Scrapy

The GeoJsoItemExporter creates a collection of features where the feature's geometry property is a point and its properties property is the item serialized into JSON format.

Every item must have a value for latitude and longitude in order to be exported. No validation is made to ensure the data is correct.

# Based on Scrapy's JsonItemExporter
# See: https://github.com/scrapy/scrapy/blob/master/scrapy/exporters.py
from scrapy.exporters import BaseItemExporter
from scrapy.utils.serialize import ScrapyJSONEncoder
from scrapy.utils.python import to_bytes
class GeoJsonItemExporter(BaseItemExporter):
def __init__(self, file, **kwargs):
self._configure(kwargs, dont_fail=True)
self.file = file
kwargs.setdefault('ensure_ascii', not self.encoding)
self.encoder = ScrapyJSONEncoder(**kwargs)
self.first_item = True
def start_exporting(self):
self.file.write(b'{ "type": "FeatureCollection","features":[\n')
def finish_exporting(self):
self.file.write(b"\n]}")
def export_item(self, item):
if self.first_item:
self.first_item = False
else:
self.file.write(b',\n')
itemdict = dict(self._get_serialized_fields(item))
geojsondict = {
'type':'Feature',
'geometry': {
'type': 'Point',
'coordinates': [itemdict['longitude'], itemdict['latitude']]
},
'properties':itemdict
}
data = self.encoder.encode(geojsondict)
self.file.write(to_bytes(data, self.encoding))
from crawler.exporters import GeoJsonItemExporter
class GeoJSONWriterPipeline(object):
def __init__(self):
self.files = dict()
self.exporters = dict()
def open_spider(self, spider):
file_path = '/data/{}.geojson'.format(spider.name)
file = open(file_path, spider._meta.get('mode', 'wb'))
self.files[spider.name] = file
self.exporters[spider.name] = GeoJsonItemExporter(file)
self.exporters[spider.name].start_exporting()
def process_item(self, item, spider):
if item.get('latitude') and item.get('longitude'):
self.exporters[spider.name].export_item(item)
return item
def close_spider(self, spider):
exporter = self.exporters.pop(spider.name)
exporter.finish_exporting()
file = self.files.pop(spider.name)
file.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment