A Scrapy-Splash spider for extracting semantic annotations from geographic web maps authored with Leaflet.annotate
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
import json | |
import datetime | |
import scrapy | |
from scrapy import Selector | |
from scrapy_splash import SplashRequest | |
class LeafletAnnotateSpider(scrapy.Spider): | |
name = "leafletAnnotate" | |
allowed_domains = ["github.io"] | |
## Scrape the only two geographic web maps authored using Leaflet.annotate (up to my knowledge) | |
start_urls = [ | |
"http://mukil.github.io/Leaflet.annotate/example/paris/index.html", | |
"http://mukil.github.io/Leaflet.annotate/example/usna/index.html" | |
] | |
def __init__(self, typeName=None, url=None, *args, **kwargs): | |
super(LeafletAnnotateSpider, self).__init__(*args, **kwargs) | |
if typeName is not None: | |
self.log("LeafletAnnotateSpider to search for annotations of typeUri=https://schema.org/%s" % typeName) | |
self.log("Type specific search for annotations in webmap urls NOT YET IMPLEMENTED") | |
if url is not None: | |
self.log("LeafletAnnotateSpider to search for web map annotations at url=%s" % url) | |
self.start_urls.append(url) | |
def start_requests(self): | |
for url in self.start_urls: | |
yield SplashRequest(url, self.parse, | |
endpoint='render.html', | |
args={'wait': 0.5}, | |
dont_process_response=False | |
) | |
def parse(self, response): | |
htmlDoc = response.body | |
# jsonDocument = response.data | |
result = {} | |
map = response.css('div.leaflet-container').extract_first() | |
if (map is not None): | |
mapId = response.css('div.leaflet-container::attr(id)').extract() | |
result['source'] = response.url | |
result['domId'] = mapId | |
# htmlElements = [] | |
articles = response.xpath('//articles[@itemtype]') | |
## Debug: self.print_annotations(articles) | |
result['articleElements'] = articles.extract() | |
metadataelements = response.xpath('//metadata[@itemtype]') | |
result['svgElements'] = metadataelements.extract() | |
## Debug: self.print_annotations(metadataelements) | |
title = response.xpath('//title/text()')[0].extract() | |
result['pageTitle'] = title | |
# Leaflet JS Script Href (Attempt to parse a Version Number) | |
# result['leafletScriptUrl'] = response.xpath('//script/text()')[0].extract() | |
filename = self.convert_to_filename(response.url) | |
with open(filename, 'wt') as f: | |
json.dump(result, f) | |
f.close() | |
self.log('Success: Saved semantic webmap representation (domId=%(domId)s))' | |
' \"%(title)s\" in file \"%(filename)s\"' | |
% {'domId': mapId,'title': title,'filename': filename }) | |
else: | |
self.log('WARNING: No leaflet container found on webpage \"%(url)s\"' % {'url': response.url}) | |
def convert_to_filename(self, url): | |
parsedAt = datetime.datetime.utcnow().strftime("%I:%M:%S%p-%d-%m-%Y") | |
urlendname = url.replace("http://", "") | |
urlendname = urlendname.replace("https://", "") | |
urlendname = urlendname.replace("/", "-") | |
urlendname = urlendname.replace(".", "-") | |
filename = 'webmap-%(filename)s-%(timestamp)s.json' % {'filename': urlendname, 'timestamp': parsedAt } | |
return filename | |
def print_annotations(self, elements): | |
self.log("Digging into %s we map elements for reading semantic web map annotations" % len(elements)) | |
for article in elements: | |
fragmentSelector = Selector(text=article.extract(), type="html") | |
itemTypeValue = fragmentSelector.xpath('self[@itemtype]').extract() | |
self.log("=> Web Map Element Type \"%s\"" % itemTypeValue) | |
for fragmentProp in fragmentSelector.xpath('//meta[@itemprop]'): | |
# itemProp = fragmentScope.xpath('@itemprop') | |
if fragmentProp.extract() is "name": | |
# self.log("current scope %s" % itemValue.extract()) | |
# properties = fragmentScope.xpath(''' | |
# set:difference( | |
# ./descendant::*/@itemprop, | |
# .//*[@itemscope]/*/@itemprop)''') | |
self.log("\t=> Name %s" % fragmentProp.extract()) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Proof of System
Proofs the functionality of Laptop-Laboratory style of crawling geographic web maps which contain dynamically generated microdata statements in the DOM. Extracts statements made on all web map elements contained in the web mapping document and therewith facilitates reasoning on the dynamics of certain geographic web maps over time.
Output
Turns leaflet based web map authored with Leaflet.annotate into a JSON text file which (a) carries the url and the timestamp of parsing in the filename and (b) contains the following data structure including all annotations on web map elements:
Attributions
This gist is based on contents and has taken parts from Handling JS in Scrapy with Splash by Richard Dowinton. Go through this article and setup your docker container running splash, if you want to start seeing this spider in action. And of course, the Scrapy-Splash Readme and source code repo on github. Additionally, the docs our first Spider, and Spiders from the Scrapy project helped a lot. And, last but not least :), of course, this answer from SO on XPath syntax, This answer on writing python dictionary as JSON text files, and this answer on how to append an element to a python list (Hint: calling .append will do the trick).