Navigation Menu

Skip to content

Instantly share code, notes, and snippets.

@mukil
Last active March 30, 2017 15:12
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save mukil/dbe71bf2797e439ad2e1c18851194419 to your computer and use it in GitHub Desktop.
Save mukil/dbe71bf2797e439ad2e1c18851194419 to your computer and use it in GitHub Desktop.
A Scrapy-Splash spider for extracting semantic annotations from geographic web maps authored with Leaflet.annotate
# -*- coding: utf-8 -*-
import json
import datetime
import scrapy
from scrapy import Selector
from scrapy_splash import SplashRequest
class LeafletAnnotateSpider(scrapy.Spider):
name = "leafletAnnotate"
allowed_domains = ["github.io"]
## Scrape the only two geographic web maps authored using Leaflet.annotate (up to my knowledge)
start_urls = [
"http://mukil.github.io/Leaflet.annotate/example/paris/index.html",
"http://mukil.github.io/Leaflet.annotate/example/usna/index.html"
]
def __init__(self, typeName=None, url=None, *args, **kwargs):
super(LeafletAnnotateSpider, self).__init__(*args, **kwargs)
if typeName is not None:
self.log("LeafletAnnotateSpider to search for annotations of typeUri=https://schema.org/%s" % typeName)
self.log("Type specific search for annotations in webmap urls NOT YET IMPLEMENTED")
if url is not None:
self.log("LeafletAnnotateSpider to search for web map annotations at url=%s" % url)
self.start_urls.append(url)
def start_requests(self):
for url in self.start_urls:
yield SplashRequest(url, self.parse,
endpoint='render.html',
args={'wait': 0.5},
dont_process_response=False
)
def parse(self, response):
htmlDoc = response.body
# jsonDocument = response.data
result = {}
map = response.css('div.leaflet-container').extract_first()
if (map is not None):
mapId = response.css('div.leaflet-container::attr(id)').extract()
result['source'] = response.url
result['domId'] = mapId
# htmlElements = []
articles = response.xpath('//articles[@itemtype]')
## Debug: self.print_annotations(articles)
result['articleElements'] = articles.extract()
metadataelements = response.xpath('//metadata[@itemtype]')
result['svgElements'] = metadataelements.extract()
## Debug: self.print_annotations(metadataelements)
title = response.xpath('//title/text()')[0].extract()
result['pageTitle'] = title
# Leaflet JS Script Href (Attempt to parse a Version Number)
# result['leafletScriptUrl'] = response.xpath('//script/text()')[0].extract()
filename = self.convert_to_filename(response.url)
with open(filename, 'wt') as f:
json.dump(result, f)
f.close()
self.log('Success: Saved semantic webmap representation (domId=%(domId)s))'
' \"%(title)s\" in file \"%(filename)s\"'
% {'domId': mapId,'title': title,'filename': filename })
else:
self.log('WARNING: No leaflet container found on webpage \"%(url)s\"' % {'url': response.url})
def convert_to_filename(self, url):
parsedAt = datetime.datetime.utcnow().strftime("%I:%M:%S%p-%d-%m-%Y")
urlendname = url.replace("http://", "")
urlendname = urlendname.replace("https://", "")
urlendname = urlendname.replace("/", "-")
urlendname = urlendname.replace(".", "-")
filename = 'webmap-%(filename)s-%(timestamp)s.json' % {'filename': urlendname, 'timestamp': parsedAt }
return filename
def print_annotations(self, elements):
self.log("Digging into %s we map elements for reading semantic web map annotations" % len(elements))
for article in elements:
fragmentSelector = Selector(text=article.extract(), type="html")
itemTypeValue = fragmentSelector.xpath('self[@itemtype]').extract()
self.log("=> Web Map Element Type \"%s\"" % itemTypeValue)
for fragmentProp in fragmentSelector.xpath('//meta[@itemprop]'):
# itemProp = fragmentScope.xpath('@itemprop')
if fragmentProp.extract() is "name":
# self.log("current scope %s" % itemValue.extract())
# properties = fragmentScope.xpath('''
# set:difference(
# ./descendant::*/@itemprop,
# .//*[@itemscope]/*/@itemprop)''')
self.log("\t=> Name %s" % fragmentProp.extract())
@mukil
Copy link
Author

mukil commented Mar 30, 2017

Proof of System

Proofs the functionality of Laptop-Laboratory style of crawling geographic web maps which contain dynamically generated microdata statements in the DOM. Extracts statements made on all web map elements contained in the web mapping document and therewith facilitates reasoning on the dynamics of certain geographic web maps over time.

Output

Turns leaflet based web map authored with Leaflet.annotate into a JSON text file which (a) carries the url and the timestamp of parsing in the filename and (b) contains the following data structure including all annotations on web map elements:

{
	"svgElements": [
             "<metadata itemscope=\"\" itemtype=\"http://schema.org/CreativeWork\" data-internal-leaflet-id=\"26\"><meta itemprop=\"name\" content=\"The circle marker indicating geographically where this meta poem was conceived.\"><g itemscope=\"\" itemtype=\"http://schema.org/Place\" itemprop=\"locationCreated\"><g itemprop=\"geo\" itemtype=\"http://schema.org/GeoCoordinates\" itemscope=\"\"><meta itemprop=\"latitude\" content=\"40.573112\"><meta itemprop=\"longitude\" content=\"-73.98074\"></g></g></metadata>",
             "<metadata itemscope=\"\" itemtype=\"http://schema.org/Person\" data-internal-leaflet-id=\"27\"><meta itemprop=\"name\" content=\"Bernie Sanders.\"><meta itemprop=\"description\" content=\"Running for the White House, USA - Democratic Presidential Candidate 2016\"><meta itemprop=\"sameAs\" content=\"https://www.wikidata.org/wiki/Q359442\"><g itemscope=\"\" itemtype=\"http://schema.org/Place\" itemprop=\"workLocation\"><g itemprop=\"geo\" itemtype=\"http://schema.org/GeoCoordinates\" itemscope=\"\"><meta itemprop=\"latitude\" content=\"44.478344\"><meta itemprop=\"longitude\" content=\"-73.213295\"></g></g></metadata>",
             "<metadata itemscope=\"\" itemtype=\"http://schema.org/Organization\" data-internal-leaflet-id=\"28\"><meta itemprop=\"name\" content=\"Google Inc\"><g itemscope=\"\" itemtype=\"http://schema.org/Place\" itemprop=\"location\"><g itemprop=\"geo\" itemtype=\"http://schema.org/GeoCoordinates\" itemscope=\"\"><meta itemprop=\"latitude\" content=\"37.422436\"><meta itemprop=\"longitude\" content=\"-122.084057\"></g></g></metadata>"],
	"articleElements": [],
	"source": "http://mukil.github.io/Leaflet.annotate/example/usna/index.html",
	"pageTitle": "Usage Example - Semantic Markup for LeafletJS Marker, Popups and Overlay",
	"domId": ["map"]
}

Attributions

This gist is based on contents and has taken parts from Handling JS in Scrapy with Splash by Richard Dowinton. Go through this article and setup your docker container running splash, if you want to start seeing this spider in action. And of course, the Scrapy-Splash Readme and source code repo on github. Additionally, the docs our first Spider, and Spiders from the Scrapy project helped a lot. And, last but not least :), of course, this answer from SO on XPath syntax, This answer on writing python dictionary as JSON text files, and this answer on how to append an element to a python list (Hint: calling .append will do the trick).

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment