mukil/LeafletAnnotateSpider.py

## LeafletAnnotateSpider.py
# -*- coding: utf-8 -*-
import json
import datetime
import scrapy
from scrapy import Selector
from scrapy_splash import SplashRequest

class LeafletAnnotateSpider(scrapy.Spider):
    name = "leafletAnnotate"
    allowed_domains = ["github.io"]

    ## Scrape the only two geographic web maps authored using Leaflet.annotate (up to my knowledge)
    start_urls = [
        "http://mukil.github.io/Leaflet.annotate/example/paris/index.html",
        "http://mukil.github.io/Leaflet.annotate/example/usna/index.html"
    ]

    def __init__(self, typeName=None, url=None, *args, **kwargs):
        super(LeafletAnnotateSpider, self).__init__(*args, **kwargs)
        if typeName is not None:
            self.log("LeafletAnnotateSpider to search for annotations of typeUri=https://schema.org/%s" % typeName)
            self.log("Type specific search for annotations in webmap urls NOT YET IMPLEMENTED")
        if url is not None:
            self.log("LeafletAnnotateSpider to search for web map annotations at url=%s" % url)
            self.start_urls.append(url)

    def start_requests(self):
        for url in self.start_urls:
            yield SplashRequest(url, self.parse,
                endpoint='render.html',
                args={'wait': 0.5},
                dont_process_response=False
            )

    def parse(self, response):
        htmlDoc = response.body
        # jsonDocument = response.data
        result = {}
        map = response.css('div.leaflet-container').extract_first()
        if (map is not None):
            mapId = response.css('div.leaflet-container::attr(id)').extract()
            result['source'] = response.url
            result['domId'] = mapId
            # htmlElements = []
            articles = response.xpath('//articles[@itemtype]')
            ## Debug: self.print_annotations(articles)
            result['articleElements'] = articles.extract()
            metadataelements = response.xpath('//metadata[@itemtype]')
            result['svgElements'] = metadataelements.extract()
            ## Debug: self.print_annotations(metadataelements)
            title = response.xpath('//title/text()')[0].extract()
            result['pageTitle'] = title
            # Leaflet JS Script Href (Attempt to parse a Version Number)
            # result['leafletScriptUrl'] = response.xpath('//script/text()')[0].extract()
            filename = self.convert_to_filename(response.url)
            with open(filename, 'wt') as f:
                json.dump(result, f)
                f.close()
            self.log('Success: Saved semantic webmap representation (domId=%(domId)s))'
                     ' \"%(title)s\" in file \"%(filename)s\"'
                     % {'domId': mapId,'title': title,'filename': filename })
        else:
            self.log('WARNING: No leaflet container found on webpage \"%(url)s\"' % {'url': response.url})

    def convert_to_filename(self, url):
        parsedAt = datetime.datetime.utcnow().strftime("%I:%M:%S%p-%d-%m-%Y")
        urlendname = url.replace("http://", "")
        urlendname = urlendname.replace("https://", "")
        urlendname = urlendname.replace("/", "-")
        urlendname = urlendname.replace(".", "-")
        filename = 'webmap-%(filename)s-%(timestamp)s.json' % {'filename': urlendname, 'timestamp': parsedAt }
        return filename

    def print_annotations(self, elements):
        self.log("Digging into %s we map elements for reading semantic web map annotations" % len(elements))
        for article in elements:
            fragmentSelector = Selector(text=article.extract(), type="html")
            itemTypeValue = fragmentSelector.xpath('self[@itemtype]').extract()
            self.log("=> Web Map Element Type \"%s\"" % itemTypeValue)
            for fragmentProp in fragmentSelector.xpath('//meta[@itemprop]'):
                # itemProp = fragmentScope.xpath('@itemprop')
                if fragmentProp.extract() is "name":
                    # self.log("current scope %s" % itemValue.extract())
                    # properties = fragmentScope.xpath('''
                      #  set:difference(
                       #     ./descendant::*/@itemprop,
                        #    .//*[@itemscope]/*/@itemprop)''')
                    self.log("\t=> Name %s" % fragmentProp.extract())
	# -- coding: utf-8 --
	import json
	import datetime
	import scrapy
	from scrapy import Selector
	from scrapy_splash import SplashRequest

	class LeafletAnnotateSpider(scrapy.Spider):
	name = "leafletAnnotate"
	allowed_domains = ["github.io"]

	## Scrape the only two geographic web maps authored using Leaflet.annotate (up to my knowledge)
	start_urls = [
	"http://mukil.github.io/Leaflet.annotate/example/paris/index.html",
	"http://mukil.github.io/Leaflet.annotate/example/usna/index.html"
	]

	def __init__(self, typeName=None, url=None, args, *kwargs):
	super(LeafletAnnotateSpider, self).__init__(args, *kwargs)
	if typeName is not None:
	self.log("LeafletAnnotateSpider to search for annotations of typeUri=https://schema.org/%s" % typeName)
	self.log("Type specific search for annotations in webmap urls NOT YET IMPLEMENTED")
	if url is not None:
	self.log("LeafletAnnotateSpider to search for web map annotations at url=%s" % url)
	self.start_urls.append(url)

	def start_requests(self):
	for url in self.start_urls:
	yield SplashRequest(url, self.parse,
	endpoint='render.html',
	args={'wait': 0.5},
	dont_process_response=False
	)

	def parse(self, response):
	htmlDoc = response.body
	# jsonDocument = response.data
	result = {}
	map = response.css('div.leaflet-container').extract_first()
	if (map is not None):
	mapId = response.css('div.leaflet-container::attr(id)').extract()
	result['source'] = response.url
	result['domId'] = mapId
	# htmlElements = []
	articles = response.xpath('//articles[@itemtype]')
	## Debug: self.print_annotations(articles)
	result['articleElements'] = articles.extract()
	metadataelements = response.xpath('//metadata[@itemtype]')
	result['svgElements'] = metadataelements.extract()
	## Debug: self.print_annotations(metadataelements)
	title = response.xpath('//title/text()')[0].extract()
	result['pageTitle'] = title
	# Leaflet JS Script Href (Attempt to parse a Version Number)
	# result['leafletScriptUrl'] = response.xpath('//script/text()')[0].extract()
	filename = self.convert_to_filename(response.url)
	with open(filename, 'wt') as f:
	json.dump(result, f)
	f.close()
	self.log('Success: Saved semantic webmap representation (domId=%(domId)s))'
	' \"%(title)s\" in file \"%(filename)s\"'
	% {'domId': mapId,'title': title,'filename': filename })
	else:
	self.log('WARNING: No leaflet container found on webpage \"%(url)s\"' % {'url': response.url})

	def convert_to_filename(self, url):
	parsedAt = datetime.datetime.utcnow().strftime("%I:%M:%S%p-%d-%m-%Y")
	urlendname = url.replace("http://", "")
	urlendname = urlendname.replace("https://", "")
	urlendname = urlendname.replace("/", "-")
	urlendname = urlendname.replace(".", "-")
	filename = 'webmap-%(filename)s-%(timestamp)s.json' % {'filename': urlendname, 'timestamp': parsedAt }
	return filename

	def print_annotations(self, elements):
	self.log("Digging into %s we map elements for reading semantic web map annotations" % len(elements))
	for article in elements:
	fragmentSelector = Selector(text=article.extract(), type="html")
	itemTypeValue = fragmentSelector.xpath('self[@itemtype]').extract()
	self.log("=> Web Map Element Type \"%s\"" % itemTypeValue)
	for fragmentProp in fragmentSelector.xpath('//meta[@itemprop]'):
	# itemProp = fragmentScope.xpath('@itemprop')
	if fragmentProp.extract() is "name":
	# self.log("current scope %s" % itemValue.extract())
	# properties = fragmentScope.xpath('''
	# set:difference(
	# ./descendant::*/@itemprop,
	# .//[@itemscope]//@itemprop)''')
	self.log("\t=> Name %s" % fragmentProp.extract())