danstowell/wikidata_scan_osm.py

## wikidata_scan_osm.py
import os, sys, re
from datetime import datetime
from imposm.parser import OSMParser

########################################################

#osmsourcelbl = 'greater-london'
#osmsourcelbl = 'great-britain'
osmsourcelbl = 'planet'

osmsourcefpath = os.path.expanduser('~/osm/%s-latest.osm.pbf' % osmsourcelbl)

########################################################

wkddatum_format_matcher = re.compile(r"^[qQ]\d+$")

class WkdRefScan(object):
	"Class to parse OSM data file and build up a map from wkd identifiers to osm objects"
	wikidatas = 0
	granddict = {}

	def osmobjs(self, osmtype, items):
		granddict = self.granddict
		for osmid, tags, otherstuff in items:
			if 'wikidata' in tags:
				self.wikidatas += 1
				wkdrefs = set()
				for wkddatum in tags['wikidata'].split(';'):
					wkddatum = wkddatum.strip()
					# if garbage (meaning not expected format [qQ]\d+), skip
					if wkddatum_format_matcher.match(wkddatum) is None:
						print("Malformed wikidata string: %s -- in http://www.openstreetmap.org/%s/%i" % (wkddatum, osmtype, osmid))
						continue
					wkddatum = u'Q' + wkddatum[1:]  # normalise q to Q
					#print wkddatum
					wkdrefs.add(wkddatum)
				for wkddatum in wkdrefs:
					if wkddatum not in granddict:
						granddict[wkddatum] = []
					granddict[wkddatum].append((osmtype, osmid))

	def ways(self, items):
		self.osmobjs('way', items)
	def nodes(self, items):
		self.osmobjs('node', items)
	def relations(self, items):
		self.osmobjs('relation', items)

	def write(self, pathstem):
		"Write out big JSON and CSV files containing the whole wkd->osm lookup data"
		with open("%s.csv" % pathstem, 'wb') as csvfp:
			with open("%s.json" % pathstem, 'wb') as jsonfp:
				jsonfp.write("{\n")
				firsteverentry = True
				for wkd, osmobjlist in sorted(self.granddict.items()):
					# write beginning of WKD item
					csvfp.write("%s" % (wkd))
					jsonfp.write('%s"%s": [' % ([',',''][firsteverentry], wkd))
					for objindex, (osmtype, osmid) in enumerate(sorted(osmobjlist)):
						csvfp.write(",%s/%i" % (osmtype, osmid))
						jsonfp.write('%s["%s",%i]' % ([', ', ''][objindex==0], osmtype, osmid))
					# write end of WKD item
					csvfp.write("\n")
					jsonfp.write("]\n")
					firsteverentry = False
				jsonfp.write("}\n")

########################################################
if __name__ == '__main__':
	refobj = WkdRefScan()
	p = OSMParser(concurrency=4, ways_callback=refobj.ways, nodes_callback=refobj.nodes, relations_callback=refobj.relations)
	startTime = datetime.now()
	print("Beginning parsing. Time: %s" % startTime)
	p.parse(osmsourcefpath)
	endtime = datetime.now()
	print("Time taken: %s" % (endtime - startTime,))
	print("Number of wikidata tags encountered: %i" % refobj.wikidatas)
	refobj.write("output/wkdosm-gb")
	import os, sys, re
	from datetime import datetime
	from imposm.parser import OSMParser

	########################################################

	#osmsourcelbl = 'greater-london'
	#osmsourcelbl = 'great-britain'
	osmsourcelbl = 'planet'

	osmsourcefpath = os.path.expanduser('~/osm/%s-latest.osm.pbf' % osmsourcelbl)

	########################################################

	wkddatum_format_matcher = re.compile(r"^[qQ]\d+$")

	class WkdRefScan(object):
	"Class to parse OSM data file and build up a map from wkd identifiers to osm objects"
	wikidatas = 0
	granddict = {}

	def osmobjs(self, osmtype, items):
	granddict = self.granddict
	for osmid, tags, otherstuff in items:
	if 'wikidata' in tags:
	self.wikidatas += 1
	wkdrefs = set()
	for wkddatum in tags['wikidata'].split(';'):
	wkddatum = wkddatum.strip()
	# if garbage (meaning not expected format [qQ]\d+), skip
	if wkddatum_format_matcher.match(wkddatum) is None:
	print("Malformed wikidata string: %s -- in http://www.openstreetmap.org/%s/%i" % (wkddatum, osmtype, osmid))
	continue
	wkddatum = u'Q' + wkddatum[1:] # normalise q to Q
	#print wkddatum
	wkdrefs.add(wkddatum)
	for wkddatum in wkdrefs:
	if wkddatum not in granddict:
	granddict[wkddatum] = []
	granddict[wkddatum].append((osmtype, osmid))

	def ways(self, items):
	self.osmobjs('way', items)
	def nodes(self, items):
	self.osmobjs('node', items)
	def relations(self, items):
	self.osmobjs('relation', items)

	def write(self, pathstem):
	"Write out big JSON and CSV files containing the whole wkd->osm lookup data"
	with open("%s.csv" % pathstem, 'wb') as csvfp:
	with open("%s.json" % pathstem, 'wb') as jsonfp:
	jsonfp.write("{\n")
	firsteverentry = True
	for wkd, osmobjlist in sorted(self.granddict.items()):
	# write beginning of WKD item
	csvfp.write("%s" % (wkd))
	jsonfp.write('%s"%s": [' % ([',',''][firsteverentry], wkd))
	for objindex, (osmtype, osmid) in enumerate(sorted(osmobjlist)):
	csvfp.write(",%s/%i" % (osmtype, osmid))
	jsonfp.write('%s["%s",%i]' % ([', ', ''][objindex==0], osmtype, osmid))
	# write end of WKD item
	csvfp.write("\n")
	jsonfp.write("]\n")
	firsteverentry = False
	jsonfp.write("}\n")

	########################################################
	if __name__ == '__main__':
	refobj = WkdRefScan()
	p = OSMParser(concurrency=4, ways_callback=refobj.ways, nodes_callback=refobj.nodes, relations_callback=refobj.relations)
	startTime = datetime.now()
	print("Beginning parsing. Time: %s" % startTime)
	p.parse(osmsourcefpath)
	endtime = datetime.now()
	print("Time taken: %s" % (endtime - startTime,))
	print("Number of wikidata tags encountered: %i" % refobj.wikidatas)
	refobj.write("output/wkdosm-gb")