Skip to content

Instantly share code, notes, and snippets.

@danstowell
Created March 22, 2017 22:27
Show Gist options
  • Save danstowell/f2e05428ec62aafa77f4b24693a1a4d5 to your computer and use it in GitHub Desktop.
Save danstowell/f2e05428ec62aafa77f4b24693a1a4d5 to your computer and use it in GitHub Desktop.
Script to build Wikidata -> OpenStreetMap lookup table
import os, sys, re
from datetime import datetime
from imposm.parser import OSMParser
########################################################
#osmsourcelbl = 'greater-london'
#osmsourcelbl = 'great-britain'
osmsourcelbl = 'planet'
osmsourcefpath = os.path.expanduser('~/osm/%s-latest.osm.pbf' % osmsourcelbl)
########################################################
wkddatum_format_matcher = re.compile(r"^[qQ]\d+$")
class WkdRefScan(object):
"Class to parse OSM data file and build up a map from wkd identifiers to osm objects"
wikidatas = 0
granddict = {}
def osmobjs(self, osmtype, items):
granddict = self.granddict
for osmid, tags, otherstuff in items:
if 'wikidata' in tags:
self.wikidatas += 1
wkdrefs = set()
for wkddatum in tags['wikidata'].split(';'):
wkddatum = wkddatum.strip()
# if garbage (meaning not expected format [qQ]\d+), skip
if wkddatum_format_matcher.match(wkddatum) is None:
print("Malformed wikidata string: %s -- in http://www.openstreetmap.org/%s/%i" % (wkddatum, osmtype, osmid))
continue
wkddatum = u'Q' + wkddatum[1:] # normalise q to Q
#print wkddatum
wkdrefs.add(wkddatum)
for wkddatum in wkdrefs:
if wkddatum not in granddict:
granddict[wkddatum] = []
granddict[wkddatum].append((osmtype, osmid))
def ways(self, items):
self.osmobjs('way', items)
def nodes(self, items):
self.osmobjs('node', items)
def relations(self, items):
self.osmobjs('relation', items)
def write(self, pathstem):
"Write out big JSON and CSV files containing the whole wkd->osm lookup data"
with open("%s.csv" % pathstem, 'wb') as csvfp:
with open("%s.json" % pathstem, 'wb') as jsonfp:
jsonfp.write("{\n")
firsteverentry = True
for wkd, osmobjlist in sorted(self.granddict.items()):
# write beginning of WKD item
csvfp.write("%s" % (wkd))
jsonfp.write('%s"%s": [' % ([',',''][firsteverentry], wkd))
for objindex, (osmtype, osmid) in enumerate(sorted(osmobjlist)):
csvfp.write(",%s/%i" % (osmtype, osmid))
jsonfp.write('%s["%s",%i]' % ([', ', ''][objindex==0], osmtype, osmid))
# write end of WKD item
csvfp.write("\n")
jsonfp.write("]\n")
firsteverentry = False
jsonfp.write("}\n")
########################################################
if __name__ == '__main__':
refobj = WkdRefScan()
p = OSMParser(concurrency=4, ways_callback=refobj.ways, nodes_callback=refobj.nodes, relations_callback=refobj.relations)
startTime = datetime.now()
print("Beginning parsing. Time: %s" % startTime)
p.parse(osmsourcefpath)
endtime = datetime.now()
print("Time taken: %s" % (endtime - startTime,))
print("Number of wikidata tags encountered: %i" % refobj.wikidatas)
refobj.write("output/wkdosm-gb")
@nicolasmaia
Copy link

Please add a license header :^)

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment