Created
January 25, 2018 20:51
-
-
Save stepps00/b7b1f936d10a58e1d7118b4f2e1c4548 to your computer and use it in GitHub Desktop.
Import script to add mesoshapes to WOF
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# run original SHP file thru MapShaper to add label position (mps_y, mps_x) columns | |
# | |
# first add label position via mapshaper: | |
# mapshaper input.shp encoding=utf8 -each 'mps_x=$.innerX, mps_y=$.innerY' -o import_via_mapshaper.shp | |
# full example: | |
# mapshaper /usr/local/mapzen/countries/Chile/Admin_2/Chile_admin2.shp encoding=utf8 -each 'mps_x=$.innerX, mps_y=$.innerY' -o chile_adm2_via_mapshaper.shp | |
# | |
# now convert that SHP to GeoJSON format, which is easier to load into Python | |
# ogr2ogr -F GeoJSON converted.geojson import_via_mapshaper.shp | |
# full example: | |
# ogr2ogr -F GeoJSON chile_adm2_via_mapshaper.geojson chile_adm2_via_mapshaper.shp | |
# | |
# then run this script, like: | |
# python apply_wof_id_to_martin_shapes_using_wof_api.py chile_adm2_via_mapshaper.geojson county 85633057 CL meso | |
import sys | |
import os | |
import logging | |
import optparse | |
import json | |
import geojson | |
import pprint | |
import mapzen.whosonfirst.api.client | |
import mapzen.whosonfirst.utils | |
import mapzen.whosonfirst.export | |
import editdistance | |
logging.basicConfig(level=logging.INFO) | |
if __name__ == '__main__': | |
opt_parser = optparse.OptionParser() | |
# python apply_wof_id_to_martin_shapes_using_wof_api.py chile_adm2_via_mapshaper.geojson county 85633057 CL meso | |
opt_parser.add_option('-i', '--input', dest='input', action='store', default=None, help='Where to read GeoJSON import file from') | |
opt_parser.add_option('-o', '--output', dest='output', action='store', default="/usr/local/mapzen/whosonfirst-data/data", help='Where to write WOF records to') | |
opt_parser.add_option('-g', '--debug', dest='debug', action='store', default=None, help='Where to write debug GeoJSON (with wof:id added)') | |
opt_parser.add_option('-k', '--skip-wof-api', dest='skip', action='store_true', default=None, help='Skip running WOF API to look for existing features.') | |
opt_parser.add_option('-p', '--placetype', dest='placetype', action='store', default=None, help='What WOF placetype') | |
opt_parser.add_option('-c', '--country_id', dest='country_id', action='store', default=None, help='What country wof:id') | |
opt_parser.add_option('-d', '--country_code', dest='country_code', action='store', default=None, help='What WOF (ISO) country code') | |
opt_parser.add_option('-s', '--source', dest='wof_source', action='store', default='meso', help='What WOF data source identifier') | |
opt_parser.add_option('-v', '--verbose', dest='verbose', action='store_true', default=False, help='Be chatty (default is False)') | |
options, args = opt_parser.parse_args() | |
# calling the file | |
#raise Exception, "Y U RUN THIS? Usage: python scriptname.py import_via_mapshaper.geojson wof_placetype country_id namespace" | |
# setup variables | |
wof_placetype = ['country', 'region', 'macrocounty', 'county', 'locality', 'neighbourhood'] | |
geoplanet_placetype = { | |
"Country" : "country", | |
"County" : "county", | |
"HistoricalCounty" : "county", # this should be marked CESSATION DATE with today's date, and MZ:IS_CURRENT = 0 | |
"LocalAdmin" : "localadmin", | |
"State" : "region", | |
"Suburb" : "neighbourhood", | |
"Town" : "locality" | |
} | |
# there are only 8 places which don't have a geoplanet placetype | |
# and also don't have a geonames.org placetype (gn_fcode) | |
geonames_org_placetype = { | |
"ADM1" : "region", | |
"ADM1H" : "region", # this should be marked CESSATION DATE with today's date, and MZ:IS_CURRENT = 0 | |
"ADM2" : "county", | |
"ADM2H" : "county", # this should be marked CESSATION DATE with today's date, and MZ:IS_CURRENT = 0 | |
"ADM3" : "localadmin", | |
"ADM3H" : "localadmin", # this should be marked CESSATION DATE with today's date, and MZ:IS_CURRENT = 0 | |
"ADM4" : "localadmin", | |
"ADM4H" : "localadmin", # this should be marked CESSATION DATE with today's date, and MZ:IS_CURRENT = 0 | |
"ADM5" : "localadmin", | |
"ADMD" : "localadmin", | |
"ADMDH" : "localadmin", # this should be marked CESSATION DATE with today's date, and MZ:IS_CURRENT = 0 | |
"ADMF" : "locality", # gov't building | |
"AGRF" : "locality", # ag / farm | |
"AIRB" : "locality", # airbase | |
"AIRP" : "locality", # airport | |
"ANS" : "locality", # ancient site | |
"AREA" : "locality", # huh? | |
"ATOL" : "locality", | |
"BAR" : "locality", # this should probably SKIP | |
"BAY" : "locality", # this should probably SKIP | |
"BLDG" : "locality", | |
"BLDO" : "locality", | |
"BP" : "locality", # boundary marker? | |
"CH" : "locality", # church | |
"CMP" : "locality", | |
"CMPMN" : "locality", | |
"CMPRF" : "locality", | |
"CMTY" : "locality", | |
"CNLSB" : "locality", | |
"CNS" : "locality", | |
"COMC" : "locality", | |
"CONT" : "locality", | |
"COVE" : "locality", | |
"CRTR" : "locality", | |
"CST" : "locality", | |
"CSTL" : "locality", | |
"CSTM" : "locality", | |
"CTRS" : "locality", | |
"CULT" : "locality", | |
"DEVH" : "locality", | |
"DLTA" : "locality", | |
"DPOF" : "locality", | |
"DVD" : "locality", | |
"EST" : "locality", | |
"FCL" : "locality", | |
"FLD" : "locality", | |
"FRM" : "locality", # farm | |
"FRMS" : "locality", | |
"FRMT" : "locality", | |
"GHSE" : "locality", | |
"GRAZ" : "locality", | |
"HLL" : "locality", | |
"HMSD" : "locality", | |
"HSE" : "locality", # house | |
"HSEC" : "locality", # houses | |
"HSP" : "locality", # hospital | |
"HSPC" : "locality", | |
"HSTS" : "locality", | |
"HTL" : "locality", | |
"HUT" : "locality", | |
"HUTS" : "locality", | |
"INDS" : "locality", | |
"ISL" : "locality", | |
"ISLET" : "locality", | |
"ISLS" : "locality", | |
"ISLX" : "locality", | |
"ITTR" : "locality", | |
"LCTY" : "locality", | |
"LEPC" : "locality", | |
"LK" : "locality", | |
"LTER" : "locality", | |
"MALL" : "locality", | |
"MAR" : "locality", | |
"MFG" : "locality", | |
"MKT" : "locality", | |
"MLSW" : "locality", | |
"MLWTR" : "locality", | |
"MSQE" : "locality", | |
"MSTY" : "locality", | |
"MT" : "locality", | |
"MTS" : "locality", | |
"OBPT" : "locality", | |
"OCH" : "locality", | |
"PCL" : "country", # political entity | |
"PCLD" : "dependency", # dependent political entity | |
"PCLF" : "dependency", # freely associated state | |
"PCLH" : "country", # historical political entity, CESSATION DATE with today's date, and MZ:IS_CURRENT = 0 | |
"PCLI" : "country", # independent political entity | |
"PCLIX" : "region", # section of independent political entity | |
"PCLS" : "dependency", # semi-independent political entity | |
"PEN" : "locality", | |
"PGDA" : "locality", | |
"PK" : "locality", | |
"PKS" : "locality", | |
"PLN" : "locality", | |
"PPL" : "locality", # basic locality | |
"PPLA" : "locality", # region capital add new CAPITAL_OF list | |
"PPLA2" : "locality", # county capital add new CAPITAL_OF list | |
"PPLA3" : "locality", # localadmin capital add new CAPITAL_OF list | |
"PPLA4" : "locality", # localadmin capital add new CAPITAL_OF list | |
"PPLC" : "locality", # country capital add new CAPITAL_OF list | |
"PPLF" : "locality", # farm village | |
"PPLG" : "locality", # seat of government of a political entity add new CAPITAL_OF list | |
"PPLH" : "locality", # historical populated place, CESSATION DATE with today's date, and MZ:IS_CURRENT = 0 | |
"PPLL" : "locality", # populated locality | |
"PPLQ" : "locality", # abandoned populated place, CESSATION DATE with today's date, and MZ:IS_CURRENT = 0 | |
"PPLR" : "locality", # religious populated place | |
"PPLS" : "locality", # populated places | |
"PPLW" : "locality", # destroyed populated place, CESSATION DATE with today's date, and MZ:IS_CURRENT = 0 | |
"PPLX" : "neighbourhood", | |
"PRK" : "locality", | |
"PRN" : "locality", | |
"PRSH" : "locality", | |
"PRT" : "locality", | |
"PT" : "locality", | |
"RD" : "locality", | |
"RDGB" : "locality", | |
"RDGE" : "locality", | |
"RECG" : "locality", | |
"REST" : "locality", | |
"RESW" : "locality", | |
"RGN" : "locality", | |
"RHSE" : "locality", | |
"RLGR" : "locality", | |
"RNCH" : "locality", | |
"RR" : "locality", | |
"RSD" : "locality", | |
"RSRT" : "locality", | |
"RSTN" : "locality", | |
"RSV" : "locality", | |
"RUIN" : "locality", | |
"SAND" : "locality", | |
"SCH" : "locality", | |
"SCHC" : "locality", | |
"SCHT" : "locality", | |
"SHRN" : "locality", | |
"SHSU" : "locality", | |
"SPA" : "locality", | |
"SQR" : "locality", | |
"ST" : "locality", | |
"STDM" : "locality", | |
"STLMT" : "locality", | |
"STM" : "locality", | |
"STMI" : "locality", | |
"STNB" : "locality", | |
"STNC" : "locality", | |
"STNF" : "locality", | |
"STNM" : "locality", | |
"STNR" : "locality", | |
"STRT" : "locality", | |
"SWMP" : "locality", | |
"TERR" : "locality", | |
"TMB" : "locality", | |
"TMPL" : "locality", | |
"TOWR" : "locality", | |
"TRB" : "locality", | |
"TRIG" : "locality", | |
"TWO" : "locality", | |
"UNIV" : "locality", | |
"VAL" : "locality", | |
"WAD" : "locality", | |
"WTRH" : "locality", | |
"ZN" : "locality", | |
"ZNB" : "locality" | |
} | |
token = 'KEY...' | |
api = mapzen.whosonfirst.api.client.OAuth2(token) | |
# now the main logic | |
# read from the system arguements | |
# GeoJSON file to process | |
dump = os.path.abspath(options.input) | |
# where to put the results | |
source = os.path.abspath(options.output) | |
exporter = mapzen.whosonfirst.export.flatfile(source) | |
placetype = options.placetype | |
skip = options.skip | |
verbose = options.verbose | |
# TODO: rewrite this section | |
# was a valid placetype specified? | |
if not placetype in wof_placetype: | |
raise Exception, ('boo: placetype fail :( choose one of:', wof_placetype) | |
try: | |
country_id = int(options.country_id) | |
except: | |
raise Exception, ('boo: what country are you in, sir?') | |
# out of 1.3 million records: | |
# `iso` code is set on all but 250,000 records | |
# `gn_country` is set for all but 129,000 records | |
# iso and gn_country match for all but 2375 records | |
# only 18 don't have either iso or gn_country set | |
try: | |
iso_country_code = options.country_code | |
except: | |
raise Exception, ('boo: what ISO code does your country have, sir?') | |
try: | |
wof_src_namespace = options.wof_source | |
except: | |
raise Exception, ('boo: what WOF source namespace are you using, sir?') | |
if wof_src_namespace == 'meso' or 'mz': | |
src_lat = 'mps_y' | |
src_lng = 'mps_x' | |
src_label = 'mapshaper' | |
admin_1_name_prop = 'meso:admin_1' | |
# assumes Quattroshapes point gazetteer (which is WRONG) | |
else: | |
src_lat = 'lat' | |
src_lng = 'lng' | |
src_label = 'qspg' | |
admin_1_name_prop = 'qspg:name_adm1' | |
# read in the GeoJSON file contents (text file) | |
fh = open(dump, 'r') | |
# Import (parse) our text file from command line | |
# arguement as structured GeoJSON object | |
data = geojson.load(fh) | |
# sometimes input have bad columns to skip | |
property_key_skip = [ | |
"qs_id", # empty | |
"featurecla", # empty | |
"HASC", | |
"NAME_EN", | |
"mps_x", | |
"mps_y", | |
"SOURCE" | |
] | |
# sometimes SHP files have stupid 10 char | |
# UPPER names, make the WOF pretty | |
property_key_remap = { | |
"NAME_EN": "name:eng_x_preferred", | |
"NAME_LOCAL": "name:und_x_variant", | |
"NAME_LOC" : "name:und_x_variant", | |
"NAME_ALT" : "name:und_x_variant", | |
"NAME_ALT1" : "name:und_x_variant", | |
"NAME_ALT2" : "name:und_x_variant", | |
"GAUL_ADMIN": "name:eng_x_variant", | |
"GADM_ADMIN": "name:eng_x_variant", | |
"GADM_HASC2": "wof:concordances:hasc:id", | |
"HASC": "wof:concordances:hasc:id", | |
#"SOURCE": "src:geom", | |
"mps:latitude" : "ms:latitude", | |
"mps:latitude" : "ms:latitude", | |
# quattroshapes point gazetteer re-up | |
"name" : "name:eng_x_preferred", | |
"gn_id" : "wof:concordances:gn_id", | |
"woe_id" : "wof:concordances:woe_id", | |
#"gn_id_eh" : "wof:concordances:gn_id_eh", | |
#"woe_id_eh" : "wof:concordances:woe_id_eh", | |
"gn_name" : "name:und_x_variant", | |
"gn_ascii" : "name:und_x_variant", | |
"woe_name" : "name:und_x_variant", | |
"woe_nameen" : "name:eng_x_preferred", | |
"qs_maybe" : "qspg_id" | |
} | |
# values that are integers | |
keys_with_integer_values = [ | |
"OBJECTID", | |
"qs_id", | |
"gn_id", | |
"woe_id", | |
"gn_id_eh", | |
"woe_id_eh", | |
"scalerank", | |
"natscale", | |
"adm0cap", | |
"worldcity", | |
"megacity", | |
"metro_core", | |
"micro_core", | |
"gn_pop", | |
"parent_id", | |
"woe_local", | |
"woe_lau", | |
"woe_adm2", | |
"woe_adm1", | |
"woe_adm0", | |
"gns_id", | |
"photos", | |
"photos_all", | |
"woemembers", | |
"photos_1k", | |
"photos_9k", | |
"photos_sr", | |
"photos_9r", | |
"pop_sr", | |
"temp_id", | |
"qs_maybe" | |
] | |
source_name_remap = { | |
"AOTM": "meso" | |
} | |
wof_src_key = "src:geom" | |
# sometimes the (SHP) keys above contain "no data" | |
# which is conviently returned in a number of non-empty | |
# forms. this is why we can't have nice things. | |
no_data_vals = [ | |
"NO DATA", | |
"0", | |
"" | |
] | |
# did we already use this feature to match | |
# (if so, don't reuse it) | |
matched_feature_ids = [] | |
first_property_key_remap_warning = True | |
first_read_language_code_from_data_warning = True | |
first_mapshaper_warning = True | |
feature_counter = 1 | |
total_features = len(data['features']) | |
# for each feature in GeoJSON feature collection | |
for _f in data['features']: | |
#sys.stdout.write( str(feature_counter) + ' of ' + str(total_features) ) | |
print str(feature_counter) + ' of ' + str(total_features) | |
# print "%s of %s" % (feature_counter, total_features) | |
feature_counter += 1 | |
# | |
# SKIP this part since we're importing POLYGONS and we're | |
# expecting explicate mz_lat, mz_lng properties | |
# | |
## store the feature's geometry | |
#geom = _f['geometry'] | |
## store the feature's coordinates object (lat, lng) | |
#coords = geom['coordinates'] | |
## make that easier to work with later (for assigning label position) | |
#lon, lat = coords | |
# now for the feature's properties | |
_p = _f['properties'] | |
wof_id_from_data_file_boolean = False | |
# in some cases we might already have a WOF_ID, but probably not | |
try: | |
# we prefer wof:id in this format | |
wofid = _p['wof:id'] | |
# but sometimes history | |
wofid = _p.get('WOF_ID', wofid) | |
# if there is no WOF_ID, set it to zero | |
if wofid == None: | |
wofid = 0 | |
# else parse it as an integer so WOF is happy | |
else: | |
wofid = int(wofid) | |
wof_id_from_data_file_boolean = True | |
except: | |
wofid = 0 | |
# for most the data that doens't have a WOF_ID yet | |
# create a new feature that is the imported feature | |
if wofid == 0: | |
# set the geom and properties to match the imported feature | |
feature = _f | |
#"iso" : "", | |
#"gn_country" : "", | |
#"name_adm1" : "", | |
#"name_adm0" : "", | |
# add required WOF placetype | |
# TODO: This smells like we should ALWAYS set this, even if feature already exists | |
props = { | |
# we add wof:id later | |
# we add wof:name later | |
'wof:placetype': placetype, | |
'wof:hierarchy': [ | |
{ 'country_id': country_id } | |
], | |
'wof:country': iso_country_code | |
# ASSUMPTION: placetypes are added country, then region, then county, then, locality, then neighbourhood, etc | |
# TODO: we are missing parent_id | |
# The default for unknown parent_id is -1, but really we should PIP these things to find it out | |
} | |
# default the geometry source to the CL opt_parser value | |
# (but this is overwriten below as data-driven per feature) | |
if wof_src_namespace: | |
props[wof_src_key] = wof_src_namespace | |
# else we alreday have a WOF feature, so load that up and let's modify it | |
else: | |
feature = mapzen.whosonfirst.utils.load([source], wofid) | |
props = feature['properties'] | |
# set WOF:name based on what field? | |
if "name" in _p: | |
wof_name_key = "name" | |
elif "NAME_EN" in _p: | |
wof_name_key = "NAME_EN" | |
elif "NAME_LOCAL" in _p: | |
wof_name_key = "NAME_LOCAL" | |
elif "NAME_LOC" in _p: | |
wof_name_key = "NAME_LOC" | |
else: | |
raise Exception, ('boo: data needs either name, NAME_EN, or NAME_LOCAL fields') | |
# now selectively add data properties from import_via_mapshaper file | |
# making sure to sanitize their property (key) names | |
# NOTE: all other properties not in the property_key_remap list will be skipped | |
try: | |
for orig_key, sanitized_key in property_key_remap.items(): | |
#print "%s, %s" % (orig_key, sanitized_key) | |
#try: | |
# print "\t%s: %s" % (orig_key, _p[ orig_key ]) | |
#except: | |
# continue | |
try: | |
if _p[ orig_key ] == None: | |
# continue | |
props[ sanitized_key ] = '' | |
else: | |
# | |
# Special WOF-ism | |
# | |
#print "key: %s, value: %s" % (orig_key, _p[ orig_key ]) | |
# store the WOF name | |
if orig_key == wof_name_key: | |
if _p[ orig_key ] in no_data_vals: | |
props[ "wof:name" ] = "" | |
else: | |
props[ "wof:name" ] = unicode(_p[ orig_key ]) | |
print '1: props[ "wof:name" ]: %s' % (props[ "wof:name" ],) | |
if verbose: | |
print " wof:name being evaluted (for import or concordance): " + props[ "wof:name" ] + " (" + _p[ orig_key ] + ")" | |
# if the data has a source specified, try to be data driven | |
if _p[ sanitized_key ] == wof_src_key: | |
# make it easier to read the later logic | |
input_src_value = _p[ orig_key ] | |
# but sometimes the data values are bad | |
if input_src_value in source_name_remap: | |
# set the final property to the remapped value | |
props[ sanitized_key ] = source_name_remap[ input_src_value ] | |
else: | |
# In this case don't take the unicode of the input src as WOF sources are only ascii7 | |
props[ sanitized_key ] = input_src_value | |
print '2: props[ "wof:name" ]: %s' % (props[ "wof:name" ],) | |
# | |
# For the explicate property lookups in property_key_remap | |
# | |
# | |
# names are lists | |
if ("name:" in sanitized_key): | |
# skip this key if the data is bad, but if it's good, do the following | |
if _p[ orig_key ] not in no_data_vals: | |
# langauge codes must be 3 characters long | |
# https://en.wikipedia.org/wiki/ISO_639 | |
# eg: eng (English), fra (French), deu (Geramn), zho (Taiiwan), vie (Vietnamese), rus (Russian), ukr (Ukrainian) | |
# defualt to und (undefined) langauge code | |
full_language_key = sanitized_key | |
# do we know the language code via DATA? | |
try: | |
# shapefiles can only have column names up to 10 characters | |
# if you know the 3 char language code of the column, specify that | |
# like: foobarlc (where foobar is an existing column name) | |
# what data column should we look for? | |
if len(orig_key) > 8: | |
language_key = orig_key[0:8] + 'LC' | |
else: | |
language_key = orig_key + 'LC' | |
# can we read a 3 char language code from the data? | |
if _p[ language_key ] is not None: | |
if len(_p[ language_key ]) == 3: | |
full_language_key = 'name:' + _p[ language_key ] + '_x_variant' | |
except: | |
if first_read_language_code_from_data_warning: | |
print "problems with determining language codes from data :(" | |
first_read_language_code_from_data_warning = False | |
# | |
# We finally have a vaue to add, with a language code! | |
# | |
# test to see if we already names in that language code | |
langs = props.get(full_language_key, []) | |
# only add unique names | |
if unicode(_p[ orig_key ]) not in langs: | |
langs.append(unicode(_p[ orig_key ])) | |
props[full_language_key] = langs | |
# if we don't have a wof:name, store one of the alts as the name | |
print ' props.get("wof:name"): ' + props.get("wof:name") | |
if props.get("wof:name", "") == "": | |
props["wof:name"] = unicode( _p[ orig_key ] ) | |
# concordances are objects | |
elif sanitized_key.startswith("wof:concordances:"): | |
print "found a concordance" | |
if unicode(_p[ orig_key ]) not in no_data_vals: | |
print "found a concordance: %s" % (_p[ orig_key ],) | |
#concordance_namespace = sanitized_key.rsplit(":", 1) | |
# ignore the first 17 characters, they are only ever "wof:concordances:" | |
concordance_namespace = sanitized_key[17:] | |
k = concordance_namespace | |
v = unicode(_p[ orig_key ]) | |
concordances = props.get("wof:concordances", {}) | |
concordances[ k ] = v | |
props['wof:concordances'] = concordances | |
#everything else is straight properties | |
else: | |
props[ sanitized_key ] = unicode(_p[ orig_key ]) | |
except: | |
if first_property_key_remap_warning: | |
print '\t' + orig_key + ' not found in import, skipping' | |
first_property_key_remap_warning = False | |
except Exception, e: | |
print pprint.pformat(_p) | |
raise Exception, e | |
# always record the original values into a new namespace (specified in CL arguement) | |
try: | |
for prop_key, prop_val in _p.items(): | |
# only export the k,v when it's not in the black list | |
if prop_key not in property_key_skip and wof_src_namespace is not 'mz': | |
props[ wof_src_namespace + ":" + prop_key.lower() ] = prop_val #unicode( prop_val ) | |
except Exception, e: | |
print pprint.pformat(_p) | |
raise Exception, e | |
# TODO: this should only be set on NEW features, not EXISTING features | |
# someday MapShaper process will be built into exportify, until then... | |
try: | |
props['lbl:latitude'] = _p[ src_lat ] | |
props['lbl:longitude'] = _p[ src_lng ] | |
# TODO: is this right? | |
props['src:lbl:centroid'] = src_label | |
except Exception, e: | |
if first_mapshaper_warning: | |
print 'oops, no LABEL latitude and longitude label values found :(' | |
first_mapshaper_warning = False | |
# print out the properties the feature DOES have | |
print pprint.pformat(_p) | |
# it was missing one of the properties we expected, which one? | |
raise Exception, e | |
if verbose: | |
print props | |
debug_lat = None | |
debug_lng = None | |
debug_area = None | |
num_wof_results = 0 | |
candidate_ids = '' | |
## fail safe: all features need a wof:id before we import them | |
if not props.get('wof:id', False): | |
# TODO: We often know what region a feature is in | |
# add logic to determine the wof:id of that region, and then scope the next piece to that region_id (not country_id) | |
# but if region can't be determined, still use country_id | |
admin_1_name = props.get(admin_1_name_prop, None) | |
region_id = None | |
region_num_wof_results = 0 | |
if admin_1_name: | |
method = 'whosonfirst.places.search' | |
args = {'names': admin_1_name.encode("utf8"), 'placetype': 'region', 'country_id': country_id, 'extras':'geom:area, geom:latitude, geom:longitude'} | |
if verbose: | |
print ' search args: ' + unicode(args) | |
#curl -X GET 'https://whosonfirst.mapzen.com/api/rest/?method=whosonfirst.places.search&access_token=d60c97c1d8b353e2e7a69fcacfe72eb7&name=Iquique&placetype=county&country_id=85633057&page=1&per_page=100' | |
## print pprint.pformat(rsp) | |
pages = None | |
page = 1 | |
feature_area = 0 | |
# if you don't have any pages yet, or we have more pages to look at | |
while not pages or page <= pages: | |
args['page'] = page | |
rsp = api.execute_method(method,args) | |
if not pages: | |
pages = rsp['pages'] | |
if pages == 0: | |
break | |
region_num_wof_results += len(rsp['results']) | |
if verbose: | |
print " %d of %d" % (page, pages) | |
print " %s REGION results found for %s" % (region_num_wof_results,props['wof:name']) | |
for row in rsp['results']: | |
## print row | |
if row['wof:placetype'] == 'venue': | |
continue | |
if verbose: | |
print " %d, %s (%f, %f, %f)" % (row['wof:id'], row['wof:name'], row['geom:area'], row['geom:latitude'], row['geom:longitude']) | |
# pick one to set a WOF ID for, else it's still null | |
# NOTE: if you have two points with same area of 0, then it'll choose the last one | |
# | |
# TODO: add validatation that result is "near" the input (0.1 DD or 1.0 DD, etc) | |
# | |
if rsp['total'] == 1 or row['geom:area'] > feature_area: | |
region_id = row['wof:id'] | |
# TODO: look at varient names, all names (and some edit distance ranking) | |
# TODO: look at the lat,lng distances between all these place | |
page+=1 | |
if not props['wof:name'] == '' and not skip: | |
method = 'whosonfirst.places.search' | |
if region_id: | |
args = {'names': props['wof:name'].encode("utf8"), 'placetype': placetype, 'region_id': region_id, 'extras':'geom:area, geom:latitude, geom:longitude'} | |
else: | |
args = {'names': props['wof:name'].encode("utf8"), 'placetype': placetype, 'country_id': country_id, 'extras':'geom:area, geom:latitude, geom:longitude'} | |
if verbose: | |
print ' search args: ' + unicode(args) | |
#curl -X GET 'https://whosonfirst.mapzen.com/api/rest/?method=whosonfirst.places.search&access_token=d60c97c1d8b353e2e7a69fcacfe72eb7&name=Iquique&placetype=county&country_id=85633057&page=1&per_page=100' | |
## print pprint.pformat(rsp) | |
pages = None | |
page = 1 | |
feature_area = 0 | |
candidates = [] | |
# example candidate object | |
#{ | |
# id = None | |
# name = None | |
# lat = None | |
# lng = None | |
# area = 0 | |
#} | |
# if you don't have any pages yet, or we have more pages to look at | |
while not pages or page <= pages: | |
args['page'] = page | |
rsp = api.execute_method(method,args) | |
if not pages: | |
pages = rsp['pages'] | |
if pages == 0: | |
break | |
num_wof_results += len(rsp['results']) | |
if verbose: | |
print " %d of %d" % (page, pages) | |
print " %s results found for %s" % (num_wof_results,props['wof:name']) | |
for row in rsp['results']: | |
possible_canidate = False | |
## print row | |
if row['wof:placetype'] == 'venue': | |
continue | |
if verbose: | |
print " %d, %s (%f, %f, %f)" % (row['wof:id'], row['wof:name'], row['geom:area'], row['geom:latitude'], row['geom:longitude']) | |
# TODO: pick one to set a WOF ID for, else it's still null | |
# NOTE: if you have two points with same area of 0, then it'll choose the last one | |
if rsp['total'] == 1: | |
possible_candidate = True | |
# Prefer result with the largest area (or really: polygons over points) | |
if row['geom:area'] > feature_area: | |
#feature_area = row['geom:area'] | |
possible_candidate = True | |
# Prefer exact name matches | |
if row['wof:name'] == props['wof:name']: #.encode("utf8") | |
possible_candidate = True | |
if possible_candidate: | |
candidates.append( { 'id' : row['wof:id'], | |
'name' : row['wof:name'], | |
'lat' : row['geom:latitude'], | |
'lng' : row['geom:longitude'], | |
'area' : row['geom:area'] | |
} ) | |
page+=1 | |
# If we still don't have any candidates, get features of that country of that placetype to evaluate | |
if len(candidates) == 0: | |
# TODO: look at varient names, all names (and some edit distance ranking), and default to exact match if multiple | |
# TODO: if no name, no all names, just get all the county children and do edit distance | |
#curl -X GET 'https://whosonfirst.mapzen.com/api/rest/?method=whosonfirst.places.getDescendants&access_token=06e2ccbf23ab42122963e68c887e87b4&id=85633805&placetype=county&page=1&per_page=100' | |
method = 'whosonfirst.places.getDescendants' | |
if region_id: | |
args = {'placetype': placetype, 'id': region_id, 'extras':'geom:area, geom:latitude, geom:longitude'} | |
else: | |
args = {'placetype': placetype, 'id': country_id, 'extras':'geom:area, geom:latitude, geom:longitude'} | |
if verbose: | |
print ' descendant args: ' + unicode(args) | |
#curl -X GET 'https://whosonfirst.mapzen.com/api/rest/?method=whosonfirst.places.search&access_token=d60c97c1d8b353e2e7a69fcacfe72eb7&name=Iquique&placetype=county&country_id=85633057&page=1&per_page=100' | |
## print pprint.pformat(rsp) | |
pages = None | |
page = 1 | |
feature_area = 0 | |
# if you don't have any pages yet, or we have more pages to look at | |
while not pages or page <= pages: | |
args['page'] = page | |
rsp = api.execute_method(method,args) | |
if not pages: | |
pages = rsp['pages'] | |
if pages == 0: | |
break | |
num_wof_results += len(rsp['results']) | |
if verbose: | |
print " %d of %d" % (page, pages) | |
print " %s results found for %s" % (num_wof_results,props['wof:name']) | |
for row in rsp['results']: | |
possible_canidate = False | |
## print row | |
if row['wof:placetype'] == 'venue': | |
continue | |
if verbose: | |
print " %d, %s (%f, %f, %f)" % (row['wof:id'], row['wof:name'], row['geom:area'], row['geom:latitude'], row['geom:longitude']) | |
# Name edit distance | |
leve_dist = editdistance.eval( row[ 'wof:name' ], props['wof:name']) | |
leve_dist_length1 = len( row[ 'wof:name' ] ) | |
leve_dist_length2 = len( props['wof:name'] ) | |
# converts the edit distance into a percentage based on the actual edit distance and the length of the inputs | |
leve_rank = (leve_dist * 1.0) / max( leve_dist_length1, leve_dist_length2 ) | |
# DEBUG | |
#print leve_rank, leve_dist, max( leve_dist_length1, leve_dist_length2 ), row['wof:name'], props['wof:name'] | |
if leve_rank < 0.3: | |
possible_canidate = True | |
#TODO | |
# If it's within 1 decimal degree of our input search, then append it to candiate list, else ignore it | |
if possible_canidate: | |
candidates.append( { 'id' : row['wof:id'], | |
'name' : row['wof:name'], | |
'lat' : row['geom:latitude'], | |
'lng' : row['geom:longitude'], | |
'area' : row['geom:area'] | |
} ) | |
page+=1 | |
# after we've reviewed all possible candidates, let's pick one | |
if len(candidates) > 0: | |
#for loop to pick the best one | |
best_candidate = None | |
# TODO | |
print str(len(candidates)) + ' candidates being evaluated...' | |
#print candidates | |
# DEBUG | |
#best_candidate = candidates[0] | |
for candidate in candidates: | |
# for debug, we want to know which other candidates it might have | |
# matched to to QA if the script did the right thing | |
# (without having to manually recreate the script) | |
if len(candidate_ids) > 0: | |
candidate_ids = candidate_ids + ',' + str(candidate[ 'id' ]) | |
else: | |
candidate_ids = str(candidate[ 'id' ]) | |
# DEBUG | |
# print candidate | |
# | |
# Scoring section | |
# | |
# Default the score to 0, then add scores based on different factors below | |
candidate[ 'score' ] = 0 | |
# Prefer exact names, with area | |
if candidate[ 'name' ] == props['wof:name'] and candidate[ 'area' ] > 0: | |
candidate[ 'score' ] += 1 | |
if candidate[ 'name' ] == props['wof:name']: | |
candidate[ 'score' ] += 1 | |
# Name edit distance | |
leve_dist = editdistance.eval( candidate[ 'name' ], props['wof:name']) | |
leve_dist_length1 = len( candidate[ 'name' ] ) | |
leve_dist_length2 = len( props['wof:name'] ) | |
# converts the edit distance into a percentage based on the actual edit distance and the length of the inputs | |
leve_rank = (leve_dist * 1.0) / max( leve_dist_length1, leve_dist_length2 ) | |
candidate['leve_dist'] = leve_rank | |
if leve_rank < 0.3 and candidate[ 'area' ] > 0: | |
candidate[ 'score' ] += 0.3 | |
if leve_rank < 0.3: | |
candidate[ 'score' ] += 0.3 | |
# Prefer features with area | |
if candidate[ 'area' ] > 0: | |
candidate[ 'score' ] += 1 | |
# TODO: else prefer most similar names, that have area | |
# In New Zealand, MESO Waikato District is mapping to WOF Waikato District, but MESO South Waikato District is also mapping to WOF Waikato District, and shouldn't | |
# TODO: else look at the lat,lng distances between all these place | |
# if we have a winner, set best_candidate to that winner | |
#TODO | |
# sort the canidates by score, and choose the 1st one | |
candidates_sorted = sorted(candidates, key=lambda x: x['score'], reverse=True) | |
# make sure that we're not reusing the same candidate over and over | |
# todo: keep going until you've looked at all the possible canidates | |
if candidates_sorted[0] not in matched_feature_ids: | |
# set best to 1st in sorted list | |
best_candidate = candidates_sorted[0] | |
if verbose: | |
print "candidates_sorted" | |
print candidates_sorted | |
print "best_candidate" | |
print best_candidate | |
# we have a winner | |
if best_candidate: | |
props['wof:id'] = best_candidate[ 'id' ] | |
debug_lat = best_candidate[ 'lat' ] | |
debug_lng = best_candidate[ 'lng' ] | |
debug_area = best_candidate[ 'area' ] | |
else: | |
# TODO: get smarter about searching around me in the raw data on the local machine? | |
if skip: | |
print " ASSUMING NO CONCORDANCE exists, per your skip request" | |
else: | |
print " SKIPPING WOF CONCORDANCE for feature, it has no name :(" | |
if verbose: | |
print " %d %s" % (props.get('wof:id', -1), props.get('wof:name', None)) | |
# we previously set the feature's geometry based on the import | |
# now also set the feature's properties based on the sanitized props | |
feature['properties'] = props | |
# mapzen.whosonfirst.pip.utils.append_hierarchy_and_parent(feature,data_root="...") | |
# this will allow us to append hierarchies to our new mesoshapes | |
# print props | |
# print pprint.pformat(props) | |
#print feature | |
# TODO: take advantage of parent_id to load that feature and copy append all of it's hierarchy onto this new record | |
print candidate_ids | |
# ids are stored in a string that is comma delim | |
num_canidates = candidate_ids.count(',') | |
# but if there is only a single candidate there is no delim, so test for that case | |
if num_canidates == 0 and len(candidate_ids) > 0: | |
num_canidates = 1 | |
if props.get('wof:id') in matched_feature_ids: | |
print '\toops, already used %s on another feature' % (props.get('wof:id'),) | |
if wof_id_from_data_file_boolean: | |
print '\toops, you specified a duplicate WOF ID in your input data file: %s' %s (props['wof:id'],) | |
# dont' reuse something again | |
props['wof:id'] = None | |
else: | |
# mark it so it's not reused in later features in this run | |
matched_feature_ids.append( props.get('wof:id') ) | |
# Did we find a concordance for this feature | |
if props.get('wof:id'): | |
print ' found exising WOF record! %s' % (props.get('wof:id'),) | |
# TODO: make this path a command line argument (repeat) | |
existing_feature = mapzen.whosonfirst.utils.load(source, props.get('wof:id')) | |
#print pprint.pformat(feature['properties']) | |
# | |
# Preserve existing geometry as an alt | |
# | |
# | |
# filename template | |
# (taken care of by exporter.export_alt_feature) | |
# | |
# 85922583-alt-mapzen.geojson | |
# | |
# | |
# properties template | |
# (our responsibility) | |
# | |
# "properties":{ | |
# "src:geom":"mapzen", | |
# "wof:id":85922583 | |
#} | |
alt_geom = {'type': 'Feature'} | |
# what geom is the existing feature src from? | |
src_geom_namespace = existing_feature['properties'].get('src:geom', None) | |
# some data was imported incorrectly (which is sad) | |
if not src_geom_namespace: | |
if existing_feature['properties'].get('qs:name', None): | |
src_geom_namespace = 'quattroshapes' | |
# OPTION 1 | |
if src_geom_namespace == 'meso': | |
print " hmmm, we've been around this corner before. let's treat 2nd viewing as new WOF feature to add" | |
#housekeeping --- really this should be the newly minted WOF:id??? | |
feature['properties']['wof:id'] = None | |
# This will definitely result in a few scrambled records | |
# You can look at the debug files counting by wof_id candidate to figure out which to manually review | |
exporter.export_feature(feature) | |
#skip further logic in loop | |
continue | |
# get the list, else create an empty list | |
alt_geom_list = existing_feature['properties'].get('src:geom_alt', []) | |
if src_geom_namespace: | |
alt_geom['properties'] = { | |
'wof:id': props.get('wof:id'), | |
'src:geom': str(src_geom_namespace).lower() | |
} | |
#print str(alt_geom['properties']) | |
alt_geom['geometry'] = existing_feature['geometry'] | |
# TODO: why this not write out file? | |
alt_path = exporter.export_alt_feature(alt_geom, alt=alt_geom['properties']['src:geom']) | |
print "EXPORTED %s (%s)" % (alt_path, alt_geom['properties']['src:geom']) | |
# document that we created an alt geom | |
alt_geom_list.append( alt_geom['properties']['src:geom'] ) | |
#print 'alts: ' + str(alt_geom_list) | |
# record that new list onto the existing feature | |
existing_feature['properties']['src:geom_alt'] = alt_geom_list | |
#print existing_feature | |
else: | |
print " oops, not valid WOF record, missing src:geom property, skipping alt-geom creation" | |
# | |
# Now the primary WOF record | |
# | |
# import the geometry to the existing feature | |
existing_feature['geometry'] = _f['geometry'] | |
# TODO: iterate thru all the props, adding them | |
for k, v in props.items(): | |
# eg: wof:belongsto | |
#print "key: %s (%s)" % (k,type(k)) | |
#print "new value: %s (%s)" % (v,type(v)) | |
#try: | |
# if existing_feature['properties'][k]: | |
# print "old value: %s (%s)" % (existing_feature['properties'][k],type(existing_feature['properties'][k])) | |
#except: | |
# print "old value: n/a" | |
# eg: wof:belongsto or wof:hierarchy | |
if type(v) == type(list()): | |
new = existing_feature['properties'].get(k, []) | |
# crazy multiple hiearchy | |
if k == 'wof:hierarchy' and len(new) > 0: | |
if type(new[0]) == type(dict()): | |
# just update the first hierarchy | |
for dict_k, dict_v in v[0].items(): | |
new[0][dict_k] = dict_v | |
# else assume sanity | |
else: | |
for val_in_list in v: | |
if val_in_list not in new: | |
new.append(val_in_list) | |
existing_feature['properties'][k] = new | |
#print 'again new new : %s' % (existing_feature['properties'][k],) | |
# eg: wof:concordances | |
elif type(v) == type(dict()): | |
new = existing_feature['properties'].get(k, {}) | |
#print 'new new new : %s' % (new,) | |
for dict_k, dict_v in v.items(): | |
#print ' %s, %s' % (dict_k,dict_v) | |
new[dict_k] = dict_v | |
existing_feature['properties'][k] = new | |
#print 'again new new : %s' % (existing_feature['properties'][k],) | |
# eg: wof:name | |
else: | |
if k == "wof:name": | |
if v != "": | |
existing_feature['properties'][k] = v | |
else: | |
if k != "meso:wof_id" and k != "meso:wof:id": | |
existing_feature['properties'][k] = v | |
#print pprint.pformat(existing_feature['properties']) | |
# now export the modified record | |
exporter.export_feature(existing_feature) | |
# If not, export as new feature | |
else: | |
print ' adding new feature to WOF' | |
exporter.export_feature(feature) | |
# which input records had what results? | |
# NOTE: this needs to be after features are exported as we're modifying the original | |
# feature (which is also the exported feature) | |
if options.debug and num_canidates > 0: | |
_f['properties']['debug_wof_id'] = candidate_ids | |
_f['properties']['debug_wof_candidates'] = num_canidates | |
_f['properties']['debug_wof_results'] = num_wof_results | |
_f['properties']['debug_wof_region_id'] = region_id | |
_f['properties']['debug_wof_region_results'] = region_num_wof_results | |
_f['properties']['debug_lat'] = debug_lat | |
_f['properties']['debug_lng'] = debug_lng | |
_f['properties']['debug_area'] = debug_area | |
# now that we've processed every record, for the set should we | |
# write out debug file (which now has new 'debug_wof_id' key, value pair) | |
if options.debug: | |
debug_path = os.path.abspath(options.debug) | |
with open(debug_path, 'w') as debug_outfile: | |
json.dump(data, debug_outfile) | |
# OPTION 2 | |
# Loop thru the debug file (data) again | |
# Looking for wof_id values that are not unique | |
# and operate on them: choose the "best", ditch the rest, probably making new records for that input. | |
## next step - figure out how to exclude by placetype. | |
## see if we complete a dry run, stash the diffs, etc.. basically do a dry run of this script to see how we're doing. |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment