Skip to content

Instantly share code, notes, and snippets.

@ihor-nahuliak
Forked from stepps00/import_meso.py
Created October 1, 2018 03:12
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ihor-nahuliak/6acdac8c5dfcaa56aa23ec65be57ecf9 to your computer and use it in GitHub Desktop.
Save ihor-nahuliak/6acdac8c5dfcaa56aa23ec65be57ecf9 to your computer and use it in GitHub Desktop.
Import script to add mesoshapes to WOF
#!/usr/bin/env python
# run original SHP file thru MapShaper to add label position (mps_y, mps_x) columns
#
# first add label position via mapshaper:
# mapshaper input.shp encoding=utf8 -each 'mps_x=$.innerX, mps_y=$.innerY' -o import_via_mapshaper.shp
# full example:
# mapshaper /usr/local/mapzen/countries/Chile/Admin_2/Chile_admin2.shp encoding=utf8 -each 'mps_x=$.innerX, mps_y=$.innerY' -o chile_adm2_via_mapshaper.shp
#
# now convert that SHP to GeoJSON format, which is easier to load into Python
# ogr2ogr -F GeoJSON converted.geojson import_via_mapshaper.shp
# full example:
# ogr2ogr -F GeoJSON chile_adm2_via_mapshaper.geojson chile_adm2_via_mapshaper.shp
#
# then run this script, like:
# python apply_wof_id_to_martin_shapes_using_wof_api.py chile_adm2_via_mapshaper.geojson county 85633057 CL meso
import sys
import os
import logging
import optparse
import json
import geojson
import pprint
import mapzen.whosonfirst.api.client
import mapzen.whosonfirst.utils
import mapzen.whosonfirst.export
import editdistance
logging.basicConfig(level=logging.INFO)
if __name__ == '__main__':
opt_parser = optparse.OptionParser()
# python apply_wof_id_to_martin_shapes_using_wof_api.py chile_adm2_via_mapshaper.geojson county 85633057 CL meso
opt_parser.add_option('-i', '--input', dest='input', action='store', default=None, help='Where to read GeoJSON import file from')
opt_parser.add_option('-o', '--output', dest='output', action='store', default="/usr/local/mapzen/whosonfirst-data/data", help='Where to write WOF records to')
opt_parser.add_option('-g', '--debug', dest='debug', action='store', default=None, help='Where to write debug GeoJSON (with wof:id added)')
opt_parser.add_option('-k', '--skip-wof-api', dest='skip', action='store_true', default=None, help='Skip running WOF API to look for existing features.')
opt_parser.add_option('-p', '--placetype', dest='placetype', action='store', default=None, help='What WOF placetype')
opt_parser.add_option('-c', '--country_id', dest='country_id', action='store', default=None, help='What country wof:id')
opt_parser.add_option('-d', '--country_code', dest='country_code', action='store', default=None, help='What WOF (ISO) country code')
opt_parser.add_option('-s', '--source', dest='wof_source', action='store', default='meso', help='What WOF data source identifier')
opt_parser.add_option('-v', '--verbose', dest='verbose', action='store_true', default=False, help='Be chatty (default is False)')
options, args = opt_parser.parse_args()
# calling the file
#raise Exception, "Y U RUN THIS? Usage: python scriptname.py import_via_mapshaper.geojson wof_placetype country_id namespace"
# setup variables
wof_placetype = ['country', 'region', 'macrocounty', 'county', 'locality', 'neighbourhood']
geoplanet_placetype = {
"Country" : "country",
"County" : "county",
"HistoricalCounty" : "county", # this should be marked CESSATION DATE with today's date, and MZ:IS_CURRENT = 0
"LocalAdmin" : "localadmin",
"State" : "region",
"Suburb" : "neighbourhood",
"Town" : "locality"
}
# there are only 8 places which don't have a geoplanet placetype
# and also don't have a geonames.org placetype (gn_fcode)
geonames_org_placetype = {
"ADM1" : "region",
"ADM1H" : "region", # this should be marked CESSATION DATE with today's date, and MZ:IS_CURRENT = 0
"ADM2" : "county",
"ADM2H" : "county", # this should be marked CESSATION DATE with today's date, and MZ:IS_CURRENT = 0
"ADM3" : "localadmin",
"ADM3H" : "localadmin", # this should be marked CESSATION DATE with today's date, and MZ:IS_CURRENT = 0
"ADM4" : "localadmin",
"ADM4H" : "localadmin", # this should be marked CESSATION DATE with today's date, and MZ:IS_CURRENT = 0
"ADM5" : "localadmin",
"ADMD" : "localadmin",
"ADMDH" : "localadmin", # this should be marked CESSATION DATE with today's date, and MZ:IS_CURRENT = 0
"ADMF" : "locality", # gov't building
"AGRF" : "locality", # ag / farm
"AIRB" : "locality", # airbase
"AIRP" : "locality", # airport
"ANS" : "locality", # ancient site
"AREA" : "locality", # huh?
"ATOL" : "locality",
"BAR" : "locality", # this should probably SKIP
"BAY" : "locality", # this should probably SKIP
"BLDG" : "locality",
"BLDO" : "locality",
"BP" : "locality", # boundary marker?
"CH" : "locality", # church
"CMP" : "locality",
"CMPMN" : "locality",
"CMPRF" : "locality",
"CMTY" : "locality",
"CNLSB" : "locality",
"CNS" : "locality",
"COMC" : "locality",
"CONT" : "locality",
"COVE" : "locality",
"CRTR" : "locality",
"CST" : "locality",
"CSTL" : "locality",
"CSTM" : "locality",
"CTRS" : "locality",
"CULT" : "locality",
"DEVH" : "locality",
"DLTA" : "locality",
"DPOF" : "locality",
"DVD" : "locality",
"EST" : "locality",
"FCL" : "locality",
"FLD" : "locality",
"FRM" : "locality", # farm
"FRMS" : "locality",
"FRMT" : "locality",
"GHSE" : "locality",
"GRAZ" : "locality",
"HLL" : "locality",
"HMSD" : "locality",
"HSE" : "locality", # house
"HSEC" : "locality", # houses
"HSP" : "locality", # hospital
"HSPC" : "locality",
"HSTS" : "locality",
"HTL" : "locality",
"HUT" : "locality",
"HUTS" : "locality",
"INDS" : "locality",
"ISL" : "locality",
"ISLET" : "locality",
"ISLS" : "locality",
"ISLX" : "locality",
"ITTR" : "locality",
"LCTY" : "locality",
"LEPC" : "locality",
"LK" : "locality",
"LTER" : "locality",
"MALL" : "locality",
"MAR" : "locality",
"MFG" : "locality",
"MKT" : "locality",
"MLSW" : "locality",
"MLWTR" : "locality",
"MSQE" : "locality",
"MSTY" : "locality",
"MT" : "locality",
"MTS" : "locality",
"OBPT" : "locality",
"OCH" : "locality",
"PCL" : "country", # political entity
"PCLD" : "dependency", # dependent political entity
"PCLF" : "dependency", # freely associated state
"PCLH" : "country", # historical political entity, CESSATION DATE with today's date, and MZ:IS_CURRENT = 0
"PCLI" : "country", # independent political entity
"PCLIX" : "region", # section of independent political entity
"PCLS" : "dependency", # semi-independent political entity
"PEN" : "locality",
"PGDA" : "locality",
"PK" : "locality",
"PKS" : "locality",
"PLN" : "locality",
"PPL" : "locality", # basic locality
"PPLA" : "locality", # region capital add new CAPITAL_OF list
"PPLA2" : "locality", # county capital add new CAPITAL_OF list
"PPLA3" : "locality", # localadmin capital add new CAPITAL_OF list
"PPLA4" : "locality", # localadmin capital add new CAPITAL_OF list
"PPLC" : "locality", # country capital add new CAPITAL_OF list
"PPLF" : "locality", # farm village
"PPLG" : "locality", # seat of government of a political entity add new CAPITAL_OF list
"PPLH" : "locality", # historical populated place, CESSATION DATE with today's date, and MZ:IS_CURRENT = 0
"PPLL" : "locality", # populated locality
"PPLQ" : "locality", # abandoned populated place, CESSATION DATE with today's date, and MZ:IS_CURRENT = 0
"PPLR" : "locality", # religious populated place
"PPLS" : "locality", # populated places
"PPLW" : "locality", # destroyed populated place, CESSATION DATE with today's date, and MZ:IS_CURRENT = 0
"PPLX" : "neighbourhood",
"PRK" : "locality",
"PRN" : "locality",
"PRSH" : "locality",
"PRT" : "locality",
"PT" : "locality",
"RD" : "locality",
"RDGB" : "locality",
"RDGE" : "locality",
"RECG" : "locality",
"REST" : "locality",
"RESW" : "locality",
"RGN" : "locality",
"RHSE" : "locality",
"RLGR" : "locality",
"RNCH" : "locality",
"RR" : "locality",
"RSD" : "locality",
"RSRT" : "locality",
"RSTN" : "locality",
"RSV" : "locality",
"RUIN" : "locality",
"SAND" : "locality",
"SCH" : "locality",
"SCHC" : "locality",
"SCHT" : "locality",
"SHRN" : "locality",
"SHSU" : "locality",
"SPA" : "locality",
"SQR" : "locality",
"ST" : "locality",
"STDM" : "locality",
"STLMT" : "locality",
"STM" : "locality",
"STMI" : "locality",
"STNB" : "locality",
"STNC" : "locality",
"STNF" : "locality",
"STNM" : "locality",
"STNR" : "locality",
"STRT" : "locality",
"SWMP" : "locality",
"TERR" : "locality",
"TMB" : "locality",
"TMPL" : "locality",
"TOWR" : "locality",
"TRB" : "locality",
"TRIG" : "locality",
"TWO" : "locality",
"UNIV" : "locality",
"VAL" : "locality",
"WAD" : "locality",
"WTRH" : "locality",
"ZN" : "locality",
"ZNB" : "locality"
}
token = 'KEY...'
api = mapzen.whosonfirst.api.client.OAuth2(token)
# now the main logic
# read from the system arguements
# GeoJSON file to process
dump = os.path.abspath(options.input)
# where to put the results
source = os.path.abspath(options.output)
exporter = mapzen.whosonfirst.export.flatfile(source)
placetype = options.placetype
skip = options.skip
verbose = options.verbose
# TODO: rewrite this section
# was a valid placetype specified?
if not placetype in wof_placetype:
raise Exception, ('boo: placetype fail :( choose one of:', wof_placetype)
try:
country_id = int(options.country_id)
except:
raise Exception, ('boo: what country are you in, sir?')
# out of 1.3 million records:
# `iso` code is set on all but 250,000 records
# `gn_country` is set for all but 129,000 records
# iso and gn_country match for all but 2375 records
# only 18 don't have either iso or gn_country set
try:
iso_country_code = options.country_code
except:
raise Exception, ('boo: what ISO code does your country have, sir?')
try:
wof_src_namespace = options.wof_source
except:
raise Exception, ('boo: what WOF source namespace are you using, sir?')
if wof_src_namespace == 'meso' or 'mz':
src_lat = 'mps_y'
src_lng = 'mps_x'
src_label = 'mapshaper'
admin_1_name_prop = 'meso:admin_1'
# assumes Quattroshapes point gazetteer (which is WRONG)
else:
src_lat = 'lat'
src_lng = 'lng'
src_label = 'qspg'
admin_1_name_prop = 'qspg:name_adm1'
# read in the GeoJSON file contents (text file)
fh = open(dump, 'r')
# Import (parse) our text file from command line
# arguement as structured GeoJSON object
data = geojson.load(fh)
# sometimes input have bad columns to skip
property_key_skip = [
"qs_id", # empty
"featurecla", # empty
"HASC",
"NAME_EN",
"mps_x",
"mps_y",
"SOURCE"
]
# sometimes SHP files have stupid 10 char
# UPPER names, make the WOF pretty
property_key_remap = {
"NAME_EN": "name:eng_x_preferred",
"NAME_LOCAL": "name:und_x_variant",
"NAME_LOC" : "name:und_x_variant",
"NAME_ALT" : "name:und_x_variant",
"NAME_ALT1" : "name:und_x_variant",
"NAME_ALT2" : "name:und_x_variant",
"GAUL_ADMIN": "name:eng_x_variant",
"GADM_ADMIN": "name:eng_x_variant",
"GADM_HASC2": "wof:concordances:hasc:id",
"HASC": "wof:concordances:hasc:id",
#"SOURCE": "src:geom",
"mps:latitude" : "ms:latitude",
"mps:latitude" : "ms:latitude",
# quattroshapes point gazetteer re-up
"name" : "name:eng_x_preferred",
"gn_id" : "wof:concordances:gn_id",
"woe_id" : "wof:concordances:woe_id",
#"gn_id_eh" : "wof:concordances:gn_id_eh",
#"woe_id_eh" : "wof:concordances:woe_id_eh",
"gn_name" : "name:und_x_variant",
"gn_ascii" : "name:und_x_variant",
"woe_name" : "name:und_x_variant",
"woe_nameen" : "name:eng_x_preferred",
"qs_maybe" : "qspg_id"
}
# values that are integers
keys_with_integer_values = [
"OBJECTID",
"qs_id",
"gn_id",
"woe_id",
"gn_id_eh",
"woe_id_eh",
"scalerank",
"natscale",
"adm0cap",
"worldcity",
"megacity",
"metro_core",
"micro_core",
"gn_pop",
"parent_id",
"woe_local",
"woe_lau",
"woe_adm2",
"woe_adm1",
"woe_adm0",
"gns_id",
"photos",
"photos_all",
"woemembers",
"photos_1k",
"photos_9k",
"photos_sr",
"photos_9r",
"pop_sr",
"temp_id",
"qs_maybe"
]
source_name_remap = {
"AOTM": "meso"
}
wof_src_key = "src:geom"
# sometimes the (SHP) keys above contain "no data"
# which is conviently returned in a number of non-empty
# forms. this is why we can't have nice things.
no_data_vals = [
"NO DATA",
"0",
""
]
# did we already use this feature to match
# (if so, don't reuse it)
matched_feature_ids = []
first_property_key_remap_warning = True
first_read_language_code_from_data_warning = True
first_mapshaper_warning = True
feature_counter = 1
total_features = len(data['features'])
# for each feature in GeoJSON feature collection
for _f in data['features']:
#sys.stdout.write( str(feature_counter) + ' of ' + str(total_features) )
print str(feature_counter) + ' of ' + str(total_features)
# print "%s of %s" % (feature_counter, total_features)
feature_counter += 1
#
# SKIP this part since we're importing POLYGONS and we're
# expecting explicate mz_lat, mz_lng properties
#
## store the feature's geometry
#geom = _f['geometry']
## store the feature's coordinates object (lat, lng)
#coords = geom['coordinates']
## make that easier to work with later (for assigning label position)
#lon, lat = coords
# now for the feature's properties
_p = _f['properties']
wof_id_from_data_file_boolean = False
# in some cases we might already have a WOF_ID, but probably not
try:
# we prefer wof:id in this format
wofid = _p['wof:id']
# but sometimes history
wofid = _p.get('WOF_ID', wofid)
# if there is no WOF_ID, set it to zero
if wofid == None:
wofid = 0
# else parse it as an integer so WOF is happy
else:
wofid = int(wofid)
wof_id_from_data_file_boolean = True
except:
wofid = 0
# for most the data that doens't have a WOF_ID yet
# create a new feature that is the imported feature
if wofid == 0:
# set the geom and properties to match the imported feature
feature = _f
#"iso" : "",
#"gn_country" : "",
#"name_adm1" : "",
#"name_adm0" : "",
# add required WOF placetype
# TODO: This smells like we should ALWAYS set this, even if feature already exists
props = {
# we add wof:id later
# we add wof:name later
'wof:placetype': placetype,
'wof:hierarchy': [
{ 'country_id': country_id }
],
'wof:country': iso_country_code
# ASSUMPTION: placetypes are added country, then region, then county, then, locality, then neighbourhood, etc
# TODO: we are missing parent_id
# The default for unknown parent_id is -1, but really we should PIP these things to find it out
}
# default the geometry source to the CL opt_parser value
# (but this is overwriten below as data-driven per feature)
if wof_src_namespace:
props[wof_src_key] = wof_src_namespace
# else we alreday have a WOF feature, so load that up and let's modify it
else:
feature = mapzen.whosonfirst.utils.load([source], wofid)
props = feature['properties']
# set WOF:name based on what field?
if "name" in _p:
wof_name_key = "name"
elif "NAME_EN" in _p:
wof_name_key = "NAME_EN"
elif "NAME_LOCAL" in _p:
wof_name_key = "NAME_LOCAL"
elif "NAME_LOC" in _p:
wof_name_key = "NAME_LOC"
else:
raise Exception, ('boo: data needs either name, NAME_EN, or NAME_LOCAL fields')
# now selectively add data properties from import_via_mapshaper file
# making sure to sanitize their property (key) names
# NOTE: all other properties not in the property_key_remap list will be skipped
try:
for orig_key, sanitized_key in property_key_remap.items():
#print "%s, %s" % (orig_key, sanitized_key)
#try:
# print "\t%s: %s" % (orig_key, _p[ orig_key ])
#except:
# continue
try:
if _p[ orig_key ] == None:
# continue
props[ sanitized_key ] = ''
else:
#
# Special WOF-ism
#
#print "key: %s, value: %s" % (orig_key, _p[ orig_key ])
# store the WOF name
if orig_key == wof_name_key:
if _p[ orig_key ] in no_data_vals:
props[ "wof:name" ] = ""
else:
props[ "wof:name" ] = unicode(_p[ orig_key ])
print '1: props[ "wof:name" ]: %s' % (props[ "wof:name" ],)
if verbose:
print " wof:name being evaluted (for import or concordance): " + props[ "wof:name" ] + " (" + _p[ orig_key ] + ")"
# if the data has a source specified, try to be data driven
if _p[ sanitized_key ] == wof_src_key:
# make it easier to read the later logic
input_src_value = _p[ orig_key ]
# but sometimes the data values are bad
if input_src_value in source_name_remap:
# set the final property to the remapped value
props[ sanitized_key ] = source_name_remap[ input_src_value ]
else:
# In this case don't take the unicode of the input src as WOF sources are only ascii7
props[ sanitized_key ] = input_src_value
print '2: props[ "wof:name" ]: %s' % (props[ "wof:name" ],)
#
# For the explicate property lookups in property_key_remap
#
#
# names are lists
if ("name:" in sanitized_key):
# skip this key if the data is bad, but if it's good, do the following
if _p[ orig_key ] not in no_data_vals:
# langauge codes must be 3 characters long
# https://en.wikipedia.org/wiki/ISO_639
# eg: eng (English), fra (French), deu (Geramn), zho (Taiiwan), vie (Vietnamese), rus (Russian), ukr (Ukrainian)
# defualt to und (undefined) langauge code
full_language_key = sanitized_key
# do we know the language code via DATA?
try:
# shapefiles can only have column names up to 10 characters
# if you know the 3 char language code of the column, specify that
# like: foobarlc (where foobar is an existing column name)
# what data column should we look for?
if len(orig_key) > 8:
language_key = orig_key[0:8] + 'LC'
else:
language_key = orig_key + 'LC'
# can we read a 3 char language code from the data?
if _p[ language_key ] is not None:
if len(_p[ language_key ]) == 3:
full_language_key = 'name:' + _p[ language_key ] + '_x_variant'
except:
if first_read_language_code_from_data_warning:
print "problems with determining language codes from data :("
first_read_language_code_from_data_warning = False
#
# We finally have a vaue to add, with a language code!
#
# test to see if we already names in that language code
langs = props.get(full_language_key, [])
# only add unique names
if unicode(_p[ orig_key ]) not in langs:
langs.append(unicode(_p[ orig_key ]))
props[full_language_key] = langs
# if we don't have a wof:name, store one of the alts as the name
print ' props.get("wof:name"): ' + props.get("wof:name")
if props.get("wof:name", "") == "":
props["wof:name"] = unicode( _p[ orig_key ] )
# concordances are objects
elif sanitized_key.startswith("wof:concordances:"):
print "found a concordance"
if unicode(_p[ orig_key ]) not in no_data_vals:
print "found a concordance: %s" % (_p[ orig_key ],)
#concordance_namespace = sanitized_key.rsplit(":", 1)
# ignore the first 17 characters, they are only ever "wof:concordances:"
concordance_namespace = sanitized_key[17:]
k = concordance_namespace
v = unicode(_p[ orig_key ])
concordances = props.get("wof:concordances", {})
concordances[ k ] = v
props['wof:concordances'] = concordances
#everything else is straight properties
else:
props[ sanitized_key ] = unicode(_p[ orig_key ])
except:
if first_property_key_remap_warning:
print '\t' + orig_key + ' not found in import, skipping'
first_property_key_remap_warning = False
except Exception, e:
print pprint.pformat(_p)
raise Exception, e
# always record the original values into a new namespace (specified in CL arguement)
try:
for prop_key, prop_val in _p.items():
# only export the k,v when it's not in the black list
if prop_key not in property_key_skip and wof_src_namespace is not 'mz':
props[ wof_src_namespace + ":" + prop_key.lower() ] = prop_val #unicode( prop_val )
except Exception, e:
print pprint.pformat(_p)
raise Exception, e
# TODO: this should only be set on NEW features, not EXISTING features
# someday MapShaper process will be built into exportify, until then...
try:
props['lbl:latitude'] = _p[ src_lat ]
props['lbl:longitude'] = _p[ src_lng ]
# TODO: is this right?
props['src:lbl:centroid'] = src_label
except Exception, e:
if first_mapshaper_warning:
print 'oops, no LABEL latitude and longitude label values found :('
first_mapshaper_warning = False
# print out the properties the feature DOES have
print pprint.pformat(_p)
# it was missing one of the properties we expected, which one?
raise Exception, e
if verbose:
print props
debug_lat = None
debug_lng = None
debug_area = None
num_wof_results = 0
candidate_ids = ''
## fail safe: all features need a wof:id before we import them
if not props.get('wof:id', False):
# TODO: We often know what region a feature is in
# add logic to determine the wof:id of that region, and then scope the next piece to that region_id (not country_id)
# but if region can't be determined, still use country_id
admin_1_name = props.get(admin_1_name_prop, None)
region_id = None
region_num_wof_results = 0
if admin_1_name:
method = 'whosonfirst.places.search'
args = {'names': admin_1_name.encode("utf8"), 'placetype': 'region', 'country_id': country_id, 'extras':'geom:area, geom:latitude, geom:longitude'}
if verbose:
print ' search args: ' + unicode(args)
#curl -X GET 'https://whosonfirst.mapzen.com/api/rest/?method=whosonfirst.places.search&access_token=d60c97c1d8b353e2e7a69fcacfe72eb7&name=Iquique&placetype=county&country_id=85633057&page=1&per_page=100'
## print pprint.pformat(rsp)
pages = None
page = 1
feature_area = 0
# if you don't have any pages yet, or we have more pages to look at
while not pages or page <= pages:
args['page'] = page
rsp = api.execute_method(method,args)
if not pages:
pages = rsp['pages']
if pages == 0:
break
region_num_wof_results += len(rsp['results'])
if verbose:
print " %d of %d" % (page, pages)
print " %s REGION results found for %s" % (region_num_wof_results,props['wof:name'])
for row in rsp['results']:
## print row
if row['wof:placetype'] == 'venue':
continue
if verbose:
print " %d, %s (%f, %f, %f)" % (row['wof:id'], row['wof:name'], row['geom:area'], row['geom:latitude'], row['geom:longitude'])
# pick one to set a WOF ID for, else it's still null
# NOTE: if you have two points with same area of 0, then it'll choose the last one
#
# TODO: add validatation that result is "near" the input (0.1 DD or 1.0 DD, etc)
#
if rsp['total'] == 1 or row['geom:area'] > feature_area:
region_id = row['wof:id']
# TODO: look at varient names, all names (and some edit distance ranking)
# TODO: look at the lat,lng distances between all these place
page+=1
if not props['wof:name'] == '' and not skip:
method = 'whosonfirst.places.search'
if region_id:
args = {'names': props['wof:name'].encode("utf8"), 'placetype': placetype, 'region_id': region_id, 'extras':'geom:area, geom:latitude, geom:longitude'}
else:
args = {'names': props['wof:name'].encode("utf8"), 'placetype': placetype, 'country_id': country_id, 'extras':'geom:area, geom:latitude, geom:longitude'}
if verbose:
print ' search args: ' + unicode(args)
#curl -X GET 'https://whosonfirst.mapzen.com/api/rest/?method=whosonfirst.places.search&access_token=d60c97c1d8b353e2e7a69fcacfe72eb7&name=Iquique&placetype=county&country_id=85633057&page=1&per_page=100'
## print pprint.pformat(rsp)
pages = None
page = 1
feature_area = 0
candidates = []
# example candidate object
#{
# id = None
# name = None
# lat = None
# lng = None
# area = 0
#}
# if you don't have any pages yet, or we have more pages to look at
while not pages or page <= pages:
args['page'] = page
rsp = api.execute_method(method,args)
if not pages:
pages = rsp['pages']
if pages == 0:
break
num_wof_results += len(rsp['results'])
if verbose:
print " %d of %d" % (page, pages)
print " %s results found for %s" % (num_wof_results,props['wof:name'])
for row in rsp['results']:
possible_canidate = False
## print row
if row['wof:placetype'] == 'venue':
continue
if verbose:
print " %d, %s (%f, %f, %f)" % (row['wof:id'], row['wof:name'], row['geom:area'], row['geom:latitude'], row['geom:longitude'])
# TODO: pick one to set a WOF ID for, else it's still null
# NOTE: if you have two points with same area of 0, then it'll choose the last one
if rsp['total'] == 1:
possible_candidate = True
# Prefer result with the largest area (or really: polygons over points)
if row['geom:area'] > feature_area:
#feature_area = row['geom:area']
possible_candidate = True
# Prefer exact name matches
if row['wof:name'] == props['wof:name']: #.encode("utf8")
possible_candidate = True
if possible_candidate:
candidates.append( { 'id' : row['wof:id'],
'name' : row['wof:name'],
'lat' : row['geom:latitude'],
'lng' : row['geom:longitude'],
'area' : row['geom:area']
} )
page+=1
# If we still don't have any candidates, get features of that country of that placetype to evaluate
if len(candidates) == 0:
# TODO: look at varient names, all names (and some edit distance ranking), and default to exact match if multiple
# TODO: if no name, no all names, just get all the county children and do edit distance
#curl -X GET 'https://whosonfirst.mapzen.com/api/rest/?method=whosonfirst.places.getDescendants&access_token=06e2ccbf23ab42122963e68c887e87b4&id=85633805&placetype=county&page=1&per_page=100'
method = 'whosonfirst.places.getDescendants'
if region_id:
args = {'placetype': placetype, 'id': region_id, 'extras':'geom:area, geom:latitude, geom:longitude'}
else:
args = {'placetype': placetype, 'id': country_id, 'extras':'geom:area, geom:latitude, geom:longitude'}
if verbose:
print ' descendant args: ' + unicode(args)
#curl -X GET 'https://whosonfirst.mapzen.com/api/rest/?method=whosonfirst.places.search&access_token=d60c97c1d8b353e2e7a69fcacfe72eb7&name=Iquique&placetype=county&country_id=85633057&page=1&per_page=100'
## print pprint.pformat(rsp)
pages = None
page = 1
feature_area = 0
# if you don't have any pages yet, or we have more pages to look at
while not pages or page <= pages:
args['page'] = page
rsp = api.execute_method(method,args)
if not pages:
pages = rsp['pages']
if pages == 0:
break
num_wof_results += len(rsp['results'])
if verbose:
print " %d of %d" % (page, pages)
print " %s results found for %s" % (num_wof_results,props['wof:name'])
for row in rsp['results']:
possible_canidate = False
## print row
if row['wof:placetype'] == 'venue':
continue
if verbose:
print " %d, %s (%f, %f, %f)" % (row['wof:id'], row['wof:name'], row['geom:area'], row['geom:latitude'], row['geom:longitude'])
# Name edit distance
leve_dist = editdistance.eval( row[ 'wof:name' ], props['wof:name'])
leve_dist_length1 = len( row[ 'wof:name' ] )
leve_dist_length2 = len( props['wof:name'] )
# converts the edit distance into a percentage based on the actual edit distance and the length of the inputs
leve_rank = (leve_dist * 1.0) / max( leve_dist_length1, leve_dist_length2 )
# DEBUG
#print leve_rank, leve_dist, max( leve_dist_length1, leve_dist_length2 ), row['wof:name'], props['wof:name']
if leve_rank < 0.3:
possible_canidate = True
#TODO
# If it's within 1 decimal degree of our input search, then append it to candiate list, else ignore it
if possible_canidate:
candidates.append( { 'id' : row['wof:id'],
'name' : row['wof:name'],
'lat' : row['geom:latitude'],
'lng' : row['geom:longitude'],
'area' : row['geom:area']
} )
page+=1
# after we've reviewed all possible candidates, let's pick one
if len(candidates) > 0:
#for loop to pick the best one
best_candidate = None
# TODO
print str(len(candidates)) + ' candidates being evaluated...'
#print candidates
# DEBUG
#best_candidate = candidates[0]
for candidate in candidates:
# for debug, we want to know which other candidates it might have
# matched to to QA if the script did the right thing
# (without having to manually recreate the script)
if len(candidate_ids) > 0:
candidate_ids = candidate_ids + ',' + str(candidate[ 'id' ])
else:
candidate_ids = str(candidate[ 'id' ])
# DEBUG
# print candidate
#
# Scoring section
#
# Default the score to 0, then add scores based on different factors below
candidate[ 'score' ] = 0
# Prefer exact names, with area
if candidate[ 'name' ] == props['wof:name'] and candidate[ 'area' ] > 0:
candidate[ 'score' ] += 1
if candidate[ 'name' ] == props['wof:name']:
candidate[ 'score' ] += 1
# Name edit distance
leve_dist = editdistance.eval( candidate[ 'name' ], props['wof:name'])
leve_dist_length1 = len( candidate[ 'name' ] )
leve_dist_length2 = len( props['wof:name'] )
# converts the edit distance into a percentage based on the actual edit distance and the length of the inputs
leve_rank = (leve_dist * 1.0) / max( leve_dist_length1, leve_dist_length2 )
candidate['leve_dist'] = leve_rank
if leve_rank < 0.3 and candidate[ 'area' ] > 0:
candidate[ 'score' ] += 0.3
if leve_rank < 0.3:
candidate[ 'score' ] += 0.3
# Prefer features with area
if candidate[ 'area' ] > 0:
candidate[ 'score' ] += 1
# TODO: else prefer most similar names, that have area
# In New Zealand, MESO Waikato District is mapping to WOF Waikato District, but MESO South Waikato District is also mapping to WOF Waikato District, and shouldn't
# TODO: else look at the lat,lng distances between all these place
# if we have a winner, set best_candidate to that winner
#TODO
# sort the canidates by score, and choose the 1st one
candidates_sorted = sorted(candidates, key=lambda x: x['score'], reverse=True)
# make sure that we're not reusing the same candidate over and over
# todo: keep going until you've looked at all the possible canidates
if candidates_sorted[0] not in matched_feature_ids:
# set best to 1st in sorted list
best_candidate = candidates_sorted[0]
if verbose:
print "candidates_sorted"
print candidates_sorted
print "best_candidate"
print best_candidate
# we have a winner
if best_candidate:
props['wof:id'] = best_candidate[ 'id' ]
debug_lat = best_candidate[ 'lat' ]
debug_lng = best_candidate[ 'lng' ]
debug_area = best_candidate[ 'area' ]
else:
# TODO: get smarter about searching around me in the raw data on the local machine?
if skip:
print " ASSUMING NO CONCORDANCE exists, per your skip request"
else:
print " SKIPPING WOF CONCORDANCE for feature, it has no name :("
if verbose:
print " %d %s" % (props.get('wof:id', -1), props.get('wof:name', None))
# we previously set the feature's geometry based on the import
# now also set the feature's properties based on the sanitized props
feature['properties'] = props
# mapzen.whosonfirst.pip.utils.append_hierarchy_and_parent(feature,data_root="...")
# this will allow us to append hierarchies to our new mesoshapes
# print props
# print pprint.pformat(props)
#print feature
# TODO: take advantage of parent_id to load that feature and copy append all of it's hierarchy onto this new record
print candidate_ids
# ids are stored in a string that is comma delim
num_canidates = candidate_ids.count(',')
# but if there is only a single candidate there is no delim, so test for that case
if num_canidates == 0 and len(candidate_ids) > 0:
num_canidates = 1
if props.get('wof:id') in matched_feature_ids:
print '\toops, already used %s on another feature' % (props.get('wof:id'),)
if wof_id_from_data_file_boolean:
print '\toops, you specified a duplicate WOF ID in your input data file: %s' %s (props['wof:id'],)
# dont' reuse something again
props['wof:id'] = None
else:
# mark it so it's not reused in later features in this run
matched_feature_ids.append( props.get('wof:id') )
# Did we find a concordance for this feature
if props.get('wof:id'):
print ' found exising WOF record! %s' % (props.get('wof:id'),)
# TODO: make this path a command line argument (repeat)
existing_feature = mapzen.whosonfirst.utils.load(source, props.get('wof:id'))
#print pprint.pformat(feature['properties'])
#
# Preserve existing geometry as an alt
#
#
# filename template
# (taken care of by exporter.export_alt_feature)
#
# 85922583-alt-mapzen.geojson
#
#
# properties template
# (our responsibility)
#
# "properties":{
# "src:geom":"mapzen",
# "wof:id":85922583
#}
alt_geom = {'type': 'Feature'}
# what geom is the existing feature src from?
src_geom_namespace = existing_feature['properties'].get('src:geom', None)
# some data was imported incorrectly (which is sad)
if not src_geom_namespace:
if existing_feature['properties'].get('qs:name', None):
src_geom_namespace = 'quattroshapes'
# OPTION 1
if src_geom_namespace == 'meso':
print " hmmm, we've been around this corner before. let's treat 2nd viewing as new WOF feature to add"
#housekeeping --- really this should be the newly minted WOF:id???
feature['properties']['wof:id'] = None
# This will definitely result in a few scrambled records
# You can look at the debug files counting by wof_id candidate to figure out which to manually review
exporter.export_feature(feature)
#skip further logic in loop
continue
# get the list, else create an empty list
alt_geom_list = existing_feature['properties'].get('src:geom_alt', [])
if src_geom_namespace:
alt_geom['properties'] = {
'wof:id': props.get('wof:id'),
'src:geom': str(src_geom_namespace).lower()
}
#print str(alt_geom['properties'])
alt_geom['geometry'] = existing_feature['geometry']
# TODO: why this not write out file?
alt_path = exporter.export_alt_feature(alt_geom, alt=alt_geom['properties']['src:geom'])
print "EXPORTED %s (%s)" % (alt_path, alt_geom['properties']['src:geom'])
# document that we created an alt geom
alt_geom_list.append( alt_geom['properties']['src:geom'] )
#print 'alts: ' + str(alt_geom_list)
# record that new list onto the existing feature
existing_feature['properties']['src:geom_alt'] = alt_geom_list
#print existing_feature
else:
print " oops, not valid WOF record, missing src:geom property, skipping alt-geom creation"
#
# Now the primary WOF record
#
# import the geometry to the existing feature
existing_feature['geometry'] = _f['geometry']
# TODO: iterate thru all the props, adding them
for k, v in props.items():
# eg: wof:belongsto
#print "key: %s (%s)" % (k,type(k))
#print "new value: %s (%s)" % (v,type(v))
#try:
# if existing_feature['properties'][k]:
# print "old value: %s (%s)" % (existing_feature['properties'][k],type(existing_feature['properties'][k]))
#except:
# print "old value: n/a"
# eg: wof:belongsto or wof:hierarchy
if type(v) == type(list()):
new = existing_feature['properties'].get(k, [])
# crazy multiple hiearchy
if k == 'wof:hierarchy' and len(new) > 0:
if type(new[0]) == type(dict()):
# just update the first hierarchy
for dict_k, dict_v in v[0].items():
new[0][dict_k] = dict_v
# else assume sanity
else:
for val_in_list in v:
if val_in_list not in new:
new.append(val_in_list)
existing_feature['properties'][k] = new
#print 'again new new : %s' % (existing_feature['properties'][k],)
# eg: wof:concordances
elif type(v) == type(dict()):
new = existing_feature['properties'].get(k, {})
#print 'new new new : %s' % (new,)
for dict_k, dict_v in v.items():
#print ' %s, %s' % (dict_k,dict_v)
new[dict_k] = dict_v
existing_feature['properties'][k] = new
#print 'again new new : %s' % (existing_feature['properties'][k],)
# eg: wof:name
else:
if k == "wof:name":
if v != "":
existing_feature['properties'][k] = v
else:
if k != "meso:wof_id" and k != "meso:wof:id":
existing_feature['properties'][k] = v
#print pprint.pformat(existing_feature['properties'])
# now export the modified record
exporter.export_feature(existing_feature)
# If not, export as new feature
else:
print ' adding new feature to WOF'
exporter.export_feature(feature)
# which input records had what results?
# NOTE: this needs to be after features are exported as we're modifying the original
# feature (which is also the exported feature)
if options.debug and num_canidates > 0:
_f['properties']['debug_wof_id'] = candidate_ids
_f['properties']['debug_wof_candidates'] = num_canidates
_f['properties']['debug_wof_results'] = num_wof_results
_f['properties']['debug_wof_region_id'] = region_id
_f['properties']['debug_wof_region_results'] = region_num_wof_results
_f['properties']['debug_lat'] = debug_lat
_f['properties']['debug_lng'] = debug_lng
_f['properties']['debug_area'] = debug_area
# now that we've processed every record, for the set should we
# write out debug file (which now has new 'debug_wof_id' key, value pair)
if options.debug:
debug_path = os.path.abspath(options.debug)
with open(debug_path, 'w') as debug_outfile:
json.dump(data, debug_outfile)
# OPTION 2
# Loop thru the debug file (data) again
# Looking for wof_id values that are not unique
# and operate on them: choose the "best", ditch the rest, probably making new records for that input.
## next step - figure out how to exclude by placetype.
## see if we complete a dry run, stash the diffs, etc.. basically do a dry run of this script to see how we're doing.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment