ihor-nahuliak/import_meso.py

## import_meso.py
#!/usr/bin/env python

# run original SHP file thru MapShaper to add label position (mps_y, mps_x) columns
#
# first add label position via mapshaper:
# mapshaper input.shp encoding=utf8 -each 'mps_x=$.innerX, mps_y=$.innerY' -o import_via_mapshaper.shp
# full example:
# mapshaper /usr/local/mapzen/countries/Chile/Admin_2/Chile_admin2.shp encoding=utf8 -each 'mps_x=$.innerX, mps_y=$.innerY' -o chile_adm2_via_mapshaper.shp
#
# now convert that SHP to GeoJSON format, which is easier to load into Python
# ogr2ogr -F GeoJSON converted.geojson import_via_mapshaper.shp
# full example:
# ogr2ogr -F GeoJSON chile_adm2_via_mapshaper.geojson chile_adm2_via_mapshaper.shp
#
# then run this script, like:
# python apply_wof_id_to_martin_shapes_using_wof_api.py chile_adm2_via_mapshaper.geojson county 85633057 CL meso

import sys
import os
import logging
import optparse
import json

import geojson
import pprint

import mapzen.whosonfirst.api.client
import mapzen.whosonfirst.utils
import mapzen.whosonfirst.export

import editdistance

logging.basicConfig(level=logging.INFO)

if __name__ == '__main__':

    opt_parser = optparse.OptionParser()

    # python apply_wof_id_to_martin_shapes_using_wof_api.py chile_adm2_via_mapshaper.geojson county 85633057 CL meso
    opt_parser.add_option('-i', '--input', dest='input', action='store', default=None, help='Where to read GeoJSON import file from')
    opt_parser.add_option('-o', '--output', dest='output', action='store', default="/usr/local/mapzen/whosonfirst-data/data", help='Where to write WOF records to')
    opt_parser.add_option('-g', '--debug', dest='debug', action='store', default=None, help='Where to write debug GeoJSON (with wof:id added)')
    opt_parser.add_option('-k', '--skip-wof-api', dest='skip', action='store_true', default=None, help='Skip running WOF API to look for existing features.')
    opt_parser.add_option('-p', '--placetype', dest='placetype', action='store', default=None, help='What WOF placetype')
    opt_parser.add_option('-c', '--country_id', dest='country_id', action='store', default=None, help='What country wof:id')
    opt_parser.add_option('-d', '--country_code', dest='country_code', action='store', default=None, help='What WOF (ISO) country code')
    opt_parser.add_option('-s', '--source', dest='wof_source', action='store', default='meso', help='What WOF data source identifier')
    opt_parser.add_option('-v', '--verbose', dest='verbose', action='store_true', default=False, help='Be chatty (default is False)')

    options, args = opt_parser.parse_args()

    # calling the file
    #raise Exception, "Y U RUN THIS? Usage: python scriptname.py import_via_mapshaper.geojson wof_placetype country_id namespace"

    # setup variables
    wof_placetype = ['country', 'region', 'macrocounty', 'county', 'locality', 'neighbourhood']

    geoplanet_placetype = {
        "Country"          : "country",
        "County"           : "county",
        "HistoricalCounty" : "county",        # this should be marked CESSATION DATE with today's date, and MZ:IS_CURRENT = 0
        "LocalAdmin"       : "localadmin",
        "State"            : "region",
        "Suburb"           : "neighbourhood",
        "Town"             : "locality"
    }

    # there are only 8 places which don't have a geoplanet placetype
    # and also don't have a geonames.org placetype (gn_fcode)
    geonames_org_placetype = {
        "ADM1"  : "region",
        "ADM1H" : "region",                # this should be marked CESSATION DATE with today's date, and MZ:IS_CURRENT = 0
        "ADM2"  : "county",
        "ADM2H" : "county",                # this should be marked CESSATION DATE with today's date, and MZ:IS_CURRENT = 0
        "ADM3"  : "localadmin",
        "ADM3H" : "localadmin",            # this should be marked CESSATION DATE with today's date, and MZ:IS_CURRENT = 0
        "ADM4"  : "localadmin",
        "ADM4H" : "localadmin",            # this should be marked CESSATION DATE with today's date, and MZ:IS_CURRENT = 0
        "ADM5"  : "localadmin",
        "ADMD"  : "localadmin",
        "ADMDH" : "localadmin",            # this should be marked CESSATION DATE with today's date, and MZ:IS_CURRENT = 0
        "ADMF"  : "locality",            # gov't building
        "AGRF"  : "locality",            # ag / farm
        "AIRB"  : "locality",            # airbase
        "AIRP"  : "locality",            # airport
        "ANS"   : "locality",            # ancient site
        "AREA"  : "locality",             # huh?
        "ATOL"  : "locality",
        "BAR"   : "locality",            # this should probably SKIP
        "BAY"   : "locality",            # this should probably SKIP
        "BLDG"  : "locality",
        "BLDO"  : "locality",
        "BP"    : "locality",            # boundary marker?
        "CH"    : "locality",            # church
        "CMP"   : "locality",
        "CMPMN" : "locality",
        "CMPRF" : "locality",
        "CMTY"  : "locality",
        "CNLSB" : "locality",
        "CNS"   : "locality",
        "COMC"  : "locality",
        "CONT"  : "locality",
        "COVE"  : "locality",
        "CRTR"  : "locality",
        "CST"   : "locality",
        "CSTL"  : "locality",
        "CSTM"  : "locality",
        "CTRS"  : "locality",
        "CULT"  : "locality",
        "DEVH"  : "locality",
        "DLTA"  : "locality",
        "DPOF"  : "locality",
        "DVD"   : "locality",
        "EST"   : "locality",
        "FCL"   : "locality",
        "FLD"   : "locality",
        "FRM"   : "locality",            # farm
        "FRMS"  : "locality",
        "FRMT"  : "locality",
        "GHSE"  : "locality",
        "GRAZ"  : "locality",
        "HLL"   : "locality",
        "HMSD"  : "locality",
        "HSE"   : "locality",            # house
        "HSEC"  : "locality",            # houses
        "HSP"   : "locality",            # hospital
        "HSPC"  : "locality",
        "HSTS"  : "locality",
        "HTL"   : "locality",
        "HUT"   : "locality",
        "HUTS"  : "locality",
        "INDS"  : "locality",
        "ISL"   : "locality",
        "ISLET" : "locality",
        "ISLS"  : "locality",
        "ISLX"  : "locality",
        "ITTR"  : "locality",
        "LCTY"  : "locality",
        "LEPC"  : "locality",
        "LK"    : "locality",
        "LTER"  : "locality",
        "MALL"  : "locality",
        "MAR"   : "locality",
        "MFG"   : "locality",
        "MKT"   : "locality",
        "MLSW"  : "locality",
        "MLWTR" : "locality",
        "MSQE"  : "locality",
        "MSTY"  : "locality",
        "MT"    : "locality",
        "MTS"   : "locality",
        "OBPT"  : "locality",
        "OCH"   : "locality",
        "PCL"   : "country",          # political entity
        "PCLD"  : "dependency",        # dependent political entity
        "PCLF"  : "dependency",        # freely associated state
        "PCLH"  : "country",        # historical political entity, CESSATION DATE with today's date, and MZ:IS_CURRENT = 0
        "PCLI"  : "country",        # independent political entity
        "PCLIX" : "region",            # section of independent political entity
        "PCLS"  : "dependency",        # semi-independent political entity
        "PEN"   : "locality",
        "PGDA"  : "locality",
        "PK"    : "locality",
        "PKS"   : "locality",
        "PLN"   : "locality",
        "PPL"   : "locality",        # basic locality
        "PPLA"  : "locality",        # region capital      add new CAPITAL_OF list
        "PPLA2" : "locality",        # county capital      add new CAPITAL_OF list
        "PPLA3" : "locality",        # localadmin capital      add new CAPITAL_OF list
        "PPLA4" : "locality",        # localadmin capital      add new CAPITAL_OF list
        "PPLC"  : "locality",        # country capital      add new CAPITAL_OF list
        "PPLF"  : "locality",        # farm village
        "PPLG"  : "locality",        # seat of government of a political entity      add new CAPITAL_OF list
        "PPLH"  : "locality",        # historical populated place, CESSATION DATE with today's date, and MZ:IS_CURRENT = 0
        "PPLL"  : "locality",        # populated locality
        "PPLQ"  : "locality",        # abandoned populated place, CESSATION DATE with today's date, and MZ:IS_CURRENT = 0
        "PPLR"  : "locality",        # religious populated place
        "PPLS"  : "locality",        # populated places
        "PPLW"  : "locality",        # destroyed populated place, CESSATION DATE with today's date, and MZ:IS_CURRENT = 0
        "PPLX"  : "neighbourhood",
        "PRK"   : "locality",
        "PRN"   : "locality",
        "PRSH"  : "locality",
        "PRT"   : "locality",
        "PT"    : "locality",
        "RD"    : "locality",
        "RDGB"  : "locality",
        "RDGE"  : "locality",
        "RECG"  : "locality",
        "REST"  : "locality",
        "RESW"  : "locality",
        "RGN"   : "locality",
        "RHSE"  : "locality",
        "RLGR"  : "locality",
        "RNCH"  : "locality",
        "RR"    : "locality",
        "RSD"   : "locality",
        "RSRT"  : "locality",
        "RSTN"  : "locality",
        "RSV"   : "locality",
        "RUIN"  : "locality",
        "SAND"  : "locality",
        "SCH"   : "locality",
        "SCHC"  : "locality",
        "SCHT"  : "locality",
        "SHRN"  : "locality",
        "SHSU"  : "locality",
        "SPA"   : "locality",
        "SQR"   : "locality",
        "ST"    : "locality",
        "STDM"  : "locality",
        "STLMT" : "locality",
        "STM"   : "locality",
        "STMI"  : "locality",
        "STNB"  : "locality",
        "STNC"  : "locality",
        "STNF"  : "locality",
        "STNM"  : "locality",
        "STNR"  : "locality",
        "STRT"  : "locality",
        "SWMP"  : "locality",
        "TERR"  : "locality",
        "TMB"   : "locality",
        "TMPL"  : "locality",
        "TOWR"  : "locality",
        "TRB"   : "locality",
        "TRIG"  : "locality",
        "TWO"   : "locality",
        "UNIV"  : "locality",
        "VAL"   : "locality",
        "WAD"   : "locality",
        "WTRH"  : "locality",
        "ZN"    : "locality",
        "ZNB"   : "locality"
    }

    token = 'KEY...'
    api = mapzen.whosonfirst.api.client.OAuth2(token)

    # now the main logic
    # read from the system arguements

    # GeoJSON file to process
    dump = os.path.abspath(options.input)

    # where to put the results
    source = os.path.abspath(options.output)
    exporter = mapzen.whosonfirst.export.flatfile(source)

    placetype = options.placetype
    skip = options.skip
    verbose = options.verbose

    # TODO: rewrite this section

    # was a valid placetype specified?
    if not placetype in wof_placetype:
        raise Exception, ('boo: placetype fail :( choose one of:', wof_placetype)

    try:
        country_id = int(options.country_id)
    except:
        raise Exception, ('boo: what country are you in, sir?')

    # out of 1.3 million records:
    # `iso` code is set on all but 250,000 records
    # `gn_country` is set for all but 129,000 records
    # iso and gn_country match for all but 2375 records
    # only 18 don't have either iso or gn_country set
    try:
        iso_country_code = options.country_code
    except:
        raise Exception, ('boo: what ISO code does your country have, sir?')

    try:
        wof_src_namespace = options.wof_source
    except:
        raise Exception, ('boo: what WOF source namespace are you using, sir?')

    if wof_src_namespace == 'meso' or 'mz':
        src_lat = 'mps_y'
        src_lng = 'mps_x'
        src_label = 'mapshaper'
        admin_1_name_prop = 'meso:admin_1'
    # assumes Quattroshapes point gazetteer (which is WRONG)
    else:
        src_lat = 'lat'
        src_lng = 'lng'
        src_label = 'qspg'
        admin_1_name_prop = 'qspg:name_adm1'

    # read in the GeoJSON file contents (text file)
    fh = open(dump, 'r')

    # Import (parse) our text file from command line
    # arguement  as structured GeoJSON object
    data = geojson.load(fh)

    # sometimes input have bad columns to skip
    property_key_skip = [
        "qs_id",            # empty
        "featurecla",       # empty
        "HASC",
        "NAME_EN",
        "mps_x",
        "mps_y",
        "SOURCE"
    ]

    # sometimes SHP files have stupid 10 char
    # UPPER names, make the WOF pretty
    property_key_remap = {
        "NAME_EN":    "name:eng_x_preferred",
        "NAME_LOCAL": "name:und_x_variant",
        "NAME_LOC"  : "name:und_x_variant",
        "NAME_ALT"  : "name:und_x_variant",
        "NAME_ALT1"  : "name:und_x_variant",
        "NAME_ALT2"  : "name:und_x_variant",
        "GAUL_ADMIN": "name:eng_x_variant",
        "GADM_ADMIN": "name:eng_x_variant",
        "GADM_HASC2": "wof:concordances:hasc:id",
        "HASC": "wof:concordances:hasc:id",
        #"SOURCE":     "src:geom",
        "mps:latitude" : "ms:latitude",
        "mps:latitude" : "ms:latitude",
        # quattroshapes point gazetteer re-up
        "name" : "name:eng_x_preferred",
        "gn_id" : "wof:concordances:gn_id",
        "woe_id" : "wof:concordances:woe_id",
        #"gn_id_eh" : "wof:concordances:gn_id_eh",
        #"woe_id_eh" : "wof:concordances:woe_id_eh",
        "gn_name" : "name:und_x_variant",
        "gn_ascii" : "name:und_x_variant",
        "woe_name" : "name:und_x_variant",
        "woe_nameen" : "name:eng_x_preferred",
        "qs_maybe" : "qspg_id"
    }

    # values that are integers
    keys_with_integer_values = [
        "OBJECTID",
        "qs_id",
        "gn_id",
        "woe_id",
        "gn_id_eh",
        "woe_id_eh",
        "scalerank",
        "natscale",
        "adm0cap",
        "worldcity",
        "megacity",
        "metro_core",
        "micro_core",
        "gn_pop",
        "parent_id",
        "woe_local",
        "woe_lau",
        "woe_adm2",
        "woe_adm1",
        "woe_adm0",
        "gns_id",
        "photos",
        "photos_all",
        "woemembers",
        "photos_1k",
        "photos_9k",
        "photos_sr",
        "photos_9r",
        "pop_sr",
        "temp_id",
        "qs_maybe"
    ]

    source_name_remap = {
        "AOTM": "meso"
    }

    wof_src_key = "src:geom"

    # sometimes the (SHP) keys above contain "no data"
    # which is conviently returned in a number of non-empty
    # forms. this is why we can't have nice things.
    no_data_vals = [
        "NO DATA",
        "0",
        ""
    ]

    # did we already use this feature to match
    # (if so, don't reuse it)
    matched_feature_ids = []

    first_property_key_remap_warning = True
    first_read_language_code_from_data_warning = True
    first_mapshaper_warning = True

    feature_counter = 1
    total_features = len(data['features'])

    # for each feature in GeoJSON feature collection
    for _f in data['features']:
        #sys.stdout.write( str(feature_counter) + ' of ' + str(total_features) )
        print str(feature_counter) + ' of ' + str(total_features)
        # print "%s of %s" % (feature_counter, total_features)

        feature_counter += 1

        #
        # SKIP this part since we're importing POLYGONS and we're
        # expecting explicate mz_lat, mz_lng properties
        #
        ## store the feature's geometry
        #geom = _f['geometry']
        ## store the feature's coordinates object (lat, lng)
        #coords = geom['coordinates']
        ## make that easier to work with later (for assigning label position)
        #lon, lat = coords

        # now for the feature's properties
        _p = _f['properties']

        wof_id_from_data_file_boolean = False

        # in some cases we might already have a WOF_ID, but probably not
        try:
            # we prefer wof:id in this format
            wofid = _p['wof:id']

            # but sometimes history
            wofid = _p.get('WOF_ID', wofid)

            # if there is no WOF_ID, set it to zero
            if wofid == None:
                wofid = 0
            # else parse it as an integer so WOF is happy
            else:
                wofid = int(wofid)
                wof_id_from_data_file_boolean = True
        except:
            wofid = 0

        # for most the data that doens't have a WOF_ID yet
        # create a new feature that is the imported feature
        if wofid == 0:

            # set the geom and properties to match the imported feature
            feature = _f

            #"iso" : "",
            #"gn_country" : "",
            #"name_adm1" : "",
            #"name_adm0" : "",

            # add required WOF placetype
            # TODO: This smells like we should ALWAYS set this, even if feature already exists
            props = {
                # we add wof:id later
                # we add wof:name later
                'wof:placetype': placetype,
                'wof:hierarchy': [
                    { 'country_id': country_id }
                ],
                'wof:country': iso_country_code
                # ASSUMPTION: placetypes are added country, then region, then county, then, locality, then neighbourhood, etc
                # TODO: we are missing parent_id
                # The default for unknown parent_id is -1, but really we should PIP these things to find it out
            }

            # default the geometry source to the CL opt_parser value
            # (but this is overwriten below as data-driven per feature)
            if wof_src_namespace:
                props[wof_src_key] = wof_src_namespace

        # else we alreday have a WOF feature, so load that up and let's modify it
        else:
            feature = mapzen.whosonfirst.utils.load([source], wofid)
            props = feature['properties']

        # set WOF:name based on what field?
        if "name" in _p:
            wof_name_key = "name"
        elif "NAME_EN" in _p:
            wof_name_key = "NAME_EN"
        elif "NAME_LOCAL" in _p:
            wof_name_key = "NAME_LOCAL"
        elif "NAME_LOC" in _p:
            wof_name_key = "NAME_LOC"
        else:
            raise Exception, ('boo: data needs either name, NAME_EN, or NAME_LOCAL fields')


        # now selectively add data properties from import_via_mapshaper file
        # making sure to sanitize their property (key) names
        # NOTE: all other properties not in the property_key_remap list will be skipped
        try:
            for orig_key, sanitized_key in property_key_remap.items():
                #print "%s, %s" % (orig_key, sanitized_key)
                #try:
                #    print "\t%s: %s" % (orig_key, _p[ orig_key ])
                #except:
                #    continue
                try:

                    if _p[ orig_key ] == None:
                        # continue
                        props[ sanitized_key ] = ''
                    else:
                        #
                        # Special WOF-ism
                        #
                        #print "key: %s, value: %s" % (orig_key, _p[ orig_key ])
                        # store the WOF name
                        if orig_key == wof_name_key:
                            if _p[ orig_key ] in no_data_vals:
                                props[ "wof:name" ] = ""
                            else:
                                props[ "wof:name" ] = unicode(_p[ orig_key ])

                            print '1: props[ "wof:name" ]: %s' % (props[ "wof:name" ],)

                            if verbose:
                                print "  wof:name being evaluted (for import or concordance): " + props[ "wof:name" ] + " (" + _p[ orig_key ] + ")"

                            # if the data has a source specified, try to be data driven
                            if _p[ sanitized_key ] == wof_src_key:

                                # make it easier to read the later logic
                                input_src_value = _p[ orig_key ]

                                # but sometimes the data values are bad
                                if input_src_value in source_name_remap:
                                    # set the final property to the remapped value
                                    props[ sanitized_key ] = source_name_remap[ input_src_value ]
                                else:
                                    # In this case don't take the unicode of the input src as WOF sources are only ascii7
                                    props[ sanitized_key ] = input_src_value

                        print '2: props[ "wof:name" ]: %s' % (props[ "wof:name" ],)

                        #
                        # For the explicate property lookups in property_key_remap
                        #
                        #
                        # names are lists
                        if ("name:" in sanitized_key):

                            # skip this key if the data is bad, but if it's good, do the following
                            if _p[ orig_key ] not in no_data_vals:

                                # langauge codes must be 3 characters long
                                # https://en.wikipedia.org/wiki/ISO_639
                                # eg: eng (English), fra (French), deu (Geramn), zho (Taiiwan), vie (Vietnamese), rus (Russian), ukr (Ukrainian)

                                # defualt to und (undefined) langauge code
                                full_language_key = sanitized_key

                                # do we know the language code via DATA?
                                try:
                                    # shapefiles can only have column names up to 10 characters
                                    # if you know the 3 char language code of the column, specify that
                                    # like: foobarlc (where foobar is an existing column name)

                                    # what data column should we look for?
                                    if len(orig_key) > 8:
                                        language_key = orig_key[0:8] + 'LC'
                                    else:
                                        language_key = orig_key + 'LC'

                                    # can we read a 3 char language code from the data?
                                    if _p[ language_key ] is not None:
                                        if len(_p[ language_key ]) == 3:
                                            full_language_key = 'name:' + _p[ language_key ] + '_x_variant'
                                except:
                                    if first_read_language_code_from_data_warning:
                                        print "problems with determining language codes from data :("
                                        first_read_language_code_from_data_warning = False

                                #
                                # We finally have a vaue to add, with a language code!
                                #
                                # test to see if we already names in that language code


                                langs = props.get(full_language_key, [])

                                # only add unique names
                                if unicode(_p[ orig_key ]) not in langs:
                                    langs.append(unicode(_p[ orig_key ]))

                                props[full_language_key] = langs

                                # if we don't have a wof:name, store one of the alts as the name
                                print '  props.get("wof:name"): ' + props.get("wof:name")

                                if props.get("wof:name", "") == "":
                                    props["wof:name"] = unicode( _p[ orig_key ] )

                        # concordances are objects
                        elif sanitized_key.startswith("wof:concordances:"):
                            print "found a concordance"

                            if unicode(_p[ orig_key ]) not in no_data_vals:
                                print "found a concordance: %s" % (_p[ orig_key ],)
                                #concordance_namespace = sanitized_key.rsplit(":", 1)
                                # ignore the first 17 characters, they are only ever "wof:concordances:"
                                concordance_namespace = sanitized_key[17:]

                                k = concordance_namespace
                                v = unicode(_p[ orig_key ])

                                concordances = props.get("wof:concordances", {})
                                concordances[ k ] = v

                                props['wof:concordances'] = concordances

                        #everything else is straight properties
                        else:
                            props[ sanitized_key ] = unicode(_p[ orig_key ])
                except:
                    if first_property_key_remap_warning:
                        print '\t' + orig_key + ' not found in import, skipping'
                        first_property_key_remap_warning = False
        except Exception, e:
            print pprint.pformat(_p)
            raise Exception, e

        # always record the original values into a new namespace (specified in CL arguement)
        try:
            for prop_key, prop_val in _p.items():
                # only export the k,v when it's not in the black list
                if prop_key not in property_key_skip and wof_src_namespace is not 'mz':
                    props[ wof_src_namespace + ":" + prop_key.lower() ] = prop_val    #unicode( prop_val )
        except Exception, e:
            print pprint.pformat(_p)
            raise Exception, e

        # TODO: this should only be set on NEW features, not EXISTING features
        # someday MapShaper process will be built into exportify, until then...
        try:
            props['lbl:latitude']  = _p[ src_lat ]
            props['lbl:longitude'] = _p[ src_lng ]
            # TODO: is this right?
            props['src:lbl:centroid'] = src_label
        except Exception, e:
            if first_mapshaper_warning:
                print 'oops, no LABEL latitude and longitude label values found :('
                first_mapshaper_warning = False
            # print out the properties the feature DOES have
            print pprint.pformat(_p)
            # it was missing one of the properties we expected, which one?
            raise Exception, e

        if verbose:
            print props

        debug_lat = None
        debug_lng = None
        debug_area = None

        num_wof_results = 0
        candidate_ids = ''

        ## fail safe: all features need a wof:id before we import them
        if not props.get('wof:id', False):

            # TODO: We often know what region a feature is in
            #       add logic to determine the wof:id of that region, and then scope the next piece to that region_id (not country_id)
            #       but if region can't be determined, still use country_id


            admin_1_name = props.get(admin_1_name_prop, None)

            region_id = None
            region_num_wof_results = 0

            if admin_1_name:

                method = 'whosonfirst.places.search'
                args = {'names': admin_1_name.encode("utf8"), 'placetype': 'region', 'country_id': country_id, 'extras':'geom:area, geom:latitude, geom:longitude'}

                if verbose:
                    print '  search args: ' + unicode(args)

                #curl -X GET 'https://whosonfirst.mapzen.com/api/rest/?method=whosonfirst.places.search&access_token=d60c97c1d8b353e2e7a69fcacfe72eb7&name=Iquique&placetype=county&country_id=85633057&page=1&per_page=100'
                ## print pprint.pformat(rsp)

                pages = None
                page = 1
                feature_area = 0

                # if you don't have any pages yet, or we have more pages to look at
                while not pages or page <= pages:
                    args['page'] = page
                    rsp = api.execute_method(method,args)
                    if not pages:
                        pages = rsp['pages']

                        if pages == 0:
                            break

                    region_num_wof_results += len(rsp['results'])

                    if verbose:
                        print "  %d of %d" % (page, pages)
                    print "  %s REGION results found for %s" % (region_num_wof_results,props['wof:name'])

                    for row in rsp['results']:
                        ## print row
                        if row['wof:placetype'] == 'venue':
                            continue

                        if verbose:
                            print "    %d, %s (%f, %f, %f)" % (row['wof:id'], row['wof:name'], row['geom:area'], row['geom:latitude'], row['geom:longitude'])

                        # pick one to set a WOF ID for, else it's still null
                        # NOTE: if you have two points with same area of 0, then it'll choose the last one
                        #
                        # TODO: add validatation that result is "near" the input (0.1 DD or 1.0 DD, etc)
                        #
                        if rsp['total'] == 1 or row['geom:area'] > feature_area:
                            region_id = row['wof:id']


                        # TODO: look at varient names, all names (and some edit distance ranking)
                        # TODO: look at the lat,lng distances between all these place
                    page+=1


            if not props['wof:name'] == '' and not skip:
                method = 'whosonfirst.places.search'

                if region_id:
                    args = {'names': props['wof:name'].encode("utf8"), 'placetype': placetype, 'region_id': region_id, 'extras':'geom:area, geom:latitude, geom:longitude'}
                else:
                    args = {'names': props['wof:name'].encode("utf8"), 'placetype': placetype, 'country_id': country_id, 'extras':'geom:area, geom:latitude, geom:longitude'}

                if verbose:
                    print '  search args: ' + unicode(args)

                #curl -X GET 'https://whosonfirst.mapzen.com/api/rest/?method=whosonfirst.places.search&access_token=d60c97c1d8b353e2e7a69fcacfe72eb7&name=Iquique&placetype=county&country_id=85633057&page=1&per_page=100'
                ## print pprint.pformat(rsp)

                pages = None
                page = 1
                feature_area = 0
                candidates = []
                # example candidate object
                #{
                #    id = None
                #    name = None
                #    lat = None
                #    lng = None
                #    area = 0
                #}

                # if you don't have any pages yet, or we have more pages to look at
                while not pages or page <= pages:
                    args['page'] = page
                    rsp = api.execute_method(method,args)
                    if not pages:
                        pages = rsp['pages']

                        if pages == 0:
                            break

                    num_wof_results += len(rsp['results'])

                    if verbose:
                        print "  %d of %d" % (page, pages)
                        print "  %s results found for %s" % (num_wof_results,props['wof:name'])

                    for row in rsp['results']:
                        possible_canidate = False

                        ## print row
                        if row['wof:placetype'] == 'venue':
                            continue

                        if verbose:
                            print "    %d, %s (%f, %f, %f)" % (row['wof:id'], row['wof:name'], row['geom:area'], row['geom:latitude'], row['geom:longitude'])

                        # TODO: pick one to set a WOF ID for, else it's still null
                        # NOTE: if you have two points with same area of 0, then it'll choose the last one
                        if rsp['total'] == 1:
                            possible_candidate = True

                        # Prefer result with the largest area (or really: polygons over points)
                        if row['geom:area'] > feature_area:
                            #feature_area = row['geom:area']
                            possible_candidate = True

                        # Prefer exact name matches
                        if row['wof:name'] == props['wof:name']: #.encode("utf8")
                            possible_candidate = True

                        if possible_candidate:
                            candidates.append( { 'id'   : row['wof:id'],
                                                 'name' : row['wof:name'],
                                                 'lat'  : row['geom:latitude'],
                                                 'lng'  : row['geom:longitude'],
                                                 'area' : row['geom:area']
                                            } )

                    page+=1

                # If we still don't have any candidates, get features of that country of that placetype to evaluate
                if len(candidates) == 0:
                    # TODO: look at varient names, all names (and some edit distance ranking), and default to exact match if multiple
                    # TODO: if no name, no all names, just get all the county children and do edit distance

                    #curl -X GET 'https://whosonfirst.mapzen.com/api/rest/?method=whosonfirst.places.getDescendants&access_token=06e2ccbf23ab42122963e68c887e87b4&id=85633805&placetype=county&page=1&per_page=100'

                    method = 'whosonfirst.places.getDescendants'

                    if region_id:
                        args = {'placetype': placetype, 'id': region_id, 'extras':'geom:area, geom:latitude, geom:longitude'}
                    else:
                        args = {'placetype': placetype, 'id': country_id, 'extras':'geom:area, geom:latitude, geom:longitude'}

                    if verbose:
                        print '  descendant args: ' + unicode(args)

                    #curl -X GET 'https://whosonfirst.mapzen.com/api/rest/?method=whosonfirst.places.search&access_token=d60c97c1d8b353e2e7a69fcacfe72eb7&name=Iquique&placetype=county&country_id=85633057&page=1&per_page=100'
                    ## print pprint.pformat(rsp)

                    pages = None
                    page = 1
                    feature_area = 0

                    # if you don't have any pages yet, or we have more pages to look at
                    while not pages or page <= pages:
                        args['page'] = page
                        rsp = api.execute_method(method,args)
                        if not pages:
                            pages = rsp['pages']

                            if pages == 0:
                                break

                        num_wof_results += len(rsp['results'])

                        if verbose:
                            print "  %d of %d" % (page, pages)
                            print "  %s results found for %s" % (num_wof_results,props['wof:name'])

                        for row in rsp['results']:
                            possible_canidate = False

                            ## print row
                            if row['wof:placetype'] == 'venue':
                                continue

                            if verbose:
                                print "    %d, %s (%f, %f, %f)" % (row['wof:id'], row['wof:name'], row['geom:area'], row['geom:latitude'], row['geom:longitude'])

                            # Name edit distance
                            leve_dist = editdistance.eval( row[ 'wof:name' ], props['wof:name'])
                            leve_dist_length1 = len( row[ 'wof:name' ] )
                            leve_dist_length2 = len( props['wof:name'] )
                            # converts the edit distance into a percentage based on the actual edit distance and the length of the inputs
                            leve_rank = (leve_dist * 1.0) / max( leve_dist_length1, leve_dist_length2 )
                            # DEBUG
                            #print leve_rank, leve_dist, max( leve_dist_length1, leve_dist_length2 ), row['wof:name'], props['wof:name']

                            if leve_rank < 0.3:
                                possible_canidate = True

                            #TODO
                            # If it's within 1 decimal degree of our input search, then append it to candiate list, else ignore it

                            if possible_canidate:
                                candidates.append( { 'id'   : row['wof:id'],
                                                     'name' : row['wof:name'],
                                                     'lat'  : row['geom:latitude'],
                                                     'lng'  : row['geom:longitude'],
                                                     'area' : row['geom:area']
                                                } )

                        page+=1

                # after we've reviewed all possible candidates, let's pick one
                if len(candidates) > 0:
                    #for loop to pick the best one
                    best_candidate = None

                    # TODO
                    print str(len(candidates)) + ' candidates being evaluated...'
                    #print candidates

                    # DEBUG
                    #best_candidate = candidates[0]

                    for candidate in candidates:
                        # for debug,  we want to know which other candidates it might have
                        # matched to to QA if the script did the right thing
                        # (without having to manually recreate the script)
                        if len(candidate_ids) > 0:
                            candidate_ids = candidate_ids + ',' + str(candidate[ 'id' ])
                        else:
                            candidate_ids = str(candidate[ 'id' ])

                        # DEBUG
                        # print candidate

                        #
                        # Scoring section
                        #
                        # Default the score to 0, then add scores based on different factors below
                        candidate[ 'score' ] = 0

                        # Prefer exact names, with area
                        if candidate[ 'name' ] == props['wof:name'] and candidate[ 'area' ] > 0:
                            candidate[ 'score' ] += 1

                        if candidate[ 'name' ] == props['wof:name']:
                            candidate[ 'score' ] += 1

                        # Name edit distance
                        leve_dist = editdistance.eval( candidate[ 'name' ], props['wof:name'])
                        leve_dist_length1 = len( candidate[ 'name' ] )
                        leve_dist_length2 = len( props['wof:name'] )
                        # converts the edit distance into a percentage based on the actual edit distance and the length of the inputs
                        leve_rank = (leve_dist * 1.0) / max( leve_dist_length1, leve_dist_length2 )
                        candidate['leve_dist'] = leve_rank

                        if leve_rank < 0.3 and candidate[ 'area' ] > 0:
                            candidate[ 'score' ] += 0.3

                        if leve_rank < 0.3:
                            candidate[ 'score' ] += 0.3

                        # Prefer features with area
                        if candidate[ 'area' ] > 0:
                            candidate[ 'score' ] += 1

                        # TODO: else prefer most similar names, that have area
                            # In New Zealand, MESO Waikato District is mapping to WOF Waikato District, but MESO South Waikato District is also mapping to WOF Waikato District, and shouldn't
                        # TODO: else look at the lat,lng distances between all these place
                        # if we have a winner, set best_candidate to that winner

                    #TODO
                    # sort the canidates by score, and choose the 1st one
                    candidates_sorted = sorted(candidates, key=lambda x: x['score'], reverse=True)

                    # make sure that we're not reusing the same candidate over and over
                    # todo: keep going until you've looked at all the possible canidates
                    if candidates_sorted[0] not in matched_feature_ids:

                        # set best to 1st in sorted list
                        best_candidate = candidates_sorted[0]

                    if verbose:
                        print "candidates_sorted"
                        print candidates_sorted
                        print "best_candidate"
                        print best_candidate

                    # we have a winner
                    if best_candidate:
                        props['wof:id']     = best_candidate[ 'id' ]
                        debug_lat  = best_candidate[ 'lat' ]
                        debug_lng  = best_candidate[ 'lng' ]
                        debug_area = best_candidate[ 'area' ]

            else:
                # TODO: get smarter about searching around me in the raw data on the local machine?
                if skip:
                    print "    ASSUMING NO CONCORDANCE exists, per your skip request"
                else:
                    print "    SKIPPING WOF CONCORDANCE for feature, it has no name :("

        if verbose:
            print "  %d %s" % (props.get('wof:id', -1), props.get('wof:name', None))

        # we previously set the feature's geometry based on the import
        # now also set the feature's properties based on the sanitized props
        feature['properties'] = props

        # mapzen.whosonfirst.pip.utils.append_hierarchy_and_parent(feature,data_root="...")
        # this will allow us to append hierarchies to our new mesoshapes

        # print props
        # print pprint.pformat(props)


        #print feature

        # TODO: take advantage of parent_id to load that feature and copy append all of it's hierarchy onto this new record

        print candidate_ids

        # ids are stored in a string that is comma delim
        num_canidates = candidate_ids.count(',')
        # but if there is only a single candidate there is no delim, so test for that case
        if num_canidates == 0 and len(candidate_ids) > 0:
            num_canidates = 1

        if props.get('wof:id') in matched_feature_ids:
            print '\toops, already used %s on another feature' % (props.get('wof:id'),)
            if wof_id_from_data_file_boolean:
                print '\toops, you specified a duplicate WOF ID in your input data file: %s' %s (props['wof:id'],)

            # dont' reuse something again
            props['wof:id'] = None
        else:
            # mark it so it's not reused in later features in this run
            matched_feature_ids.append( props.get('wof:id') )

        # Did we find a concordance for this feature
        if props.get('wof:id'):
            print '  found exising WOF record! %s' % (props.get('wof:id'),)

            # TODO: make this path a command line argument (repeat)
            existing_feature = mapzen.whosonfirst.utils.load(source, props.get('wof:id'))
            #print pprint.pformat(feature['properties'])

            #
            # Preserve existing geometry as an alt
            #
            #
            # filename template
            # (taken care of by exporter.export_alt_feature)
            #
            # 85922583-alt-mapzen.geojson
            #
            #
            # properties template
            # (our responsibility)
            #
            # "properties":{
            #    "src:geom":"mapzen",
            #    "wof:id":85922583
            #}

            alt_geom = {'type': 'Feature'}

            # what geom is the existing feature src from?
            src_geom_namespace = existing_feature['properties'].get('src:geom', None)

            # some data was imported incorrectly (which is sad)
            if not src_geom_namespace:
                if existing_feature['properties'].get('qs:name', None):
                    src_geom_namespace = 'quattroshapes'

            # OPTION 1
            if src_geom_namespace == 'meso':
                print "  hmmm, we've been around this corner before. let's treat 2nd viewing as new WOF feature to add"
                #housekeeping --- really this should be the newly minted WOF:id???
                feature['properties']['wof:id']     = None
                # This will definitely result in a few scrambled records
                # You can look at the debug files counting by wof_id candidate to figure out which to manually review
                exporter.export_feature(feature)
                #skip further logic in loop
                continue

            # get the list, else create an empty list
            alt_geom_list = existing_feature['properties'].get('src:geom_alt', [])

            if src_geom_namespace:
                alt_geom['properties'] = {
                    'wof:id': props.get('wof:id'),
                    'src:geom': str(src_geom_namespace).lower()
                }

                #print str(alt_geom['properties'])

                alt_geom['geometry'] = existing_feature['geometry']

                # TODO: why this not write out file?
                alt_path = exporter.export_alt_feature(alt_geom, alt=alt_geom['properties']['src:geom'])
                print "EXPORTED %s (%s)" % (alt_path, alt_geom['properties']['src:geom'])

                # document that we created an alt geom
                alt_geom_list.append( alt_geom['properties']['src:geom'] )

                #print 'alts: ' + str(alt_geom_list)

                # record that new list onto the existing feature
                existing_feature['properties']['src:geom_alt'] = alt_geom_list

                #print existing_feature

            else:
                print "  oops, not valid WOF record, missing src:geom property, skipping alt-geom creation"

            #
            # Now the primary WOF record
            #

            # import the geometry to the existing feature
            existing_feature['geometry'] = _f['geometry']

            # TODO: iterate thru all the props, adding them
            for k, v in props.items():
                # eg: wof:belongsto
                #print "key:   %s (%s)" % (k,type(k))
                #print "new value: %s (%s)" % (v,type(v))
                #try:
                #    if existing_feature['properties'][k]:
                #        print "old value: %s (%s)" % (existing_feature['properties'][k],type(existing_feature['properties'][k]))
                #except:
                #    print "old value: n/a"

                # eg: wof:belongsto or wof:hierarchy
                if type(v) == type(list()):
                    new = existing_feature['properties'].get(k, [])

                    # crazy multiple hiearchy
                    if k == 'wof:hierarchy' and len(new) > 0:
                        if type(new[0]) == type(dict()):
                            # just update the first hierarchy
                            for dict_k, dict_v in v[0].items():
                                new[0][dict_k] = dict_v
                    # else assume sanity
                    else:
                        for val_in_list in v:
                            if val_in_list not in new:
                                new.append(val_in_list)

                    existing_feature['properties'][k] = new

                    #print 'again new new : %s' % (existing_feature['properties'][k],)

                # eg: wof:concordances
                elif type(v) == type(dict()):
                    new = existing_feature['properties'].get(k, {})
                    #print 'new new new : %s' % (new,)

                    for dict_k, dict_v in v.items():
                        #print '  %s, %s' % (dict_k,dict_v)

                        new[dict_k] = dict_v

                        existing_feature['properties'][k] = new

                        #print 'again new new : %s' % (existing_feature['properties'][k],)
                # eg: wof:name
                else:
                    if k == "wof:name":
                        if v != "":
                            existing_feature['properties'][k] = v
                    else:
                        if k != "meso:wof_id" and k != "meso:wof:id":
                            existing_feature['properties'][k] = v

            #print pprint.pformat(existing_feature['properties'])

            # now export the modified record
            exporter.export_feature(existing_feature)

        # If not, export as new feature
        else:
            print '  adding new feature to WOF'
            exporter.export_feature(feature)

        # which input records had what results?
        # NOTE: this needs to be after features are exported as we're modifying the original
        #       feature (which is also the exported feature)
        if options.debug and num_canidates > 0:
            _f['properties']['debug_wof_id'] = candidate_ids
            _f['properties']['debug_wof_candidates'] = num_canidates
            _f['properties']['debug_wof_results'] = num_wof_results
            _f['properties']['debug_wof_region_id'] = region_id
            _f['properties']['debug_wof_region_results'] = region_num_wof_results
            _f['properties']['debug_lat'] = debug_lat
            _f['properties']['debug_lng'] = debug_lng
            _f['properties']['debug_area'] = debug_area

    # now that we've processed every record, for the set should we
    # write out debug file (which now has new 'debug_wof_id' key, value pair)
    if options.debug:
        debug_path = os.path.abspath(options.debug)

        with open(debug_path, 'w') as debug_outfile:
            json.dump(data, debug_outfile)

    # OPTION 2
    # Loop thru the debug file (data) again
    # Looking for wof_id values that are not unique
    # and operate on them: choose the "best", ditch the rest, probably making new records for that input.

## next step - figure out how to exclude by placetype.

## see if we complete a dry run, stash the diffs, etc.. basically do a dry run of this script to see how we're doing.