Skip to content

Instantly share code, notes, and snippets.

@ms8r
Last active December 28, 2016 15:35
Show Gist options
  • Save ms8r/d1cd85a8c876d95f4ab8f4950a6839cd to your computer and use it in GitHub Desktop.
Save ms8r/d1cd85a8c876d95f4ab8f4950a6839cd to your computer and use it in GitHub Desktop.
Uses Google Maps API to add lat/lon info to address data provided in a CSV file
#! /usr/bin/env python
# -*- coding: utf-8 -*-
import re
import time
from collections import namedtuple
from functools import partial
import json
import logging
from tablib import Dataset
import googlemaps
import begin
# Address components containing one or more of the following regex's will be
# excluded from the search string (tested via `re.search`). These trip up the
# Google Maps search.
EXCLUDES = ['P.O.', 'Room', 'Building', 'Freezone', 'Panpro', 'PanPro',
'Floor', 'Postfach', 'Bldg', 'Suite', 'Cnr']
OutRec = namedtuple('OutRec', ['rslice', 'res_count', 'res_types',
'lat', 'lon', 'res_json'])
def new_label(cur_labels, new_label):
while new_label in cur_labels:
new_label = '_' + new_label
return new_label
def loc_items(row, loc_cols, excl_re=None):
"""
Given `row` (a sequence) will return components of search string derived
from `loc_cols` (a sequence of column indices) dropping any string between
commas that contains the regex `excl_re` (`re.search` not `re.match`).
"""
out = ', '.join([row[c].strip().rstrip(',') for c in loc_cols if row[c]])
return [s for s in out.split(', ')
if not excl_re or not re.search(excl_re, s)]
@begin.start(auto_convert=True, short_args=False)
@begin.logging
def main(apikey: """Google API key""",
infile: """input file with address data (column layout, separator can
be specified with `--insep`)""",
loccols: """labels of columns in `infile` (string specifying
whitespace separated list of labels) to be used to construct
the search string; labels must be sorted from most to least
detailed (e.g. 'street city country')""",
outcsv: """output file (tab separated); will contain data from
infile, augmented by string used for search, number or
results returned, location type of first result, lat, lon""",
insep: """column delimiter in `infile`""" = '\t',
outjson: """if specified, full data returned by Google Maps will be
serialized to this file""" = None,
delay: """optional delay in seconds between api requests""" = 0.0):
"""
Uses Google Maps API to add lat/lon info to address data provided in a
tsv/csv file. Input file can include arbitrary columns/layout. Columns to
be used for search are specified as CLI arguments. Specified columns will
be combined into a search string and submitted to the Google Maps API. If
no result is obtained for a given string, components of the string will be
successively dropped (from more to less detailed) until Google either
returns a result or the address components are exhausted. The number of
address components used and the number of results returned are included in
the results (`rslice` and `res_count` respectively). In case of multiple
results the first result's lat/lon will be used.
For Python versions earlier than 3.3 the `funcsigs` package from PyPi is
required.
"""
with open(infile, 'r') as fp:
al_ds = Dataset().load(fp.read())
for i, rec in enumerate(al_ds):
al_ds[i] = [re.sub(r'[\t\r\n]', ' ', str(r)) if r else r for r in rec]
# compile search elements and add as column:
loc_cols = [al_ds.headers.index(lc) for lc in loccols.split()]
ploc_items = partial(loc_items, loc_cols=loc_cols,
excl_re='|'.join(EXCLUDES))
al_ds.append_col(ploc_items, header=new_label(al_ds.headers, 'loc_str'))
gmaps = googlemaps.Client(key=apikey)
out = Dataset(headers=[new_label(al_ds.headers, f)
for f in OutRec._fields])
for index, row in enumerate(al_ds):
loc = row[-1]
logging.info('%7d: %s', index, loc)
res = []
i = 0
while not res and i < len(loc):
res = gmaps.geocode(', '.join(loc[i:]))
i += 1
res_rec = OutRec(
rslice=len(loc) - i + 1,
res_count=len(res),
res_types=res[0]['types'],
lat=res[0]['geometry']['location']['lat'] if res else '',
lon=res[0]['geometry']['location']['lng'] if res else '',
res_json=res)
logging.info('%7d: res_count=%3d | rslice=%2d | types=%s', index,
res_rec.res_count, res_rec.rslice, res_rec.res_types)
out.append(res_rec)
time.sleep(delay)
with open(outcsv, 'w') as foo:
foo.write(al_ds.stack_cols(out.subset(cols=out.headers[:-1])).tsv)
if outjson:
with open(outjson, 'w') as foo:
foo.write(out.subset(cols=out.headers[-1:]).json)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment