Last active
December 28, 2016 15:35
-
-
Save ms8r/d1cd85a8c876d95f4ab8f4950a6839cd to your computer and use it in GitHub Desktop.
Uses Google Maps API to add lat/lon info to address data provided in a CSV file
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /usr/bin/env python | |
# -*- coding: utf-8 -*- | |
import re | |
import time | |
from collections import namedtuple | |
from functools import partial | |
import json | |
import logging | |
from tablib import Dataset | |
import googlemaps | |
import begin | |
# Address components containing one or more of the following regex's will be | |
# excluded from the search string (tested via `re.search`). These trip up the | |
# Google Maps search. | |
EXCLUDES = ['P.O.', 'Room', 'Building', 'Freezone', 'Panpro', 'PanPro', | |
'Floor', 'Postfach', 'Bldg', 'Suite', 'Cnr'] | |
OutRec = namedtuple('OutRec', ['rslice', 'res_count', 'res_types', | |
'lat', 'lon', 'res_json']) | |
def new_label(cur_labels, new_label): | |
while new_label in cur_labels: | |
new_label = '_' + new_label | |
return new_label | |
def loc_items(row, loc_cols, excl_re=None): | |
""" | |
Given `row` (a sequence) will return components of search string derived | |
from `loc_cols` (a sequence of column indices) dropping any string between | |
commas that contains the regex `excl_re` (`re.search` not `re.match`). | |
""" | |
out = ', '.join([row[c].strip().rstrip(',') for c in loc_cols if row[c]]) | |
return [s for s in out.split(', ') | |
if not excl_re or not re.search(excl_re, s)] | |
@begin.start(auto_convert=True, short_args=False) | |
@begin.logging | |
def main(apikey: """Google API key""", | |
infile: """input file with address data (column layout, separator can | |
be specified with `--insep`)""", | |
loccols: """labels of columns in `infile` (string specifying | |
whitespace separated list of labels) to be used to construct | |
the search string; labels must be sorted from most to least | |
detailed (e.g. 'street city country')""", | |
outcsv: """output file (tab separated); will contain data from | |
infile, augmented by string used for search, number or | |
results returned, location type of first result, lat, lon""", | |
insep: """column delimiter in `infile`""" = '\t', | |
outjson: """if specified, full data returned by Google Maps will be | |
serialized to this file""" = None, | |
delay: """optional delay in seconds between api requests""" = 0.0): | |
""" | |
Uses Google Maps API to add lat/lon info to address data provided in a | |
tsv/csv file. Input file can include arbitrary columns/layout. Columns to | |
be used for search are specified as CLI arguments. Specified columns will | |
be combined into a search string and submitted to the Google Maps API. If | |
no result is obtained for a given string, components of the string will be | |
successively dropped (from more to less detailed) until Google either | |
returns a result or the address components are exhausted. The number of | |
address components used and the number of results returned are included in | |
the results (`rslice` and `res_count` respectively). In case of multiple | |
results the first result's lat/lon will be used. | |
For Python versions earlier than 3.3 the `funcsigs` package from PyPi is | |
required. | |
""" | |
with open(infile, 'r') as fp: | |
al_ds = Dataset().load(fp.read()) | |
for i, rec in enumerate(al_ds): | |
al_ds[i] = [re.sub(r'[\t\r\n]', ' ', str(r)) if r else r for r in rec] | |
# compile search elements and add as column: | |
loc_cols = [al_ds.headers.index(lc) for lc in loccols.split()] | |
ploc_items = partial(loc_items, loc_cols=loc_cols, | |
excl_re='|'.join(EXCLUDES)) | |
al_ds.append_col(ploc_items, header=new_label(al_ds.headers, 'loc_str')) | |
gmaps = googlemaps.Client(key=apikey) | |
out = Dataset(headers=[new_label(al_ds.headers, f) | |
for f in OutRec._fields]) | |
for index, row in enumerate(al_ds): | |
loc = row[-1] | |
logging.info('%7d: %s', index, loc) | |
res = [] | |
i = 0 | |
while not res and i < len(loc): | |
res = gmaps.geocode(', '.join(loc[i:])) | |
i += 1 | |
res_rec = OutRec( | |
rslice=len(loc) - i + 1, | |
res_count=len(res), | |
res_types=res[0]['types'], | |
lat=res[0]['geometry']['location']['lat'] if res else '', | |
lon=res[0]['geometry']['location']['lng'] if res else '', | |
res_json=res) | |
logging.info('%7d: res_count=%3d | rslice=%2d | types=%s', index, | |
res_rec.res_count, res_rec.rslice, res_rec.res_types) | |
out.append(res_rec) | |
time.sleep(delay) | |
with open(outcsv, 'w') as foo: | |
foo.write(al_ds.stack_cols(out.subset(cols=out.headers[:-1])).tsv) | |
if outjson: | |
with open(outjson, 'w') as foo: | |
foo.write(out.subset(cols=out.headers[-1:]).json) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment