Skip to content

Instantly share code, notes, and snippets.

@cimi
Created March 9, 2011 17:24
Show Gist options
  • Save cimi/862587 to your computer and use it in GitHub Desktop.
Save cimi/862587 to your computer and use it in GitHub Desktop.
Enriches geonames.org information with local city names drawn from Google
import urllib
import urllib2
import json
import pprint
import codecs
import sqlite3
import sys
import logging
import time
# range of cities to attempt geocoding
if len(sys.argv) < 3:
print "You must provide the range of ids to geocode"
exit()
RANGE_START = int(sys.argv[1])
RANGE_END = int(sys.argv[2])
# logging configuration
LOG_FILENAME = 'populate-' + str(RANGE_START) + '-' + str(RANGE_END) + '.log'
logging.basicConfig(filename=LOG_FILENAME,level=logging.DEBUG)
# list of languages supported by the google geocoding service
google_languages = [
"ar", "eu", "bg", "bn", "ca", "cs", "da", "de", "el", "en", "es",
"eu", "fa", "fi", "fil", "fr", "gl", "gu", "hi", "hr", "hu", "id",
"it", "iw", "ja", "kn", "ko", "lt", "lv", "ml", "mr", "nl", "no",
"pl", "pt", "ro", "ru", "sk", "sl", "sr", "sv", "tl", "ta", "te",
"th", "tr", "uk", "vi", "zh-CN", "zh-TW"
]
bing_languages = [
"cs", "da", "nl", "fi", "fr", "de", "it", "ja", "nb", "pt", "es", "sv", "en"
]
bing_locales = {
"cs" : "cs-CZ",
"da" : "da-DK",
"nl" : "nl-NL",
"fi" : "fi-FI",
"de" : "de-DE",
"it" : "it-IT",
"ja" : "ja-JP",
"nb" : "nb-NO",
"pt" : "pt-PT",
"es" : "es-ES",
"sv" : "sv-SE",
"en" : "en-US"
}
def init_db(cities_file, connection):
""" Reads the geonames file and puts the contents inside a database """
c = connection.cursor()
c.execute('''DROP TABLE cities;''')
c.execute('''
CREATE TABLE IF NOT EXISTS cities (
id INTEGER PRIMARY KEY AUTOINCREMENT,
geoname_id INTEGER,
geoname STRING,
ascii_name STRING,
local_name STRING,
en_name STRING,
alternate_names STRING,
cc STRING,
lat FLOAT,
lon FLOAT
); ''')
# column names for the cities file - city characteristics taken from geonames
columns = cities_file.readline().strip().split('\t')
# array for storing all the cities
cities = {}
for line in cities_file:
# populate city objects with each of the columns
tokens = line.strip().split('\t')
city = {}
for idx,column in enumerate(columns):
city[column] = tokens[idx]
if column == 'alternate_names' or column == 'name':
city[column] = city[column].decode('utf-8')
# insert into DB
sql = 'INSERT INTO cities (geoname_id, geoname, ascii_name, alternate_names, cc, lat, lon) VALUES (:geoname, :name, :ascii_name, :alternate_names, :cc, :lat, :lon)';
cur = c.execute(sql, city)
connection.commit()
sql = 'SELECT last_insert_rowid()'
res = c.execute(sql)
for row in c:
city['id'] = row[0]
cities[city['id']] = city
c.close()
return cities
def read_db(connection):
""" Reads the contents of the database file into memory and returns a dictionary of the form id => city """
cities = {}
c = connection.cursor()
c.execute('''SELECT * FROM cities;''')
columns = ["id", "geoname_id", "geoname", "ascii_name", "local_name", "en_name", "alternate_names", "cc", "lat", "lon"]
for row in c:
city = {}
for idx,x in enumerate(row):
city[columns[idx]] = x
cities[city['id']] = city
logging.debug(str(len(cities.keys())) + " cities loaded from database")
return cities
def dump_db(cities_file, connection):
""" Dumps the contents of the SQL database to disk in a plaintext file (tab separated) """
c = connection.cursor()
c.execute('''SELECT * FROM cities;''')
columns = ["id", "geoname_id", "geoname", "ascii_name", "local_name", "en_name", "alternate_names", "cc", "lat", "lon"]
cities_file.write("\t".join(columns) + "\n")
for row in c:
cities_file.write("\t".join(unicode(x).encode('utf-8') for x in row) + "\n")
def get_languages(countries_file):
""" Gets the languages associated with a country from the countries resource file """
countries = {}
for line in countries_file:
tokens = line.split('\t');
if tokens[1]:
languages = tokens[-1].strip().split(',')
default_languages = []
for language in languages:
default_languages.append(language.split('-')[0].strip("\""))
countries[tokens[1]] = default_languages
return countries
def get_local_name(city, languages, connection):
""" Determine a city's local name from the official language of it's country """
# if the city name has already been queried for, return it directly
if city['local_name'] != None and city['local_name'] != '#bing' and city['local_name'] != '#retry':
logging.debug("City was already in db: " + city['ascii_name'] + " == " + city['local_name'])
return city['local_name']
# select the default language
language = None
if city['cc'] in languages:
language = languages[city['cc']][0]
else:
language = "language not available"
if language in bing_languages:
try:
url = "http://dev.virtualearth.net/REST/v1/Locations/{1}?o=json&key=AlBXPeA6_Mj8OIvLRJvHvx7EYhf9URHvtnK4Oy30LfvVSUK0Mlm7NNef4IhcqJVi&c={0}".format(bing_locales[language], urllib.quote(city['ascii_name']))
logging.debug("Requesting " + url)
# wait one second before performing the actual request so that we don't get rejected by Google since we're not paying
time.sleep(1)
geocoding = json.loads(urllib2.urlopen(url).read())
if (geocoding['resourceSets']):
address = geocoding['resourceSets'][0]['resources'][0]['address']
city['local_name'] = address['locality'] if 'locality' in address.keys() else geocoding['resourceSets'][0]['resources'][0]['name']
logging.debug("Found " + city['local_name'])
else:
city['local_name'] = '#not_found'
logging.warn("City not found " + city['ascii_name'])
except urllib2.URLError:
# if the service rejected the error, log the occurrence
city['local_name'] = '#retry'
logging.warn("Request rejected " + url)
time.sleep(10)
except KeyError:
city['local_name'] = '#not_found'
logging.warn("City not found " + city['ascii_name'])
elif language in google_languages:
try:
url = "http://maps.googleapis.com/maps/api/geocode/json?language={0}&address={1}&sensor=false".format(language, urllib.quote(city['ascii_name']))
logging.debug("Requesting " + url)
# wait one second before performing the actual request so that we don't get rejected by Google since we're not paying
time.sleep(1)
geocoding = json.loads(urllib2.urlopen(url).read())
if geocoding['results']:
city['local_name'] = geocoding['results'][0]['address_components'][0]['long_name']
logging.debug("Found " + city['local_name'])
else:
city['local_name'] = '#not_found'
logging.warn("City not found " + city['ascii_name'])
except urllib2.URLError:
# if the service rejected the error, log the occurrence
city['local_name'] = '#retry'
logging.warn("Request rejected " + url)
time.sleep(10)
else:
# if the language is not supported by the geocoding provider, log the occurence
city['local_name'] = '#not supported'
logging.warn("Language not supported for " + city['ascii_name'] + ", " + language)
# if a local name could be determined, insert it in the db
if city['local_name']:
sql = "UPDATE cities SET local_name = :local_name WHERE id = :id"
c = connection.cursor()
c.execute(sql, city)
connection.commit()
conn = sqlite3.connect('cities.db')
# cities = init_db(open('cities15000.tsv'), conn)
languages = get_languages(open('countries.tsv'))
cities = read_db(conn)
for id,city in cities.items():
if id in range(RANGE_START, RANGE_END):
city['local_name'] = get_local_name(city, languages, conn)
dump_db(open('cities_out.tsv', 'w'), conn)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment