Instantly share code, notes, and snippets.

Embed
What would you like to do?
Compiling an extensive cities list from free geonames.org data
# This module provides a function to compile city data using the free data
# provided on http://download.geonames.org/export/dump.
# It uses following files to compile a list of 4.4 mio. cities world-wide,
# including their name, state (administrative level 1), country, time zone,
# latitude, and longitude:
#
# * allCountries.txt (included in allCountries.zip)
# * countryInfo.txt
# * admin1CodesASCII.txt
#
defmodule Cities do
defmodule Data do
@moduledoc """
Provides functions for compiling a raw data file.
"""
@doc """
Compiles the needed raw city data from the given files.
"""
def compile(location_file, countries_file, states_file) do
new_file = File.stream!("cities.txt", [:write])
states = compile_states(states_file)
countries = compile_countries(countries_file)
location_file
|> File.stream!
|> Stream.filter(&city?/1)
|> Stream.map(&to_attrs/1)
|> Stream.map(&(replace_state(&1, states)))
|> Stream.map(&(replace_country(&1, countries)))
|> Stream.map(&to_line/1)
|> Enum.into(new_file)
end
defp city?(line) do
type =
line
|> String.split("\t")
|> Enum.at(6)
type == "P" # Parish -> city or village
end
defp to_attrs(line) do
[_geonameid, # integer id of record in geonames database
name, # name of geographical point (utf8) varchar(200)
_asciiname, # name of geographical point in plain ascii characters, varchar(200)
_alternatenames, # alternatenames, comma separated, ascii names automatically transliterated, convenience attribute from alternatename table, varchar(10000)
latitude, # latitude in decimal degrees (wgs84)
longitude, # longitude in decimal degrees (wgs84)
_feature_class, # see http://www.geonames.org/export/codes.html, char(1)
_feature_code, # see http://www.geonames.org/export/codes.html, varchar(10)
country_code, # ISO-3166 2-letter country code, 2 characters
_cc2, # alternate country codes, comma separated, ISO-3166 2-letter country code, 200 characters
state, # fipscode (subject to change to iso code), see exceptions below, see file admin1Codes.txt for display names of this code; varchar(20)
_admin2_code, # code for the second administrative division, a county in the US, see file admin2Codes.txt; varchar(80)
_admin3_code, # code for third level administrative division, varchar(20)
_admin4_code, # code for fourth level administrative division, varchar(20)
_population, # bigint (8 byte int)
_elevation, # in meters, integer
_dem, # digital elevation model, srtm3 or gtopo30, average elevation of 3''x3'' (ca 90mx90m) or 30''x30'' (ca 900mx900m) area in meters, integer. srtm processed by cgiar/ciat.
timezone, # the iana timezone id (see file timeZone.txt) varchar(40)
_modification_date # date of last modification in yyyy-MM-dd format
] = String.split(line, "\t")
%{city: name,
state: state,
country: country_code,
timezone: timezone,
latitude: latitude,
longitude: longitude}
end
defp replace_state(attrs, state_names) do
%{attrs | state: state_names["#{attrs.country}.#{attrs.state}"]}
end
defp compile_states(file) do
file
|> File.stream!
|> Stream.filter(&(!String.starts_with?(&1, "#")))
|> Enum.into(%{}, fn line ->
[key, value | _] = String.split(line, "\t")
{key, value}
end)
end
defp replace_country(attrs, country_names) do
%{attrs | country: country_names[attrs.country]}
end
defp compile_countries(file) do
file
|> File.stream!
|> Stream.filter(&(!String.starts_with?(&1, "#")))
|> Enum.into(%{}, fn line ->
[key, _, _, _, value | _] = String.split(line, "\t")
{key, value}
end)
end
defp to_line(attrs) do
line = Enum.join([
"\"#{attrs.city}\"",
"\"#{attrs.state}\"",
"\"#{attrs.country}\"",
"\"#{attrs.timezone}\"",
attrs.latitude,
attrs.longitude
], ",")
"#{line}\n"
end
end
end
# Example usage:
Cities.Data.compile("allCountries.txt", "countryInfo.txt", "admin1CodesASCII.txt")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment