Created
July 14, 2015 08:46
-
-
Save kumekay/7b4fc3a6d697c5faa8b4 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": { | |
"collapsed": true | |
}, | |
"source": [ | |
"\n", | |
"## Parse cities and countries from geonames.org" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 11, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"import csv\n", | |
"import sys\n", | |
"\n", | |
"csv.field_size_limit(sys.maxsize)\n", | |
"\n", | |
"wdir = '/Users/ku/data/'" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## The table 'alternate names' :\n", | |
"\n", | |
"Source: http://download.geonames.org/export/dump/alternateNames.zip\n", | |
"\n", | |
"0. alternateNameId : the id of this alternate name, int\n", | |
"1. geonameid : geonameId referring to id in table 'geoname', int\n", | |
"2. isolanguage : iso 639 language code 2- or 3-characters; 4-characters 'post' for postal codes and 'iata','icao' and faac for airport codes, fr_1793 for French Revolution names, abbr for abbreviation, link for a website, varchar(7)\n", | |
"3. alternate name : alternate name or name variant, varchar(200)\n", | |
"4. isPreferredName : '1', if this alternate name is an official/preferred name\n", | |
"5. isShortName : '1', if this is a short name like 'California' for 'State of California'\n", | |
"6. isColloquial : '1', if this alternate name is a colloquial or slang term\n", | |
"7. isHistoric : '1', if this alternate name is historic and was used in the past\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 21, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"names_ru = {}\n", | |
"with open(wdir + \"alternateNames.txt\", 'rb') as csvfile: \n", | |
" idreader = csv.reader(csvfile, delimiter='\\t')\n", | |
" for r in idreader:\n", | |
" if r[2] == \"ru\":\n", | |
" names_ru[r[1]]=r[3]\n", | |
" " | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## The main 'geoname' table has the following fields :\n", | |
"\n", | |
"Source: http://download.geonames.org/export/dump/allCountries.zip\n", | |
"\n", | |
"0. geonameid | integer id of record in geonames database |\n", | |
"1. name | name of geographical point (utf8) varchar(200)\n", | |
"2. asciiname | name of geographical point in plain ascii characters, varchar(200)\n", | |
"3. alternatenames | alternatenames, comma separated, ascii names automatically transliterated, convenience attribute from alternatename table, varchar(10000)\n", | |
"4. latitude | latitude in decimal degrees (wgs84)\n", | |
"5. longitude | longitude in decimal degrees (wgs84)\n", | |
"6. feature class | see http://www.geonames.org/export/codes.html, char(1)\n", | |
"7. feature code | see http://www.geonames.org/export/codes.html, varchar(10)\n", | |
"8. country code | ISO-3166 2-letter country code, 2 characters\n", | |
"9. cc2 | alternate country codes, comma separated, ISO-3166 2-letter country code, 200 characters\n", | |
"10. admin1 code | fipscode (subject to change to iso code), see exceptions below, see file admin1Codes.txt for display names of this code; varchar(20)\n", | |
"11. admin2 code | code for the second administrative division, a county in the US, see file admin2Codes.txt; varchar(80) \n", | |
"12. admin3 code | code for third level administrative division, varchar(20)\n", | |
"13. admin4 code | code for fourth level administrative division, varchar(20)\n", | |
"14. population | bigint (8 byte int) \n", | |
"15. elevation | in meters, integer\n", | |
"16. dem | digital elevation model, srtm3 or gtopo30, average elevation of 3''x3'' (ca 90mx90m) or 30''x30'' (ca 900mx900m) area in meters, integer. srtm processed by cgiar/ciat.\n", | |
"17. timezone | the timezone id (see file timeZone.txt) varchar(40)\n", | |
"18. modification date | date of last modification in yyyy-MM-dd format" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 30, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Localized: 238621 Not localized: 3471962 Without country: 0\n", | |
"Last row ['8534443', 'Nhlangano', None, '-19.85776', '27.82236', 'zw', '']\n" | |
] | |
} | |
], | |
"source": [ | |
"not_localized = 0\n", | |
"localized = 0\n", | |
"without_country = 0\n", | |
"ro = []\n", | |
"with open(wdir + \"allCountries.txt\", 'rb') as csvfile: \n", | |
" with open(wdir + \"out_cities.csv\", 'wb') as csvoutfile:\n", | |
" pwriter = csv.writer(csvoutfile, dialect='excel')\n", | |
" idreader = csv.reader(csvfile, delimiter='\\t')\n", | |
" pwriter.writerow(['id','name_en', 'name_ru', 'latitude', 'longitude', 'code', 'population'])\n", | |
" for r in idreader:\n", | |
" name_ru = None\n", | |
" if r[6] != 'P':\n", | |
" continue\n", | |
" if r[8] == '':\n", | |
" without_country += 1\n", | |
" continue\n", | |
" if r[0] in names_ru:\n", | |
" localized += 1\n", | |
" names_ru[r[0]]\n", | |
" else:\n", | |
" not_localized += 1\n", | |
" \n", | |
" ro = [r[0], r[1], name_ru, r[4], r[5], r[8].lower(), r[15]]\n", | |
" pwriter.writerow(ro)\n", | |
"\n", | |
"print \"Localized:\", localized, \"Not localized:\", not_localized, \"Without country:\", without_country\n", | |
"print \"Last row\", ro" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Countries in multiple languages\n", | |
"Sample urls to get csv of countries: \n", | |
"\n", | |
"- http://api.geonames.org/countryInfoCSV?lang=en&username=demo\n", | |
"- http://api.geonames.org/countryInfoCSV?lang=ru&username=demo\n", | |
"\n", | |
"### Format:\n", | |
"- iso alpha2\t\n", | |
"- iso alpha3\t\n", | |
"- iso numeric\t\n", | |
"- fips code\t\n", | |
"- name\t\n", | |
"- capital\t\n", | |
"- areaInSqKm\t\n", | |
"- population\t\n", | |
"- continent\t\n", | |
"- languages\t\n", | |
"- currency\t\n", | |
"- geonameId" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 53, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"countries_en = []\n", | |
"countries_ru_h = {}" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 54, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"# English countries\n", | |
"with open(wdir + \"countryInfoCSVen.txt\", 'rb') as encountriescsv: \n", | |
" enreader = csv.DictReader(encountriescsv, delimiter='\\t')\n", | |
" for r in enreader:\n", | |
" r['iso alpha2'] = r['iso alpha2'].lower()\n", | |
" countries_en.append(r)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 55, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"# Russian countries\n", | |
"with open(wdir + \"countryInfoCSVru.txt\", 'rb') as rucountriescsv: \n", | |
" rureader = csv.DictReader(rucountriescsv, delimiter='\\t')\n", | |
" for r in rureader:\n", | |
" countries_ru_h[r['iso alpha2'].lower()] = r['name']" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 59, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"English country sample: {'geonameId': '3041565', 'name': 'Andorra', 'areaInSqKm': '468.0', 'iso numeric': '020', 'languages': 'ca', 'currency': 'EUR', 'capital': 'Andorra la Vella', 'iso alpha3': 'AND', 'iso alpha2': 'ad', 'continent': 'EU', 'fips code': 'AN', 'population': '84000'}\n", | |
"Russian country sample with 'ru': Россия\n" | |
] | |
} | |
], | |
"source": [ | |
"print \"English country sample:\", countries_en[0]\n", | |
"print \"Russian country sample with 'ru':\", countries_ru_h['ru']" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 69, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
" Country sample: {'name_ru': '\\xd0\\x97\\xd0\\xb8\\xd0\\xbc\\xd0\\xb1\\xd0\\xb0\\xd0\\xb1\\xd0\\xb2\\xd0\\xb5', 'name_en': 'Zimbabwe', 'code': 'zw', 'id': '878675', 'population': '11651858'}\n", | |
"Total countres: 246\n", | |
"Bad countries: 6 List: ['', 'Antarctica', 'Bouvet Island', '', 'Heard Island and McDonald Islands', 'U.S. Minor Outlying Islands']\n" | |
] | |
} | |
], | |
"source": [ | |
"out_country = {}\n", | |
"countries_count = 0\n", | |
"bad_countries = []\n", | |
"with open(wdir + \"out_countries.csv\", 'wb') as csvoutfile:\n", | |
" fieldnames = ['id', 'code', 'name_en','name_ru', 'population']\n", | |
" pwriter = csv.DictWriter(csvoutfile, dialect='excel', fieldnames=fieldnames)\n", | |
" pwriter.writeheader()\n", | |
" for c in countries_en:\n", | |
" if c['name'] == '' or c['population'] == '0':\n", | |
" bad_countries.append(c)\n", | |
" continue\n", | |
" out_country['id'] = c['geonameId']\n", | |
" out_country['code'] = c['iso alpha2']\n", | |
" out_country['name_en'] = c['name']\n", | |
" out_country['name_ru'] = countries_ru_h.get(c['iso alpha2'])\n", | |
" out_country['population'] = c['population']\n", | |
" countries_count += 1\n", | |
" pwriter.writerow(out_country)\n", | |
"\n", | |
"print \"Country sample:\", out_country\n", | |
"print \"Total countres:\", countries_count \n", | |
"print \"Bad countries:\", len(bad_countries), 'List:', map(lambda c: c['name'], bad_countries)" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 2", | |
"language": "python", | |
"name": "python2" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 2 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython2", | |
"version": "2.7.10" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 0 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment