kumekay/geonames.ipynb

## geonames.ipynb
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": true
   },
   "source": [
    "\n",
    "## Parse cities and countries from geonames.org"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "import csv\n",
    "import sys\n",
    "\n",
    "csv.field_size_limit(sys.maxsize)\n",
    "\n",
    "wdir = '/Users/ku/data/'"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## The table 'alternate names' :\n",
    "\n",
    "Source: http://download.geonames.org/export/dump/alternateNames.zip\n",
    "\n",
    "0. alternateNameId   : the id of this alternate name, int\n",
    "1. geonameid         : geonameId referring to id in table 'geoname', int\n",
    "2. isolanguage       : iso 639 language code 2- or 3-characters; 4-characters 'post' for postal codes and 'iata','icao' and faac for airport codes, fr_1793 for French Revolution names,  abbr for abbreviation, link for a website, varchar(7)\n",
    "3. alternate name    : alternate name or name variant, varchar(200)\n",
    "4. isPreferredName   : '1', if this alternate name is an official/preferred name\n",
    "5. isShortName       : '1', if this is a short name like 'California' for 'State of California'\n",
    "6. isColloquial      : '1', if this alternate name is a colloquial or slang term\n",
    "7. isHistoric        : '1', if this alternate name is historic and was used in the past\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "names_ru = {}\n",
    "with open(wdir + \"alternateNames.txt\", 'rb') as csvfile:    \n",
    "    idreader = csv.reader(csvfile, delimiter='\\t')\n",
    "    for r in idreader:\n",
    "        if r[2] == \"ru\":\n",
    "            names_ru[r[1]]=r[3]\n",
    "        "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## The main 'geoname' table has the following fields :\n",
    "\n",
    "Source: http://download.geonames.org/export/dump/allCountries.zip\n",
    "\n",
    "0. geonameid         | integer id of record in geonames database |\n",
    "1. name              | name of geographical point (utf8) varchar(200)\n",
    "2. asciiname         | name of geographical point in plain ascii characters, varchar(200)\n",
    "3. alternatenames    | alternatenames, comma separated, ascii names automatically transliterated, convenience attribute from alternatename table, varchar(10000)\n",
    "4. latitude          | latitude in decimal degrees (wgs84)\n",
    "5. longitude         | longitude in decimal degrees (wgs84)\n",
    "6. feature class     | see http://www.geonames.org/export/codes.html, char(1)\n",
    "7. feature code      | see http://www.geonames.org/export/codes.html, varchar(10)\n",
    "8. country code      | ISO-3166 2-letter country code, 2 characters\n",
    "9. cc2               | alternate country codes, comma separated, ISO-3166 2-letter country code, 200 characters\n",
    "10. admin1 code       | fipscode (subject to change to iso code), see exceptions below, see file admin1Codes.txt for display names of this code; varchar(20)\n",
    "11. admin2 code       | code for the second administrative division, a county in the US, see file admin2Codes.txt; varchar(80) \n",
    "12. admin3 code       | code for third level administrative division, varchar(20)\n",
    "13. admin4 code       | code for fourth level administrative division, varchar(20)\n",
    "14. population        | bigint (8 byte int) \n",
    "15. elevation         | in meters, integer\n",
    "16. dem               | digital elevation model, srtm3 or gtopo30, average elevation of 3''x3'' (ca 90mx90m) or 30''x30'' (ca 900mx900m) area in meters, integer. srtm processed by cgiar/ciat.\n",
    "17. timezone          | the timezone id (see file timeZone.txt) varchar(40)\n",
    "18. modification date | date of last modification in yyyy-MM-dd format"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Localized: 238621 Not localized: 3471962 Without country: 0\n",
      "Last row ['8534443', 'Nhlangano', None, '-19.85776', '27.82236', 'zw', '']\n"
     ]
    }
   ],
   "source": [
    "not_localized = 0\n",
    "localized = 0\n",
    "without_country = 0\n",
    "ro = []\n",
    "with open(wdir + \"allCountries.txt\", 'rb') as csvfile:    \n",
    "    with open(wdir + \"out_cities.csv\", 'wb') as csvoutfile:\n",
    "        pwriter = csv.writer(csvoutfile, dialect='excel')\n",
    "        idreader = csv.reader(csvfile, delimiter='\\t')\n",
    "        pwriter.writerow(['id','name_en', 'name_ru', 'latitude', 'longitude', 'code', 'population'])\n",
    "        for r in idreader:\n",
    "            name_ru = None\n",
    "            if r[6] != 'P':\n",
    "                continue\n",
    "            if r[8] == '':\n",
    "                without_country += 1\n",
    "                continue\n",
    "            if r[0] in names_ru:\n",
    "                localized += 1\n",
    "                names_ru[r[0]]\n",
    "            else:\n",
    "                not_localized += 1\n",
    "                \n",
    "            ro = [r[0], r[1], name_ru, r[4], r[5], r[8].lower(), r[15]]\n",
    "            pwriter.writerow(ro)\n",
    "\n",
    "print \"Localized:\", localized, \"Not localized:\", not_localized, \"Without country:\", without_country\n",
    "print \"Last row\", ro"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Countries in multiple languages\n",
    "Sample urls to get csv of countries: \n",
    "\n",
    "- http://api.geonames.org/countryInfoCSV?lang=en&username=demo\n",
    "- http://api.geonames.org/countryInfoCSV?lang=ru&username=demo\n",
    "\n",
    "### Format:\n",
    "- iso alpha2\t\n",
    "- iso alpha3\t\n",
    "- iso numeric\t\n",
    "- fips code\t\n",
    "- name\t\n",
    "- capital\t\n",
    "- areaInSqKm\t\n",
    "- population\t\n",
    "- continent\t\n",
    "- languages\t\n",
    "- currency\t\n",
    "- geonameId"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "countries_en = []\n",
    "countries_ru_h = {}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# English countries\n",
    "with open(wdir + \"countryInfoCSVen.txt\", 'rb') as encountriescsv:    \n",
    "    enreader = csv.DictReader(encountriescsv, delimiter='\\t')\n",
    "    for r in enreader:\n",
    "        r['iso alpha2'] = r['iso alpha2'].lower()\n",
    "        countries_en.append(r)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# Russian countries\n",
    "with open(wdir + \"countryInfoCSVru.txt\", 'rb') as rucountriescsv:    \n",
    "    rureader = csv.DictReader(rucountriescsv, delimiter='\\t')\n",
    "    for r in rureader:\n",
    "        countries_ru_h[r['iso alpha2'].lower()] = r['name']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 59,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "English country sample: {'geonameId': '3041565', 'name': 'Andorra', 'areaInSqKm': '468.0', 'iso numeric': '020', 'languages': 'ca', 'currency': 'EUR', 'capital': 'Andorra la Vella', 'iso alpha3': 'AND', 'iso alpha2': 'ad', 'continent': 'EU', 'fips code': 'AN', 'population': '84000'}\n",
      "Russian country sample with 'ru': Россия\n"
     ]
    }
   ],
   "source": [
    "print \"English country sample:\", countries_en[0]\n",
    "print \"Russian country sample with 'ru':\", countries_ru_h['ru']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 69,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      " Country sample: {'name_ru': '\\xd0\\x97\\xd0\\xb8\\xd0\\xbc\\xd0\\xb1\\xd0\\xb0\\xd0\\xb1\\xd0\\xb2\\xd0\\xb5', 'name_en': 'Zimbabwe', 'code': 'zw', 'id': '878675', 'population': '11651858'}\n",
      "Total countres: 246\n",
      "Bad countries: 6 List: ['', 'Antarctica', 'Bouvet Island', '', 'Heard Island and McDonald Islands', 'U.S. Minor Outlying Islands']\n"
     ]
    }
   ],
   "source": [
    "out_country = {}\n",
    "countries_count = 0\n",
    "bad_countries = []\n",
    "with open(wdir + \"out_countries.csv\", 'wb') as csvoutfile:\n",
    "        fieldnames = ['id', 'code', 'name_en','name_ru', 'population']\n",
    "        pwriter = csv.DictWriter(csvoutfile, dialect='excel', fieldnames=fieldnames)\n",
    "        pwriter.writeheader()\n",
    "        for c in countries_en:\n",
    "                if c['name'] == '' or c['population'] == '0':\n",
    "                    bad_countries.append(c)\n",
    "                    continue\n",
    "                out_country['id'] = c['geonameId']\n",
    "                out_country['code'] = c['iso alpha2']\n",
    "                out_country['name_en'] = c['name']\n",
    "                out_country['name_ru'] = countries_ru_h.get(c['iso alpha2'])\n",
    "                out_country['population'] = c['population']\n",
    "                countries_count += 1\n",
    "                pwriter.writerow(out_country)\n",
    "\n",
    "print \"Country sample:\", out_country\n",
    "print \"Total countres:\", countries_count \n",
    "print \"Bad countries:\", len(bad_countries), 'List:', map(lambda c: c['name'], bad_countries)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}
	{
	"cells": [
	{
	"cell_type": "markdown",
	"metadata": {
	"collapsed": true
	},
	"source": [
	"\n",
	"## Parse cities and countries from geonames.org"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 11,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"import csv\n",
	"import sys\n",
	"\n",
	"csv.field_size_limit(sys.maxsize)\n",
	"\n",
	"wdir = '/Users/ku/data/'"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## The table 'alternate names' :\n",
	"\n",
	"Source: http://download.geonames.org/export/dump/alternateNames.zip\n",
	"\n",
	"0. alternateNameId : the id of this alternate name, int\n",
	"1. geonameid : geonameId referring to id in table 'geoname', int\n",
	"2. isolanguage : iso 639 language code 2- or 3-characters; 4-characters 'post' for postal codes and 'iata','icao' and faac for airport codes, fr_1793 for French Revolution names, abbr for abbreviation, link for a website, varchar(7)\n",
	"3. alternate name : alternate name or name variant, varchar(200)\n",
	"4. isPreferredName : '1', if this alternate name is an official/preferred name\n",
	"5. isShortName : '1', if this is a short name like 'California' for 'State of California'\n",
	"6. isColloquial : '1', if this alternate name is a colloquial or slang term\n",
	"7. isHistoric : '1', if this alternate name is historic and was used in the past\n"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 21,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"names_ru = {}\n",
	"with open(wdir + \"alternateNames.txt\", 'rb') as csvfile: \n",
	" idreader = csv.reader(csvfile, delimiter='\\t')\n",
	" for r in idreader:\n",
	" if r[2] == \"ru\":\n",
	" names_ru[r[1]]=r[3]\n",
	" "
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## The main 'geoname' table has the following fields :\n",
	"\n",
	"Source: http://download.geonames.org/export/dump/allCountries.zip\n",
	"\n",
	"0. geonameid \| integer id of record in geonames database \|\n",
	"1. name \| name of geographical point (utf8) varchar(200)\n",
	"2. asciiname \| name of geographical point in plain ascii characters, varchar(200)\n",
	"3. alternatenames \| alternatenames, comma separated, ascii names automatically transliterated, convenience attribute from alternatename table, varchar(10000)\n",
	"4. latitude \| latitude in decimal degrees (wgs84)\n",
	"5. longitude \| longitude in decimal degrees (wgs84)\n",
	"6. feature class \| see http://www.geonames.org/export/codes.html, char(1)\n",
	"7. feature code \| see http://www.geonames.org/export/codes.html, varchar(10)\n",
	"8. country code \| ISO-3166 2-letter country code, 2 characters\n",
	"9. cc2 \| alternate country codes, comma separated, ISO-3166 2-letter country code, 200 characters\n",
	"10. admin1 code \| fipscode (subject to change to iso code), see exceptions below, see file admin1Codes.txt for display names of this code; varchar(20)\n",
	"11. admin2 code \| code for the second administrative division, a county in the US, see file admin2Codes.txt; varchar(80) \n",
	"12. admin3 code \| code for third level administrative division, varchar(20)\n",
	"13. admin4 code \| code for fourth level administrative division, varchar(20)\n",
	"14. population \| bigint (8 byte int) \n",
	"15. elevation \| in meters, integer\n",
	"16. dem \| digital elevation model, srtm3 or gtopo30, average elevation of 3''x3'' (ca 90mx90m) or 30''x30'' (ca 900mx900m) area in meters, integer. srtm processed by cgiar/ciat.\n",
	"17. timezone \| the timezone id (see file timeZone.txt) varchar(40)\n",
	"18. modification date \| date of last modification in yyyy-MM-dd format"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 30,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"Localized: 238621 Not localized: 3471962 Without country: 0\n",
	"Last row ['8534443', 'Nhlangano', None, '-19.85776', '27.82236', 'zw', '']\n"
	]
	}
	],
	"source": [
	"not_localized = 0\n",
	"localized = 0\n",
	"without_country = 0\n",
	"ro = []\n",
	"with open(wdir + \"allCountries.txt\", 'rb') as csvfile: \n",
	" with open(wdir + \"out_cities.csv\", 'wb') as csvoutfile:\n",
	" pwriter = csv.writer(csvoutfile, dialect='excel')\n",
	" idreader = csv.reader(csvfile, delimiter='\\t')\n",
	" pwriter.writerow(['id','name_en', 'name_ru', 'latitude', 'longitude', 'code', 'population'])\n",
	" for r in idreader:\n",
	" name_ru = None\n",
	" if r[6] != 'P':\n",
	" continue\n",
	" if r[8] == '':\n",
	" without_country += 1\n",
	" continue\n",
	" if r[0] in names_ru:\n",
	" localized += 1\n",
	" names_ru[r[0]]\n",
	" else:\n",
	" not_localized += 1\n",
	" \n",
	" ro = [r[0], r[1], name_ru, r[4], r[5], r[8].lower(), r[15]]\n",
	" pwriter.writerow(ro)\n",
	"\n",
	"print \"Localized:\", localized, \"Not localized:\", not_localized, \"Without country:\", without_country\n",
	"print \"Last row\", ro"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Countries in multiple languages\n",
	"Sample urls to get csv of countries: \n",
	"\n",
	"- http://api.geonames.org/countryInfoCSV?lang=en&username=demo\n",
	"- http://api.geonames.org/countryInfoCSV?lang=ru&username=demo\n",
	"\n",
	"### Format:\n",
	"- iso alpha2\t\n",
	"- iso alpha3\t\n",
	"- iso numeric\t\n",
	"- fips code\t\n",
	"- name\t\n",
	"- capital\t\n",
	"- areaInSqKm\t\n",
	"- population\t\n",
	"- continent\t\n",
	"- languages\t\n",
	"- currency\t\n",
	"- geonameId"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 53,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"countries_en = []\n",
	"countries_ru_h = {}"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 54,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"# English countries\n",
	"with open(wdir + \"countryInfoCSVen.txt\", 'rb') as encountriescsv: \n",
	" enreader = csv.DictReader(encountriescsv, delimiter='\\t')\n",
	" for r in enreader:\n",
	" r['iso alpha2'] = r['iso alpha2'].lower()\n",
	" countries_en.append(r)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 55,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"# Russian countries\n",
	"with open(wdir + \"countryInfoCSVru.txt\", 'rb') as rucountriescsv: \n",
	" rureader = csv.DictReader(rucountriescsv, delimiter='\\t')\n",
	" for r in rureader:\n",
	" countries_ru_h[r['iso alpha2'].lower()] = r['name']"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 59,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"English country sample: {'geonameId': '3041565', 'name': 'Andorra', 'areaInSqKm': '468.0', 'iso numeric': '020', 'languages': 'ca', 'currency': 'EUR', 'capital': 'Andorra la Vella', 'iso alpha3': 'AND', 'iso alpha2': 'ad', 'continent': 'EU', 'fips code': 'AN', 'population': '84000'}\n",
	"Russian country sample with 'ru': Россия\n"
	]
	}
	],
	"source": [
	"print \"English country sample:\", countries_en[0]\n",
	"print \"Russian country sample with 'ru':\", countries_ru_h['ru']"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 69,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	" Country sample: {'name_ru': '\\xd0\\x97\\xd0\\xb8\\xd0\\xbc\\xd0\\xb1\\xd0\\xb0\\xd0\\xb1\\xd0\\xb2\\xd0\\xb5', 'name_en': 'Zimbabwe', 'code': 'zw', 'id': '878675', 'population': '11651858'}\n",
	"Total countres: 246\n",
	"Bad countries: 6 List: ['', 'Antarctica', 'Bouvet Island', '', 'Heard Island and McDonald Islands', 'U.S. Minor Outlying Islands']\n"
	]
	}
	],
	"source": [
	"out_country = {}\n",
	"countries_count = 0\n",
	"bad_countries = []\n",
	"with open(wdir + \"out_countries.csv\", 'wb') as csvoutfile:\n",
	" fieldnames = ['id', 'code', 'name_en','name_ru', 'population']\n",
	" pwriter = csv.DictWriter(csvoutfile, dialect='excel', fieldnames=fieldnames)\n",
	" pwriter.writeheader()\n",
	" for c in countries_en:\n",
	" if c['name'] == '' or c['population'] == '0':\n",
	" bad_countries.append(c)\n",
	" continue\n",
	" out_country['id'] = c['geonameId']\n",
	" out_country['code'] = c['iso alpha2']\n",
	" out_country['name_en'] = c['name']\n",
	" out_country['name_ru'] = countries_ru_h.get(c['iso alpha2'])\n",
	" out_country['population'] = c['population']\n",
	" countries_count += 1\n",
	" pwriter.writerow(out_country)\n",
	"\n",
	"print \"Country sample:\", out_country\n",
	"print \"Total countres:\", countries_count \n",
	"print \"Bad countries:\", len(bad_countries), 'List:', map(lambda c: c['name'], bad_countries)"
	]
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 2",
	"language": "python",
	"name": "python2"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 2
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython2",
	"version": "2.7.10"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 0
	}