Created
January 22, 2013 07:27
-
-
Save fiorix/4592774 to your computer and use it in GitHub Desktop.
A list of all cities in the world.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# coding: utf-8 | |
# | |
# The World Gazetteer provides a downloadable file that contains a list | |
# of all cities, towns, administrative divisions and agglomerations with | |
# their population, their English name parent country. | |
# | |
# Article: http://answers.google.com/answers/threadview/id/774429.html | |
# Download: http://www.world-gazetteer.com/dataen.zip | |
import codecs | |
# can't just iterate over the fd as there are many lines with | |
# carriage returns in the middle of the line and things break. | |
def rows(fd): | |
n = 1 | |
rest = "" | |
while 1: | |
chunk = fd.read(1024).encode("utf-8") | |
if not chunk: | |
break | |
while 1: | |
chunk = rest + chunk | |
pos = chunk.find("\n") | |
if pos > -1: | |
pos += 1 | |
line, rest = chunk[:pos], chunk[pos:] | |
yield n, line.replace("\r", "").replace("\n", "").split("\t") | |
chunk = "" | |
n += 1 | |
else: | |
break | |
if __name__ == "__main__": | |
cities = set() | |
fd = codecs.open("dataen.txt", "r", "utf-8") | |
for n, row in rows(fd): | |
columns = [row[8], row[9], row[1]] # Country,Region,City | |
if all(columns): | |
cities.add(",".join(columns)) | |
for line in sorted(cities): | |
print line |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
The URL http://www.world-gazetteer.com/dataen.zip died