Skip to content

Instantly share code, notes, and snippets.

@harmy
Last active September 6, 2018 03:35
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save harmy/7ecd6cfdfae939db7e2781e8aa302559 to your computer and use it in GitHub Desktop.
Save harmy/7ecd6cfdfae939db7e2781e8aa302559 to your computer and use it in GitHub Desktop.
bluk load geoname.csv into ElasticSearch
#!/usr/bin/env python3
import sys
import time
import csv
import json
from collections import namedtuple
from elasticsearch import Elasticsearch
from elasticsearch.helpers import bulk
ES_ENDPOINT = 'vpc-xxxxxxxxxxxx.us-east-1.es.amazonaws.com:80'
ES_INDEX = 'geonames'
ES_TYPE = 'doc'
ES_MAPPING = {
ES_TYPE: {
"properties": {
"location": {
"type": "geo_point"
}
}
}
}
def get_geoname_data(filename):
GeonameRecord = namedtuple('GeonameRecord', 'id name ascii_name alternate_names latitude longitude feature_class feature_code country_code cc2 admin1_code admin2_code admin3_code admin4_code population elevation dem timezone modified_at')
with open(filename, "r", encoding="utf-8") as geoname_records:
for geoname_record in csv.reader(geoname_records):
if len(geoname_record) == 19: # a valid row
geoname = GeonameRecord(*geoname_record)
yield {
'_index': ES_INDEX,
'_type': ES_TYPE,
'_id': geoname.id,
'_source': {
"admin1_code": geoname.admin1_code,
"ascii_name": geoname.ascii_name,
"country_code": geoname.country_code,
"dem": int(geoname.dem),
"elevation": int(geoname.elevation),
"feature_class": geoname.feature_class,
"feature_code": geoname.feature_code,
"id": geoname.id,
"location": ','.join([geoname.latitude, geoname.longitude]),
"modified_at": geoname.modified_at,
"name": geoname.name,
"population": int(geoname.population),
"timezone": geoname.timezone
}
}
if __name__ == '__main__':
es = Elasticsearch(hosts=[ES_ENDPOINT], timeout=5000)
es.indices.delete(index=ES_INDEX)
es.indices.create(index=ES_INDEX)
es.indices.put_mapping(index=ES_INDEX, doc_type=ES_TYPE, body=ES_MAPPING)
count, _ = bulk(es, get_geoname_data('./geoname.csv'))
print('Successfully load {0} items into ES.'.format(count))
3039077 Sispony Sispony Sispony 42.53368 1.51613 P PPL AD 04 0 0 1315 Europe/Andorra 2018-09-04 08:24:47.126072+00
3039154 El Tarter El Tarter Ehl Tarter,Эл Тартер 42.57952 1.65362 P PPL AD 02 1052 0 1721 Europe/Andorra 2018-09-04 08:24:47.129741+00
3039163 Sant Julià de Lòria Sant Julia de Loria San Julia,San Julià,Sant Julia de Loria,Sant Julià de Lòria,Sant-Zhulija-de-Lorija,sheng hu li ya-de luo li ya,Сант-Жулия-де-Лория,サン・ジュリア・デ・ロリア教区,圣胡利娅-德洛里亚,圣胡利娅-德洛里亚 42.46372 1.49129 P PPLA AD 06 8022 0 921 Europe/Andorra 2018-09-04 08:24:47.131937+00
3039181 Santa Coloma Santa Coloma Santa Coloma 42.49454 1.49897 P PPL AD 07 0 0 978 Europe/Andorra 2018-09-04 08:24:47.13408+00
3039604 Pas de la Casa Pas de la Casa Pas de la Kasa,Пас де ла Каса 42.54277 1.73361 P PPL AD 03 2363 2050 2106 Europe/Andorra 2018-09-04 08:24:47.136234+00
3039678 Ordino Ordino Ordino,ao er di nuo,orudino jiao qu,Ордино,オルディノ教区,奥尔迪诺 42.55623 1.53319 P PPLA AD 05 3066 0 1296 Europe/Andorra 2018-09-04 08:24:47.138404+00
3039862 Meritxell Meritxell Sanctuaire de Meritxeli,Sanctuaire de Meritxell,Santuari de Meritxell 42.55403 1.59087 P PPL AD AD 02 0 0 1479 Europe/Andorra 2018-09-04 08:24:47.140615+00
3040051 les Escaldes les Escaldes Ehskal'des-Ehndzhordani,Escaldes,Escaldes-Engordany,Les Escaldes,esukarudesu=engorudani jiao qu,lai sai si ka er de-en ge er da,Эскальдес-Энджордани,エスカルデス=エンゴルダニ教区,萊塞斯卡爾德-恩戈爾達,萊塞斯卡爾德-恩戈爾達 42.50729 1.53414 P PPLA AD 08 15853 0 1033 Europe/Andorra 2018-09-04 08:24:47.142784+00
3040067 Les Bons Les Bons Els Bons 42.53873 1.58649 P PPL AD AD 03 0 0 1299 Europe/Andorra 2018-09-04 08:24:47.145036+00
3040132 la Massana la Massana La Macana,La Massana,La Maçana,La-Massana,la Massana,ma sa na,Ла-Массана,ラ・マサナ教区,马萨纳 42.54499 1.51483 P PPLA AD 04 7211 0 1245 Europe/Andorra 2018-09-04 08:24:47.147197+00
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment