Last active
January 13, 2022 17:13
-
-
Save archaeogeek/dd413568fbf7aabedc8ff101d6a4dc5b to your computer and use it in GitHub Desktop.
processing os open names with just linux cli tools
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# extract zipped csv file to ~/osopennames | |
cd osopennames | |
# merge all csvs into one, with header first | |
for f in DATA/*.csv ; do cat DOC/OS_Open_Names_Header.csv $f; done > mergedwithheader.csv | |
# view top 5 or so lines to make sure it's what you expect | |
head -5 mergedwithheader.csv | |
# take only the columns we want | |
# put them in the right order | |
# to match the geonames mapping: | |
## add missing static values | |
## add correct column headings | |
cut -f1,3,5,9,10,7,8,30,28,25,22 --delimiter=, mergedwithheader.csv | \ | |
awk -F, -v OFS="," '{print $1,$2,$3,","$4,$5,$6,$7,"GBR,,GB,"$8,$9,$10,$11,",,,GMT,01-04-21"}' | \ | |
sed -e s'/ID,NAME1,NAME2,,TYPE,LOCAL_TYPE,GEOMETRY_X,GEOMETRY_Y,GBR,,GB,DISTRICT_BOROUGH,COUNTY_UNITARY,REGION,COUNTRY,,,,GMT,01-04-21/ \ | |
ID,NAME1,NAME2,ASCIINAME,TYPE,LOCAL_TYPE,GEOMETRY_X,GEOMETRY_Y,COUNTRY_CODE2,COUNTRY_CODE3,CC2,DISTRICT_BOROUGH,COUNTY_UNITARY,REGION,COUNTRY,POP,ELEV,DEM,TIMEZONE,CHANGEDATE/1' mergedwithheader.csv > geonames.csv | |
# create index with mapping using https://github.com/openeventdata/es-geonames/blob/master/geonames_mapping.json | |
curl --user user:pass -XPUT 'https://yourelasticsearchurl/geonames' -H 'Content-Type: application/json' -d @geonames_mapping.json | |
# use elasticsearch_loader to load: https://github.com/Moshe/elasticsearch_loader | |
# (create virtualenv and pip install elasticsearch_loader, if errors about elasticsearch compatibility then pip install elasticsearch==7.13.1) | |
elasticsearch_loader --es-host https://yoursecureurl.bonsaisearch.net:443 --verify-certs --http-auth user:pass --timeout 60 --index geonames csv geonames.csv | |
# Compare line count in csv with number of documents in the index: | |
/geonames/_count | |
wc -l geonames.csv |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment