Last active January 18, 2022 16:35
Prepping osopennames for ML
# extract zipped csv file to ~/osopennames
cd osopennames
# merge all csvs into one, with header first
for f in DATA/*.csv ; do cat $f; done > mergedwithoutheader.csv
# view top 5 or so lines to make sure it's what you expect
head -5 mergedwithoutheader.csv
# open in a text editor and remove any "
# strip it down to just the NAME1 column
cut -f3 --delimiter=, mergedwithoutheader.csv > mergednameonly.csv
# convert to lower case
tr '[:upper:]' '[:lower:]' < mergednameonly.csv > mergednameonlylowercase.csv
# convert spaces to hyphens
sed -i -e 's/ /-/g' mergednameonlylowercase.csv
# duplicate the column
awk 'BEGIN{FS=OFS=", "}{$1 = $1 OFS $1} 1' mergednameonlylowercase.csv > mergednameonlydupe.csv
# convert to json (yes I'm sure this can be piped together, don't @ me)
# what we're aiming for is:
# {"label": "GPE", "pattern": [{"LOWER": "san"}, {"LOWER": "francisco"}], "id": "san-francisco"}
# 1) globally replace inner space(s) with "}, {"LOWER": "
sed -e 's/ /"}, {"LOWER": "/g' mergednameonlydupe.csv > mergednameonly.json
# 2) prefix {"label": "B-Geo", "pattern": [{"LOWER": at the start
sed -i -e 's/^/{"label": "B-Geo", "pattern": [{"LOWER": "/' mergednameonly.json
# 3) replace the , "id element with "}], "id
sed -i -e "s/,\"id\"/\"}], \"id\"/g" mergednameonlyquoted.json
# 4) Add a closing } at the end
sed -i -e "s/$/}/g" mergednameonly.json
# rename to jsonl as that's what spacy is looking for
mv mergednameonly.json mergednameonly.jsonl
