Last active
January 18, 2022 16:35
-
-
Save archaeogeek/d7a3c20a12147bab38e1c0e75f56c1f1 to your computer and use it in GitHub Desktop.
Prepping osopennames for ML
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# extract zipped csv file to ~/osopennames | |
cd osopennames | |
# merge all csvs into one, with header first | |
for f in DATA/*.csv ; do cat $f; done > mergedwithoutheader.csv | |
# view top 5 or so lines to make sure it's what you expect | |
head -5 mergedwithoutheader.csv | |
# open in a text editor and remove any " | |
# strip it down to just the NAME1 column | |
cut -f3 --delimiter=, mergedwithoutheader.csv > mergednameonly.csv | |
# convert to lower case | |
tr '[:upper:]' '[:lower:]' < mergednameonly.csv > mergednameonlylowercase.csv | |
# convert spaces to hyphens | |
sed -i -e 's/ /-/g' mergednameonlylowercase.csv | |
# duplicate the column | |
awk 'BEGIN{FS=OFS=", "}{$1 = $1 OFS $1} 1' mergednameonlylowercase.csv > mergednameonlydupe.csv | |
# convert to json (yes I'm sure this can be piped together, don't @ me) | |
# what we're aiming for is: | |
# {"label": "GPE", "pattern": [{"LOWER": "san"}, {"LOWER": "francisco"}], "id": "san-francisco"} | |
# 1) globally replace inner space(s) with "}, {"LOWER": " | |
sed -e 's/ /"}, {"LOWER": "/g' mergednameonlydupe.csv > mergednameonly.json | |
# 2) prefix {"label": "B-Geo", "pattern": [{"LOWER": at the start | |
sed -i -e 's/^/{"label": "B-Geo", "pattern": [{"LOWER": "/' mergednameonly.json | |
# 3) replace the , "id element with "}], "id | |
sed -i -e "s/,\"id\"/\"}], \"id\"/g" mergednameonlyquoted.json | |
# 4) Add a closing } at the end | |
sed -i -e "s/$/}/g" mergednameonly.json | |
# rename to jsonl as that's what spacy is looking for | |
mv mergednameonly.json mergednameonly.jsonl | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment