Created
February 10, 2017 17:19
-
-
Save pcurylo/2e044a3dfa6ee8b076147546081bef78 to your computer and use it in GitHub Desktop.
Country Codes
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
# acquires and cleans country codes | |
curl 'http://www.worldatlas.com/aatlas/ctycodes.htm' \ | |
| tr "\n" "|" \ | |
| grep -o '<table.*</table>' \ | |
| tr "|" "\n" \ | |
| tr -d '\n\r' \ | |
| sed 's/\(<[^>]*>\)\s*\([^>]*\)\s*\(<\/[^>]*>\)/\1\2\3/Ig' \ | |
| sed 's/<\/TR[^>]*>/\n/Ig' \ | |
| sed 's/<\?\(TABLE\|TBODY\|TR\)[^>]*>//Ig' \ | |
| sed 's/<T[DH][^>]*>\|<\/\?T[DH][^>]*>$//Ig' \ | |
| sed 's/<\/T[DH][^>]*>/\t/Ig' \ | |
# get data | |
# remove newlines | |
# extract table | |
# put newlines back | |
# now remove all newlines and carriage returns | |
# trim tag contents | |
# linebreak the rows | |
# remove start tags; except for th/td | |
# now remove th/td start tags and end tags at end of line | |
# break columns by tab char |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment