Created
March 23, 2015 19:57
-
-
Save futurulus/38307d98992e7fdeec0d to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env bash | |
# Reformat as [-]token[-]|||POS_TAGS | |
cat $1 | awk ' | |
BEGIN { | |
FS = "·"; | |
ORS = " "; | |
buckwalter = "java edu.stanford.nlp.international.arabic.Buckwalter"; | |
} | |
/^t/ { | |
split($2, pos, "+"); | |
has_prefix = ($3 == "t"); | |
has_suffix = ($4 == "t"); | |
# Convert the Buckwalter entry to UTF-8, sans vowels | |
stripped = $5; | |
gsub(/[`~aeFiKNou]/, "", stripped); | |
gsub(/{/, "A", stripped); # turn alif madda into ordinary alif | |
print stripped "\n" |& buckwalter; buckwalter |& getline utf8; | |
split(utf8, segments, "+"); | |
# Figure out when to stop (after removing all empty/vowel-only segments) | |
# and fill in missing POS tags with UNKNOWN | |
for (i in segments) { | |
if (segments[i] != "") last_segment = i; | |
if (!(i in pos)) { | |
pos[i] = "UNKNOWN"; | |
} | |
} | |
# Print out all the segments with tags | |
for (i in segments) { | |
if (segments[i] != "") { | |
print ((i != 1 || has_prefix ? "-" : "") \ | |
segments[i] \ | |
(i != last_segment || has_suffix ? "-" : "") \ | |
"|||" \ | |
pos[i]); | |
} | |
} | |
} | |
/^TREE/ { print "\n"; } | |
' | sed 's/^\s*//' | sed 's/\s*$//' |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
يتألمون كثيرا من أوجاعهم