Created
April 30, 2012 04:08
-
-
Save neubig/2555399 to your computer and use it in GitHub Desktop.
A program to change KyTea's Japanese POS tags to english tags
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/perl | |
# This is a script to change KyTea's POS tags in Japanese to English | |
# abbreviations | |
use strict; | |
use utf8; | |
use Getopt::Long; | |
use List::Util qw(sum min max shuffle); | |
binmode STDIN, ":utf8"; | |
binmode STDOUT, ":utf8"; | |
binmode STDERR, ":utf8"; | |
my %mapping = ( | |
"名詞" => "N", # Noun | |
"代名詞" => "PRP", # Pronoun | |
"連体詞" => "DT", # Adjectival determiner | |
"動詞" => "V", # Verb | |
"形容詞" => "ADJ", # Adjective | |
"形状詞" => "ADJV", # Adjectival verb | |
"副詞" => "ADV", # Adverb | |
"助詞" => "PRT", # Particle | |
"助動詞" => "AUXV", # Auxiliary verb | |
"補助記号" => ".", # Punctuation | |
"記号" => "SYM", # Symbol | |
"接尾辞" => "SUF", # Suffix | |
"接頭辞" => "PRE", # Prefix | |
"語尾" => "TAIL", # Word tail (conjugation) | |
"接続詞" => "CC", # Conjunction | |
"代名詞" => "PRP", # Pronoun | |
"URL" => "URL", # URL | |
"英単語" => "ENG", # English word | |
"言いよどみ" => "FIL", # Filler | |
"web誤脱" => "MSP", # Misspelling | |
"感動詞" => "INT", # Interjection | |
"新規未知語" => "UNK", # Unclassified unknown word | |
# "" => "", | |
); | |
while(<STDIN>) { | |
chomp; | |
my @arr = map { | |
my @col = split(/\//); | |
die "Bad POS tag in @col" if not exists $mapping{$col[1]}; | |
$col[1] = $mapping{$col[1]}; | |
join("/", @col) | |
} split(/ /); | |
print "@arr\n"; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Doesn't work anymore probably due to new formatting.
normal.txt:
From fairest creatures we desire increase,
normal.raw:
From/名詞/ふろむ \ fairest/名詞/UNK \ /補助記号/UNK creatures/名詞/CREATURES \ we\ /ローマ字文/UNK desire/名詞/でざいあー \ /補助記号/UNK increase/名詞/INCREASE ,/補助記号/、