Skip to content

Instantly share code, notes, and snippets.

@ceekz
Created July 29, 2013 05:29
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ceekz/6102297 to your computer and use it in GitHub Desktop.
Save ceekz/6102297 to your computer and use it in GitHub Desktop.
use Text::MeCab;
our $mecab = Text::MeCab->new({ node_format => "%M" });
sub get_terms {
my $str = shift;
my @terms;
my @surfaces;
for (my $node = $mecab->parse($str); $node; $node = $node->next) {
my @feature = split /,/, $node->feature;
utf8::decode($feature[0]);
utf8::decode($feature[1]);
if ($feature[0] eq '名詞' && $feature[1] ne '非自立' && $feature[1] ne '代名詞') {
if (scalar(@surfaces) == 0 && $feature[1] eq '接尾') {
next;
}
push @surfaces, $node->format($mecab);
} else {
if (scalar(@surfaces) > 0) {
my $term = join('', @surfaces);
utf8::decode($term);
$term =~ s/^[\x00-\x2F\x3A-\x40\x5B-\x60\x7B-\x7F]+//;
$term =~ s/[\x00-\x2F\x3A-\x40\x5B-\x60\x7B-\x7F]+$//;
if (length($term) > 1) {
push @terms, $term;
}
}
undef @surfaces;
}
}
return @terms;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment