Skip to content

Instantly share code, notes, and snippets.

@fuba
Created March 4, 2015 03:39
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save fuba/135f3459e9104d40f4c4 to your computer and use it in GitHub Desktop.
Save fuba/135f3459e9104d40f4c4 to your computer and use it in GitHub Desktop.
package NLPDocument::MeCabNode;
use strict;
use warnings;
use utf8;
use Encode;
use YAML::Syck;
use Lingua::JA::Numbers;
use base qw( Class::Accessor::Fast );
__PACKAGE__->mk_accessors(qw/
id pos pos_detail form type fund kana pron is_number statistics
line length
/);
sub new {
my ($pkg, $opt) = @_;
my $hash = {};
if ($opt->{cache}) {
$hash = $opt->{cache};
}
elsif ($opt->{mecab_node}) {
my $p = $opt->{mecab_node};
my $feature = decode('utf-8', $p->feature);
$hash = parse_mecab_feature($feature);
my $surface = decode('utf-8', $p->surface);
$surface = '' unless (defined $surface);
my $pseudo_pron_for_empty_pron = $surface;
$pseudo_pron_for_empty_pron =~ s/・//g;
$hash->{line} = "$surface\t$feature\n";
return if ($hash->{pos} eq 'BOS/EOS');
if (!$hash->{pron} && $surface =~ /^\d+$/) {
$hash->{pron} = num2ja($surface, {style => 'katakana'});
}
if (!$hash->{pron} && $pseudo_pron_for_empty_pron =~ /^\p{InKatakana}+$/) {
$hash->{pron} = $pseudo_pron_for_empty_pron;
}
$opt->{sentence_id} = (defined $opt->{sentence_id}) ? $opt->{sentence_id} : 0;
my $pid = (defined $p->id) ? $p->id : 0;
$hash->{id} = $opt->{sentence_id}.'-'.$p->id;
$hash->{surface} = $surface;
if ($hash->{pos_detail}->[0] eq '数') {
#$hash->{surface} = '<num>';
$hash->{pos_detail}->[0] = '';
$hash->{is_number} = 1;
}
$hash->{length} = length($hash->{surface});
my @chars = ($hash->{surface} =~ m/(.)/g);
$hash->{chars} = \@chars;
}
elsif ($opt->{morp}) {
$hash = $opt->{morp};
}
bless $hash, $pkg;
}
sub surface {
my $self = shift;
my %opt = @_;
if ($opt{normalize_num} && $self->is_number) {
return '<num>';
}
return $self->{surface};
}
sub chars {
return @{shift->{chars}};
}
sub parse_mecab_feature {
my $csv = shift;
return {} unless ($csv);
my @v = split /\,/, $csv;
#0 品詞
#1 品詞細分類1
#2 品詞細分類2
#3 品詞細分類3
#4 活用型
#5 活用形
#6 基本形
#7 読み
#8 発音
return {
pos => $v[0],
pos_detail => [ $v[1], $v[2], $v[3] ],
form => $v[4],
type => $v[5],
fund => $v[6],
kana => $v[7],
pron => $v[8],
};
}
sub is_noun {
my ($self, $option) = @_;
my $bool = 0;
$bool = 1 if ($self->pos eq '名詞');
$bool = 0 if ($self->surface =~ /\W/);
$bool = 1 if ($self->is_number);
if ($option && !$option->{is_head}) {
}
else {
my $prebool = $bool;
for my $avoid (qw/
副詞可能 非自立 接尾 接続詞的 代名詞 ナイ形容詞語幹
接続詞的 代名詞 動詞非自立的 特殊
/) {
$bool = 0 if ($self->in_pos_detail($avoid));
last unless ($bool);
}
# warn 'is not head '.$self->surface if (!$bool && $prebool);
}
return $bool;
}
sub in_pos_detail {
my ($self, $kw) = @_;
return 1 if (grep {$kw eq $_} @{$self->pos_detail});
return;
}
1;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment