Created
September 11, 2018 19:24
-
-
Save sekia/dc55c2ae253370cd09c3350d0acf7818 to your computer and use it in GitHub Desktop.
Example classification program for Chinese text, using Algorithm::LibLinear.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env perl | |
use strict; | |
use warnings; | |
use utf8; | |
use feature qw/say state/; | |
use Algorithm::LibLinear; | |
use Lingua::ZH::Jieba; # Word segmentor for Chinese language. | |
use Text::Ngrams; # Language-independent n-gram generator. | |
# These training/test data are taken from tgrocery's sample/classify.py. | |
my @training_data = ( | |
[ 'education', '名师指导托福语法技巧:名词的复数形式' ], | |
[ 'education', '中国高考成绩海外认可 是“狼来了”吗?' ], | |
[ 'sports', '图文:法网孟菲尔斯苦战进16强 孟菲尔斯怒吼' ], | |
[ 'sports', '四川丹棱举行全国长距登山挑战赛 近万人参与' ], | |
); | |
my @test_data = ( | |
'考生必读:新托福写作考试评分标准', | |
); | |
my $model_filename = 'model.liblinear'; | |
sub segment_text { | |
my ($sentence) = @_; | |
state $segmentor = Lingua::ZH::Jieba->new; | |
# cut_all => 1 is same configuration as tgrocery's default. | |
$segmentor->cut($sentence, +{ cut_all => 1 }); | |
} | |
# Conversion from text to sparse vector of {uni,bi}gram frequencies. | |
# Each {uni,bi}gram is mapped to unique integer ID. | |
# Result sparse vector consists of key-value pairs such that | |
# |<ngram ID> => <ngram frequency>|. | |
sub generate_feature { | |
my ($sentence) = @_; | |
state %ngram_indices; | |
my $bigram = Text::Ngrams->new(type => 'word', windowsize => 2); | |
my $words = segment_text($sentence); | |
$bigram->feed_tokens(@$words); | |
my %ngram_frequencies = ( | |
$bigram->get_ngrams(n => 1), # Unigram frequencies. | |
$bigram->get_ngrams(n => 2), # Bigram frequencies. | |
); | |
my %feature; | |
for my $ngram (keys %ngram_frequencies) { | |
my $feature_index = $ngram_indices{$ngram} //= keys %ngram_indices; | |
$feature{$feature_index} = $ngram_frequencies{$ngram}; | |
} | |
return \%feature; | |
} | |
# Converts training data into LBLINEAR's sparse vector format. | |
# Each answer label is mapped to unique integer ID, like the same as feature ngrams. | |
my %label_ids; | |
my @liblinear_training_data; | |
for (@training_data) { | |
my ($label_text, $sentence) = @$_; | |
my $label_id = $label_ids{$label_text} //= keys %label_ids; | |
my $feature = generate_feature($sentence); | |
push @liblinear_training_data, +{ | |
feature => $feature, | |
label => $label_id, | |
}; | |
} | |
my $data_set = Algorithm::LibLinear::DataSet->new( | |
data_set => \@liblinear_training_data, | |
); | |
# Instantiates a LIBLINEAR trainer instance. | |
# Probably you need to tune learning parameters (|bias|, |cost| and |epsilon|) | |
# for good precision. Consult `perldoc Algorithm::LibLinear`. | |
my $learner = Algorithm::LibLinear->new(solver => 'MCSVM_CS'); | |
# Trains with given data set. | |
my $model = $learner->train(data_set => $data_set); | |
# Trained LIBLINEAR model can be saved by |save| method. | |
# Note that label-id map and ngram-id map are *not* included in the file. | |
# You'll need to save them manually for restoring whole system state. | |
$model->save(filename => $model_filename); | |
# my $restored_model = Algorithm::LibLinear::Model->load(filename => $model_filename);) | |
binmode STDOUT => ':utf8'; | |
my %labels = reverse %label_ids; # ID to text label mapping. | |
for my $test_sentence (@test_data) { | |
my $feature = generate_feature($test_sentence); | |
my $label_id = $model->predict(feature => $feature); | |
say "Sentence: $test_sentence"; | |
say "Predicted Label: $labels{$label_id}"; | |
my @label_ids = @{ $model->class_labels }; | |
my @predict_values = @{ $model->predict_values(feature => $feature) }; | |
say 'Confidence:'; | |
for my $i (0 .. $#label_ids) { | |
my $label = $labels{$label_ids[$i]}; | |
my $confidence = $predict_values[$i]; | |
say "\t$label: $confidence"; | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment