Skip to content

Instantly share code, notes, and snippets.

@sekia
Created September 11, 2018 19:24
Show Gist options
  • Save sekia/dc55c2ae253370cd09c3350d0acf7818 to your computer and use it in GitHub Desktop.
Save sekia/dc55c2ae253370cd09c3350d0acf7818 to your computer and use it in GitHub Desktop.
Example classification program for Chinese text, using Algorithm::LibLinear.
#!/usr/bin/env perl
use strict;
use warnings;
use utf8;
use feature qw/say state/;
use Algorithm::LibLinear;
use Lingua::ZH::Jieba; # Word segmentor for Chinese language.
use Text::Ngrams; # Language-independent n-gram generator.
# These training/test data are taken from tgrocery's sample/classify.py.
my @training_data = (
[ 'education', '名师指导托福语法技巧:名词的复数形式' ],
[ 'education', '中国高考成绩海外认可 是“狼来了”吗?' ],
[ 'sports', '图文:法网孟菲尔斯苦战进16强 孟菲尔斯怒吼' ],
[ 'sports', '四川丹棱举行全国长距登山挑战赛 近万人参与' ],
);
my @test_data = (
'考生必读:新托福写作考试评分标准',
);
my $model_filename = 'model.liblinear';
sub segment_text {
my ($sentence) = @_;
state $segmentor = Lingua::ZH::Jieba->new;
# cut_all => 1 is same configuration as tgrocery's default.
$segmentor->cut($sentence, +{ cut_all => 1 });
}
# Conversion from text to sparse vector of {uni,bi}gram frequencies.
# Each {uni,bi}gram is mapped to unique integer ID.
# Result sparse vector consists of key-value pairs such that
# |<ngram ID> => <ngram frequency>|.
sub generate_feature {
my ($sentence) = @_;
state %ngram_indices;
my $bigram = Text::Ngrams->new(type => 'word', windowsize => 2);
my $words = segment_text($sentence);
$bigram->feed_tokens(@$words);
my %ngram_frequencies = (
$bigram->get_ngrams(n => 1), # Unigram frequencies.
$bigram->get_ngrams(n => 2), # Bigram frequencies.
);
my %feature;
for my $ngram (keys %ngram_frequencies) {
my $feature_index = $ngram_indices{$ngram} //= keys %ngram_indices;
$feature{$feature_index} = $ngram_frequencies{$ngram};
}
return \%feature;
}
# Converts training data into LBLINEAR's sparse vector format.
# Each answer label is mapped to unique integer ID, like the same as feature ngrams.
my %label_ids;
my @liblinear_training_data;
for (@training_data) {
my ($label_text, $sentence) = @$_;
my $label_id = $label_ids{$label_text} //= keys %label_ids;
my $feature = generate_feature($sentence);
push @liblinear_training_data, +{
feature => $feature,
label => $label_id,
};
}
my $data_set = Algorithm::LibLinear::DataSet->new(
data_set => \@liblinear_training_data,
);
# Instantiates a LIBLINEAR trainer instance.
# Probably you need to tune learning parameters (|bias|, |cost| and |epsilon|)
# for good precision. Consult `perldoc Algorithm::LibLinear`.
my $learner = Algorithm::LibLinear->new(solver => 'MCSVM_CS');
# Trains with given data set.
my $model = $learner->train(data_set => $data_set);
# Trained LIBLINEAR model can be saved by |save| method.
# Note that label-id map and ngram-id map are *not* included in the file.
# You'll need to save them manually for restoring whole system state.
$model->save(filename => $model_filename);
# my $restored_model = Algorithm::LibLinear::Model->load(filename => $model_filename);)
binmode STDOUT => ':utf8';
my %labels = reverse %label_ids; # ID to text label mapping.
for my $test_sentence (@test_data) {
my $feature = generate_feature($test_sentence);
my $label_id = $model->predict(feature => $feature);
say "Sentence: $test_sentence";
say "Predicted Label: $labels{$label_id}";
my @label_ids = @{ $model->class_labels };
my @predict_values = @{ $model->predict_values(feature => $feature) };
say 'Confidence:';
for my $i (0 .. $#label_ids) {
my $label = $labels{$label_ids[$i]};
my $confidence = $predict_values[$i];
say "\t$label: $confidence";
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment