Skip to content

Instantly share code, notes, and snippets.

@theerapat-y
Created November 22, 2012 10:04
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save theerapat-y/10d2222d8bffe8d7631d to your computer and use it in GitHub Desktop.
Save theerapat-y/10d2222d8bffe8d7631d to your computer and use it in GitHub Desktop.
#!/usr/bin/perl -w
use strict;
use Test::More;
use Novus::Xapian;
use Novus::Xapian::VSM;
use Data::Dumper;
my $vsm = Novus::Xapian::VSM->new();
subtest 'No text parse' => sub {
my $result = $vsm->text_to_vsm();
is_deeply($result,undef,'Return undef if there is no text parse');
};
subtest 'English text' => sub {
my $text = 'In Norway, Oslo is the capital and largest city of Norway, and the third largest city of Scandinavia.';
my $lang = 'english';
my $output = {
'citi' => 2,
'scandinavia' => 1,
'norway' => 2,
'third' => 1,
'oslo' => 1,
'largest' => 2
};
my $result = $vsm->text_to_vsm($text,$lang);
is_deeply($result,$output,'Output for english text is correct');
};
subtest 'Norwegian text' => sub {
my $text = 'Truet med å stevne misfornøyd PC-kunde - PC-leverandøren Asus likte svært dårlig kundens misfornøyde leserbrev.';
my $output = {
'asus' => 1,
'leserbrev' => 1,
'leverandør' => 1,
'svært' => 1,
'pc' => 2,
'kund' => 2,
'stevn' => 1,
'dår' => 1,
'truet' => 1,
'misfornøyd' => 2,
'likt' => 1
};
my $result = $vsm->text_to_vsm($text);
is_deeply($result,$output,'Output for norwegian text is correct');
};
done_testing();
categorizer:
...
analyzer:
...
stopword_english: "a,about,above,after,again,against,all,am,an,and,any,are,aren't,as,at,be,because,been,before,being,below,between,both,but,by,can't,cannot,could,couldn't,did,didn't,do,does,doesn't,doing,don't,down,during,each,few,for,from,further,had,hadn't,has,hasn't,have,haven't,having,he,he'd,he'll,he's,her,here,here's,hers,herself,him,himself,his,how,how's,i,i'd,i'll,i'm,i've,if,in,into,is,isn't,it,it's,its,itself,let's,me,more,most,mustn't,my,myself,no,nor,not,of,off,on,once,only,or,other,ought,our,ours,ourselves,out,over,own,same,shan't,she,she'd,she'll,she's,should,shouldn't,so,some,such,than,that,that's,the,their,theirs,them,themselves,then,there,there's,these,they,they'd,they'll,they're,they've,this,those,through,to,too,under,until,up,very,was,wasn't,we,we'd,we'll,we're,we've,were,weren't,what,what's,when,when's,where,where's,which,while,who,who's,whom,why,why's,with,won't,would,wouldn't,you,you'd,you'll,you're,you've,your,yours,yourself,yourselves"
stopword_norwegian: "alle,andre,arbeid,av,begge,bort,bra,bruke,da,denne,der,deres,det,din,disse,du,eller,en,ene,eneste,enhver,enn,er,et,folk,for,fordi,forsÛke,fra,få,fÛr,fÛrst,gjorde,gjÛre,god,gå,ha,hadde,han,hans,hennes,her,hva,hvem,hver,hvilken,hvis,hvor,hvordan,hvorfor,i,ikke,inn,innen,kan,kunne,lage,lang,lik,like,makt,mange,med,meg,meget,men,mens,mer,mest,min,mye,må,måte,navn,nei,ny,nå,når,og,også,om,opp,oss,over,part,punkt,på,rett,riktig,samme,sant,si,siden,sist,skulle,slik,slutt,som,start,stille,så,tid,til,tilbake,tilstand,under,ut,uten,var,ved,verdi,vi,vil,ville,vite,vår,vÖre,vÖrt,å,jeg,hun,at,blir,bli"
t/06.vsm_xapian.t ..
ok 1 - Return undef if there is no text parse
1..1
ok 1 - No text parse
$VAR1 = 'Zciti';
$VAR1 = '+++++++++++++++++++++++++++';
$VAR1 = 'Zlargest';
$VAR1 = '+++++++++++++++++++++++++++';
$VAR1 = 'Znorway';
$VAR1 = '+++++++++++++++++++++++++++';
$VAR1 = 'Zoslo';
$VAR1 = '+++++++++++++++++++++++++++';
$VAR1 = 'Zscandinavia';
$VAR1 = '+++++++++++++++++++++++++++';
$VAR1 = 'Zthird';
$VAR1 = '+++++++++++++++++++++++++++';
$VAR1 = 'and';
$VAR1 = '+++++++++++++++++++++++++++';
$VAR1 = 'capital';
$VAR1 = '+++++++++++++++++++++++++++';
$VAR1 = 'city';
$VAR1 = '+++++++++++++++++++++++++++';
$VAR1 = 'in';
$VAR1 = '+++++++++++++++++++++++++++';
$VAR1 = 'is';
$VAR1 = '+++++++++++++++++++++++++++';
$VAR1 = 'largest';
$VAR1 = '+++++++++++++++++++++++++++';
$VAR1 = 'norway';
$VAR1 = '+++++++++++++++++++++++++++';
$VAR1 = 'of';
$VAR1 = '+++++++++++++++++++++++++++';
$VAR1 = 'oslo';
$VAR1 = '+++++++++++++++++++++++++++';
$VAR1 = 'scandinavia';
$VAR1 = '+++++++++++++++++++++++++++';
$VAR1 = 'the';
$VAR1 = '+++++++++++++++++++++++++++';
$VAR1 = 'third';
$VAR1 = '+++++++++++++++++++++++++++';
ok 1 - Output for english text is correct
1..1
ok 2 - English text
$VAR1 = 'Zdår';
$VAR1 = '+++++++++++++++++++++++++++';
$VAR1 = 'Zkund';
$VAR1 = '+++++++++++++++++++++++++++';
$VAR1 = 'Zleserbrev';
$VAR1 = '+++++++++++++++++++++++++++';
$VAR1 = 'Zleverandør';
$VAR1 = '+++++++++++++++++++++++++++';
$VAR1 = 'Zlikt';
$VAR1 = '+++++++++++++++++++++++++++';
$VAR1 = 'Zmisfornøyd';
$VAR1 = '+++++++++++++++++++++++++++';
$VAR1 = 'Zpc';
$VAR1 = '+++++++++++++++++++++++++++';
$VAR1 = 'Zstevn';
$VAR1 = '+++++++++++++++++++++++++++';
$VAR1 = 'Zsvært';
$VAR1 = '+++++++++++++++++++++++++++';
$VAR1 = 'Ztruet';
$VAR1 = '+++++++++++++++++++++++++++';
$VAR1 = 'asus';
$VAR1 = '+++++++++++++++++++++++++++';
$VAR1 = 'dårlig';
$VAR1 = '+++++++++++++++++++++++++++';
$VAR1 = 'kunde';
$VAR1 = '+++++++++++++++++++++++++++';
$VAR1 = 'kundens';
$VAR1 = '+++++++++++++++++++++++++++';
$VAR1 = 'leserbrev';
$VAR1 = '+++++++++++++++++++++++++++';
$VAR1 = 'leverandøren';
$VAR1 = '+++++++++++++++++++++++++++';
$VAR1 = 'likte';
$VAR1 = '+++++++++++++++++++++++++++';
$VAR1 = 'med';
$VAR1 = '+++++++++++++++++++++++++++';
$VAR1 = 'misfornøyd';
$VAR1 = '+++++++++++++++++++++++++++';
$VAR1 = 'misfornøyde';
$VAR1 = '+++++++++++++++++++++++++++';
$VAR1 = 'pc';
$VAR1 = '+++++++++++++++++++++++++++';
$VAR1 = 'stevne';
$VAR1 = '+++++++++++++++++++++++++++';
$VAR1 = 'svært';
$VAR1 = '+++++++++++++++++++++++++++';
$VAR1 = 'truet';
$VAR1 = '+++++++++++++++++++++++++++';
$VAR1 = 'å';
$VAR1 = '+++++++++++++++++++++++++++';
not ok 1 - Output for norwegian text is correct
1..1
# Failed test 'Output for norwegian text is correct'
# at t/06.vsm_xapian.t line 49.
# Structures begin differing at:
# $got->{asus} = Does not exist
# $expected->{asus} = '1'
# Looks like you failed 1 test of 1.
not ok 3 - Norwegian text
1..3
# Failed test 'Norwegian text'
# at t/06.vsm_xapian.t line 50.
# Looks like you failed 1 test of 3.
Dubious, test returned 1 (wstat 256, 0x100)
Failed 1/3 subtests
Test Summary Report
-------------------
t/06.vsm_xapian.t (Wstat: 256 Tests: 3 Failed: 1)
Failed test: 3
Non-zero exit status: 1
Files=1, Tests=3, 1 wallclock secs ( 0.03 usr 0.00 sys + 0.54 cusr 0.07 csys = 0.64 CPU)
Result: FAIL
package Novus::Xapian::VSM;
use strict;
use warnings;
use Moose;
use Novus::Config;
use Data::Dumper;
use Search::Xapian (':all');
use utf8;
my $indexer;
sub BUILD {
$indexer = Search::Xapian::TermGenerator->new();
}
=head1 text_to_vsm
Convert text to vsm depends on language
=cut
sub text_to_vsm {
my ($self, $text, $lang) = @_;
# Set default language to norwegian
$lang = 'norwegian' if (!defined($lang));
return undef if (!defined($text));
my $doc_context = $text;
# Retreive the stopwords
my $stopper_string = $self->read_stopword($lang);
my @stopwords = split(',', $stopper_string);
my $stopper = new Search::Xapian::SimpleStopper(@stopwords);
my $stemmer = Search::Xapian::Stem->new($lang);
my $doc = new Search::Xapian::Document();
$indexer->set_stemmer($stemmer);
$indexer->set_stopper($stopper);
$indexer->set_document($doc);
# index context
$indexer->index_text($doc_context);
my $terms = $doc->termlist_begin;
my $terms_count = $doc->termlist_count;
my $count = 1;
my $term;
my $vsm={};
while ( $terms_count > $count and $term = $terms++ ) {
print Dumper($term->get_termname);
print Dumper('+++++++++++++++++++++++++++');
#use only number and indexed terms to gen VSM
if($term->get_termname =~ m/^Z(.*)|(^\d.*)/) {
$vsm->{
(defined($1)) ? $1 : $2
} = $term->get_wdf;
}
$count++;
}
return $vsm;
}
=head1 readfile
Read the file defined by language
=cut
sub read_stopword {
my ($self, $lang) = @_;
my $config = Novus::Config->new()->config->{'categorizer'}->{'analyzer'};
my $stopword_lang = 'stopword_'.$lang;
my $stopword = $config->{$stopword_lang};
return $stopword;
}
1;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment