-
-
Save theerapat-y/10d2222d8bffe8d7631d to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/perl -w | |
use strict; | |
use Test::More; | |
use Novus::Xapian; | |
use Novus::Xapian::VSM; | |
use Data::Dumper; | |
my $vsm = Novus::Xapian::VSM->new(); | |
subtest 'No text parse' => sub { | |
my $result = $vsm->text_to_vsm(); | |
is_deeply($result,undef,'Return undef if there is no text parse'); | |
}; | |
subtest 'English text' => sub { | |
my $text = 'In Norway, Oslo is the capital and largest city of Norway, and the third largest city of Scandinavia.'; | |
my $lang = 'english'; | |
my $output = { | |
'citi' => 2, | |
'scandinavia' => 1, | |
'norway' => 2, | |
'third' => 1, | |
'oslo' => 1, | |
'largest' => 2 | |
}; | |
my $result = $vsm->text_to_vsm($text,$lang); | |
is_deeply($result,$output,'Output for english text is correct'); | |
}; | |
subtest 'Norwegian text' => sub { | |
my $text = 'Truet med å stevne misfornøyd PC-kunde - PC-leverandøren Asus likte svært dårlig kundens misfornøyde leserbrev.'; | |
my $output = { | |
'asus' => 1, | |
'leserbrev' => 1, | |
'leverandør' => 1, | |
'svært' => 1, | |
'pc' => 2, | |
'kund' => 2, | |
'stevn' => 1, | |
'dår' => 1, | |
'truet' => 1, | |
'misfornøyd' => 2, | |
'likt' => 1 | |
}; | |
my $result = $vsm->text_to_vsm($text); | |
is_deeply($result,$output,'Output for norwegian text is correct'); | |
}; | |
done_testing(); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
categorizer: | |
... | |
analyzer: | |
... | |
stopword_english: "a,about,above,after,again,against,all,am,an,and,any,are,aren't,as,at,be,because,been,before,being,below,between,both,but,by,can't,cannot,could,couldn't,did,didn't,do,does,doesn't,doing,don't,down,during,each,few,for,from,further,had,hadn't,has,hasn't,have,haven't,having,he,he'd,he'll,he's,her,here,here's,hers,herself,him,himself,his,how,how's,i,i'd,i'll,i'm,i've,if,in,into,is,isn't,it,it's,its,itself,let's,me,more,most,mustn't,my,myself,no,nor,not,of,off,on,once,only,or,other,ought,our,ours,ourselves,out,over,own,same,shan't,she,she'd,she'll,she's,should,shouldn't,so,some,such,than,that,that's,the,their,theirs,them,themselves,then,there,there's,these,they,they'd,they'll,they're,they've,this,those,through,to,too,under,until,up,very,was,wasn't,we,we'd,we'll,we're,we've,were,weren't,what,what's,when,when's,where,where's,which,while,who,who's,whom,why,why's,with,won't,would,wouldn't,you,you'd,you'll,you're,you've,your,yours,yourself,yourselves" | |
stopword_norwegian: "alle,andre,arbeid,av,begge,bort,bra,bruke,da,denne,der,deres,det,din,disse,du,eller,en,ene,eneste,enhver,enn,er,et,folk,for,fordi,forsÛke,fra,få,fÛr,fÛrst,gjorde,gjÛre,god,gå,ha,hadde,han,hans,hennes,her,hva,hvem,hver,hvilken,hvis,hvor,hvordan,hvorfor,i,ikke,inn,innen,kan,kunne,lage,lang,lik,like,makt,mange,med,meg,meget,men,mens,mer,mest,min,mye,må,måte,navn,nei,ny,nå,når,og,også,om,opp,oss,over,part,punkt,på,rett,riktig,samme,sant,si,siden,sist,skulle,slik,slutt,som,start,stille,så,tid,til,tilbake,tilstand,under,ut,uten,var,ved,verdi,vi,vil,ville,vite,vår,vÖre,vÖrt,å,jeg,hun,at,blir,bli" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
t/06.vsm_xapian.t .. | |
ok 1 - Return undef if there is no text parse | |
1..1 | |
ok 1 - No text parse | |
$VAR1 = 'Zciti'; | |
$VAR1 = '+++++++++++++++++++++++++++'; | |
$VAR1 = 'Zlargest'; | |
$VAR1 = '+++++++++++++++++++++++++++'; | |
$VAR1 = 'Znorway'; | |
$VAR1 = '+++++++++++++++++++++++++++'; | |
$VAR1 = 'Zoslo'; | |
$VAR1 = '+++++++++++++++++++++++++++'; | |
$VAR1 = 'Zscandinavia'; | |
$VAR1 = '+++++++++++++++++++++++++++'; | |
$VAR1 = 'Zthird'; | |
$VAR1 = '+++++++++++++++++++++++++++'; | |
$VAR1 = 'and'; | |
$VAR1 = '+++++++++++++++++++++++++++'; | |
$VAR1 = 'capital'; | |
$VAR1 = '+++++++++++++++++++++++++++'; | |
$VAR1 = 'city'; | |
$VAR1 = '+++++++++++++++++++++++++++'; | |
$VAR1 = 'in'; | |
$VAR1 = '+++++++++++++++++++++++++++'; | |
$VAR1 = 'is'; | |
$VAR1 = '+++++++++++++++++++++++++++'; | |
$VAR1 = 'largest'; | |
$VAR1 = '+++++++++++++++++++++++++++'; | |
$VAR1 = 'norway'; | |
$VAR1 = '+++++++++++++++++++++++++++'; | |
$VAR1 = 'of'; | |
$VAR1 = '+++++++++++++++++++++++++++'; | |
$VAR1 = 'oslo'; | |
$VAR1 = '+++++++++++++++++++++++++++'; | |
$VAR1 = 'scandinavia'; | |
$VAR1 = '+++++++++++++++++++++++++++'; | |
$VAR1 = 'the'; | |
$VAR1 = '+++++++++++++++++++++++++++'; | |
$VAR1 = 'third'; | |
$VAR1 = '+++++++++++++++++++++++++++'; | |
ok 1 - Output for english text is correct | |
1..1 | |
ok 2 - English text | |
$VAR1 = 'Zdår'; | |
$VAR1 = '+++++++++++++++++++++++++++'; | |
$VAR1 = 'Zkund'; | |
$VAR1 = '+++++++++++++++++++++++++++'; | |
$VAR1 = 'Zleserbrev'; | |
$VAR1 = '+++++++++++++++++++++++++++'; | |
$VAR1 = 'Zleverandør'; | |
$VAR1 = '+++++++++++++++++++++++++++'; | |
$VAR1 = 'Zlikt'; | |
$VAR1 = '+++++++++++++++++++++++++++'; | |
$VAR1 = 'Zmisfornøyd'; | |
$VAR1 = '+++++++++++++++++++++++++++'; | |
$VAR1 = 'Zpc'; | |
$VAR1 = '+++++++++++++++++++++++++++'; | |
$VAR1 = 'Zstevn'; | |
$VAR1 = '+++++++++++++++++++++++++++'; | |
$VAR1 = 'Zsvært'; | |
$VAR1 = '+++++++++++++++++++++++++++'; | |
$VAR1 = 'Ztruet'; | |
$VAR1 = '+++++++++++++++++++++++++++'; | |
$VAR1 = 'asus'; | |
$VAR1 = '+++++++++++++++++++++++++++'; | |
$VAR1 = 'dårlig'; | |
$VAR1 = '+++++++++++++++++++++++++++'; | |
$VAR1 = 'kunde'; | |
$VAR1 = '+++++++++++++++++++++++++++'; | |
$VAR1 = 'kundens'; | |
$VAR1 = '+++++++++++++++++++++++++++'; | |
$VAR1 = 'leserbrev'; | |
$VAR1 = '+++++++++++++++++++++++++++'; | |
$VAR1 = 'leverandøren'; | |
$VAR1 = '+++++++++++++++++++++++++++'; | |
$VAR1 = 'likte'; | |
$VAR1 = '+++++++++++++++++++++++++++'; | |
$VAR1 = 'med'; | |
$VAR1 = '+++++++++++++++++++++++++++'; | |
$VAR1 = 'misfornøyd'; | |
$VAR1 = '+++++++++++++++++++++++++++'; | |
$VAR1 = 'misfornøyde'; | |
$VAR1 = '+++++++++++++++++++++++++++'; | |
$VAR1 = 'pc'; | |
$VAR1 = '+++++++++++++++++++++++++++'; | |
$VAR1 = 'stevne'; | |
$VAR1 = '+++++++++++++++++++++++++++'; | |
$VAR1 = 'svært'; | |
$VAR1 = '+++++++++++++++++++++++++++'; | |
$VAR1 = 'truet'; | |
$VAR1 = '+++++++++++++++++++++++++++'; | |
$VAR1 = 'å'; | |
$VAR1 = '+++++++++++++++++++++++++++'; | |
not ok 1 - Output for norwegian text is correct | |
1..1 | |
# Failed test 'Output for norwegian text is correct' | |
# at t/06.vsm_xapian.t line 49. | |
# Structures begin differing at: | |
# $got->{asus} = Does not exist | |
# $expected->{asus} = '1' | |
# Looks like you failed 1 test of 1. | |
not ok 3 - Norwegian text | |
1..3 | |
# Failed test 'Norwegian text' | |
# at t/06.vsm_xapian.t line 50. | |
# Looks like you failed 1 test of 3. | |
Dubious, test returned 1 (wstat 256, 0x100) | |
Failed 1/3 subtests | |
Test Summary Report | |
------------------- | |
t/06.vsm_xapian.t (Wstat: 256 Tests: 3 Failed: 1) | |
Failed test: 3 | |
Non-zero exit status: 1 | |
Files=1, Tests=3, 1 wallclock secs ( 0.03 usr 0.00 sys + 0.54 cusr 0.07 csys = 0.64 CPU) | |
Result: FAIL |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package Novus::Xapian::VSM; | |
use strict; | |
use warnings; | |
use Moose; | |
use Novus::Config; | |
use Data::Dumper; | |
use Search::Xapian (':all'); | |
use utf8; | |
my $indexer; | |
sub BUILD { | |
$indexer = Search::Xapian::TermGenerator->new(); | |
} | |
=head1 text_to_vsm | |
Convert text to vsm depends on language | |
=cut | |
sub text_to_vsm { | |
my ($self, $text, $lang) = @_; | |
# Set default language to norwegian | |
$lang = 'norwegian' if (!defined($lang)); | |
return undef if (!defined($text)); | |
my $doc_context = $text; | |
# Retreive the stopwords | |
my $stopper_string = $self->read_stopword($lang); | |
my @stopwords = split(',', $stopper_string); | |
my $stopper = new Search::Xapian::SimpleStopper(@stopwords); | |
my $stemmer = Search::Xapian::Stem->new($lang); | |
my $doc = new Search::Xapian::Document(); | |
$indexer->set_stemmer($stemmer); | |
$indexer->set_stopper($stopper); | |
$indexer->set_document($doc); | |
# index context | |
$indexer->index_text($doc_context); | |
my $terms = $doc->termlist_begin; | |
my $terms_count = $doc->termlist_count; | |
my $count = 1; | |
my $term; | |
my $vsm={}; | |
while ( $terms_count > $count and $term = $terms++ ) { | |
print Dumper($term->get_termname); | |
print Dumper('+++++++++++++++++++++++++++'); | |
#use only number and indexed terms to gen VSM | |
if($term->get_termname =~ m/^Z(.*)|(^\d.*)/) { | |
$vsm->{ | |
(defined($1)) ? $1 : $2 | |
} = $term->get_wdf; | |
} | |
$count++; | |
} | |
return $vsm; | |
} | |
=head1 readfile | |
Read the file defined by language | |
=cut | |
sub read_stopword { | |
my ($self, $lang) = @_; | |
my $config = Novus::Config->new()->config->{'categorizer'}->{'analyzer'}; | |
my $stopword_lang = 'stopword_'.$lang; | |
my $stopword = $config->{$stopword_lang}; | |
return $stopword; | |
} | |
1; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment