theerapat-y/06.vsm_xapian.t Secret

## 06.vsm_xapian.t
#!/usr/bin/perl -w

use strict;
use Test::More;
use Novus::Xapian;
use Novus::Xapian::VSM;
use Data::Dumper;

my $vsm = Novus::Xapian::VSM->new();

subtest 'No text parse' => sub {
	my $result = $vsm->text_to_vsm();
	is_deeply($result,undef,'Return undef if there is no text parse');
};


subtest 'English text' => sub {
	my $text = 'In Norway, Oslo  is the capital and largest city of Norway, and the third largest city of Scandinavia.';
	my $lang = 'english';
	my $output = {
		'citi' => 2,
		'scandinavia' => 1,
		'norway' => 2,
		'third' => 1,
		'oslo' => 1,
		'largest' => 2
	};
	my $result = $vsm->text_to_vsm($text,$lang);
	is_deeply($result,$output,'Output for english text is correct');
};


subtest 'Norwegian text' => sub {
	my $text = 'Truet med å stevne misfornøyd PC-kunde - PC-leverandøren Asus likte svært dårlig kundens misfornøyde leserbrev.';
	my $output = {
        'asus' => 1,
        'leserbrev' => 1,
        'leverandør' => 1,
        'svært' => 1,
        'pc' => 2,
        'kund' => 2,
        'stevn' => 1,
        'dår' => 1,
        'truet' => 1,
        'misfornøyd' => 2,
        'likt' => 1
    };
	my $result = $vsm->text_to_vsm($text);
	is_deeply($result,$output,'Output for norwegian text is correct');
};

done_testing();

## novus.yml
categorizer:
    ...
    analyzer:
        ...
        stopword_english: "a,about,above,after,again,against,all,am,an,and,any,are,aren't,as,at,be,because,been,before,being,below,between,both,but,by,can't,cannot,could,couldn't,did,didn't,do,does,doesn't,doing,don't,down,during,each,few,for,from,further,had,hadn't,has,hasn't,have,haven't,having,he,he'd,he'll,he's,her,here,here's,hers,herself,him,himself,his,how,how's,i,i'd,i'll,i'm,i've,if,in,into,is,isn't,it,it's,its,itself,let's,me,more,most,mustn't,my,myself,no,nor,not,of,off,on,once,only,or,other,ought,our,ours,ourselves,out,over,own,same,shan't,she,she'd,she'll,she's,should,shouldn't,so,some,such,than,that,that's,the,their,theirs,them,themselves,then,there,there's,these,they,they'd,they'll,they're,they've,this,those,through,to,too,under,until,up,very,was,wasn't,we,we'd,we'll,we're,we've,were,weren't,what,what's,when,when's,where,where's,which,while,who,who's,whom,why,why's,with,won't,would,wouldn't,you,you'd,you'll,you're,you've,your,yours,yourself,yourselves"
        stopword_norwegian: "alle,andre,arbeid,av,begge,bort,bra,bruke,da,denne,der,deres,det,din,disse,du,eller,en,ene,eneste,enhver,enn,er,et,folk,for,fordi,forsÛke,fra,få,fÛr,fÛrst,gjorde,gjÛre,god,gå,ha,hadde,han,hans,hennes,her,hva,hvem,hver,hvilken,hvis,hvor,hvordan,hvorfor,i,ikke,inn,innen,kan,kunne,lage,lang,lik,like,makt,mange,med,meg,meget,men,mens,mer,mest,min,mye,må,måte,navn,nei,ny,nå,når,og,også,om,opp,oss,over,part,punkt,på,rett,riktig,samme,sant,si,siden,sist,skulle,slik,slutt,som,start,stille,så,tid,til,tilbake,tilstand,under,ut,uten,var,ved,verdi,vi,vil,ville,vite,vår,vÖre,vÖrt,å,jeg,hun,at,blir,bli"

## test result
t/06.vsm_xapian.t ..
    ok 1 - Return undef if there is no text parse
    1..1
ok 1 - No text parse
$VAR1 = 'Zciti';
$VAR1 = '+++++++++++++++++++++++++++';
$VAR1 = 'Zlargest';
$VAR1 = '+++++++++++++++++++++++++++';
$VAR1 = 'Znorway';
$VAR1 = '+++++++++++++++++++++++++++';
$VAR1 = 'Zoslo';
$VAR1 = '+++++++++++++++++++++++++++';
$VAR1 = 'Zscandinavia';
$VAR1 = '+++++++++++++++++++++++++++';
$VAR1 = 'Zthird';
$VAR1 = '+++++++++++++++++++++++++++';
$VAR1 = 'and';
$VAR1 = '+++++++++++++++++++++++++++';
$VAR1 = 'capital';
$VAR1 = '+++++++++++++++++++++++++++';
$VAR1 = 'city';
$VAR1 = '+++++++++++++++++++++++++++';
$VAR1 = 'in';
$VAR1 = '+++++++++++++++++++++++++++';
$VAR1 = 'is';
$VAR1 = '+++++++++++++++++++++++++++';
$VAR1 = 'largest';
$VAR1 = '+++++++++++++++++++++++++++';
$VAR1 = 'norway';
$VAR1 = '+++++++++++++++++++++++++++';
$VAR1 = 'of';
$VAR1 = '+++++++++++++++++++++++++++';
$VAR1 = 'oslo';
$VAR1 = '+++++++++++++++++++++++++++';
$VAR1 = 'scandinavia';
$VAR1 = '+++++++++++++++++++++++++++';
$VAR1 = 'the';
$VAR1 = '+++++++++++++++++++++++++++';
$VAR1 = 'third';
$VAR1 = '+++++++++++++++++++++++++++';
    ok 1 - Output for english text is correct
    1..1
ok 2 - English text
$VAR1 = 'Zdår';
$VAR1 = '+++++++++++++++++++++++++++';
$VAR1 = 'Zkund';
$VAR1 = '+++++++++++++++++++++++++++';
$VAR1 = 'Zleserbrev';
$VAR1 = '+++++++++++++++++++++++++++';
$VAR1 = 'Zleverandør';
$VAR1 = '+++++++++++++++++++++++++++';
$VAR1 = 'Zlikt';
$VAR1 = '+++++++++++++++++++++++++++';
$VAR1 = 'Zmisfornøyd';
$VAR1 = '+++++++++++++++++++++++++++';
$VAR1 = 'Zpc';
$VAR1 = '+++++++++++++++++++++++++++';
$VAR1 = 'Zstevn';
$VAR1 = '+++++++++++++++++++++++++++';
$VAR1 = 'Zsvært';
$VAR1 = '+++++++++++++++++++++++++++';
$VAR1 = 'Ztruet';
$VAR1 = '+++++++++++++++++++++++++++';
$VAR1 = 'asus';
$VAR1 = '+++++++++++++++++++++++++++';
$VAR1 = 'dårlig';
$VAR1 = '+++++++++++++++++++++++++++';
$VAR1 = 'kunde';
$VAR1 = '+++++++++++++++++++++++++++';
$VAR1 = 'kundens';
$VAR1 = '+++++++++++++++++++++++++++';
$VAR1 = 'leserbrev';
$VAR1 = '+++++++++++++++++++++++++++';
$VAR1 = 'leverandøren';
$VAR1 = '+++++++++++++++++++++++++++';
$VAR1 = 'likte';
$VAR1 = '+++++++++++++++++++++++++++';
$VAR1 = 'med';
$VAR1 = '+++++++++++++++++++++++++++';
$VAR1 = 'misfornøyd';
$VAR1 = '+++++++++++++++++++++++++++';
$VAR1 = 'misfornøyde';
$VAR1 = '+++++++++++++++++++++++++++';
$VAR1 = 'pc';
$VAR1 = '+++++++++++++++++++++++++++';
$VAR1 = 'stevne';
$VAR1 = '+++++++++++++++++++++++++++';
$VAR1 = 'svært';
$VAR1 = '+++++++++++++++++++++++++++';
$VAR1 = 'truet';
$VAR1 = '+++++++++++++++++++++++++++';
$VAR1 = 'å';
$VAR1 = '+++++++++++++++++++++++++++';
    not ok 1 - Output for norwegian text is correct
    1..1

    #   Failed test 'Output for norwegian text is correct'
    #   at t/06.vsm_xapian.t line 49.
    #     Structures begin differing at:
    #          $got->{asus} = Does not exist
    #     $expected->{asus} = '1'
    # Looks like you failed 1 test of 1.
not ok 3 - Norwegian text
1..3

#   Failed test 'Norwegian text'
#   at t/06.vsm_xapian.t line 50.
# Looks like you failed 1 test of 3.
Dubious, test returned 1 (wstat 256, 0x100)
Failed 1/3 subtests

Test Summary Report
-------------------
t/06.vsm_xapian.t (Wstat: 256 Tests: 3 Failed: 1)
  Failed test:  3
  Non-zero exit status: 1
Files=1, Tests=3,  1 wallclock secs ( 0.03 usr  0.00 sys +  0.54 cusr  0.07 csys =  0.64 CPU)
Result: FAIL

## VSM.pm
package Novus::Xapian::VSM;

use strict;
use warnings;
use Moose;
use Novus::Config;
use Data::Dumper;
use Search::Xapian (':all');
use utf8;


my $indexer;
sub BUILD {
    $indexer = Search::Xapian::TermGenerator->new();
}


=head1 text_to_vsm

Convert text to vsm depends on language

=cut

sub text_to_vsm {
    my ($self, $text, $lang) = @_;

    # Set default language to norwegian
    $lang = 'norwegian' if (!defined($lang));
    return undef if (!defined($text));

    my $doc_context = $text;

    # Retreive the stopwords
    my $stopper_string = $self->read_stopword($lang);
    my @stopwords = split(',', $stopper_string);

    my $stopper = new Search::Xapian::SimpleStopper(@stopwords);
    my $stemmer = Search::Xapian::Stem->new($lang);

    my $doc = new Search::Xapian::Document();
    $indexer->set_stemmer($stemmer);
    $indexer->set_stopper($stopper);
    $indexer->set_document($doc);

    # index context
    $indexer->index_text($doc_context);


    my $terms = $doc->termlist_begin;
    my $terms_count = $doc->termlist_count;
    my $count = 1;
    my $term;

    my $vsm={};
    while ( $terms_count > $count and $term = $terms++ ) {
print Dumper($term->get_termname);
print Dumper('+++++++++++++++++++++++++++');
        #use only number and indexed terms to gen VSM
        if($term->get_termname =~ m/^Z(.*)|(^\d.*)/) {
            $vsm->{
                (defined($1)) ? $1 : $2
            } = $term->get_wdf;
        }
        $count++;
    }

    return $vsm;
}


=head1 readfile

Read the file defined by language

=cut

sub read_stopword {
    my ($self, $lang) = @_;

    my $config = Novus::Config->new()->config->{'categorizer'}->{'analyzer'};
    my $stopword_lang = 'stopword_'.$lang;
    my $stopword = $config->{$stopword_lang};

    return $stopword;
}

1;
	#!/usr/bin/perl -w

	use strict;
	use Test::More;
	use Novus::Xapian;
	use Novus::Xapian::VSM;
	use Data::Dumper;

	my $vsm = Novus::Xapian::VSM->new();

	subtest 'No text parse' => sub {
	my $result = $vsm->text_to_vsm();
	is_deeply($result,undef,'Return undef if there is no text parse');
	};


	subtest 'English text' => sub {
	my $text = 'In Norway, Oslo is the capital and largest city of Norway, and the third largest city of Scandinavia.';
	my $lang = 'english';
	my $output = {
	'citi' => 2,
	'scandinavia' => 1,
	'norway' => 2,
	'third' => 1,
	'oslo' => 1,
	'largest' => 2
	};
	my $result = $vsm->text_to_vsm($text,$lang);
	is_deeply($result,$output,'Output for english text is correct');
	};


	subtest 'Norwegian text' => sub {
	my $text = 'Truet med å stevne misfornøyd PC-kunde - PC-leverandøren Asus likte svært dårlig kundens misfornøyde leserbrev.';
	my $output = {
	'asus' => 1,
	'leserbrev' => 1,
	'leverandør' => 1,
	'svært' => 1,
	'pc' => 2,
	'kund' => 2,
	'stevn' => 1,
	'dår' => 1,
	'truet' => 1,
	'misfornøyd' => 2,
	'likt' => 1
	};
	my $result = $vsm->text_to_vsm($text);
	is_deeply($result,$output,'Output for norwegian text is correct');
	};

	done_testing();
	categorizer:
	...
	analyzer:
	...
	stopword_english: "a,about,above,after,again,against,all,am,an,and,any,are,aren't,as,at,be,because,been,before,being,below,between,both,but,by,can't,cannot,could,couldn't,did,didn't,do,does,doesn't,doing,don't,down,during,each,few,for,from,further,had,hadn't,has,hasn't,have,haven't,having,he,he'd,he'll,he's,her,here,here's,hers,herself,him,himself,his,how,how's,i,i'd,i'll,i'm,i've,if,in,into,is,isn't,it,it's,its,itself,let's,me,more,most,mustn't,my,myself,no,nor,not,of,off,on,once,only,or,other,ought,our,ours,ourselves,out,over,own,same,shan't,she,she'd,she'll,she's,should,shouldn't,so,some,such,than,that,that's,the,their,theirs,them,themselves,then,there,there's,these,they,they'd,they'll,they're,they've,this,those,through,to,too,under,until,up,very,was,wasn't,we,we'd,we'll,we're,we've,were,weren't,what,what's,when,when's,where,where's,which,while,who,who's,whom,why,why's,with,won't,would,wouldn't,you,you'd,you'll,you're,you've,your,yours,yourself,yourselves"
	stopword_norwegian: "alle,andre,arbeid,av,begge,bort,bra,bruke,da,denne,der,deres,det,din,disse,du,eller,en,ene,eneste,enhver,enn,er,et,folk,for,fordi,forsÛke,fra,få,fÛr,fÛrst,gjorde,gjÛre,god,gå,ha,hadde,han,hans,hennes,her,hva,hvem,hver,hvilken,hvis,hvor,hvordan,hvorfor,i,ikke,inn,innen,kan,kunne,lage,lang,lik,like,makt,mange,med,meg,meget,men,mens,mer,mest,min,mye,må,måte,navn,nei,ny,nå,når,og,også,om,opp,oss,over,part,punkt,på,rett,riktig,samme,sant,si,siden,sist,skulle,slik,slutt,som,start,stille,så,tid,til,tilbake,tilstand,under,ut,uten,var,ved,verdi,vi,vil,ville,vite,vår,vÖre,vÖrt,å,jeg,hun,at,blir,bli"
	t/06.vsm_xapian.t ..
	ok 1 - Return undef if there is no text parse
	1..1
	ok 1 - No text parse
	$VAR1 = 'Zciti';
	$VAR1 = '+++++++++++++++++++++++++++';
	$VAR1 = 'Zlargest';
	$VAR1 = '+++++++++++++++++++++++++++';
	$VAR1 = 'Znorway';
	$VAR1 = '+++++++++++++++++++++++++++';
	$VAR1 = 'Zoslo';
	$VAR1 = '+++++++++++++++++++++++++++';
	$VAR1 = 'Zscandinavia';
	$VAR1 = '+++++++++++++++++++++++++++';
	$VAR1 = 'Zthird';
	$VAR1 = '+++++++++++++++++++++++++++';
	$VAR1 = 'and';
	$VAR1 = '+++++++++++++++++++++++++++';
	$VAR1 = 'capital';
	$VAR1 = '+++++++++++++++++++++++++++';
	$VAR1 = 'city';
	$VAR1 = '+++++++++++++++++++++++++++';
	$VAR1 = 'in';
	$VAR1 = '+++++++++++++++++++++++++++';
	$VAR1 = 'is';
	$VAR1 = '+++++++++++++++++++++++++++';
	$VAR1 = 'largest';
	$VAR1 = '+++++++++++++++++++++++++++';
	$VAR1 = 'norway';
	$VAR1 = '+++++++++++++++++++++++++++';
	$VAR1 = 'of';
	$VAR1 = '+++++++++++++++++++++++++++';
	$VAR1 = 'oslo';
	$VAR1 = '+++++++++++++++++++++++++++';
	$VAR1 = 'scandinavia';
	$VAR1 = '+++++++++++++++++++++++++++';
	$VAR1 = 'the';
	$VAR1 = '+++++++++++++++++++++++++++';
	$VAR1 = 'third';
	$VAR1 = '+++++++++++++++++++++++++++';
	ok 1 - Output for english text is correct
	1..1
	ok 2 - English text
	$VAR1 = 'Zdår';
	$VAR1 = '+++++++++++++++++++++++++++';
	$VAR1 = 'Zkund';
	$VAR1 = '+++++++++++++++++++++++++++';
	$VAR1 = 'Zleserbrev';
	$VAR1 = '+++++++++++++++++++++++++++';
	$VAR1 = 'Zleverandør';
	$VAR1 = '+++++++++++++++++++++++++++';
	$VAR1 = 'Zlikt';
	$VAR1 = '+++++++++++++++++++++++++++';
	$VAR1 = 'Zmisfornøyd';
	$VAR1 = '+++++++++++++++++++++++++++';
	$VAR1 = 'Zpc';
	$VAR1 = '+++++++++++++++++++++++++++';
	$VAR1 = 'Zstevn';
	$VAR1 = '+++++++++++++++++++++++++++';
	$VAR1 = 'Zsvært';
	$VAR1 = '+++++++++++++++++++++++++++';
	$VAR1 = 'Ztruet';
	$VAR1 = '+++++++++++++++++++++++++++';
	$VAR1 = 'asus';
	$VAR1 = '+++++++++++++++++++++++++++';
	$VAR1 = 'dårlig';
	$VAR1 = '+++++++++++++++++++++++++++';
	$VAR1 = 'kunde';
	$VAR1 = '+++++++++++++++++++++++++++';
	$VAR1 = 'kundens';
	$VAR1 = '+++++++++++++++++++++++++++';
	$VAR1 = 'leserbrev';
	$VAR1 = '+++++++++++++++++++++++++++';
	$VAR1 = 'leverandøren';
	$VAR1 = '+++++++++++++++++++++++++++';
	$VAR1 = 'likte';
	$VAR1 = '+++++++++++++++++++++++++++';
	$VAR1 = 'med';
	$VAR1 = '+++++++++++++++++++++++++++';
	$VAR1 = 'misfornøyd';
	$VAR1 = '+++++++++++++++++++++++++++';
	$VAR1 = 'misfornøyde';
	$VAR1 = '+++++++++++++++++++++++++++';
	$VAR1 = 'pc';
	$VAR1 = '+++++++++++++++++++++++++++';
	$VAR1 = 'stevne';
	$VAR1 = '+++++++++++++++++++++++++++';
	$VAR1 = 'svært';
	$VAR1 = '+++++++++++++++++++++++++++';
	$VAR1 = 'truet';
	$VAR1 = '+++++++++++++++++++++++++++';
	$VAR1 = 'å';
	$VAR1 = '+++++++++++++++++++++++++++';
	not ok 1 - Output for norwegian text is correct
	1..1

	# Failed test 'Output for norwegian text is correct'
	# at t/06.vsm_xapian.t line 49.
	# Structures begin differing at:
	# $got->{asus} = Does not exist
	# $expected->{asus} = '1'
	# Looks like you failed 1 test of 1.
	not ok 3 - Norwegian text
	1..3

	# Failed test 'Norwegian text'
	# at t/06.vsm_xapian.t line 50.
	# Looks like you failed 1 test of 3.
	Dubious, test returned 1 (wstat 256, 0x100)
	Failed 1/3 subtests

	Test Summary Report
	-------------------
	t/06.vsm_xapian.t (Wstat: 256 Tests: 3 Failed: 1)
	Failed test: 3
	Non-zero exit status: 1
	Files=1, Tests=3, 1 wallclock secs ( 0.03 usr 0.00 sys + 0.54 cusr 0.07 csys = 0.64 CPU)
	Result: FAIL
	package Novus::Xapian::VSM;

	use strict;
	use warnings;
	use Moose;
	use Novus::Config;
	use Data::Dumper;
	use Search::Xapian (':all');
	use utf8;


	my $indexer;
	sub BUILD {
	$indexer = Search::Xapian::TermGenerator->new();
	}


	=head1 text_to_vsm

	Convert text to vsm depends on language

	=cut

	sub text_to_vsm {
	my ($self, $text, $lang) = @_;

	# Set default language to norwegian
	$lang = 'norwegian' if (!defined($lang));
	return undef if (!defined($text));

	my $doc_context = $text;

	# Retreive the stopwords
	my $stopper_string = $self->read_stopword($lang);
	my @stopwords = split(',', $stopper_string);

	my $stopper = new Search::Xapian::SimpleStopper(@stopwords);
	my $stemmer = Search::Xapian::Stem->new($lang);

	my $doc = new Search::Xapian::Document();
	$indexer->set_stemmer($stemmer);
	$indexer->set_stopper($stopper);
	$indexer->set_document($doc);

	# index context
	$indexer->index_text($doc_context);


	my $terms = $doc->termlist_begin;
	my $terms_count = $doc->termlist_count;
	my $count = 1;
	my $term;

	my $vsm={};
	while ( $terms_count > $count and $term = $terms++ ) {
	print Dumper($term->get_termname);
	print Dumper('+++++++++++++++++++++++++++');
	#use only number and indexed terms to gen VSM
	if($term->get_termname =~ m/^Z(.)\|(^\d.)/) {
	$vsm->{
	(defined($1)) ? $1 : $2
	} = $term->get_wdf;
	}
	$count++;
	}

	return $vsm;
	}


	=head1 readfile

	Read the file defined by language

	=cut

	sub read_stopword {
	my ($self, $lang) = @_;

	my $config = Novus::Config->new()->config->{'categorizer'}->{'analyzer'};
	my $stopword_lang = 'stopword_'.$lang;
	my $stopword = $config->{$stopword_lang};

	return $stopword;
	}

	1;