Skip to content

Instantly share code, notes, and snippets.

@karpet
Created February 21, 2017 04:51
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save karpet/d8fe12085246b8419f9e4ab44930c1cc to your computer and use it in GitHub Desktop.
Save karpet/d8fe12085246b8419f9e4ab44930c1cc to your computer and use it in GitHub Desktop.
example Lucy with Chinese analyzer
{
package Jieba;
use v5.10;
sub jieba_tokenize {
jieba_tokenize_python(shift);
}
# TODO:
#result = jieba.tokenize(u'永和服装饰品有限公司', mode='search')
use Inline Python => <<'END_OF_PYTHON_CODE';
from jieba import tokenize
def jieba_tokenize_python(text):
seg_list = tokenize(text, mode='search')
return(list(seg_list))
END_OF_PYTHON_CODE
1;
}
package ChineseAnalyzer;
use v5.10;
use Encode qw(decode_utf8);
use base qw( Lucy::Analysis::Analyzer );
sub new {
my $self = shift->SUPER::new;
return $self;
}
sub transform {
my ( $self, $inversion ) = @_;
return $inversion;
}
sub transform_text {
my ( $self, $text ) = @_;
my $inversion = Lucy::Analysis::Inversion->new;
my @tokens = Jieba::jieba_tokenize( decode_utf8($text) );
$inversion->append(
Lucy::Analysis::Token->new(
text => $_->[0],
start_offset => $_->[1],
end_offset => $_->[2]
)
) for @tokens;
return $inversion;
}
1;
#!/usr/bin/env perl
use strict;
use warnings;
use Lucy::Plan::Schema;
use Lucy::Plan::FullTextType;
use Lucy::Index::Indexer;
use ChineseAnalyzer;
my $path_to_index = shift(@ARGV) or die "$0 path/to/index";
# Create Schema.
my $schema = Lucy::Plan::Schema->new;
my $chinese = ChineseAnalyzer->new();
my $raw_type = Lucy::Plan::FullTextType->new( analyzer => $chinese, );
$schema->spec_field( name => 'body', type => $raw_type );
# Create an Indexer object.
my $indexer = Lucy::Index::Indexer->new(
index => $path_to_index,
schema => $schema,
create => 1,
# truncate => 1,
);
my $doc = { body => '全自动安装' };
$indexer->add_doc($doc);
$indexer->commit;
print "Finished.\n";
@karpet
Copy link
Author

karpet commented Feb 21, 2017

requires install of https://github.com/fxsjy/jieba

@swuecho
Copy link

swuecho commented Feb 21, 2017

#!/usr/local/bin/perl

package MyAnalyzer {
use v5.10;
use base qw( Lucy::Analysis::Analyzer );

sub new {
    my $self = shift->SUPER::new;
    return $self;
}

sub transform {
    my ($self, $inversion)= @_;
    return $inversion;
}

sub transform_text {
    my ($self, $text) = @_;
    my $inversion = Lucy::Analysis::Inversion->new;
    my @tokens = (['a', 0,1], ['b', 1,2] );
    $inversion->append(
       Lucy::Analysis::Token->new(text =>$_->[0],
                                  start_offset=> $_->[1] ,
                                  end_offset=>$_->[2] 
        )
        
    ) for @tokens;
    return $inversion;
}

1;

}

package main;
use DBI;
use File::Spec::Functions qw( catfile );

use Lucy::Plan::Schema;
use Lucy::Plan::FullTextType;
use Lucy::Index::Indexer;


my $path_to_index = '/tmp/test.index';

# Create Schema.
my $schema = Lucy::Plan::Schema->new;

my $my_analyzer= MyAnalyzer->new();

my $raw_type = Lucy::Plan::FullTextType->new(
        analyzer => $my_analyzer,
);

$schema->spec_field( name => 'body',  type => $raw_type);
use DDP; p $schema->dump;
# Create an Indexer object.
my $indexer = Lucy::Index::Indexer->new(
    index    => $path_to_index,
    schema   => $schema,
    create   => 1,
#    truncate => 1,
);


my $doc = { body => 'test' }; 
$indexer->add_doc($doc);

$indexer->commit;

print "Finished.\n";

@swuecho
Copy link

swuecho commented Feb 21, 2017

remove dependency.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment