Skip to content

Instantly share code, notes, and snippets.

@karronoli
Last active April 20, 2017 06:28
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save karronoli/e1fe69baead725e3813bdddec2b27a52 to your computer and use it in GitHub Desktop.
Save karronoli/e1fe69baead725e3813bdddec2b27a52 to your computer and use it in GitHub Desktop.
Import MySQL record to Elasticsearch index with japanese mappinng example.
requires 'DBI', '1.636';
requires 'DBD::mysql', '4.042';
requires 'Search::Elasticsearch', '2.03';
=head1 NAME
Import MySQL record to Elasticsearch index with japanese mappinng example.
=cut
use strict;
use warnings;
use utf8;
use Data::Dumper;
use Encode;
use Unicode::Normalize;
use DBI;
use DBD::mysql;
use Search::Elasticsearch;
use constant
{DATABASE => 'mydatabase',
DB_HOST => 'localhost',
DB_PORT => 3006,
DB_USER => 'myuser',
DB_PASS => 'mypassword',
DB_TABLE => 'mytable',
SQL_TEMPLATE => 'select id, name, created_at from %s', # mytable,
ES_INDEX => 'myindex',
ES_STOPTAGS => ["接続詞","助詞","助詞-格助詞","助詞-格助詞-一般","助詞-格助詞-引用","助詞-格助詞-連語","助詞-接続助詞","助詞-係助詞","助詞-副助詞","助詞-間投助詞","助詞-並立助詞","助詞-終助詞","助詞-副助詞/並立助詞/終助詞","助詞-連体化","助詞-副詞化","助詞-特殊","助動詞","記号","記号-一般","記号-読点","記号-句点","記号-空白","記号-括弧開","記号-括弧閉","その他-間投","フィラー","非言語音"]};
my $dbh = DBI->connect(sprintf('DBI:mysql:%s:%s:%d', DATABASE, DB_HOST, DB_PORT),
DB_USER, DB_PASS)
or die "cannot connect to MySQL: $DBI::errstr";
my $sth = $dbh->prepare(sprintf(SQL_TEMPLATE, DB_TABLE)) or die $dbh->errstr;
$sth->execute or die $dbh->errstr;
my @row;
while (my @tmp = $sth->fetchrow_array) {
push @row, [map {decode_utf8($_)} @tmp];
}
$sth->finish;
$dbh->disconnect;
# elasticsearch-plugin install analysis-kuromoji
# elasticsearch-plugin install org.codelibs:elasticsearch-analysis-kuromoji-neologd:5.2.1
my %mappings =
(items =>
{properties => {name =>
{type => 'text',
analyzer => 'japanese'},
created_at => {type => 'date'}}});
my %settings =
(index =>
{analysis =>
{filter =>
{pos_filter =>
{type => 'kuromoji_part_of_speech',
stoptags => ES_STOPTAGS}},
tokenizer => {kuromoji =>
{type => 'kuromoji_tokenizer',
mode => 'search'}},
analyzer => {japanese =>
{type => 'custom',
tokenizer => 'kuromoji',
filter => ['pos_filter']}}}});
my $es = Search::Elasticsearch->new();
eval {
$es->indices->delete(index => ES_INDEX);
};
warn $@ if $@;
$es->indices->create(index => ES_INDEX,
body => {mappings => \%mappings,
settings => \%settings});
my $bulk = $es->bulk_helper(index => ES_INDEX, type => DB_TABLE);
for my $row (@row) {
my ($id, $name, $created_at) = ($row->[0], NFKC($row->[1]), $row->[2]);
$bulk->index({id => $id,
source => {name => $name,
created_at => $created_at}});
}
$bulk->flush;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment