Skip to content

Instantly share code, notes, and snippets.

@syohex
Created January 16, 2011 01:48
Show Gist options
  • Save syohex/781464 to your computer and use it in GitHub Desktop.
Save syohex/781464 to your computer and use it in GitHub Desktop.
pdic2sdic.pl
#!/usr/bin/env perl
use strict;
use warnings;
use 5.010;
use utf8;
use Encode;
my $last_word = '';
while (my $line = <STDIN>) {
$line =~ s{\r?\n?$}{};
my $decoded = decode('shift_jis', $line);
$decoded =~ s{&}{&amp;}g;
$decoded =~ s{<}{&lt;}g;
$decoded =~ s{>}{&gt;}g;
$decoded =~ s{^■}{}g;
my $need_newline = 1;
if ($decoded =~ s{\s\s}{&lf;}gxms) {
$decoded =~ m{^(.+?);};
my $current = $1;
if ($current ne $last_word) {
print "\n";
$need_newline = 1;
} else {
$need_newline = 0;
}
$last_word = $current;
} else {
print "\n";
}
my ($head, $content) = $decoded =~ m{([^:]+):(.+)$}xms;
$head =~ s{\s+$}{};
$content =~ s{■}{&lf; ■}g; # EIWA separator
$content =~ s{●}{&lf; ●}g; # WAEI separator
my $key = make_key($head, $need_newline);
print encode_utf8($key . $content);
}
sub make_key {
my ($head, $is_new_word) = @_;
state $is_continue = 0;
$is_continue = 0 if $is_new_word;
my $key = lc $head;
$key =~ s{\s+}{ };
my $ret = '';
if ($key =~ m{^([^&]+)&lf;(.*)}xms) {
my ($word, $type) = ($1, $2);
if ($is_continue == 0) {
$ret = "<K>$word</K>$type";
} else {
$ret = "&lf; $type";
}
$is_continue = 1;
return $ret;
}
$is_continue = 0;
if ($head ne $key) {
$ret = "<H>$head</H>";
}
return "$ret<K>$key</K>";
}
__END__
=head1 NAME
pdic2sdic.pl - Converter Eijiro file from pdic format to sdic format
=head1 VERSION
EIJIRO version 127
=head1 SEE ALSO
L<http://d.hatena.ne.jp/eiel/20090111#1231681381>
=cut
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment