Skip to content

Instantly share code, notes, and snippets.

@osamu2001
Created June 11, 2009 15:03
Show Gist options
  • Save osamu2001/127959 to your computer and use it in GitHub Desktop.
Save osamu2001/127959 to your computer and use it in GitHub Desktop.
#!/usr/bin/perl
use strict;
use warnings;
use utf8;
use Encode;
my $ens="";
my $jas="";
my $mode="en";
&print_tmx_header();
while(<STDIN>){
my $line = decode("utf8", $_);
chomp $line;
# $line =~ s/&/&amp;/g;
# $line =~ s/"/&quot;/g;
# $line =~ s/</&lt;/g;
# $line =~ s/>/&gt;/g;
$line = &HTMLSanitize($line);
if($line =~ /^\/\/[\s\t]+(.+)/){
$jas = $jas . " " .$1;
$mode = "ja";
}else{
if(0 == length($line)){
if((0 != length($ens))&&(0 != length($jas))){
$ens =~ s/\.\s+/\.\n/g;
$ens =~ s/[ ]+/ /g;
$jas =~ s/。\s*/。\n/g;
$jas =~ s/、\s*/、/g;
$jas =~ s/。\n$/。/g;
unless($ens =~ /\n/){
#single sentence.
&print_tuv($ens,$jas);
}else{
#multi sentences.
my $ensn = scalar(() = $ens =~ /\n/g);
my $jasn = scalar(() = $jas =~ /\n/g);
if($ensn==$jasn){
my @enss = split(/\n/,$ens);
my @jass = split(/\n/,$jas);
for(my $i=0;$i<$ensn;$i=$i+1){
&print_tuv($enss[$i],$jass[$i]);
}
}else{
# print "\$ensn=$ensn, \$jasn=$jasn\n";
&print_tuv($ens,$jas);
}
}
}
$ens = "";
$jas = "";
}else{
$ens = $ens . " " . $line;
$mode = "en";
}
}
}
&print_tmx_footer();
sub print_tuv{
my ($ens,$jas) = @_;
&print_tuv_en_b();
print encode("utf8",$ens);
&print_tuv_en_e();
&print_tuv_ja_b();
print encode("utf8",$jas);
&print_tuv_ja_e();
}
sub print_tuv_en_b(){
print << "TUV_EN";
<tu>
<tuv lang="EN-US">
TUV_EN
print "<seg>";
}
sub print_tuv_en_e(){
print << "TUV_EN";
</seg>
</tuv>
TUV_EN
}
sub print_tuv_ja_b(){
print << "TUV_JA";
<tuv lang="JA">
TUV_JA
print "<seg>";
}
sub print_tuv_ja_e(){
print << "TUV_JA";
</seg>
</tuv>
</tu>
TUV_JA
}
sub print_tmx_footer(){
print << "TMX_FOOTER";
</body>
</tmx>
TMX_FOOTER
}
sub print_tmx_header(){
print << "TMX_HEADER";
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE tmx SYSTEM "tmx11.dtd">
<tmx version="1.1">
<header
creationtool="OmegaT"
creationtoolversion="2.0.2_1"
segtype="sentence"
o-tmf="OmegaT TMX"
adminlang="EN-US"
srclang="EN-US"
datatype="plaintext"
>
</header>
<body>
TMX_HEADER
}
sub HTMLSanitize {
my $str = shift;
$str =~ s/&/&amp;/g;
$str =~ s/</&lt;/g;
$str =~ s/>/&gt;/g;
$str =~ s/"/&quot;/g;
return $str;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment