Skip to content

Instantly share code, notes, and snippets.

@clojens
Created November 22, 2014 02:14
Show Gist options
  • Save clojens/8fc00c0357e06f0ec00f to your computer and use it in GitHub Desktop.
Save clojens/8fc00c0357e06f0ec00f to your computer and use it in GitHub Desktop.
#!/usr/bin/perl -w
use English; # things like MATCH
# --------------------------------------------------------------
# txt2phoNL - usage: perl txt2phoNL <dutch-text.txt >phonemes.pho
# the generated phoneme file is suitable for use with MBROLA,
# but you have to use the -e option in MBROLA to skip over
# spurious unpronounceable phoneme pairs (e.g. caused by English
# words in your Dutch text file!).
# Hint: Use pipes, e.g. "ls | txt2phoNL | mbrola -e - - | play"
# This is GPLed software (open source freeware) by
# Eric Auer <eric@coli.uni-sb.REMOVEthisIFyouAREnoSPAMMER.de>, the license is the GNU GPL
# version 2 or later, also available as copying.txt in this
# directory, http://www.coli.uni-saarland.de/~eric/stuff/soft (3/2002)
# Please give me some feedback: As I am no native speaker
# of Dutch, this txt2phoNL definitely need some improvement!
# --------------------------------------------------------------
# new version 14 feb 2002:
# - sanitize away illegal phone pairs in a last step,
# includes devoicing of consonants before a break.
# - intermediate repn uses one char per phoneme.
# - simpler rewrite mechanism eats all matched chars
# and produces only phones - so the text string is constant.
# BUT: restart from " " if the rule input ended in " " !
# - steps: 1. digit/... names
# 2. sound pattern rules (preferring long matches,
# walking the string and trying all rules per char)
# 3. sanitize and get final repn from intermediate one
# new version 2003-04-05 by Marc Spoorendonk marc@spoorendonk.com (native Dutch speaker)
# - changed to much to mention. Very acceptable translation now.
my $XLATEDEBUG = 4; # show all translation rule applications
# of at least this size
# special one char repn:
# _ is " ", Ei is 1, 9y is 3, Au is 4, ai is 5,
# oi is 6, ui is 7, Ai is 8, Oi is 9, . is EOF, ? is question
# , is comma
open(STRING,">/dev/stderr") || die "cannot open debug log\n";
# open(STRING,">nl2pho.log") || die "cannot open debug log\n";
my $foo;
$OUTPUT_AUTOFLUSH = 1; # (also known as $|): flush after every
# write/print, do not buffer output
$/ = undef; # do not split on line breaks
# $/ is $RS, record separator in use English
my $text0 = <STDIN>; # read stdin
my $text = " "; # other stage (start with a space)
my $phones = " "; # phoneme one-char-per-phoneme repn
# by the way: a "^>*" remover would be nice for mails...
# g vs G vs x: regen [reG@n] goal [goL] gage [xaZe]
# where the G (voiced "ch") is a dialect alternative to x ("ch")
# and the g only occurs in foreign words.
# e vs E vs @: gemak [x@mAk] gage [xaZe] veer [ver] pet [pEt]
# this is the len: e is long, is ee or e-at-end-of-syll.
# E is short, is default, kind of.
# ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
# first step: reduce the alphabet by spelling out specials
# result: a plain [a-z.? ]* string
my %special = ("0","null", "=","is",
"1","een", "!","!",
"2","twee", '"',"aanhaalingsteken",
"3","drie",
"4","vier", "\$","dollar",
"5","vijf", "%","procent",
"6","zes", "&","en",
"7","zeven", "/","slesh", #phonetically
"8","acht", "(","haakje openen",
"9","negen", ")","haakje sluiten,",
"*","ster", "\\","beckslesh", #phonetically
"+","plus", "?","?",
"#","hekje", "|","paip", #phonetically
".",".", "_","underscoor", #phonetically
",",",", "-","",
">","groter", ";",";",
"<","kleiner",":",":",
"^","dakje", "@","aapestaartje",
"°","grad", "{","accolade openen",
"[","hoekje", "]","hoekje sluiten,",
"~","tilde", "}","accolade sluiten,"
);
# use this: punt. koma, vraagteken?
# or that: . , ?
# the latter has the problem that a . or , or ?
# surrounded by spaces just sounds like a space...
# ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
#Marc> Prefix with space for easyer matching.
$text0 =~ s/^/ /g;
$text0 =~ s/$/ /g;
$text0 =~ s/^[>]*//g; # un-mailify the text :-)
$text0 =~ s{://}{dubbele punt slesh slesh}g; # http:// and similar stuff
$text0 =~ s{:-[)]}{,lachend gezicht}g; # smiley
$text0 =~ s{:[)]}{,lachend gezicht}g; # smiley
$text0 =~ s{:-[(]}{,treurig gezicht}g; # smiley
$text0 =~ s{:[(]}{,treurig gezicht}g; # smiley
$text0 =~ s{;-[)]}{,knipoogend gezicht}g; # smiley
$text0 =~ s{;[)]}{,knipoogend gezicht}g; # smiley
$text0 =~ s/cie/sie/g; # precies -> presies, provincie -> provinsie
#Marc> betaal -> betaal
#Marc> betalen -> betaalen
#Marc> It keeps metten, marren, matten as they are.
#Marc> betaling -> betaaling
# b e t a l i ng bet a a li ng
$text0 =~ s/([^eaiou][eaou][rtpsdfgklzcvbnm])([eaiou])([rtpsdfgklzcvbnm][eaiou])/$1$2$2$3/g;
#Marc> meten -> meeten maren -> maaren
# m e t e n m e e ten
$text0 =~ s/([^eaiou])([eaou])([rtpsdfgklzcvbnm][eaiou][^eaiou])/$1$2$2$3/g;
#Marc> k.n.m.i. -> k n m i
$text0 =~ s/([^a-z])([a-z])\./$1$2 /g;
$text0 =~ s/([^a-z])([a-z])\./$1$2 /g;
#Marc> remove lines from input: "-----------------------------" -> ""
$text0 =~ s/[-_=+]{3,}//g;
#Marc> www.bla.com -> www punt bla punt com
$text0 =~ s/\.([^ \n\t])/punt $1/g;
#Marc> translate some numbers. (write a function for this once)
$text0 =~ s/([^0-9])10([^0-9])/$1tien$2/g;
$text0 =~ s/([^0-9])11([^0-9])/$1elf$2/g;
$text0 =~ s/([^0-9])12([^0-9])/$1twaalf$2/g;
$text0 =~ s/([^0-9])13([^0-9])/$1dertien$2/g;
$text0 =~ s/([^0-9])14([^0-9])/$1veertien$2/g;
$text0 =~ s/([^0-9])15([^0-9])/$1vijftien$2/g;
$text0 =~ s/([^0-9])16([^0-9])/$1zestien$2/g;
$text0 =~ s/([^0-9])17([^0-9])/$1zeventien$2/g;
$text0 =~ s/([^0-9])18([^0-9])/$1achttien$2/g;
$text0 =~ s/([^0-9])19([^0-9])/$1negentien$2/g;
$text0 =~ s/([^0-9])20([^0-9])/$1twintig$2/g;
$text0 =~ s/([^0-9])21([^0-9])/$1eenentwintig$2/g;
$text0 =~ s/([^0-9])22([^0-9])/$1tweeentwintig$2/g;
$text0 =~ s/([^0-9])23([^0-9])/$1drieentwintig$2/g;
$text0 =~ s/([^0-9])24([^0-9])/$1vierentwintig$2/g;
$text0 =~ s/([^0-9])25([^0-9])/$1vijfentwintig$2/g;
$text0 =~ s/([^0-9])26([^0-9])/$1zesentwintig$2/g;
$text0 =~ s/([^0-9])27([^0-9])/$1zevenentwintig$2/g;
$text0 =~ s/([^0-9])28([^0-9])/$1achtentwintig$2/g;
$text0 =~ s/([^0-9])29([^0-9])/$1negenentwintig$2/g;
$text0 =~ s/([^0-9])30([^0-9])/$1dertig$2/g;
$text0 =~ s/([^0-9])31([^0-9])/$1eenendertig$2/g;
$text0 =~ s/([^0-9])32([^0-9])/$1tweeendertig$2/g;
$text0 =~ s/([^0-9])33([^0-9])/$1drieendertig$2/g;
$text0 =~ s/([^0-9])34([^0-9])/$1vierendertig$2/g;
$text0 =~ s/([^0-9])35([^0-9])/$1vijfendertig$2/g;
$text0 =~ s/([^0-9])36([^0-9])/$1zesendertig$2/g;
$text0 =~ s/([^0-9])37([^0-9])/$1zevenendertig$2/g;
$text0 =~ s/([^0-9])38([^0-9])/$1achtendertig$2/g;
$text0 =~ s/([^0-9])39([^0-9])/$1negenendertig$2/g;
print STDERR "Text: $text0";
for my $char (split(//,$text0)) {
$char = lc($char);
$char = "eu" if ($char =~ /öÖ/); # approximately :-)
$char = "ae" if ($char =~ /äÄ/); # could be better
$char = "uu" if ($char =~ /Üü/); # should also be for &euml;
if (defined $special{$char}) {
$text .= " " unless ($text =~ / $/);
$text .= $special{$char} . " ";
} elsif ($char =~ /[a-z]/) {
$text .= $char;
} else {
$text .= " " unless ($text =~ / $/);
} # simplify all whitespace/linebreak stretches
# and other special chars to single spaces
}
$text .= " " x 5; # end with spaces!
# ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
# second step: apply phoneme pattern rules (prefer longest
# match, eat up all left side apart from trailing space,
# produce pure phoneme list)
my %five = ( " lijk"," l1k",
"lijk ",'l@k ',
"elijk","El1k",
" bent", " bEnt",
"atie ","atsi", #informatie
"+size+",5
);
my %four = ( "http","ha te te pe ",
"html","ha te Em El ",
"agen","axEn",
"ooie","oi@", # mooie
"ooit","oIt",
" er "," Er ",
" en "," En ",
" nl "," EnEl ",
" he "," hE ",
" ok "," oke ",
"hou ","h4w ",
"ouch","uS", # douche
"oush","uS", # kianoush
" pc "," pe se ",
"even",'ev@n',
"tie ","tsi", #vakantie
" chi"," Si", # china
"+size+",4
);
my %three = (
"aai","5" ,
"ooi","oi" ,
"oei","7",
"cee","se",
"ai ","8" ,
"oi ","9",
"age","aZe",
"ch ","x" ,
"ftp","ef te pe ",
"www","we we we ",
"htm","ha te em ",
"tp:", "te pe ",
"mp ", "Em pe ", # mp3
"mb ", "Embe ", # mp3
"eeu", "e2",
"en ",'@n',
"he ","he",
"eij","1",
#pronounciation of E before double dissonant
"ett","Et", #letter
"epp","Ep",
"ett","Et",
"err","Er",
"ekk","Ek", #lekker
"emm","Em",
"ess","Es",
"eff","Ef",
"ell","El",
"ebb","Eb",
"enn","En",
#Marc> distinct letters: k.n.m.i a.u.b.
" a ", "a",
" b ", "be",
" c ", "se",
" d ", "de",
" e ", "e",
" f ", "Ef",
" g ", "xe",
" h ", "ha",
" i ", "i",
" j ", "ie",
" k ", "ka",
" l ", "El",
" m ", "Em",
" n ", "En",
" o ", "o",
" p ", "pe",
" q ", "ky",
" r ", "Er",
" s ", "Es",
" t ", "te",
" u ", "y",
" v ", "ve",
" w ", "we",
" x ", "Iks",
" y ", "1",
" z ", "zEt",
"+size+",3
);
my %two = ("ie","i" , "oe","u" , "uu","y",
"aa","a" , "ee","e" , "oo","o",
"eu","2" , "ei","1",
"ui","3" , "ou","4" , "ij","1",
"sj","S" , "g ","x" , "nj","J",
"ce","sE",
"l ","l" , "ng","N" ,
"dt","t" , "ch","x" , "iu","ju",
"dl",'d@l', "lf",'l@f',
"bb","b" , "dd","d" , "e ",'@',
"d ","t" , "hr","r" , "hl","l",
"o ","o" , "a ", "a",
"yl","1l",
"zl","z l",
"mm", "m", # Marc> m-m is not a sound. Same for p-p and n-n.
"pp", "p",
"nn", "n",
"rr", "r",
"kk", "k",
"tt", "t",
"+size+",2
); # hr/hl/yl/zl: sane processing
# of foreign words
my %one = ("a","A", "b","b", "c","k",
"d","d", "e",'E', "f","f",
"g","x", "h","h", "i","I",
"j","j", "k","k", "l","l",
"m","m", "n","n", "o","O",
"p","p", "q","k", "r","r",
"s","s", "t","t", "u","Y",
"v","v", "w","w", "x","ks",
"y","j", "z","z", " "," ",
".",".", "?","?", ",",",",
"+size+",1
); # prosody with [ ?.,] is a later step
my @todo = (\%five, \%four, \%three, \%two, \%one);
# ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
$phones = " ";
my $x = 0; # string index
while ($x < (length($text)-5)) {
my $y = 0;
for my $hashref (@todo) { # do l-longest rules first...
if ($y != 0) { next; }
my $check = substr($text,$x,$hashref->{"+size+"});
if (defined $hashref->{$check}) {
$phones .= $hashref->{$check};
$x += $hashref->{"+size+"};
$x-- if (($check =~ / $/) && ($check ne " "));
# skip over matched part, but rewind on " " suffix
$y++;
print STDERR "Translate: <$check> to /"
. $hashref->{$check} . "/\n"
if (length($check) >= $XLATEDEBUG);
}
}
if ($y == 0) {
print STDERR "Had to translate first char to NIL:\n";
print STDERR "<" . substr($text,$x,10) . "...>\n";
$phones .= " ";
$x++;
}
}
# ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
# third step: convert to SAMPA alphabet and apply constraints
# on phoneme pairings (input: $phones string)
my %xlate = ("1","Ei", "3","9y", "4","Au", "5","ai",
"6","oi", "7","ui", "8","Ai", "9","Oi",
" ","_", "?","_", ".","_", ",","_"
);
my $Pvowel = "aeiouAEIOy2Y13456789";
my $Pdipht = "56789";
my $Pvoiced = "bdcvzZGhJjg"; # adding g for convenience
my $Pconson = "ptkbdgcfvszSZxGhmn";
my $Pvoice2 = "czZGhJj";
my $Psemi = "GNJL";
my $Pspace = ".?,_ ";
# rules:
# handled above: no "EY" or "IY" (replace by eY and iY)
# handled above: no d before l (add schwa)
# handled above: common case of bb and dd (replace by b and d)
# handled above: commod case of d_ (devoice to t_)
# no voiced/semi doubled (replace by single occurance)
# no schwa before OR AFTER dipht (remove schwa)
# no voice2 before l, r or j (add schwa)
# no voiced before conson (add schwa ; duplication rule first)
# no conson before semi (add schwa)
# special case of next rule: j-E (replace by j-@)
# no dipht before or after vowel/j (insert " ", see above)
# no voiced at the end of a word (devoice)
# ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
$text = "";
my $adder = ""; # buffer before we really add the phone!
my $freq = 200; # freq in Hz, only used at " " for now
my $dur = 100; # duration in msec
my $ph; # current phoneme
my $ph0 = " "; # previous phoneme
# the prosody and rhythm are still extremely simple
foreach $ph (split(//,$phones)) {
if (($ph =~ /[${Psemi}${Pvoiced}]/) && ($ph0 eq $ph)) {
print STDERR "${ph}-$ph: removeone $ph\n";
$adder = ""; # ignore first copy of double phoneme
} elsif (($ph0 eq "@") && ($ph =~ /[${Pdipht}]/)) {
# remove the previous schwa
# (or just insert a short "h")
$adder = "";
print STDERR "\@-$ph: remove \@\n";
} elsif (($ph eq "@") && ($ph0 =~ /[${Pdipht}]/)) {
# remove the current schwa
$ph = "";
print STDERR "${ph0}-\@: remove \@\n";
} elsif ( (($ph0 =~ /[${Pvoice2}]/) && ($ph =~ /lrj/))
|| (($ph0 =~ /[${Pvoiced}]/) &&
($ph =~ /[${Pconson}]/))
|| (($ph0 =~ /[${Pconson}]/) &&
($ph =~ /[${Psemi}]/))
) {
$adder .= "\@ 50\n";
print STDERR "${ph0}-$ph: insert schwa\n";
} elsif (($ph0 eq "j") && ($ph eq "E")) {
print STDERR "j-E: changeto j-\@\n";
$ph = "@"; # modify this part this time...
} elsif ( (($ph0 =~ /[${Pdipht}j]/) &&
($ph =~ /[${Pvowel}j]/))
|| (($ph0 =~ /[${Pvowel}j]/) &&
($ph =~ /[${Pdipht}j]/))
) {
if ($ph0 eq "j") {
$adder = "i 100\n";
print STDERR "${ph0}-$ph: changeto i-$ph\n";
}
if ($ph eq "j") {
print STDERR "${ph0}-$ph: changeto ${ph0}-i\n";
$ph = "i";
}
if (($ph0 ne "j") && ($ph ne "j")) {
$adder .= "_ 50\n";
print STDERR "${ph0}-$ph: insert break\n";
}
} elsif (($ph0 =~ /[${Pvoiced}]/) && ($ph =~ /[${Pspace}]/)) {
my $de = $ph0;
$de =~ tr/bdcvzZGhJjg/ptxfsSx_IIk/;
# ptxfsSx IIk
print STDERR "${ph0}-_: changeto ${de}-_\n";
$adder = "$de 100\n";
}
if (($ph0 eq "j") && ($ph =~ /[${Pspace}]/)) {
print STDERR "j-_: insert \@\n";
$adder .= "\@ 100\n";
}
if ($adder) {
$text .= $adder; # add possibly corrected recent phoneme
$adder =~ s{^([^ ]*).*$}
{$1}gm; # reduce to phonemes, multiline
die "<$adder> ?\n" if ($adder =~ / /);
$adder = join("-",split(/\n/,$adder)); # a\nb\n -> a-b-
print STRING "${adder}-";
}
if ($ph) {
$dur = ($ph =~ /[iuyaeo213456789rmnNJ]/) ? 200 : 100;
# longer for long vowels/rmnNJ
$freq = 200 if ($ph eq " "); # default freq
$freq = 252 if ($ph eq "?"); # go up for questions
$freq = 159 if ($ph eq "."); # go down for boundaries
$freq = 178 if ($ph eq ","); # go down a bit for commas
if ($ph =~ /[${Pspace}]/) { # various breaks
$adder = "_ 100 (50 , $freq)\n";
} else {
$adder = ( (defined $xlate{$ph}) ? $xlate{$ph} : $ph );
# use 1..2 char phone names
$adder .= " $dur\n";
}
} else {
print STDERR "Skip\n";
$adder = "";
}
$ph0 = $ph;
}
print "$text\n";
print STRING "\n";
close STRING;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment