Skip to content

Instantly share code, notes, and snippets.

@DPenner1
Last active April 4, 2024 18:42
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save DPenner1/5bc495780c76cba524699320bcbdbafe to your computer and use it in GitHub Desktop.
Save DPenner1/5bc495780c76cba524699320bcbdbafe to your computer and use it in GitHub Desktop.
German Pronunciation Guesser. Used as a reference implementation on https://codegolf.stackexchange.com/q/120743/8954
using System;
using System.IO;
using System.Collections.Generic;
using System.Linq;
namespace dp1
{
class GermanIPAGuesser
{
private string target;
private int syllables;
public string Guess(string target)
{
this.target = target.ToLower();
syllables = SyllableCount(this.target);
string[] result = new string[this.target.Length];
// Rules implemented:
// 0. Check for specific prefixes. If one exists, process it and then do the below for the rest of the word
// 1. Figure out what value that special ch cluster has, ç or x
// 2. Replace letter clusters with their usual value
// 3. Handle special endings: -er, -ig and consonant devoicing
// 4. Handle special starts: st-, sp-
// 5. Map e's to schwas (ə) except when they are the first vowel
// 6. Vowels assumed long unless followed by two consonants (except h first)
// 7. Special handling for letters h and r
// 8. Otherwise replace each letter with their usual value
// 9. Just assume stress is on first syllable
// Check for prefixes first
if (syllables > 1)
{
foreach (var prefix in Prefixes)
{
if (this.target.StartsWith(prefix))
{
var root = target.Substring(prefix.Length);
var rootGuesser = new GermanIPAGuesser();
var rootGuess = rootGuesser.Guess(root);
if (SyllableCount(root) == 1) rootGuess = "ˈ" + rootGuess; // have to add in the stress mark
var prefixGuesser = new GermanIPAGuesser();
var prefixGuess = prefixGuesser.Guess(prefix);
// special processing for prefixes... vowels aren't usually long and e's are further reduced
prefixGuess = prefixGuess.Replace("ː", "");
prefixGuess = prefixGuess.Replace('e', 'ɛ');
if (prefixGuess.EndsWith("ɛ")) prefixGuess = prefixGuess.Replace('ɛ', 'ə');
return prefixGuess + rootGuess;
}
}
}
CHRound(result);
ClusterRound(result);
EndRound(result);
StartRound(result);
SchwaRound(result);
VowelRound(result);
RHRound(result);
ConsonantRound(result);
NullRound(result);
var retval = Collapse(string.Join("", result).ToCharArray());
if (syllables > 1) retval = "ˈ" + retval; // assume stress on first syllable
return retval;
}
private int SyllableCount(string s)
{
int count = 0;
bool lastVowel = false;
for (int i = 0; i < s.Length; i++)
{
// hack - this method will ignore two vowels in a row, considering them to only form 1 syllable
// This really only works because German only has 1 and 2 vowel letter syllable nuclei
if (IsVowel(s[i]) && !lastVowel)
{
lastVowel = true;
count++;
}
else
{
lastVowel = false;
}
}
return count;
}
private void CHRound(string[] result)
{
// x if ach, och, uch, auch, ç otherwise
for (int i = 0; i < target.Length - 1; i++)
{
if (target[i] == 'c' && target[i + 1] == 'h') // cluster found
{
if (i > 0 && target[i - 1] == 's') continue; // actually the sch cluster
if (i < target.Length - 2 && target[i + 2] == 's') continue; // actually the chs cluster
bool isHardCH = false;
if (i > 0 && (target[i - 1] == 'a' || target[i - 1] == 'o')) isHardCH = true;
if (i > 0 && target[i - 1] == 'u') // here we have complications based on vowel cluster
{
if (i == 1) isHardCH = true; // word starts with uch...i don't think this ever actually happens
else if (target[i - 2] != 'ä' && target[i - 2] != 'e') isHardCH = true; // äuch and euch is a soft ch
}
if (isHardCH) result[i] = "x";
else result[i] = "ç";
result[i + 1] = ""; // mark both letters as processed
}
}
}
private void ClusterRound(string[] result)
{
//clusters can overlap. So we have defined behaviour the order to check cluster is:
// 1. By size of cluster 2. From start to end of target string.
var clusterGroups = ClusterMappings.Keys.GroupBy(x => x.Length).OrderBy(x => x.Key).Reverse();
foreach (var group in clusterGroups)
{
int size = group.Key;
for (int i = 0; i < target.Length - size + 1; i++)
{
foreach (var cluster in group)
{
if (target.Substring(i, size).Equals(cluster, StringComparison.Ordinal))
{
// cluster matches, just make sure we're not overwriting already processed portions
var replace = true;
for (int j = 0; j < size; j++)
{
if (result[i + j] != null) replace = false;
}
if (replace)
{
result[i] = ClusterMappings[cluster];
for (int j = 1; j < size; j++)
{
result[i + j] = ""; // mark rest of characters as processed
}
}
}
}
}
}
} // }}}}}}...
// A few special endings, and consonant devoicing.
private void EndRound(string[] result)
{
if (result[target.Length - 1] == null) {
if (target.EndsWith("er") && syllables > 1 && result[target.Length - 2] == null)
{
result[result.Length - 2] = "";
if (target.Length > 2 && IsVowel(target[target.Length - 3]))
result[result.Length - 1] = "ɐ̯";
else
result[result.Length - 1] = "ɐ";
}
else if (target.EndsWith("ig") && syllables > 1 && result[target.Length - 2] == null)
{
result[result.Length - 2] = "";
result[result.Length - 1] = "ɪç";
}
// Otherwise, all consonants past last vowel are checked for special end mapping (consonant devoicing)
else
{
for (int i = target.Length - 1; i >= 0 && IsConsonant(target[i]); i--)
{
if (EndConsonantMappings.ContainsKey(target[i]) && result[i] == null)
{
result[i] = EndConsonantMappings[target[i]];
}
}
}
}
}
private void StartRound(string[] result)
{
if (result[0] == null && (target.StartsWith("sp") || target.StartsWith("st"))) result[0] = "ʃ";
}
// heuristic: assume all e's are schwas unless e is the first vowel
private void SchwaRound(string[] result)
{
bool vowelFound = false;
for (int i = 0; i < target.Length; i++)
{
if (IsVowel(target[i]))
{
if (!vowelFound) vowelFound = true; // never modify first e
else if (result[i] == null && target[i] == 'e') result[i] = "ə";
}
}
}
private void VowelRound(string[] result)
{
// Assume long unless followed by two consonants (not h first)
// That is generally the rule of thumb, but there's no clear rule for if the two consonants are different
// Example given by wikipedia: Mond is long, Hand is short.
// seek out short vowels
for (int i = 0; i < target.Length - 2; i++)
{
if (IsVowel(target[i]) && IsConsonant(target[i + 1]) && IsConsonant(target[i + 2]))
{
if (target[i + 1] != 'h' && result[i] == null) result[i] = ShortVowelMappings[target[i]];
}
}
// any remaining are long
for (int i = 0; i < target.Length; i++)
{
if (IsVowel(target[i]) && result[i] == null) result[i] = LongVowelMappings[target[i]];
}
}
// Letter h is silent after vowels, Letter r is ɐ̯ after long vowels and if not followed by vowel
private void RHRound(string[] result)
{
for (int i = 1; i < target.Length; i++)
{
if (target[i] == 'h' && result[i] == null)
{
if (IsVowel(target[i - 1])) result[i] = "";
}
if (target[i] == 'r' && result[i] == null)
{
// find previous entry
string lastEntry = null;
for (int j = i - 1; j >= 0; j--)
{
if (result[j] != "")
{
lastEntry = result[j]; //can still be unprocessed null.
break;
}
}
if (lastEntry != null && lastEntry.EndsWith("ː") && (i == target.Length - 1 || IsConsonant(target[i + 1])))
result[i] = "ɐ̯";
}
}
}
private void ConsonantRound(string[] result)
{
for (int i = 0; i < target.Length; i++)
{
if (ConsonantMappings.ContainsKey(target[i]) && result[i] == null) result[i] = ConsonantMappings[target[i]];
}
}
// Anything remaining maps to itself
private void NullRound(string[] result)
{
for (int i = 0; i < target.Length; i++)
{
if (result[i] == null) result[i] = target[i].ToString();
}
}
// Get rid of identical consecutive IPA characters that may have crept in
private string Collapse(char[] characters)
{
for (int i = 0; i < characters.Length - 1; i++)
{
if (characters[i] == characters[i + 1]) characters[i] = ' ';
}
return string.Join("", characters).Replace(" ", "");
}
private bool IsVowel(char c) => Vowels.Contains(c);
private bool IsConsonant(char c) => !IsVowel(c);
private List<char> Vowels = new List<char> { 'a', 'e', 'i', 'o', 'u', 'y', 'ä', 'ö', 'ü' };
// Sometimes these prefixes can be misidentified.
// e.g. gehen is just a word that happens to start with ge, but it's not the ge- prefix
private List<string> Prefixes = new List<string>
{"ge", "zu", "da", "be", "ver", "er", "ent"};
// just the ones that don't map to themselves
private Dictionary<char, string> ConsonantMappings = new Dictionary<char, string>
{
{'c', "ts"}, // c doesn't really occur on its own in German
{'g', "ɡ"},
{'r', "ʁ"},
{'s', "z"},
{'v', "f"},
{'w', "v"},
{'x', "ks"},
{'z', "ts"},
{'ß', "s"},
};
private Dictionary<char, string> EndConsonantMappings = new Dictionary<char, string>
{
{'b', "p"},
{'d', "t"},
{'g', "k"},
{'s', "s"},
};
private Dictionary<string, string> ClusterMappings = new Dictionary<string, string>
{
// ch handled separately
{"chs", "ks"},
{"ck", "k"},
{"dt", "t"},
{"ng", "ŋ"},
{"nk", "ŋk"},
{"ph", "f"},
{"qu", "kv"},
{"sch", "ʃ"},
{"ss", "s"},
{"th", "t"},
{"tz", "ts"},
{"ie", "iː"},
{"au", "aʊ̯"},
{"äu", "ɔʏ̯"},
{"eu", "ɔʏ̯"},
{"ei", "aɪ̯"},
{"ai", "aɪ̯"},
{"ey", "aɪ̯"},
{"ay", "aɪ̯"},
{"aa", "aː"},
{"ee", "eː"},
{"oo", "oː"},
};
private Dictionary<char, string> ShortVowelMappings = new Dictionary<char, string>
{
{'a', "a"},
{'ä', "ɛ"},
{'e', "ɛ"},
{'i', "ɪ"},
{'o', "ɔ"},
{'ö', "œ"},
{'u', "ʊ"},
{'ü', "ʏ"},
{'y', "ʏ"},
};
private Dictionary<char, string> LongVowelMappings = new Dictionary<char, string>
{
{'a', "aː"},
{'ä', "ɛː"},
{'e', "eː"},
{'i', "iː"},
{'o', "oː"},
{'ö', "øː"},
{'u', "uː"},
{'ü', "yː"},
{'y', "yː"},
};
public static void Main (string[] args)
{
var data = new Dictionary<string, string> ();
// read word/IPA pair into data
using (var reader = new StreamReader ("/GermanWordList.txt")) {
string line;
reader.ReadLine (); // skip header
while ((line = reader.ReadLine ()) != null) {
if (!string.IsNullOrWhiteSpace (line)) {
var parts = line.Split(',');
data.Add (parts [0].Trim (), parts [1].Trim ());
}
}
}
var guesser = new GermanIPAGuesser ();
int count = 0;
Console.WriteLine($"Word, Actual, Guess, Correct?");
foreach (var kvp in data) {
var guess = guesser.Guess (kvp.Key);
var correct = guess.Equals (kvp.Value, StringComparison.Ordinal);
if (correct) count++;
Console.WriteLine($"{kvp.Key}, {kvp.Value}, {guess}, {correct}");
}
Console.WriteLine();
Console.WriteLine($"Guessed {count}/{data.Count} words correct");
}
}
}
Word, Pronunciation
ich, ɪç
ist, ɪst
sein, zaɪ̯n
nicht, nɪçt
sie, ziː
du, duː
das, das
die, diː
es, ɛs
und, ʊnt
der, deːɐ̯
zu, tsuː
ein, aɪ̯n
in, ɪn
wir, viːɐ̯
mir, miːɐ̯
mit, mɪt
was, vas
den, deːn
mich, mɪç
auf, aʊ̯f
dass, das
er, eːɐ̯
eine, ˈaɪ̯nə
hat, hat
so, zoː
sind, zɪnt
von, fɔn
dich, dɪç
war, vaːɐ̯
haben, ˈhaːbən
für, fyːɐ̯
ja, jaː
hier, hiːɐ̯
an, an
habe, ˈhaːbə
bin, bɪn
wie, viː
noch, nɔx
dir, diːɐ̯
uns, ʊns
sich, zɪç
nur, nuːɐ̯
einen, ˈaɪ̯nən
nein, naɪ̯n
dem, deːm
ihn, iːn
auch, aʊ̯x
hast, hast
ihr, iːɐ̯
da, daː
aus, aʊ̯s
kann, kan
aber, ˈaːbɐ
schon, ʃoːn
wenn, vɛn
wird, vɪʁt
um, ʊm
als, als
bist, bɪst
im, ɪm
mal, maːl
doch, dɔx
gut, ɡuːt
meine, ˈmaɪ̯nə
jetzt, jɛtst
weiß, vaɪ̯s
werden, ˈveːɐ̯dən
nach, naːx
oder, ˈoːdɐ
dann, dan
will, vɪl
mein, maɪ̯n
mehr, meːɐ̯
keine, ˈkaɪ̯nə
etwas, ˈɛtvas
alles, ˈaləs
muss, mʊs
immer, ˈɪmɐ
nichts, nɪçts
man, man
wieder, ˈviːdɐ
bei, baɪ̯
hab, haːp
machen, ˈmaxən
vor, foːɐ̯
Mann, man
ihm, iːm
einem, ˈaɪ̯nəm
tun, tuːn
zum, tsʊm
können, ˈkœnən
sagen, ˈzaːɡən
werde, ˈveːɐ̯də
denn, dɛn
einer, ˈaɪ̯nɐ
warum, vaˈʁʊm
gehen, ˈɡeːən
sehen, ˈzeːən
sehr, zeːɐ̯
geht, ɡeːt
alle, ˈalə
über, ˈyːbɐ
müssen, ˈmʏsən
diese, ˈdiːzə
einfach, ˈaɪ̯nfax
euch, ɔʏ̯ç
des, dɛs
nie, niː
also, ˈalzoː
wo, voː
los, loːs
Zeit, tsaɪ̯t
ihnen, ˈiːnən
gibt, ɡiːpt
wirklich, ˈvɪʁklɪç
danke, ˈdaŋkə
deine, ˈdaɪ̯nə
würde, ˈvʏʁdə
soll, zɔl
hatte, ˈhatə
wissen, ˈvɪsən
bitte, ˈbɪtə
viel, fiːl
gesagt, ɡəˈzaːkt
komm, kɔm
wer, veːɐ̯
zurück, tsuˈʁʏk
wurde, ˈvʊʁdə
wollte, ˈvɔltə
Frau, fʁaʊ̯
leben, ˈleːbən
wäre, ˈvɛːʁə
damit, daˈmɪt
Gott, ɡɔt
Leute, ˈlɔʏ̯tə
kannst, kanst
heute, ˈhɔʏ̯tə
meinen, ˈmaɪ̯nən
dieser, ˈdiːzɐ
dein, daɪ̯n
kommt, kɔmt
kommen, ˈkɔmən
willst, vɪlst
na, naː
wollen, ˈvɔlən
sicher, ˈzɪçɐ
ganz, ɡants
zur, tsuːɐ̯
hätte, ˈhɛtə
weil, vaɪ̯l
tut, tuːt
hallo, ˈhaloː
okay, oˈkeː
macht, maxt
waren, ˈvaːʁən
bis, bɪs
vielleicht, fiˈlaɪ̯çt
könnte, ˈkœntə
weg, vɛk
seine, ˈzaɪ̯nə
lassen, ˈlasən
Vater, ˈfaːtɐ
gesehen, ɡəˈzeːən
nun, nuːn
gerade, ɡəˈʁaːdə
glaube, ˈɡlaʊ̯bə
Tag, taːk
ab, ap
Geld, ɡɛlt
gemacht, ɡəˈmaxt
sollte, ˈzɔltə
sagte, ˈzaːktə
keinen, ˈkaɪ̯nən
durch, dʊʁç
zwei, tsvaɪ̯
diesen, ˈdiːzən
passiert, paˈsiːɐ̯t
wohl, voːl
ihre, ˈiːʁə
Mutter, ˈmʊtɐ
reden, ˈʁeːdən
anderen, ˈandəʁən
dachte, ˈdaxtə
möchte, ˈmœçtə
wirst, vɪʁst
gehört, ɡəˈhøːɐ̯t
weißt, vaɪ̯st
raus, ʁaʊ̯s
paar, paːɐ̯
besser, ˈbɛsɐ
ob, ɔp
her, heːɐ̯
musst, mʊst
Ordnung, ˈɔʁdnʊŋ
meiner, ˈmaɪ̯nɐ
klar, klaːɐ̯
diesem, ˈdiːzəm
meinem, ˈmaɪ̯nəm
lange, ˈlaŋə
lass, las
dieses, ˈdiːzəs
helfen, ˈhɛlfən
unsere, ˈʊnzəʁə
selbst, zɛlpst
finden, ˈfɪndən
jemand, ˈjeːmant
denke, ˈdɛŋkə
ach, ax
Nacht, naxt
genau, ɡəˈnaʊ̯
gar, ɡaːɐ̯
sagt, zaːkt
Welt, vɛlt
Mädchen, ˈmɛːtçən
ins, ɪns
hin, hɪn
vom, fɔm
schön, ʃøːn
weiter, ˈvaɪ̯tɐ
leid, laɪ̯t
gleich, ɡlaɪ̯ç
ohne, ˈoːnə
Menschen, ˈmɛnʃən
deinen, ˈdaɪ̯nən
wieso, viˈzoː
dort, dɔʁt
Hause, ˈhaʊ̯zə
sollten, ˈzɔltən
davon, daˈfɔn
zusammen, tsuˈzamən
geben, ɡeːbən
hör, høːɐ̯
machst, maxst
Freund, fʁɔʏ̯nt
richtig, ˈʁɪçtɪç
Angst, aŋst
viele, ˈfiːlə
sieht, ziːt
Haus, haʊ̯s
tot, toːt
sollen, ˈzɔlən
ganze, ˈɡantsə
andere, ˈandəʁə
getan, ɡəˈtaːn
rein, ʁaɪ̯n
Moment, moˈmɛnt
bleiben, ˈblaɪ̯bən
brauchen, ˈbʁaʊ̯xən
Kinder, ˈkɪndɐ
unter, ˈʊntɐ
Arbeit, ˈaʁbaɪ̯t
Problem, pʁoˈbleːm
dafür, daˈfyːɐ̯
Herr, hɛʁ
wegen, ˈveːɡən
liebe, ˈliːbə
genug, ɡəˈnuːk
dabei, daˈbaɪ̯
bringen, ˈbʁɪŋən
gegen, ˈɡeːɡən
sag, zaːk
schnell, ʃnɛl
eines, ˈaɪ̯nəs
brauche, ˈbʁaʊ̯xə
Abend, ˈaːbənt
hatten, ˈhatən
Jahre, ˈjaːʁə
mach, max
Bruder, ˈbʁuːdɐ
Sache, ˈzaxə
hören, ˈhøːʁən
Sohn, zoːn
seit, zaɪ̯t
Morgen, ˈmɔʁɡən
drei, dʁaɪ̯
steht, ʃteːt
Scheiße, ˈʃaɪ̯sə
mag, maːk
sei, zaɪ̯
unser, ˈʊnzɐ
konnte, ˈkɔntə
habt, haːpt
einmal, ˈaɪ̯nmaːl
heißt, haɪ̯st
Kopf, kɔpf
Hilfe, ˈhɪlfə
Familie, faˈmiːliə
geh, ɡeː
Baby, ˈbeːbi
erst, eːɐ̯st
fertig, ˈfɛʁtɪç
gefunden, ɡəˈfʊndən
seid, zaɪ̯t
kam, kaːm
nehmen, ˈneːmən
sprechen, ˈʃpʁɛçən
sofort, zoˈfɔʁt
Jahren, ˈjaːʁən
Kind, kɪnt
seinen, ˈzaɪ̯nən
daran, daˈʁan
Junge, ˈjʊŋə
Minuten, miˈnuːtən
bevor, bəˈfoːɐ̯
Stadt, ʃtat
beim, baɪ̯m
warst, vaːɐ̯st
Recht, ʁɛçt
warte, ˈvaʁtə
darüber, daˈʁyːbɐ
deiner, ˈdaɪ̯nɐ
deinem, ˈdaɪ̯nəm
Ende, ˈɛndə
Dinge, ˈdɪŋə
Namen, ˈnaːmən
wahr, vaːɐ̯
allein, aˈlaɪ̯n
natürlich, naˈtyːɐ̯lɪç
töten, ˈtøːtən
bekommen, bəˈkɔmən
Männer, ˈmɛnɐ
vergessen, fɛɐ̯ˈɡɛsən
dazu, daˈtsuː
gab, ɡaːp
später, ˈʃpɛːtɐ
wusste, ˈvʊstə
Augen, ˈaʊ̯ɡən
Dank, daŋk
Freunde, ˈfʁɔʏ̯ndə
Jungs, jʊŋs
halten, ˈhaltən
warten, ˈvaʁtən
Glück, ɡlʏk
beide, ˈbaɪ̯də
gute, ˈɡuːtə
kennen, ˈkɛnən
Auto, ˈaʊ̯toː
sage, ˈzaːɡə
gehe, ˈɡeːə
komme, ˈkɔmə
mache, ˈmaxə
Uhr, uːɐ̯
sehe, ˈzeːə
Teufel, ˈtɔʏ̯fəl
sonst, zɔnst
Art, aːɐ̯t
gern, ɡɛʁn
Liebe, ˈliːbə
jeden, ˈjeːdən
könnten, ˈkœntən
ihren, ˈiːʁən
Halt, halt
Fall, fal
Film, fɪlm
bisschen, ˈbɪsçən
eigentlich, ˈaɪ̯ɡəntlɪç
weit, vaɪ̯t
gib, ɡiːp
vorbei, foːɐ̯ˈbaɪ̯
verstehe, fɛɐ̯ˈʃteːə
drin, dʁɪn
Name, ˈnaːmə
ganzen, ˈɡantsən
Musik, muˈziːk
würden, ˈvʏʁdən
Tür, tyːɐ̯
verrückt, fɛɐ̯ˈʁʏkt
solltest, ˈzɔltəst
denken, ˈdɛŋkən
dran, dʁan
sieh, ziː
lieber, ˈliːbɐ
guten, ˈɡuːtən
darauf, daˈʁaʊ̯f
stimmt, ʃtɪmt
Kerl, kɛʁl
letzte, ˈlɛtstə
Job, dʒɔp
verloren, fɛɐ̯ˈloːʁən
wurden, ˈvʊʁdən
kenne, ˈkɛnə
Ahnung, ˈaːnʊŋ
seiner, ˈzaɪ̯nɐ
Grund, ɡʁʊnt
toll, tɔl
verdammt, fɛɐ̯ˈdamt
bald, balt
bereit, bəˈʁaɪ̯t
all, al
jemanden, ˈjeːmandən
hinter, ˈhɪntɐ
Hand, hant
darf, daʁf
Frauen, ˈfʁaʊ̯ən
Idee, iˈdeː
runter, ˈʁʊntɐ
draußen, ˈdʁaʊ̯sən
lang, laŋ
Land, lant
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment