Skip to content

Instantly share code, notes, and snippets.

@nagadomi
Last active May 22, 2018 16:07
Show Gist options
  • Save nagadomi/2b8131ed5f50e375f306b146f8840d11 to your computer and use it in GitHub Desktop.
Save nagadomi/2b8131ed5f50e375f306b146f8840d11 to your computer and use it in GitHub Desktop.
import os
import subprocess
import re
class Sentence():
class Phone():
def __init__(self, begin_frame, end_frame, phone):
self.begin_frame = begin_frame
self.end_frame = end_frame
self.phone = phone
def is_silence(self):
return self.phone == "silE" or self.phone == "silB" or self.phone == "sp" or self.phone == "N"
def is_vowel(self):
return any(self.phone == v for v in ['a', 'i', 'u', 'e','o'])
def __str__(self):
return "{}-{}: {}".format(self.begin_frame, self.end_frame, self.phone)
@property
def begin_time(self):
t = 0
if self.begin_frame != 0:
t += 0.0125
return round(t + self.begin_frame * 0.01, 4)
@property
def end_time(self):
return round(0.0125 + (self.end_frame + 1) * 0.01, 4)
def __init__(self):
self.phones = []
def __str__(self):
prev = None
s = []
for v in self:
if v.is_silence():
continue
if not v.is_vowel():
if prev is None:
prev = v
else:
prev = Sentence.Phone(prev.begin_frame, v.end_frame, prev.phone + v.phone)
else:
if prev is not None:
s.append(prev.phone + v.phone)
prev = None
else:
s.append(v.phone)
prev = None
return "{}phoneme: {}".format(len(s), " ".join(s))
def __iter__(self):
return iter(self.phones)
def add(self, begin_frame, end_frame, phone):
self.phones.append(self.Phone(begin_frame, end_frame, phone))
def dump(self):
for v in self:
if not v.is_silence():
print(str(m))
def simplify(self):
# join continuous phone
clean_sentence = Sentence()
prev = None
for v in self.phones:
if not v.is_silence():
if (prev is not None) and prev.phone == v.phone:
prev = Sentence.Phone(prev.begin_frame, v.end_frame, v.phone)
else:
if prev is not None:
clean_sentence.add(prev.begin_frame, prev.end_frame, prev.phone)
prev = v
if prev is not None and (len(clean_sentence.phones) == 0 or clean_sentence.phones[-1].begin_frame != prev.begin_frame):
clean_sentence.add(prev.begin_frame, prev.end_frame, prev.phone)
# convert to vowel only
prev = None
vowel_sentence = Sentence()
for v in self.phones:
if not v.is_vowel():
if prev is None:
prev = v
else:
prev = Sentence.Phone(prev.begin_frame, v.end_frame, v.phone)
else:
if prev is not None:
vowel_sentence.add(prev.begin_frame, v.end_frame, v.phone)
prev = None
else:
vowel_sentence.add(v.begin_frame, v.end_frame, v.phone)
prev = None
return vowel_sentence
def wav2sentence(wav_file, julius_path="julius", dictation_kit_path="../dictation-kit/"):
command = [
julius_path,
"-C", os.path.join(dictation_kit_path, "main.jconf"),
"-h", os.path.join(dictation_kit_path, os.path.join("model", "phone_m","jnas-mono-16mix-gid.binhmm")),
"-palign", "-fallback1pass", "-input", "rawfile"]
# run julius
p = subprocess.Popen(command, stdout=subprocess.PIPE, stdin=subprocess.PIPE, stderr=subprocess.PIPE)
p.stdin.write((wav_file + "\n").encode("UTF-8"))
p.stdin.close()
# parse
# e.g: b'[ 143 149] -24.958252 d\n'
# note that : is removed, e.g: "o:"->"o"
pt = re.compile(r"\[\s*(\d+)\s+(\d+)\s*\]\s*([\d\+\-\.]+)\s*([a-zA-Z]+)")
sentence = Sentence()
state = False
while True:
line = p.stdout.readline()
#print(line)
if not line:
break
if line.find(b"=== begin forced alignment ===") >= 0:
state = True
if line.find(b"=== end forced alignment ===") >= 0:
state = False
if state:
matched = pt.match(line.decode("utf-8"))
if matched:
begin_frame, end_frame, score, phone = matched.group(1),matched.group(2),matched.group(3), matched.group(4)
sentence.add(int(begin_frame), int(end_frame), phone)
p.wait() # wait for julius to exit
if p.returncode != 0:
raise RuntimeError("julius: an error occurred. please check julius_path and dictation_kit_path.")
return sentence
if __name__ =="__main__":
# run test
DICTATION_KIT_PATH="../dictation-kit"
JULIUS_PATH="julius"
INPUT_FILE="tts.wav"
sentence = wav2sentence(INPUT_FILE, julius_path=JULIUS_PATH, dictation_kit_path=DICTATION_KIT_PATH)
vowel_sentence = sentence.simplify()
print(str(sentence))
print(str(vowel_sentence))
for v in vowel_sentence:
print("{}-{} {}".format(v.begin_time, v.end_time, v.phone))
"""
result:
43phoneme: o ha yo mi ke da yo te i tsu mo to ko e ga chi ga u kyo wa ma i ku ro so fu to ti di e su sa ga ka wa ri ni sha be qte i ma su
43phoneme: o a o i e a o e i u o o o e a i a u o a a i u o o u o i i e u a a a a i i a e e i a u
0.0-0.1425 o
0.1425-0.3125 a
0.3125-0.6725 o
0.6725-0.9425 i
0.9425-1.0625 e
1.0625-1.2125 a
1.2125-1.3825 o
1.3825-1.6825 e
1.6825-1.9225 i
1.9225-2.1025 u
2.1025-2.2425 o
2.2425-2.4225 o
2.4225-2.5825 o
2.5825-2.7025 e
2.7025-2.8425 a
2.8425-3.0225 i
3.0225-3.1925 a
3.1925-3.3525 u
3.3525-3.7725 o
3.7725-3.9825 a
3.9825-4.3125 a
4.3125-4.4425 i
4.4425-4.5425 u
4.5425-4.6825 o
4.6825-4.8825 o
4.8825-5.0025 u
5.0025-5.1325 o
5.1325-5.4425 i
5.4425-5.6825 i
5.6825-5.8125 e
5.8125-5.9525 u
5.9525-6.1425 a
6.1425-6.4225 a
6.4225-6.5925 a
6.5925-6.7025 a
6.7025-6.8425 i
6.8425-6.9725 i
6.9725-7.2125 a
7.2125-7.4225 e
7.4225-7.6125 e
7.6125-7.6725 i
7.6725-7.8325 a
7.8325-8.1125 u
"""
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment