Skip to content

Instantly share code, notes, and snippets.

@kingjr
Last active September 29, 2022 16:06
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save kingjr/9b22f051fc80fd2cb3688dca102f6451 to your computer and use it in GitHub Desktop.
Save kingjr/9b22f051fc80fd2cb3688dca102f6451 to your computer and use it in GitHub Desktop.
import grequests # pip install grequests
from pathlib import Path
from tqdm import tqdm
import textgrid # pip install git+https://github.com/kylerbrown/textgrid#egg=textgrid
import typing as tp
import numpy as np
def online_MAUS_aligner(
txt_files, wav_files, language, ext="TextGrid", save_dir=None,
):
"""
phonemes description:
https://clarin.phonetik.uni-muenchen.de/BASWebServices
/services/runMAUSGetInventar?LANGUAGE=nld-NL
phoneme segmentation:
web interface: BASWebServices/interface/clarinIFrame
rest api: BASWebServices/services/help
"""
assert len(txt_files) == len(wav_files)
queue = list()
for i, (txt_file, wav_file) in enumerate(zip(txt_files, wav_files)):
txt_file = Path(txt_file)
wav_file = Path(wav_file)
assert txt_file.exists()
assert wav_file.exists()
assert txt_file.name[:-4] == wav_file.name[:-4]
# FIXME save_dir_ should be compulsory
save_dir_ = wav_file.parent if save_dir is None else save_dir
align_file = save_dir_ / wav_file.name.replace(".wav", f".{ext}")
if align_file.exists():
continue
url = "http://clarin.phonetik.uni-muenchen.de/BASWebServices"
url += "/services/runMAUSBasic"
r = grequests.post(
url,
params=dict(LANGUAGE=language, OUTFORMAT=ext),
files=[
("SIGNAL", open(wav_file, "rb")),
("TEXT", open(txt_file, "r")),
],
)
queue.append(dict(request=r, wav_file=wav_file, align_file=align_file))
# Parallel url requests
if len(queue):
print(f"Retrieving {len(queue)} segmentations via BASWebServices...")
else:
return []
pbar = tqdm(total=len(queue))
starts = list(range(0, len(queue), 100)) + [len(queue)]
bads = list()
for start, stop in zip(starts, starts[1:]):
subqueue = [queue[i] for i in range(start, stop)]
ans = grequests.map([r["request"] for r in subqueue])
wavs = [r["wav_file"] for r in subqueue]
aligns = [r["align_file"] for r in subqueue]
for r, wav_file, align_file in zip(ans, wavs, aligns):
# check status
if "<success>" not in r.text:
print(f"error with {wav_file.name}")
bads.append(wav_file.name)
continue
success = re.search("<success>(.*)</success>", r.text).group(1)
if success != "true":
print(f"error with {wav_file.name}")
bads.append(wav_file.name)
continue
link = re.search("<downloadLink>(.*)</downloadLink>", r.text)
# download
with open(str(align_file), "wb") as f:
f.write(urlopen(link.group(1)).read())
pbar.update(1)
if len(bads):
print(f"{len(bads)}/{len(queue)} files did not get aligned:", bads)
return bads
def read_textgrid(fname):
"""Parse TextGrid Praat file and generates a dataframe containing both
words and phonemes"""
# FIXME: merge with textgrid functions from MOUS schoffelen dataset
tgrid = textgrid.read_textgrid(str(fname))
parts: tp.Dict[str, tp.Any] = {}
for p in tgrid:
if p.name != "" and p.name != "<p:>": # Remove empty entries
parts.setdefault(p.tier, []).append(p)
# Separate orthographics, phonetics, and phonemes
words = parts["ORT-MAU"]
words_ph = parts["KAN-MAU"]
phonemes = parts["MAU"]
if len(words) != len(words_ph):
raise RuntimeError(
"Orthographics and phonetics have different lengths."
)
# Def concatenate orthographics and phonetics
rows: tp.List[tp.Dict[str, tp.Any]] = []
for word_id, (w_r, w_ph_r) in enumerate(zip(words, words_ph)):
if w_r.start != w_ph_r.start or w_r.stop != w_ph_r.stop:
raise RuntimeError(f"Mismatch: {w_r} and {w_ph_r}")
rows.append(
dict(
event_type="word",
onset=w_r.start,
duration=w_r.stop - w_r.start,
word_id=word_id,
name=w_r.name,
word_ph=w_ph_r.name,
word=w_r.name,
)
)
# Add timing of individual phonemes
starts = np.array([i["onset"] for i in rows])
# phonemes and starts are both ordered so this could be further optimized
# if needs be
for phoneme in phonemes:
idx = np.where(phoneme.start < starts)[0]
idx = idx[0] - 1 if idx.size else len(rows) - 1
row = rows[idx]
rows.append(
dict(
event_type="phoneme",
onset=phoneme.start,
duration=phoneme.stop - phoneme.start,
word_id=row["word_id"],
word=row["word"],
word_ph=row["word_ph"],
name=phoneme.name,
)
)
# not sure why sorting is needed, but otherwise a sample is dropped
rows.sort(key=lambda x: x["onset"])
return rows
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment