Skip to content

Instantly share code, notes, and snippets.

@NTT123
Created June 6, 2021 03:13
Show Gist options
  • Save NTT123/9c1fb092d76acb767cfd930386eeb1ce to your computer and use it in GitHub Desktop.
Save NTT123/9c1fb092d76acb767cfd930386eeb1ce to your computer and use it in GitHub Desktop.
MFA InfoRe Tutorial
Display the source blob
Display the rendered blob
Raw
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"name": "InfoRe MFA Tutorial.ipynb",
"provenance": [],
"collapsed_sections": []
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"language_info": {
"name": "python"
}
},
"cells": [
{
"cell_type": "code",
"metadata": {
"id": "IPkicKwU8IWj"
},
"source": [
"!apt update -y\n",
"!pip3 install gdown"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "G6Z-aDd08hfk"
},
"source": [
"%%bash\n",
"data_root=\"./train_data\" # modify this\n",
"pushd .\n",
"mkdir -p $data_root\n",
"cd $data_root\n",
"gdown --id 1Pe-5lKT_lZsliv2WxQDai2mjhI9ZMFlj -O infore.zip\n",
"unzip infore.zip \n",
"popd"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "180fmSWhP_Wq",
"outputId": "3144680c-6fee-42c9-a4c5-9471942b23fa"
},
"source": [
"%%writefile install_mfa.sh\n",
"#!/bin/bash\n",
"\n",
"## a script to install Montreal Forced Aligner (MFA)\n",
"\n",
"root_dir=${1:-/tmp/mfa}\n",
"mkdir -p $root_dir\n",
"cd $root_dir\n",
"\n",
"# download miniconda3\n",
"wget -q --show-progress https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh\n",
"bash Miniconda3-latest-Linux-x86_64.sh -b -p $root_dir/miniconda3 -f\n",
"\n",
"# create py38 env\n",
"$root_dir/miniconda3/bin/conda create -n aligner -c conda-forge openblas python=3.8 openfst pynini ngram baumwelch -y\n",
"source $root_dir/miniconda3/bin/activate aligner\n",
"\n",
"# install mfa, download kaldi\n",
"pip install montreal-forced-aligner\n",
"mfa thirdparty download\n",
"\n",
"echo -e \"\\n======== DONE ==========\"\n",
"echo -e \"\\nTo activate MFA, run: source $root_dir/miniconda3/bin/activate aligner\"\n",
"echo -e \"\\nTo delete MFA, run: rm -rf $root_dir\"\n",
"echo -e \"\\nSee: https://montreal-forced-aligner.readthedocs.io/en/latest/aligning.html to know how to use MFA\""
],
"execution_count": 4,
"outputs": [
{
"output_type": "stream",
"text": [
"Overwriting install_mfa.sh\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "UDtqhang9IHT"
},
"source": [
"!bash ./install_mfa.sh /tmp/mfa # path to install directory\n",
"!source /tmp/mfa/miniconda3/bin/activate aligner; mfa align --help"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "1MGHIiU_97LC"
},
"source": [
"!cat train_data/lexicon.txt | cut -f 1 > /content/words.txt"
],
"execution_count": 6,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "FfMSocEUf79K",
"outputId": "afde4f04-3e7d-4a4c-cec9-51c04caa9f48"
},
"source": [
"# paper: https://www.aclweb.org/anthology/W16-5207.pdf\n",
"# title: A non-expert Kaldi recipe for Vietnamese Speech Recognition System\n",
"\n",
"consonants = [\n",
" 'ngh', \n",
" 'ch', 'gh', 'gi', 'kh', 'ng', 'nh', 'ph', 'qu', 'tr', 'th', \n",
" 'b', 'c', 'd', 'Ä‘', 'g', 'h', 'k', 'l', 'm', 'n', 'p', 'r', 's', 't', 'v', 'x'\n",
"]\n",
"vowels = (\n",
" ['a', 'ă', 'â', 'e', 'ê', 'i', 'o', 'ô', 'ơ', 'u', 'ư', 'y'] +\n",
" ['á', 'ắ', 'ấ', 'é', 'ế', 'í', 'ó', 'ố', 'ớ', 'ú', 'ứ', 'ý'] +\n",
" ['à', 'ằ', 'ầ', 'è', 'ề', 'ì', 'ò', 'ồ', 'ờ', 'ù', 'ừ', 'ỳ'] +\n",
" ['ả', 'ẳ', 'ẩ', 'ẻ', 'ể', 'ỉ', 'ỏ', 'ổ', 'ở', 'ủ', 'ử', 'ỷ'] +\n",
" ['ã', 'ẵ', 'ẫ', 'ẽ', 'ễ', 'ĩ', 'õ', 'ỗ', 'ỡ', 'ũ', 'ữ', 'ỹ'] +\n",
" ['ạ', 'ặ', 'ậ', 'ẹ', 'ệ', 'ị', 'ọ', 'ộ', 'ợ', 'ụ', 'ự', 'ỵ']\n",
")\n",
"\n",
"punctuations = ['.', '?', '\"', '\\'', ',', '-', '–', '!', ':', ';', '(', ')', '[', ']', '\\n' ]\n",
"\n",
"alphabet = sorted(set(''.join(consonants + vowels)))\n",
"print(alphabet)\n",
"# phonemes = sorted(consonants + vowels, key=len, reverse=True)\n",
"phonemes = consonants + vowels\n",
"print(phonemes)\n",
"\n",
"import unicodedata\n",
"def text_to_phonemes(text, keep_punctuation=False):\n",
" text = unicodedata.normalize('NFKC', text.strip().lower())\n",
" idx = 0\n",
" out = []\n",
" while idx < len(text):\n",
" # length: 3, 2, 1\n",
" for l in [3, 2, 1]:\n",
" if idx + l <= len(text) and text[idx: (idx+l)] in phonemes:\n",
" out.append(text[idx: (idx+l)])\n",
" idx = idx + l\n",
" break\n",
" else:\n",
" if idx < len(text):\n",
" if keep_punctuation and text[idx] in punctuations:\n",
" out.append(text[idx])\n",
" if text[idx] == ' ':\n",
" out.append(text[idx])\n",
" idx = idx + 1\n",
" return out"
],
"execution_count": 7,
"outputs": [
{
"output_type": "stream",
"text": [
"['a', 'b', 'c', 'd', 'e', 'g', 'h', 'i', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'x', 'y', 'à', 'á', 'â', 'ã', 'è', 'é', 'ê', 'ì', 'í', 'ò', 'ó', 'ô', 'õ', 'ù', 'ú', 'ý', 'ă', 'đ', 'ĩ', 'ũ', 'ơ', 'ư', 'ạ', 'ả', 'ấ', 'ầ', 'ẩ', 'ẫ', 'ậ', 'ắ', 'ằ', 'ẳ', 'ẵ', 'ặ', 'ẹ', 'ẻ', 'ẽ', 'ế', 'ề', 'ể', 'ễ', 'ệ', 'ỉ', 'ị', 'ọ', 'ỏ', 'ố', 'ồ', 'ổ', 'ỗ', 'ộ', 'ớ', 'ờ', 'ở', 'ỡ', 'ợ', 'ụ', 'ủ', 'ứ', 'ừ', 'ử', 'ữ', 'ự', 'ỳ', 'ỵ', 'ỷ', 'ỹ']\n",
"['ngh', 'ch', 'gh', 'gi', 'kh', 'ng', 'nh', 'ph', 'qu', 'tr', 'th', 'b', 'c', 'd', 'đ', 'g', 'h', 'k', 'l', 'm', 'n', 'p', 'r', 's', 't', 'v', 'x', 'a', 'ă', 'â', 'e', 'ê', 'i', 'o', 'ô', 'ơ', 'u', 'ư', 'y', 'á', 'ắ', 'ấ', 'é', 'ế', 'í', 'ó', 'ố', 'ớ', 'ú', 'ứ', 'ý', 'à', 'ằ', 'ầ', 'è', 'ề', 'ì', 'ò', 'ồ', 'ờ', 'ù', 'ừ', 'ỳ', 'ả', 'ẳ', 'ẩ', 'ẻ', 'ể', 'ỉ', 'ỏ', 'ổ', 'ở', 'ủ', 'ử', 'ỷ', 'ã', 'ẵ', 'ẫ', 'ẽ', 'ễ', 'ĩ', 'õ', 'ỗ', 'ỡ', 'ũ', 'ữ', 'ỹ', 'ạ', 'ặ', 'ậ', 'ẹ', 'ệ', 'ị', 'ọ', 'ộ', 'ợ', 'ụ', 'ự', 'ỵ']\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "ioh4U9iKf_Lx"
},
"source": [
"lines = open('/content/words.txt', 'r').readlines()\n",
"f = open('/content/phonemes.txt', 'w')\n",
"for line in lines:\n",
" t = ' '.join(text_to_phonemes(line))\n",
" f.write(t + '\\n')\n",
"f.close()"
],
"execution_count": 8,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "b3nMwfzK_g0B"
},
"source": [
"ws = open('/content/words.txt').readlines()\n",
"ps = open('/content/phonemes.txt').readlines()\n",
"f = open('/content/lexicon.txt', 'w')\n",
"for w, p in zip(ws, ps):\n",
" w = w.strip()\n",
" p = p.strip()\n",
" if w == \"q\":\n",
" p = \"qu i\"\n",
" f.write(f'{w}\\t{p}\\n')\n",
"f.close()"
],
"execution_count": 9,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "TaT_fa_bAhn7"
},
"source": [
"!mkdir -p /content/wavs\n",
"!cp /content/train_data/*.wav /content/wavs"
],
"execution_count": 10,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "fQRgeq8rRuio",
"outputId": "3b26d3b2-42ab-4770-ccab-47e0b0c64921"
},
"source": [
"!gdown --id 1p4dqtkb4N9WLzggMtPzGB7WnVSOCaIFq -O scripts.csv"
],
"execution_count": 11,
"outputs": [
{
"output_type": "stream",
"text": [
"Downloading...\n",
"From: https://drive.google.com/uc?id=1p4dqtkb4N9WLzggMtPzGB7WnVSOCaIFq\n",
"To: /content/scripts.csv\n",
"\r0.00B [00:00, ?B/s]\r2.25MB [00:00, 71.7MB/s]\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "w5k2kXmOBD9q"
},
"source": [
"s = open('scripts.csv').readlines()\n",
"from pathlib import Path\n",
"for l in s:\n",
" fn, txt, t = l.strip().split('|')\n",
" fn = Path(fn).stem\n",
" with open(f'/content/wavs/{fn}.txt', 'w') as f:\n",
" f.write(txt + '\\n')\n",
" # print(fn)"
],
"execution_count": 12,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "uwlgRSd19lbK",
"outputId": "82691107-04bc-4773-9e13-40af8dda77a5"
},
"source": [
"!source /tmp/mfa/miniconda3/bin/activate aligner; mfa train --clean -C /content/wavs /content/lexicon.txt /content/InfoRe_Tg"
],
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"text": [
"All required kaldi binaries were found!\n",
"/root/Documents/MFA/wavs/train_and_align.log\n",
"INFO - Setting up corpus information...\n",
"INFO - Number of speakers in corpus: 1, average number of utterances per speaker: 14935.0\n",
"INFO - Parsing dictionary without pronunciation probabilities without silence probabilities\n",
"INFO - Creating dictionary information...\n",
"INFO - Setting up training data...\n",
"Generating base features (mfcc)...\n"
],
"name": "stdout"
}
]
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment