Created
June 6, 2021 03:13
-
-
Save NTT123/9c1fb092d76acb767cfd930386eeb1ce to your computer and use it in GitHub Desktop.
MFA InfoRe Tutorial
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"nbformat": 4, | |
"nbformat_minor": 0, | |
"metadata": { | |
"colab": { | |
"name": "InfoRe MFA Tutorial.ipynb", | |
"provenance": [], | |
"collapsed_sections": [] | |
}, | |
"kernelspec": { | |
"name": "python3", | |
"display_name": "Python 3" | |
}, | |
"language_info": { | |
"name": "python" | |
} | |
}, | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "IPkicKwU8IWj" | |
}, | |
"source": [ | |
"!apt update -y\n", | |
"!pip3 install gdown" | |
], | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "G6Z-aDd08hfk" | |
}, | |
"source": [ | |
"%%bash\n", | |
"data_root=\"./train_data\" # modify this\n", | |
"pushd .\n", | |
"mkdir -p $data_root\n", | |
"cd $data_root\n", | |
"gdown --id 1Pe-5lKT_lZsliv2WxQDai2mjhI9ZMFlj -O infore.zip\n", | |
"unzip infore.zip \n", | |
"popd" | |
], | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "180fmSWhP_Wq", | |
"outputId": "3144680c-6fee-42c9-a4c5-9471942b23fa" | |
}, | |
"source": [ | |
"%%writefile install_mfa.sh\n", | |
"#!/bin/bash\n", | |
"\n", | |
"## a script to install Montreal Forced Aligner (MFA)\n", | |
"\n", | |
"root_dir=${1:-/tmp/mfa}\n", | |
"mkdir -p $root_dir\n", | |
"cd $root_dir\n", | |
"\n", | |
"# download miniconda3\n", | |
"wget -q --show-progress https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh\n", | |
"bash Miniconda3-latest-Linux-x86_64.sh -b -p $root_dir/miniconda3 -f\n", | |
"\n", | |
"# create py38 env\n", | |
"$root_dir/miniconda3/bin/conda create -n aligner -c conda-forge openblas python=3.8 openfst pynini ngram baumwelch -y\n", | |
"source $root_dir/miniconda3/bin/activate aligner\n", | |
"\n", | |
"# install mfa, download kaldi\n", | |
"pip install montreal-forced-aligner\n", | |
"mfa thirdparty download\n", | |
"\n", | |
"echo -e \"\\n======== DONE ==========\"\n", | |
"echo -e \"\\nTo activate MFA, run: source $root_dir/miniconda3/bin/activate aligner\"\n", | |
"echo -e \"\\nTo delete MFA, run: rm -rf $root_dir\"\n", | |
"echo -e \"\\nSee: https://montreal-forced-aligner.readthedocs.io/en/latest/aligning.html to know how to use MFA\"" | |
], | |
"execution_count": 4, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": [ | |
"Overwriting install_mfa.sh\n" | |
], | |
"name": "stdout" | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "UDtqhang9IHT" | |
}, | |
"source": [ | |
"!bash ./install_mfa.sh /tmp/mfa # path to install directory\n", | |
"!source /tmp/mfa/miniconda3/bin/activate aligner; mfa align --help" | |
], | |
"execution_count": null, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "1MGHIiU_97LC" | |
}, | |
"source": [ | |
"!cat train_data/lexicon.txt | cut -f 1 > /content/words.txt" | |
], | |
"execution_count": 6, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "FfMSocEUf79K", | |
"outputId": "afde4f04-3e7d-4a4c-cec9-51c04caa9f48" | |
}, | |
"source": [ | |
"# paper: https://www.aclweb.org/anthology/W16-5207.pdf\n", | |
"# title: A non-expert Kaldi recipe for Vietnamese Speech Recognition System\n", | |
"\n", | |
"consonants = [\n", | |
" 'ngh', \n", | |
" 'ch', 'gh', 'gi', 'kh', 'ng', 'nh', 'ph', 'qu', 'tr', 'th', \n", | |
" 'b', 'c', 'd', 'Ä‘', 'g', 'h', 'k', 'l', 'm', 'n', 'p', 'r', 's', 't', 'v', 'x'\n", | |
"]\n", | |
"vowels = (\n", | |
" ['a', 'ă', 'â', 'e', 'ê', 'i', 'o', 'ô', 'ơ', 'u', 'ư', 'y'] +\n", | |
" ['á', 'ắ', 'ấ', 'é', 'ế', 'Ã', 'ó', 'ố', 'á»›', 'ú', 'ứ', 'ý'] +\n", | |
" ['à ', 'ằ', 'ầ', 'è', 'á»', 'ì', 'ò', 'ồ', 'á»', 'ù', 'ừ', 'ỳ'] +\n", | |
" ['ả', 'ẳ', 'ẩ', 'ẻ', 'ể', 'ỉ', 'á»', 'ổ', 'ở', 'ủ', 'á»', 'á»·'] +\n", | |
" ['ã', 'ẵ', 'ẫ', 'ẽ', 'ễ', 'ĩ', 'õ', 'ỗ', 'ỡ', 'ũ', 'ữ', 'ỹ'] +\n", | |
" ['ạ', 'ặ', 'áº', 'ẹ', 'ệ', 'ị', 'á»', 'á»™', 'ợ', 'ụ', 'á»±', 'ỵ']\n", | |
")\n", | |
"\n", | |
"punctuations = ['.', '?', '\"', '\\'', ',', '-', '–', '!', ':', ';', '(', ')', '[', ']', '\\n' ]\n", | |
"\n", | |
"alphabet = sorted(set(''.join(consonants + vowels)))\n", | |
"print(alphabet)\n", | |
"# phonemes = sorted(consonants + vowels, key=len, reverse=True)\n", | |
"phonemes = consonants + vowels\n", | |
"print(phonemes)\n", | |
"\n", | |
"import unicodedata\n", | |
"def text_to_phonemes(text, keep_punctuation=False):\n", | |
" text = unicodedata.normalize('NFKC', text.strip().lower())\n", | |
" idx = 0\n", | |
" out = []\n", | |
" while idx < len(text):\n", | |
" # length: 3, 2, 1\n", | |
" for l in [3, 2, 1]:\n", | |
" if idx + l <= len(text) and text[idx: (idx+l)] in phonemes:\n", | |
" out.append(text[idx: (idx+l)])\n", | |
" idx = idx + l\n", | |
" break\n", | |
" else:\n", | |
" if idx < len(text):\n", | |
" if keep_punctuation and text[idx] in punctuations:\n", | |
" out.append(text[idx])\n", | |
" if text[idx] == ' ':\n", | |
" out.append(text[idx])\n", | |
" idx = idx + 1\n", | |
" return out" | |
], | |
"execution_count": 7, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": [ | |
"['a', 'b', 'c', 'd', 'e', 'g', 'h', 'i', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'x', 'y', 'à ', 'á', 'â', 'ã', 'è', 'é', 'ê', 'ì', 'Ã', 'ò', 'ó', 'ô', 'õ', 'ù', 'ú', 'ý', 'ă', 'Ä‘', 'Ä©', 'Å©', 'Æ¡', 'Æ°', 'ạ', 'ả', 'ấ', 'ầ', 'ẩ', 'ẫ', 'áº', 'ắ', 'ằ', 'ẳ', 'ẵ', 'ặ', 'ẹ', 'ẻ', 'ẽ', 'ế', 'á»', 'ể', 'á»…', 'ệ', 'ỉ', 'ị', 'á»', 'á»', 'ố', 'ồ', 'ổ', 'á»—', 'á»™', 'á»›', 'á»', 'ở', 'ỡ', 'ợ', 'ụ', 'ủ', 'ứ', 'ừ', 'á»', 'ữ', 'á»±', 'ỳ', 'ỵ', 'á»·', 'ỹ']\n", | |
"['ngh', 'ch', 'gh', 'gi', 'kh', 'ng', 'nh', 'ph', 'qu', 'tr', 'th', 'b', 'c', 'd', 'Ä‘', 'g', 'h', 'k', 'l', 'm', 'n', 'p', 'r', 's', 't', 'v', 'x', 'a', 'ă', 'â', 'e', 'ê', 'i', 'o', 'ô', 'Æ¡', 'u', 'Æ°', 'y', 'á', 'ắ', 'ấ', 'é', 'ế', 'Ã', 'ó', 'ố', 'á»›', 'ú', 'ứ', 'ý', 'à ', 'ằ', 'ầ', 'è', 'á»', 'ì', 'ò', 'ồ', 'á»', 'ù', 'ừ', 'ỳ', 'ả', 'ẳ', 'ẩ', 'ẻ', 'ể', 'ỉ', 'á»', 'ổ', 'ở', 'ủ', 'á»', 'á»·', 'ã', 'ẵ', 'ẫ', 'ẽ', 'á»…', 'Ä©', 'õ', 'á»—', 'ỡ', 'Å©', 'ữ', 'ỹ', 'ạ', 'ặ', 'áº', 'ẹ', 'ệ', 'ị', 'á»', 'á»™', 'ợ', 'ụ', 'á»±', 'ỵ']\n" | |
], | |
"name": "stdout" | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "ioh4U9iKf_Lx" | |
}, | |
"source": [ | |
"lines = open('/content/words.txt', 'r').readlines()\n", | |
"f = open('/content/phonemes.txt', 'w')\n", | |
"for line in lines:\n", | |
" t = ' '.join(text_to_phonemes(line))\n", | |
" f.write(t + '\\n')\n", | |
"f.close()" | |
], | |
"execution_count": 8, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "b3nMwfzK_g0B" | |
}, | |
"source": [ | |
"ws = open('/content/words.txt').readlines()\n", | |
"ps = open('/content/phonemes.txt').readlines()\n", | |
"f = open('/content/lexicon.txt', 'w')\n", | |
"for w, p in zip(ws, ps):\n", | |
" w = w.strip()\n", | |
" p = p.strip()\n", | |
" if w == \"q\":\n", | |
" p = \"qu i\"\n", | |
" f.write(f'{w}\\t{p}\\n')\n", | |
"f.close()" | |
], | |
"execution_count": 9, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "TaT_fa_bAhn7" | |
}, | |
"source": [ | |
"!mkdir -p /content/wavs\n", | |
"!cp /content/train_data/*.wav /content/wavs" | |
], | |
"execution_count": 10, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "fQRgeq8rRuio", | |
"outputId": "3b26d3b2-42ab-4770-ccab-47e0b0c64921" | |
}, | |
"source": [ | |
"!gdown --id 1p4dqtkb4N9WLzggMtPzGB7WnVSOCaIFq -O scripts.csv" | |
], | |
"execution_count": 11, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": [ | |
"Downloading...\n", | |
"From: https://drive.google.com/uc?id=1p4dqtkb4N9WLzggMtPzGB7WnVSOCaIFq\n", | |
"To: /content/scripts.csv\n", | |
"\r0.00B [00:00, ?B/s]\r2.25MB [00:00, 71.7MB/s]\n" | |
], | |
"name": "stdout" | |
} | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"id": "w5k2kXmOBD9q" | |
}, | |
"source": [ | |
"s = open('scripts.csv').readlines()\n", | |
"from pathlib import Path\n", | |
"for l in s:\n", | |
" fn, txt, t = l.strip().split('|')\n", | |
" fn = Path(fn).stem\n", | |
" with open(f'/content/wavs/{fn}.txt', 'w') as f:\n", | |
" f.write(txt + '\\n')\n", | |
" # print(fn)" | |
], | |
"execution_count": 12, | |
"outputs": [] | |
}, | |
{ | |
"cell_type": "code", | |
"metadata": { | |
"colab": { | |
"base_uri": "https://localhost:8080/" | |
}, | |
"id": "uwlgRSd19lbK", | |
"outputId": "82691107-04bc-4773-9e13-40af8dda77a5" | |
}, | |
"source": [ | |
"!source /tmp/mfa/miniconda3/bin/activate aligner; mfa train --clean -C /content/wavs /content/lexicon.txt /content/InfoRe_Tg" | |
], | |
"execution_count": null, | |
"outputs": [ | |
{ | |
"output_type": "stream", | |
"text": [ | |
"All required kaldi binaries were found!\n", | |
"/root/Documents/MFA/wavs/train_and_align.log\n", | |
"INFO - Setting up corpus information...\n", | |
"INFO - Number of speakers in corpus: 1, average number of utterances per speaker: 14935.0\n", | |
"INFO - Parsing dictionary without pronunciation probabilities without silence probabilities\n", | |
"INFO - Creating dictionary information...\n", | |
"INFO - Setting up training data...\n", | |
"Generating base features (mfcc)...\n" | |
], | |
"name": "stdout" | |
} | |
] | |
} | |
] | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment