netpi/mfa-ljspeech.ipynb

## mfa-ljspeech.ipynb
{
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "name": "MFA LJSpeech.ipynb",
      "private_outputs": true,
      "provenance": [],
      "collapsed_sections": [],
      "machine_shape": "hm",
      "include_colab_link": true
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    }
  },
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "view-in-github",
        "colab_type": "text"
      },
      "source": [
        "<a href=\"https://colab.research.google.com/gist/NTT123/12264d15afad861cb897f7a20a01762e/mfa-ljspeech.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "bhYTF6XCOYAh"
      },
      "source": [
        "### Forced align LJSpeech dataset using Montreal Forced Aligner (MFA)\n",
        "\n",
        "\n",
        "**Note**: The notebook takes 20 minutes to finish.\n",
        "\n",
        "Expected results:\n",
        "\n",
        "<img src=\"https://i.imgur.com/5uehkba.png\"></img>\n"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "_twSwprjG_M9"
      },
      "source": [
        "%%writefile install_mfa.sh\n",
        "#!/bin/bash\n",
        "\n",
        "## a script to install Montreal Forced Aligner (MFA)\n",
        "\n",
        "root_dir=${1:-/tmp/mfa}\n",
        "mkdir -p $root_dir\n",
        "cd $root_dir\n",
        "\n",
        "# download miniconda3\n",
        "wget -q --show-progress https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh\n",
        "bash Miniconda3-latest-Linux-x86_64.sh -b -p $root_dir/miniconda3 -f\n",
        "\n",
        "#install MFA\n",
        "$root_dir/miniconda3/bin/conda create -n aligner -c conda-forge montreal-forced-aligner -y\n",
        "\n",
        "echo -e \"\\n======== DONE ==========\"\n",
        "echo -e \"\\nTo activate MFA, run: source $root_dir/miniconda3/bin/activate aligner\"\n",
        "echo -e \"\\nTo delete MFA, run: rm -rf $root_dir\"\n",
        "echo -e \"\\nSee: https://montreal-forced-aligner.readthedocs.io/en/latest/aligning.html to know how to use MFA\""
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "molbePbO8mlv"
      },
      "source": [
        "# download and install mfa\n",
        "INSTALL_DIR=\"/tmp/mfa\" # path to install directory\n",
        "\n",
        "!bash ./install_mfa.sh {INSTALL_DIR}\n",
        "!source {INSTALL_DIR}/miniconda3/bin/activate aligner; mfa align --help"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "ppEcCzZ2MZSp"
      },
      "source": [
        "# download and unpack ljs dataset\n",
        "!echo \"download and unpack ljs dataset\"\n",
        "!mkdir -p ./ljs; cd ./ljs; wget -q --show-progress https://data.keithito.com/data/speech/LJSpeech-1.1.tar.bz2\n",
        "!cd ./ljs; tar xjf LJSpeech-1.1.tar.bz2"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "j6XLgf6aMbgo"
      },
      "source": [
        "# install sox tool\n",
        "!sudo apt install -q -y sox\n",
        "# convert to 16k audio clips\n",
        "!mkdir ./wav\n",
        "!echo \"normalize audio clips to sample rate of 16k\"\n",
        "!find ./ljs -name \"*.wav\" -type f -execdir sox --norm=-3 {} -r 16k -c 1 `pwd`/wav/{} \\;\n",
        "!echo \"Number of clips\" $(ls ./wav/ | wc -l)"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "ZbNfEn5pMdOf"
      },
      "source": [
        "# create transcript files from metadata.csv\n",
        "lines = open('./ljs/LJSpeech-1.1/metadata.csv', 'r').readlines()\n",
        "from tqdm.auto import tqdm\n",
        "for line in tqdm(lines):\n",
        "  fn, _, transcript = line.strip().split('|')\n",
        "  ident = fn\n",
        "  open(f'./wav/{ident}.txt', 'w').write(transcript)\n",
        "\n",
        "# this is an example transcript for LJ001-0001.wav\n",
        "!cat ./wav/LJ001-0001.txt"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "obtWj9_wMghH"
      },
      "source": [
        "# download a pretrained english acoustic model, and english lexicon\n",
        "!wget -q --show-progress https://github.com/MontrealCorpusTools/mfa-models/raw/main/acoustic/english.zip\n",
        "!wget -q --show-progress http://www.openslr.org/resources/11/librispeech-lexicon.txt"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "# see: https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner/pull/480\n",
        "import re\n",
        "lexicon = open(\"librispeech-lexicon.txt\").readlines()\n",
        "sp = re.compile(\"\\s+\")\n",
        "with open(\"modified_librispeech-lexicon.txt\", \"w\") as f:\n",
        "    for line in lexicon:\n",
        "        word, *phonemes = sp.split(line.strip())\n",
        "        phonemes = \" \".join(phonemes)\n",
        "        f.write(f\"{word}\\t{phonemes}\\n\")"
      ],
      "metadata": {
        "id": "zf_ssMA8cbHw"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "HcZE4uxyMhXg"
      },
      "source": [
        "# FINALLY, align phonemes and speech\n",
        "!source {INSTALL_DIR}/miniconda3/bin/activate aligner; \\\n",
        "mfa align -t ./temp -j 4 ./wav modified_librispeech-lexicon.txt ./english.zip ./ljs_aligned\n",
        "# output files are at ./ljs_aligned\n",
        "!echo \"See output files at ./ljs_aligned\""
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [],
      "metadata": {
        "id": "0bWYNXRlLZ84"
      },
      "execution_count": null,
      "outputs": []
    }
  ]
}
	{
	"nbformat": 4,
	"nbformat_minor": 0,
	"metadata": {
	"colab": {
	"name": "MFA LJSpeech.ipynb",
	"private_outputs": true,
	"provenance": [],
	"collapsed_sections": [],
	"machine_shape": "hm",
	"include_colab_link": true
	},
	"kernelspec": {
	"name": "python3",
	"display_name": "Python 3"
	}
	},
	"cells": [
	{
	"cell_type": "markdown",
	"metadata": {
	"id": "view-in-github",
	"colab_type": "text"
	},
	"source": [
	"<a href=\"https://colab.research.google.com/gist/NTT123/12264d15afad861cb897f7a20a01762e/mfa-ljspeech.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {
	"id": "bhYTF6XCOYAh"
	},
	"source": [
	"### Forced align LJSpeech dataset using Montreal Forced Aligner (MFA)\n",
	"\n",
	"\n",
	"Note: The notebook takes 20 minutes to finish.\n",
	"\n",
	"Expected results:\n",
	"\n",
	"<img src=\"https://i.imgur.com/5uehkba.png\"></img>\n"
	]
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "_twSwprjG_M9"
	},
	"source": [
	"%%writefile install_mfa.sh\n",
	"#!/bin/bash\n",
	"\n",
	"## a script to install Montreal Forced Aligner (MFA)\n",
	"\n",
	"root_dir=${1:-/tmp/mfa}\n",
	"mkdir -p $root_dir\n",
	"cd $root_dir\n",
	"\n",
	"# download miniconda3\n",
	"wget -q --show-progress https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh\n",
	"bash Miniconda3-latest-Linux-x86_64.sh -b -p $root_dir/miniconda3 -f\n",
	"\n",
	"#install MFA\n",
	"$root_dir/miniconda3/bin/conda create -n aligner -c conda-forge montreal-forced-aligner -y\n",
	"\n",
	"echo -e \"\\n======== DONE ==========\"\n",
	"echo -e \"\\nTo activate MFA, run: source $root_dir/miniconda3/bin/activate aligner\"\n",
	"echo -e \"\\nTo delete MFA, run: rm -rf $root_dir\"\n",
	"echo -e \"\\nSee: https://montreal-forced-aligner.readthedocs.io/en/latest/aligning.html to know how to use MFA\""
	],
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "molbePbO8mlv"
	},
	"source": [
	"# download and install mfa\n",
	"INSTALL_DIR=\"/tmp/mfa\" # path to install directory\n",
	"\n",
	"!bash ./install_mfa.sh {INSTALL_DIR}\n",
	"!source {INSTALL_DIR}/miniconda3/bin/activate aligner; mfa align --help"
	],
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "ppEcCzZ2MZSp"
	},
	"source": [
	"# download and unpack ljs dataset\n",
	"!echo \"download and unpack ljs dataset\"\n",
	"!mkdir -p ./ljs; cd ./ljs; wget -q --show-progress https://data.keithito.com/data/speech/LJSpeech-1.1.tar.bz2\n",
	"!cd ./ljs; tar xjf LJSpeech-1.1.tar.bz2"
	],
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "j6XLgf6aMbgo"
	},
	"source": [
	"# install sox tool\n",
	"!sudo apt install -q -y sox\n",
	"# convert to 16k audio clips\n",
	"!mkdir ./wav\n",
	"!echo \"normalize audio clips to sample rate of 16k\"\n",
	"!find ./ljs -name \"*.wav\" -type f -execdir sox --norm=-3 {} -r 16k -c 1 `pwd`/wav/{} \\;\n",
	"!echo \"Number of clips\" $(ls ./wav/ \| wc -l)"
	],
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "ZbNfEn5pMdOf"
	},
	"source": [
	"# create transcript files from metadata.csv\n",
	"lines = open('./ljs/LJSpeech-1.1/metadata.csv', 'r').readlines()\n",
	"from tqdm.auto import tqdm\n",
	"for line in tqdm(lines):\n",
	" fn, _, transcript = line.strip().split('\|')\n",
	" ident = fn\n",
	" open(f'./wav/{ident}.txt', 'w').write(transcript)\n",
	"\n",
	"# this is an example transcript for LJ001-0001.wav\n",
	"!cat ./wav/LJ001-0001.txt"
	],
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "obtWj9_wMghH"
	},
	"source": [
	"# download a pretrained english acoustic model, and english lexicon\n",
	"!wget -q --show-progress https://github.com/MontrealCorpusTools/mfa-models/raw/main/acoustic/english.zip\n",
	"!wget -q --show-progress http://www.openslr.org/resources/11/librispeech-lexicon.txt"
	],
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "code",
	"source": [
	"# see: https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner/pull/480\n",
	"import re\n",
	"lexicon = open(\"librispeech-lexicon.txt\").readlines()\n",
	"sp = re.compile(\"\\s+\")\n",
	"with open(\"modified_librispeech-lexicon.txt\", \"w\") as f:\n",
	" for line in lexicon:\n",
	" word, *phonemes = sp.split(line.strip())\n",
	" phonemes = \" \".join(phonemes)\n",
	" f.write(f\"{word}\\t{phonemes}\\n\")"
	],
	"metadata": {
	"id": "zf_ssMA8cbHw"
	},
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "HcZE4uxyMhXg"
	},
	"source": [
	"# FINALLY, align phonemes and speech\n",
	"!source {INSTALL_DIR}/miniconda3/bin/activate aligner; \\\n",
	"mfa align -t ./temp -j 4 ./wav modified_librispeech-lexicon.txt ./english.zip ./ljs_aligned\n",
	"# output files are at ./ljs_aligned\n",
	"!echo \"See output files at ./ljs_aligned\""
	],
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "code",
	"source": [],
	"metadata": {
	"id": "0bWYNXRlLZ84"
	},
	"execution_count": null,
	"outputs": []
	}
	]
	}