erogol/tts_example.ipynb

## tts_example.ipynb
{
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "name": "TTS_example.ipynb",
      "provenance": [],
      "collapsed_sections": [],
      "include_colab_link": true
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    },
    "accelerator": "GPU"
  },
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "view-in-github",
        "colab_type": "text"
      },
      "source": [
        "<a href=\"https://colab.research.google.com/gist/erogol/97516ad65b44dbddb8cd694953187c5b/tts_example.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "cjD0xW0cEMVT"
      },
      "source": [
        "# Hands-on example for 🐸 [Coqui TTS](https://github.com/coqui-ai/TTS)\n",
        "\n",
        "This notebook trains Tacotron model on LJSpeech dataset."
      ]
    },
    {
      "cell_type": "markdown",
      "source": [
        "## Download LJSpeech"
      ],
      "metadata": {
        "id": "QPA2gbqRi9Wx"
      }
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "XGiNTMShZYvj"
      },
      "source": [
        "# download LJSpeech dataset\n",
        "!wget http://data.keithito.com/data/speech/LJSpeech-1.1.tar.bz2\n",
        "# decompress\n",
        "!tar -xjf LJSpeech-1.1.tar.bz2"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "__k0BrbfLQ-F"
      },
      "source": [
        "# create train-val splits\n",
        "!shuf LJSpeech-1.1/metadata.csv > LJSpeech-1.1/metadata_shuf.csv\n",
        "!head -n 12000 LJSpeech-1.1/metadata_shuf.csv > LJSpeech-1.1/metadata_train.csv\n",
        "!tail -n 1100 LJSpeech-1.1/metadata_shuf.csv > LJSpeech-1.1/metadata_val.csv"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "## Setup environment"
      ],
      "metadata": {
        "id": "ocmh66BqjLCF"
      }
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "pyJwcU9pDUE-"
      },
      "source": [
        "!pip install TTS "
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "zV-vHTWyirQv"
      },
      "source": [
        "# install espeak backend if you like to use phonemes instead of raw characters\n",
        "!sudo apt-get install espeak-ng"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "## Train Tacotron DCA"
      ],
      "metadata": {
        "id": "2Af-yiyFjU-f"
      }
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "y7_Xao7uNOvX"
      },
      "source": [
        "\n",
        "import os\n",
        "\n",
        "from trainer import Trainer, TrainerArgs\n",
        "\n",
        "from TTS.config.shared_configs import BaseAudioConfig\n",
        "from TTS.tts.configs.shared_configs import BaseDatasetConfig\n",
        "from TTS.tts.configs.tacotron2_config import Tacotron2Config\n",
        "from TTS.tts.datasets import load_tts_samples\n",
        "from TTS.tts.models.tacotron2 import Tacotron2\n",
        "from TTS.tts.utils.text.tokenizer import TTSTokenizer\n",
        "from TTS.utils.audio import AudioProcessor\n",
        "\n",
        "# from TTS.tts.datasets.tokenizer import Tokenizer\n",
        "\n",
        "output_path = \"./\"\n",
        "\n",
        "# init configs\n",
        "dataset_config = BaseDatasetConfig(\n",
        "    name=\"ljspeech\", meta_file_train=\"metadata.csv\", path=os.path.join(output_path, \"/content/LJSpeech-1.1\")\n",
        ")\n",
        "\n",
        "audio_config = BaseAudioConfig(\n",
        "    sample_rate=22050,\n",
        "    do_trim_silence=True,\n",
        "    trim_db=60.0,\n",
        "    signal_norm=False,\n",
        "    mel_fmin=0.0,\n",
        "    mel_fmax=8000,\n",
        "    spec_gain=1.0,\n",
        "    log_func=\"np.log\",\n",
        "    ref_level_db=20,\n",
        "    preemphasis=0.0,\n",
        ")\n",
        "\n",
        "config = Tacotron2Config(  # This is the config that is saved for the future use\n",
        "    audio=audio_config,\n",
        "    batch_size=64,\n",
        "    eval_batch_size=16,\n",
        "    num_loader_workers=4,\n",
        "    num_eval_loader_workers=4,\n",
        "    run_eval=True,\n",
        "    test_delay_epochs=-1,\n",
        "    ga_alpha=0.0,\n",
        "    decoder_loss_alpha=0.25,\n",
        "    postnet_loss_alpha=0.25,\n",
        "    postnet_diff_spec_alpha=0,\n",
        "    decoder_diff_spec_alpha=0,\n",
        "    decoder_ssim_alpha=0,\n",
        "    postnet_ssim_alpha=0,\n",
        "    r=2,\n",
        "    attention_type=\"dynamic_convolution\",\n",
        "    double_decoder_consistency=False,\n",
        "    epochs=1000,\n",
        "    text_cleaner=\"phoneme_cleaners\",\n",
        "    use_phonemes=True,\n",
        "    phoneme_language=\"en-us\",\n",
        "    phoneme_cache_path=os.path.join(output_path, \"phoneme_cache\"),\n",
        "    print_step=25,\n",
        "    print_eval=True,\n",
        "    mixed_precision=False,\n",
        "    output_path=output_path,\n",
        "    datasets=[dataset_config],\n",
        ")\n",
        "\n",
        "# INITIALIZE THE AUDIO PROCESSOR\n",
        "# Audio processor is used for feature extraction and audio I/O.\n",
        "# It mainly serves to the dataloader and the training loggers.\n",
        "ap = AudioProcessor.init_from_config(config)\n",
        "\n",
        "# INITIALIZE THE TOKENIZER\n",
        "# Tokenizer is used to convert text to sequences of token IDs.\n",
        "# If characters are not defined in the config, default characters are passed to the config\n",
        "tokenizer, config = TTSTokenizer.init_from_config(config)\n",
        "\n",
        "# LOAD DATA SAMPLES\n",
        "# Each sample is a list of ```[text, audio_file_path, speaker_name]```\n",
        "# You can define your custom sample loader returning the list of samples.\n",
        "# Or define your custom formatter and pass it to the `load_tts_samples`.\n",
        "# Check `TTS.tts.datasets.load_tts_samples` for more details.\n",
        "train_samples, eval_samples = load_tts_samples(\n",
        "    dataset_config,\n",
        "    eval_split=True,\n",
        "    eval_split_max_size=config.eval_split_max_size,\n",
        "    eval_split_size=config.eval_split_size,\n",
        ")\n",
        "\n",
        "# INITIALIZE THE MODEL\n",
        "# Models take a config object and a speaker manager as input\n",
        "# Config defines the details of the model like the number of layers, the size of the embedding, etc.\n",
        "# Speaker manager is used by multi-speaker models.\n",
        "model = Tacotron2(config, ap, tokenizer)\n",
        "\n",
        "# INITIALIZE THE TRAINER\n",
        "# Trainer provides a generic API to train all the 🐸TTS models with all its perks like mixed-precision training,\n",
        "# distributed training, etc.\n",
        "trainer = Trainer(\n",
        "    TrainerArgs(), config, output_path, model=model, train_samples=train_samples, eval_samples=eval_samples\n",
        ")\n",
        "\n",
        "# AND... 3,2,1... 🚀\n",
        "trainer.fit()"
      ],
      "execution_count": null,
      "outputs": []
    }
  ]
}
	{
	"nbformat": 4,
	"nbformat_minor": 0,
	"metadata": {
	"colab": {
	"name": "TTS_example.ipynb",
	"provenance": [],
	"collapsed_sections": [],
	"include_colab_link": true
	},
	"kernelspec": {
	"name": "python3",
	"display_name": "Python 3"
	},
	"accelerator": "GPU"
	},
	"cells": [
	{
	"cell_type": "markdown",
	"metadata": {
	"id": "view-in-github",
	"colab_type": "text"
	},
	"source": [
	"<a href=\"https://colab.research.google.com/gist/erogol/97516ad65b44dbddb8cd694953187c5b/tts_example.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {
	"id": "cjD0xW0cEMVT"
	},
	"source": [
	"# Hands-on example for 🐸 [Coqui TTS](https://github.com/coqui-ai/TTS)\n",
	"\n",
	"This notebook trains Tacotron model on LJSpeech dataset."
	]
	},
	{
	"cell_type": "markdown",
	"source": [
	"## Download LJSpeech"
	],
	"metadata": {
	"id": "QPA2gbqRi9Wx"
	}
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "XGiNTMShZYvj"
	},
	"source": [
	"# download LJSpeech dataset\n",
	"!wget http://data.keithito.com/data/speech/LJSpeech-1.1.tar.bz2\n",
	"# decompress\n",
	"!tar -xjf LJSpeech-1.1.tar.bz2"
	],
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "__k0BrbfLQ-F"
	},
	"source": [
	"# create train-val splits\n",
	"!shuf LJSpeech-1.1/metadata.csv > LJSpeech-1.1/metadata_shuf.csv\n",
	"!head -n 12000 LJSpeech-1.1/metadata_shuf.csv > LJSpeech-1.1/metadata_train.csv\n",
	"!tail -n 1100 LJSpeech-1.1/metadata_shuf.csv > LJSpeech-1.1/metadata_val.csv"
	],
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "markdown",
	"source": [
	"## Setup environment"
	],
	"metadata": {
	"id": "ocmh66BqjLCF"
	}
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "pyJwcU9pDUE-"
	},
	"source": [
	"!pip install TTS "
	],
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "zV-vHTWyirQv"
	},
	"source": [
	"# install espeak backend if you like to use phonemes instead of raw characters\n",
	"!sudo apt-get install espeak-ng"
	],
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "markdown",
	"source": [
	"## Train Tacotron DCA"
	],
	"metadata": {
	"id": "2Af-yiyFjU-f"
	}
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "y7_Xao7uNOvX"
	},
	"source": [
	"\n",
	"import os\n",
	"\n",
	"from trainer import Trainer, TrainerArgs\n",
	"\n",
	"from TTS.config.shared_configs import BaseAudioConfig\n",
	"from TTS.tts.configs.shared_configs import BaseDatasetConfig\n",
	"from TTS.tts.configs.tacotron2_config import Tacotron2Config\n",
	"from TTS.tts.datasets import load_tts_samples\n",
	"from TTS.tts.models.tacotron2 import Tacotron2\n",
	"from TTS.tts.utils.text.tokenizer import TTSTokenizer\n",
	"from TTS.utils.audio import AudioProcessor\n",
	"\n",
	"# from TTS.tts.datasets.tokenizer import Tokenizer\n",
	"\n",
	"output_path = \"./\"\n",
	"\n",
	"# init configs\n",
	"dataset_config = BaseDatasetConfig(\n",
	" name=\"ljspeech\", meta_file_train=\"metadata.csv\", path=os.path.join(output_path, \"/content/LJSpeech-1.1\")\n",
	")\n",
	"\n",
	"audio_config = BaseAudioConfig(\n",
	" sample_rate=22050,\n",
	" do_trim_silence=True,\n",
	" trim_db=60.0,\n",
	" signal_norm=False,\n",
	" mel_fmin=0.0,\n",
	" mel_fmax=8000,\n",
	" spec_gain=1.0,\n",
	" log_func=\"np.log\",\n",
	" ref_level_db=20,\n",
	" preemphasis=0.0,\n",
	")\n",
	"\n",
	"config = Tacotron2Config( # This is the config that is saved for the future use\n",
	" audio=audio_config,\n",
	" batch_size=64,\n",
	" eval_batch_size=16,\n",
	" num_loader_workers=4,\n",
	" num_eval_loader_workers=4,\n",
	" run_eval=True,\n",
	" test_delay_epochs=-1,\n",
	" ga_alpha=0.0,\n",
	" decoder_loss_alpha=0.25,\n",
	" postnet_loss_alpha=0.25,\n",
	" postnet_diff_spec_alpha=0,\n",
	" decoder_diff_spec_alpha=0,\n",
	" decoder_ssim_alpha=0,\n",
	" postnet_ssim_alpha=0,\n",
	" r=2,\n",
	" attention_type=\"dynamic_convolution\",\n",
	" double_decoder_consistency=False,\n",
	" epochs=1000,\n",
	" text_cleaner=\"phoneme_cleaners\",\n",
	" use_phonemes=True,\n",
	" phoneme_language=\"en-us\",\n",
	" phoneme_cache_path=os.path.join(output_path, \"phoneme_cache\"),\n",
	" print_step=25,\n",
	" print_eval=True,\n",
	" mixed_precision=False,\n",
	" output_path=output_path,\n",
	" datasets=[dataset_config],\n",
	")\n",
	"\n",
	"# INITIALIZE THE AUDIO PROCESSOR\n",
	"# Audio processor is used for feature extraction and audio I/O.\n",
	"# It mainly serves to the dataloader and the training loggers.\n",
	"ap = AudioProcessor.init_from_config(config)\n",
	"\n",
	"# INITIALIZE THE TOKENIZER\n",
	"# Tokenizer is used to convert text to sequences of token IDs.\n",
	"# If characters are not defined in the config, default characters are passed to the config\n",
	"tokenizer, config = TTSTokenizer.init_from_config(config)\n",
	"\n",
	"# LOAD DATA SAMPLES\n",
	"# Each sample is a list of ```[text, audio_file_path, speaker_name]```\n",
	"# You can define your custom sample loader returning the list of samples.\n",
	"# Or define your custom formatter and pass it to the `load_tts_samples`.\n",
	"# Check `TTS.tts.datasets.load_tts_samples` for more details.\n",
	"train_samples, eval_samples = load_tts_samples(\n",
	" dataset_config,\n",
	" eval_split=True,\n",
	" eval_split_max_size=config.eval_split_max_size,\n",
	" eval_split_size=config.eval_split_size,\n",
	")\n",
	"\n",
	"# INITIALIZE THE MODEL\n",
	"# Models take a config object and a speaker manager as input\n",
	"# Config defines the details of the model like the number of layers, the size of the embedding, etc.\n",
	"# Speaker manager is used by multi-speaker models.\n",
	"model = Tacotron2(config, ap, tokenizer)\n",
	"\n",
	"# INITIALIZE THE TRAINER\n",
	"# Trainer provides a generic API to train all the 🐸TTS models with all its perks like mixed-precision training,\n",
	"# distributed training, etc.\n",
	"trainer = Trainer(\n",
	" TrainerArgs(), config, output_path, model=model, train_samples=train_samples, eval_samples=eval_samples\n",
	")\n",
	"\n",
	"# AND... 3,2,1... 🚀\n",
	"trainer.fit()"
	],
	"execution_count": null,
	"outputs": []
	}
	]
	}