Skip to content

Instantly share code, notes, and snippets.

@erogol
Created December 19, 2018 11:50
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save erogol/8f39174c3f0475221c8978aeb10d4fdc to your computer and use it in GitHub Desktop.
Save erogol/8f39174c3f0475221c8978aeb10d4fdc to your computer and use it in GitHub Desktop.
TTS_example.ipynb
Display the source blob
Display the rendered blob
Raw
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"name": "TTS_example.ipynb",
"version": "0.3.2",
"provenance": [],
"collapsed_sections": [],
"include_colab_link": true
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"accelerator": "GPU"
},
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "view-in-github",
"colab_type": "text"
},
"source": [
"<a href=\"https://colab.research.google.com/gist/erogol/8f39174c3f0475221c8978aeb10d4fdc/tts_example.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
]
},
{
"metadata": {
"id": "6uMCom74Ft81",
"colab_type": "code",
"colab": {}
},
"cell_type": "code",
"source": [
"from google.colab import drive\n",
"drive.mount('/content/drive/')b"
],
"execution_count": 0,
"outputs": []
},
{
"metadata": {
"id": "9wqjz3lIGXZd",
"colab_type": "code",
"colab": {}
},
"cell_type": "code",
"source": [
"# get TTS to your local\n",
"!git clone https://github.com/mozilla/TTS"
],
"execution_count": 0,
"outputs": []
},
{
"metadata": {
"id": "TzjnO4pjGePs",
"colab_type": "code",
"colab": {}
},
"cell_type": "code",
"source": [
"# download LJSpeech dataset\n",
"!wget http://data.keithito.com/data/speech/LJSpeech-1.1.tar.bz2d\n",
"# decompress \n",
"!tar -xvjf LJSpeech-1.1.tar.bz2"
],
"execution_count": 0,
"outputs": []
},
{
"metadata": {
"id": "__k0BrbfLQ-F",
"colab_type": "code",
"colab": {}
},
"cell_type": "code",
"source": [
"# create train-val splits\n",
"!shuf LJSpeech-1.1/metadata.csv > LJSpeech-1.1/metadata_shuf.csv\n",
"!head -n 12000 LJSpeech-1.1/metadata_shuf.csv > LJSpeech-1.1/metadata_train.csv\n",
"!tail -n 11000 LJSpeech-1.1/metadata_shuf.csv > LJSpeech-1.1/metadata_val.csv"
],
"execution_count": 0,
"outputs": []
},
{
"metadata": {
"id": "G1OnsNyJJtem",
"colab_type": "code",
"colab": {}
},
"cell_type": "code",
"source": [
"# install TTS requirements\n",
"!cd TTS\n",
"!pip install -r requirements.txt"
],
"execution_count": 0,
"outputs": []
},
{
"metadata": {
"id": "J1XOWu_oKfdv",
"colab_type": "code",
"colab": {}
},
"cell_type": "code",
"source": [
"# check the default TTS config.json. It is necessary for all your training \n",
"# settings\n",
"!cat config.json"
],
"execution_count": 0,
"outputs": []
},
{
"metadata": {
"id": "y7_Xao7uNOvX",
"colab_type": "code",
"outputId": "6400488b-c1f4-45d1-dc40-a9a72543b1f8",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 36
}
},
"cell_type": "code",
"source": [
"\n",
"# set data fields for LJSpeech\n",
"%%writefile config.json\n",
"\n",
"{\n",
" \"model_name\": \"TTS-master\",\n",
" \"model_description\": \"Higher dropout rate for stopnet and disabled custom initialization, pull current mel prediction to stopnet.\",\n",
"\n",
" \"audio\":{\n",
" \"audio_processor\": \"audio\", // to use dictate different audio processors, if available.\n",
" // Audio processing parameters\n",
" \"num_mels\": 80, // size of the mel spec frame. \n",
" \"num_freq\": 1025, // number of stft frequency levels. Size of the linear spectogram frame.\n",
" \"sample_rate\": 22050, // wav sample-rate. If different than the original data, it is resampled.\n",
" \"frame_length_ms\": 50, // stft window length in ms.\n",
" \"frame_shift_ms\": 12.5, // stft window hop-lengh in ms.\n",
" \"preemphasis\": 0.97, // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis.\n",
" \"min_level_db\": -100, // normalization range\n",
" \"ref_level_db\": 20, // reference level db, theoretically 20db is the sound of air.\n",
" \"power\": 1.5, // value to sharpen wav signals after GL algorithm.\n",
" \"griffin_lim_iters\": 60,// #griffin-lim iterations. 30-60 is a good range. Larger the value, slower the generation.\n",
" // Normalization parameters\n",
" \"signal_norm\": true, // normalize the spec values in range [0, 1]\n",
" \"symmetric_norm\": false, // move normalization to range [-1, 1]\n",
" \"max_norm\": 1, // scale normalization to range [-max_norm, max_norm] or [0, max_norm]\n",
" \"clip_norm\": true, // clip normalized values into the range.\n",
" \"mel_fmin\": null, // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!!\n",
" \"mel_fmax\": null, // maximum freq level for mel-spec. Tune for dataset!!\n",
" \"do_trim_silence\": true // enable trimming of slience of audio as you load it.\n",
" },\n",
"\n",
" \"embedding_size\": 256, \n",
" \"text_cleaner\": \"english_cleaners\",\n",
" \"epochs\": 1000,\n",
" \n",
" \"lr\": 0.0001,\n",
" \"lr_decay\": false,\n",
" \"warmup_steps\": 4000,\n",
"\n",
" \"batch_size\": 32,\n",
" \"eval_batch_size\":32,\n",
" \"r\": 5,\n",
" \"wd\": 0.000001,\n",
" \"checkpoint\": true,\n",
" \"save_step\": 5000,\n",
" \"print_step\": 10,\n",
"\n",
" \"run_eval\": true,\n",
" \"data_path\": \"../../Data/LJSpeech-1.1/\", // can overwritten from command argument\n",
" \"meta_file_train\": \"metadata_train.csv\", // metafile for training dataloader\n",
" \"meta_file_val\": \"metadata_val.csv\", // metafile for validation dataloader\n",
" \"data_loader\": \"TTSDataset\", // dataloader, [\"TTSDataset\", \"TTSDatasetCached\", \"TTSDatasetMemory\"]\n",
" \"dataset\": \"ljspeech\", // one of TTS.dataset.preprocessors, only valid id dataloader == \"TTSDataset\", rest uses \"tts_cache\" by default.\n",
" \"min_seq_len\": 0,\n",
" \"output_path\": \"../keep/\",\n",
" \"num_loader_workers\": 2,\n",
" \"num_val_loader_workers\": 2\n",
"}"
],
"execution_count": 0,
"outputs": [
{
"output_type": "stream",
"text": [
"Overwriting config.json\n"
],
"name": "stdout"
}
]
},
{
"metadata": {
"id": "1Z7PR2pBLHxq",
"colab_type": "code",
"outputId": "e285d179-469c-445a-948f-dc706e51e1f4",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 1123
}
},
"cell_type": "code",
"source": [
"# pull the trigger\n",
"!python train.py --config_path config.json --data_path ../LJSpeech-1.1/ | tee training.log"
],
"execution_count": 0,
"outputs": [
{
"output_type": "stream",
"text": [
" > Using CUDA: True\n",
" > Number of GPUs: 1\n",
" > Git Hash: 20ee7c0\n",
" > Experiment folder: /content/TTS/../keep/December-19-2018_11+32AM-TTS-master-20ee7c0\n",
" > Setting up Audio Processor...\n",
" | > fft size: 2048, hop length: 275, win length: 1102\n",
" | > Audio Processor attributes.\n",
" | > bits:None\n",
" | > sample_rate:22050\n",
" | > num_mels:80\n",
" | > min_level_db:-100\n",
" | > frame_shift_ms:12.5\n",
" | > frame_length_ms:50\n",
" | > ref_level_db:20\n",
" | > num_freq:1025\n",
" | > power:1.5\n",
" | > preemphasis:0.97\n",
" | > griffin_lim_iters:60\n",
" | > signal_norm:True\n",
" | > symmetric_norm:False\n",
" | > mel_fmin:0\n",
" | > mel_fmax:None\n",
" | > max_norm:1.0\n",
" | > clip_norm:True\n",
" | > do_trim_silence:True\n",
" | > n_fft:2048\n",
" | > hop_length:275\n",
" | > win_length:1102\n",
" | > Number of characters : 149\n",
" | > Num output units : 1025\n",
"\n",
" > Starting a new training\n",
" | > Model has 7083650 parameters\n",
" > Reading LJSpeech from - ../LJSpeech-1.1/\n",
" | > Number of instances : 12000\n",
" | > Max length sequence 187\n",
" | > Min length sequence 5\n",
" | > Avg length sequence 98.28775\n",
" | > 0 instances are ignored by min_seq_len (0)\n",
" | > Batch group shuffling is active.\n",
" | > Epoch 0/1000\n",
" | > Step:9/375 GlobalStep:10 TotalLoss:0.33402 LinearLoss:0.14762 MelLoss:0.18641 StopLoss:0.73977 GradNorm:0.04653 GradNormST:1.01440 AvgTextLen:36.2 AvgSpecLen:172.2 StepTime:2.21 LR:0.000100\n",
" | > Step:19/375 GlobalStep:20 TotalLoss:0.29765 LinearLoss:0.14310 MelLoss:0.15455 StopLoss:0.66492 GradNorm:0.06941 GradNormST:0.48356 AvgTextLen:44.8 AvgSpecLen:222.0 StepTime:2.41 LR:0.000100\n",
" | > Step:29/375 GlobalStep:30 TotalLoss:0.28474 LinearLoss:0.14459 MelLoss:0.14015 StopLoss:0.72310 GradNorm:0.06081 GradNormST:0.92077 AvgTextLen:48.8 AvgSpecLen:246.3 StepTime:3.07 LR:0.000100\n",
" | > Step:39/375 GlobalStep:40 TotalLoss:0.27690 LinearLoss:0.14139 MelLoss:0.13551 StopLoss:0.73683 GradNorm:0.05845 GradNormST:2.58913 AvgTextLen:51.6 AvgSpecLen:254.2 StepTime:3.22 LR:0.000100\n",
" | > Step:49/375 GlobalStep:50 TotalLoss:0.26876 LinearLoss:0.13891 MelLoss:0.12986 StopLoss:0.72490 GradNorm:0.03796 GradNormST:3.11640 AvgTextLen:60.6 AvgSpecLen:313.0 StepTime:3.56 LR:0.000100\n",
" | > Step:59/375 GlobalStep:60 TotalLoss:0.25832 LinearLoss:0.13163 MelLoss:0.12669 StopLoss:0.74020 GradNorm:0.05046 GradNormST:1.47773 AvgTextLen:65.6 AvgSpecLen:326.4 StepTime:3.71 LR:0.000100\n",
" | > Step:69/375 GlobalStep:70 TotalLoss:0.25436 LinearLoss:0.12792 MelLoss:0.12644 StopLoss:0.67741 GradNorm:0.06263 GradNormST:2.02652 AvgTextLen:66.6 AvgSpecLen:321.7 StepTime:3.73 LR:0.000100\n",
" | > Step:79/375 GlobalStep:80 TotalLoss:0.25033 LinearLoss:0.12247 MelLoss:0.12785 StopLoss:0.77042 GradNorm:0.05292 GradNormST:0.97249 AvgTextLen:72.3 AvgSpecLen:363.9 StepTime:4.67 LR:0.000100\n",
" | > Step:89/375 GlobalStep:90 TotalLoss:0.24469 LinearLoss:0.11755 MelLoss:0.12714 StopLoss:0.67230 GradNorm:0.08123 GradNormST:0.90303 AvgTextLen:79.5 AvgSpecLen:389.7 StepTime:4.24 LR:0.000100\n",
" | > Step:99/375 GlobalStep:100 TotalLoss:0.24387 LinearLoss:0.11588 MelLoss:0.12799 StopLoss:0.56327 GradNorm:0.05469 GradNormST:2.77339 AvgTextLen:78.1 AvgSpecLen:393.7 StepTime:3.59 LR:0.000100\n",
" | > Step:109/375 GlobalStep:110 TotalLoss:0.23841 LinearLoss:0.11188 MelLoss:0.12652 StopLoss:0.62725 GradNorm:0.11853 GradNormST:1.29791 AvgTextLen:81.9 AvgSpecLen:400.1 StepTime:4.18 LR:0.000100\n",
" | > Step:119/375 GlobalStep:120 TotalLoss:0.23700 LinearLoss:0.10967 MelLoss:0.12733 StopLoss:0.60695 GradNorm:0.02969 GradNormST:1.44895 AvgTextLen:82.7 AvgSpecLen:423.1 StepTime:4.22 LR:0.000100\n",
" | > Step:129/375 GlobalStep:130 TotalLoss:0.23633 LinearLoss:0.10984 MelLoss:0.12650 StopLoss:0.73151 GradNorm:0.10447 GradNormST:1.45647 AvgTextLen:91.1 AvgSpecLen:462.0 StepTime:5.03 LR:0.000100\n",
" | > Step:139/375 GlobalStep:140 TotalLoss:0.23326 LinearLoss:0.10804 MelLoss:0.12522 StopLoss:0.56982 GradNorm:0.04163 GradNormST:1.32453 AvgTextLen:95.1 AvgSpecLen:470.1 StepTime:4.71 LR:0.000100\n",
" | > Step:149/375 GlobalStep:150 TotalLoss:0.23332 LinearLoss:0.10898 MelLoss:0.12434 StopLoss:0.64671 GradNorm:0.05904 GradNormST:0.94457 AvgTextLen:93.3 AvgSpecLen:470.4 StepTime:4.87 LR:0.000100\n",
" | > Step:159/375 GlobalStep:160 TotalLoss:0.23035 LinearLoss:0.10733 MelLoss:0.12302 StopLoss:0.70120 GradNorm:0.03810 GradNormST:1.59297 AvgTextLen:97.2 AvgSpecLen:487.0 StepTime:5.40 LR:0.000100\n"
],
"name": "stdout"
}
]
}
]
}
@vitaly-zdanevich
Copy link

First command is invalid.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment