samehkamaleldin/build-model.sh

## build-model.sh
#!/bin/sh
# ------------------------------------------------------------------------------
# FILE        | build-model.sh
# AUTHOR      | sameh kamal
# DESCRIPTION | build a new concatinative model
# ------------------------------------------------------------------------------

db_name = $1                # database name
dt_dir  = $2                # raw data dir
ws_dir  = "~/tts-workspace" # workspace dir
db_dir  = $ws_dir/$db_name  # database dir
aux_dir = $ws_dir/aux
phonemizer_continuous_dir = "~/univox/phonemizer-continuous"
marytts_base_dir          = "~/git/marytts"

# make workspace dir if not exists
mkdir -p $db_dir

if [ -d $db_dir ]; then
    # create database directories
    mkdir -p $db_dir
    mkdir -p $db_dir/text
    mkdir -p $db_dir/wav
    mkdir -p $db_dir/lab
    mkdir -p $db_dir/language

    # move raw data files to database directory
    cp $dt_dir/**/**/*.txt $db_dir/text
    cp $dt_dir/**/**/*.wav $db_dir/wav
    cp $dt_dir/**/*.lab    $db_dir/lab

    # enter database directory
    cd $db_dir
    # -------------------------------------------------------------------
    # create unique words file
    # -------------------------------------------------------------------
    # enter text directory
    cd text
    # put all sentences in one file in language directory
    awk 'FNR==1{print ""}1' *.txt > ../language/text_ar.txt
    # enter language directory
    cd ../language
    # get unique words from all sentences and put them in one file
    tr -s [:space:] \\n < text_ar.txt | sort | uniq > unique_words.txt
    # remove space and ',' from the head of file if exists
    ln     = $(head -n 1 unique_words.txt)
    ln_len = ${#ln}
    if [ $ln_len eq 1 ]
       sed -i '1d' unique_words.txt
    fi
    ln     = $(head -n 1 unique_words.txt)
    ln_len = ${#ln}
    if [ $ln_len eq 1 ]
       sed -i '1d' unique_words.txt
    fi
    # -------------------------------------------------------------------
    # replace SSIL and SIL with _ in labs files
    # -------------------------------------------------------------------
    cd ../lab
    # replace all SSIL with _ in all files
    sed -i -- 's/SSIL/_/g' *
    # replace all SIL with _ in all files
    sed -i -- 's/SIL/_/g' *

    # -------------------------------------------------------------------
    # run phonemizer on unique words
    # -------------------------------------------------------------------
    cd ../language
    java -jar $aux_dir/phonemize_cont.jar ./unique_words.txt ./ar_phon_dict.ph
else
    echo "> database dir doesn't exist. "
fi

## ilive-tts.md

      
    Raw
  

              ilive-tts.md
            
          
    iLive TTS model generation

This is a bief explanation of how to build a new model for ilive tts.
input

the input of model generation process will be a set of directories containing required files for model generation, and these folders can be listed as the follwing:

text  dir contains set of diactized arabic text sentences each in a separated file.
wav    dir contains set of wav files each represents arabic pronounciation of corresponding sentence in text dir
lab  dir contains arabic text pronounciation timelaps for each speech segment for each corresponding text and wav file
language dir we create that contain pre-processing outputs
phonemizer dir contain set of rules of pronounciation of arabic language phonemes.

Pre-Procesing

the input acquired from linguistics team needs some pre-processing to generate intermediate files needed while generating the model.

gather all unique words in all scentences in one file and put it in language folder.

# enter text directory
cd text
# put all sentences in one file in language directory
awk 'FNR==1{print ""}1' *.txt > ../language/text_ar.txt
# enter language directory
cd ../language
# get unique words from all sentences and put them in one file
tr -s [:space:] \\n < text_ar.txt | sort | uniq > unique_words.txt
# then manually remove space` ` and `,` if exist in the unique words file

in lab directory, in all files, replace all SIL and SSIL with _.

# enter lab directory
cd lab
# replace all SSIL with _ in all files
sed -i -- 's/SSIL/_/g' *
# replace all SIL with _ in all files
sed -i -- 's/SIL/_/g' *

in project phonemizer-continuous resources [src/main/resources/com/univox], replace [allophones.ar_SA.xml, ArabicPhonemesMap, ArabicScript] with files from phonemizer directory, and also in both  the jar file and the rar file inside it.
in project marytts:marytts-runtime/src/main/resources and marytts-runtime/src/main/java/marytts/com/univox replace [ArabicPhonemesMap,ArabicScript,allophones.ar_SA.xml] with files from the previos step.
put the unique words file in project phonemizer-continuous base folder.
in project phonemizer-continuous:src/main/java/com/univox/PhonemizerMain.java make sure that the file name used iis the same as your unique words file name in the base filder.

String filename="unique_words.txt";

run project phonemizer-continuous as a java application. [this will take a few minutes]
move the output of phonemizer-continuous project output named unique_words.ph to language directory and name it ar.txt.
in language folder, in ar.txt file replace all __ with  functional (yes: with a pre space), then remove all the remaining _ from the file.

# enter language folder
cd language
# replace "__" with " functional"
sed -i -- 's/__/ functional/g' ar.txt
# remove remaining '_' in the file
sed 's/_//g' ar.txt

in marytts project, delete target folder, then re-create marytts

# enter marytts directory
cd marytts
mvn -Dmaven.test.skip=true install

in language folder run transcription.sh which results from marytts build in the previous step.

cd language
transcription.sh
this will open a GUI tool that will require few steps:-

asks for alophones.ar_SA.xml file, select it from its location.
then from file menu select open and then select ar.txt file from language directory.
check all words, none should be in red, and if so this indicates and error.
click train and predict button.
from file menu, select save. this will result in saving few files in language directory.
close the gui tool.
in marytts project, delete target folder.
in marytts project, in directory marytts-languages/marytts-lang-ar, delete target folder.
in marytts project, in directory marytts-languages/marytts-lang-ar/src/main/resources/marytts/language/ar/lexicon replace the files in it with output files from the transcription.sh tool.
in the directory from the previous step rename allophones.ar_SA.xml to allophones.ar.xml and also remove the _SA from the tag lang inside the file.
in project marytts: dir marytts-language/marytts-lang-ar/lib/modules/ar/lexicon repace the two files [allophones.ar.xml,ar] with modified all_phones.ar.xml file from the previos step and ar.txt file from language directory afrer being renamed to ar only
in project marytts: dir marytts-language/marytts-lang-ar test that everything is okay.

mvn test

in project marytts:dir user-dictionaries, replace userdict-ar.txt with the output of the phonemizer-continuous project which is unique_words.ph after it being renamed to userdict-ar.txt.
edit the filed moved to user-dictionaries, replace remove __ and replace _ with | 

sed 's/__//g' userdict-ar.txt
sed 's/_/| /g' userdict-ar.txt

in marytts project, delete target folder, then build marytts

# enter marytts directory
cd marytts
mvn -Dmaven.test.skip=true install

open marytts projcet - univox workspace -in eclipse and add build configuration for server and client as the following.
run server: run configuration with ~/git/marytts as mary base
run DatabaseImportMain: run configuration with db directory as database base folder.


## set-env-vals.sh
#!/bin/sh
# ------------------------------------------------------------------------------
# FILE        | set-env-vals.sh
# AUTHOR      | sameh kamal
# DESCRIPTION | set environment variables required for building tts models
# ------------------------------------------------------------------------------

export MARY_BASE_DIR=/home/sameh/work/ist
export PHONEMIZER_SCRIPTS_DIR=/home/sameh/work/ist/phonemizer
	#!/bin/sh
	# ------------------------------------------------------------------------------
	# FILE \| build-model.sh
	# AUTHOR \| sameh kamal
	# DESCRIPTION \| build a new concatinative model
	# ------------------------------------------------------------------------------

	db_name = $1 # database name
	dt_dir = $2 # raw data dir
	ws_dir = "~/tts-workspace" # workspace dir
	db_dir = $ws_dir/$db_name # database dir
	aux_dir = $ws_dir/aux
	phonemizer_continuous_dir = "~/univox/phonemizer-continuous"
	marytts_base_dir = "~/git/marytts"

	# make workspace dir if not exists
	mkdir -p $db_dir

	if [ -d $db_dir ]; then
	# create database directories
	mkdir -p $db_dir
	mkdir -p $db_dir/text
	mkdir -p $db_dir/wav
	mkdir -p $db_dir/lab
	mkdir -p $db_dir/language

	# move raw data files to database directory
	cp $dt_dir///*.txt $db_dir/text
	cp $dt_dir///*.wav $db_dir/wav
	cp $dt_dir/*/.lab $db_dir/lab

	# enter database directory
	cd $db_dir
	# -------------------------------------------------------------------
	# create unique words file
	# -------------------------------------------------------------------
	# enter text directory
	cd text
	# put all sentences in one file in language directory
	awk 'FNR==1{print ""}1' *.txt > ../language/text_ar.txt
	# enter language directory
	cd ../language
	# get unique words from all sentences and put them in one file
	tr -s [:space:] \\n < text_ar.txt \| sort \| uniq > unique_words.txt
	# remove space and ',' from the head of file if exists
	ln = $(head -n 1 unique_words.txt)
	ln_len = ${#ln}
	if [ $ln_len eq 1 ]
	sed -i '1d' unique_words.txt
	fi
	ln = $(head -n 1 unique_words.txt)
	ln_len = ${#ln}
	if [ $ln_len eq 1 ]
	sed -i '1d' unique_words.txt
	fi
	# -------------------------------------------------------------------
	# replace SSIL and SIL with _ in labs files
	# -------------------------------------------------------------------
	cd ../lab
	# replace all SSIL with _ in all files
	sed -i -- 's/SSIL/_/g' *
	# replace all SIL with _ in all files
	sed -i -- 's/SIL/_/g' *

	# -------------------------------------------------------------------
	# run phonemizer on unique words
	# -------------------------------------------------------------------
	cd ../language
	java -jar $aux_dir/phonemize_cont.jar ./unique_words.txt ./ar_phon_dict.ph
	else
	echo "> database dir doesn't exist. "
	fi
	#!/bin/sh
	# ------------------------------------------------------------------------------
	# FILE \| set-env-vals.sh
	# AUTHOR \| sameh kamal
	# DESCRIPTION \| set environment variables required for building tts models
	# ------------------------------------------------------------------------------

	export MARY_BASE_DIR=/home/sameh/work/ist
	export PHONEMIZER_SCRIPTS_DIR=/home/sameh/work/ist/phonemizer