sgallese/How to make a Rita Lexicon in Spanish from Aspell

## combinefiles.pl
#!/usr/bin/perl

#################################################################
####    Author: Sebastian Gallese
####
####    SCRIPT: Combine Aspell dictionary, CMU Pronunciation and
####            Penn Parts of Speech into a file readable by Rita
####    EMAIL:  SebastianGallese AT GMAIL DOT COM
##################################################################

use utf8;

$dictpath =  $ARGV[0];
$cmupath =  $ARGV[1];
$pennpath =  $ARGV[2];

if ($dictpath eq "" || $cmupath eq "" || $pennpath eq "") {
    print "Please enter in a path to your Spanish Aspell dictionary txt\n";
    print "and a path to your CMU Pronunciation txt\n";
    print "and a path to your Penn Parts of Speech txt\n";
    print "Usage: perl combinefiles.pl path/to/aspelldictionary.txt path/to/cmupronunciation.txt path/to/pennpos.txt\n";
    print "E.g.: perl combinefiles.pl ~/aspellutf8.txt ~/aspellutf8cmu.txt ~/aspellutf8pos.txt\n";
}
else{
    open (dict, $dictpath);
    @dict=<dict>;
    close dict;
    open (cmu, $cmupath);
    @cmu=<cmu>;
    close cmu;
    open (penn, $pennpath);
    @penn=<penn>;
    close penn;

    $count = 0;

    foreach (@dict) {

	chomp ( $dict = @dict[$count] );
	chomp ( $cmu = @cmu[$count] );
	chomp ( $penn = @penn[$count] );

	print $dict . ":\t" . $cmu . "\t\| ". $penn . " \n";

	$count += 1;
    }
}

## How to make a Rita Lexicon in Spanish from Aspell
# How to make a Rita Lexicon in Spanish from Aspell
# by Sebastian Gallese
# gmail: sebastiangallese

# I assume you know a good bit about Rita and Java and the command line
# It might be helpful to know some Perl (I didn't know any before this)
# If you modify this file, you might be able to execute this as a
# bash script instead of copy-pasting the commands line by line.
# I sure as hell wouldn't.

# Prefix:
# In order for this to be incorporated into Rita officially,
# we must change or enhance a number of things:
# 1. See if aspell is the best source dictionary
# - why not use the RAE? (it's the OED of Spanish)
# - decide if we should use expanded forms (we currently expand all forms of a word)
# 2. Use a non-statistical POS tagger
# - OpenNLP Spanish POS tagger can fail miserably with rare words and different conjugations
# - in the SpanPosTagger.java source, you'll see problems getting more than one tag per word
# 3. Tweak the Cast3lB Corpus POS to Penn Treebank POS Conversion
# - if we stick to OpenNLP, need a Spanish linguist to verify SpanPosTagger.java settings
# 4. Use a non-automated Spanish IPA transcriptor
# - is there a database full of IPA transcription for Spanish words?
# 5. Tweak the automated IPA transcription
# - if we stick to spanishapelltorita.pl, email Xavier and ask him about his algorithm
# - modify the algorithm's weaknesses with the help of a Spanish linguist or textbook
# 6. Tweak the Spanish IPA to English IPA Conversion
# - get Spanish linguist to verify spanishapelltorita.pl conversions as most appropriate
# 7. Tweak the English IPA to CMU Pronunciation Conversion
# - get English linguist to verify spanishapelltorita.pl conversions as most appropriate


# 1. Install aspell

# Make sure you've installed Macports!
# https://trac.macports.org/wiki/InstallingMacPorts
# install aspell and the spanish dictionary
sudo port install aspell
sudo port install aspell-dict-es

# References
# http://docs.moodle.org/en/Configuring_aspell_on_Mac_OS_X

# 2. Make word list

# go to your home directory
cd

# expand an aspell dictionary
aspell -l es dump master | aspell -l es expand > aspell.txt

# change it the dictionary to utf, change every space to a newline, and sort it
iconv -f ISO8859-1 -t UTF-8 aspell.txt | tr ' ' '\n' | sort -uf > aspellutf8.txt

# move the dictionary to your home folder
mv aspellutf8.txt ~/

# References
# http://www.pocketmagic.net/?p=782
# http://fileformat.wordpress.com/2007/11/25/how-to-make-a-word-list-from-an-aspell-dictionary/

# 3. Extract POS from dictionary file

# Download the parts-of-speech generator Tagger.jar
# http://code.google.com/p/cognate-translation-tool/source/browse/#svn/trunk/src/posTagger
# Move Tagger.jar to your home directory

# Download the Spanish POS model SpanishPOS.bin.gz
# DON'T UNZIP THIS FILE!
# http://opennlp.sourceforge.net/models/spanish/postag/
# Move SpanishPOS.bin.gz to your home directory

# go to your home directory
cd

# run the jar file on your aspell dictionary and output the POS
java -jar Tagger.jar SpanishPOS.bin.gz < aspellutf8.txt >> aspellutf8pos.txt

# References
# Source code and comments on the Tagger.jar file
# You will also find related links
# http://code.google.com/p/cognate-translation-tool/source/browse/#svn/trunk/src/posTagger

# 4. Extract CMU Pronunciation from dictionary file

# Download the Spanish CMU pronunciation converter spanishaspelltorita.pl
# http://code.google.com/p/cognate-translation-tool/source/browse/#svn/trunk/src/ritaspanishdictonary
# Move spanishaspelltorita.pl to your home directory

# go to your home directory
cd

# run the perl file on your aspell dictionary and output the CMU pronunciation
perl spanishaspelltorita.pl aspellutf8.txt >> aspellutf8cmu.txt

# References
# Source code and comments on the spanishaspelltorita.pl file
# You will also find related links
# http://code.google.com/p/cognate-translation-tool/source/browse/#svn/trunk/src/ritaspanishdictonary

# 5. Combine spanish dictionary, cmu pronunciation, and pos tags

# Download the combiner combinefiles.pl
# http://code.google.com/p/cognate-translation-tool/source/browse/#svn/trunk/src/ritaspanishdictonary
# Move combinefiles.pl to your home directory

# go to your home directory
cd

# run the per file to combine all the previous files you've made
perl combinefiles.pl aspellutf8.txt aspellutf8cmu.txt aspellutf8pos.txt >> aspellutf8combined.txt

# References
# Source code and comments on the combinefiles.pl file
# You will also find related links
# http://code.google.com/p/cognate-translation-tool/source/browse/#svn/trunk/src/ritaspanishdictonary

## spanishaspelltorita.pl
#!/usr/bin/perl

#################################################################
####    Author: Sebastian Gallese
####
####    SCRIPT: Convert a Spanish Aspell dictionary list to CMU Pronunciation
####    EMAIL:  SebastianGallese AT GMAIL DOT COM
##################################################################

#####################################################################
####	All of the subroutines below the Xavier López Morrás header
####	were made by Xavier with revisions by Sebastian Gallese
#####################################################################

use utf8;
# binmode STDOUT, "utf8";

# Used to run program from command line
# Usage: perl transcriptor.pl path/to/aspelldictionary.txt
# E.g.: perl transcriptor.pl aspelldict.txt


$dictpath =  $ARGV[0];

if ($dictpath eq "") {
    print "Please enter in a path to your Spanish Aspell dictionary txt!\n";
    print "Usage: perl spanishaspelltorita.pl path/to/aspelldictionary.txt\n";
}
else{

    # $questions = 0;

    open (MYDICT, $dictpath);
    while (<MYDICT>) {
 	chomp;

	$original = $_;
	utf8::decode($_);
	$currentword = $_;
	$transcribed = &transcribe($currentword);
	@separated = &proc_sil($transcribed);

	$syllable = 0;

	for my $syll (@separated) {

	    @separated[$syllable] =  &convertcmupronounce(&convertenglishipa($syll));
	    $syllable += 1;

	    # if ($syll =~ /\?/ ) {
		# 7099 words are encoded with a question mark
		# for their pronunciation.
		# For now, we just replace these question marks,
		# but will look into fixing the problem in
		# the future
		# $questions += 1;
	    # }
	}

	$lastsyll = pop(@separated);
	push (@separated, $lastsyll);
	$cmu = "";

	    # print "original: $original\n";
	    # print "decoded: $currentword\n";
	    # $flag = utf8::valid($currentword);
	    # print "isUTF?: $flag\n";
	    # print "length: $medida\n";
	    # print "transcribe: $transcribed\n";
	    for my $syll (@separated) {

		if ($syll eq $lastsyll){
		    $cmu = $cmu . "$syll";
		}
		else{
		    $cmu = $cmu . "$syll ";
		}
	    }

	    print "$cmu\n";
	    # print "--------------------------\n";


    }
    close (MYDICT);

	# print "total questions: $questions\n";

}


sub convertenglishipa {

# Convert characters from Spanish IPA to English IPA
# using these resources as guides
# http://en.wikipedia.org/wiki/IPA_chart_for_Spanish
# http://en.wikipedia.org/wiki/IPA_for_English
# http://www.aucel.com/pln/ (see Símbolos no AFI)

    my $spanishipa = shift;

    # Convert characters that don't exit in Eng IPA
    $spanishipa =~ s/ɾ/r/g;
    $spanishipa =~ s/β/b/g;
    $spanishipa =~ s/ɣ/g/g;
    $spanishipa =~ s/y/j/g;
    $spanishipa =~ s/ʝ/j/g;
    $spanishipa =~ s/ʎ/j/g;
    $spanishipa =~ s/M/n/g;
    $spanishipa =~ s/N/n/g;
    $spanishipa =~ s/ñ/ŋ/g;
    $spanishipa =~ s/ɲ/ŋ/g;
    $spanishipa =~ s/ɾ/r/g;
    $spanishipa =~ s/ɾ/r/g;

    # Change character according to Eng pronunciation
    $spanishipa =~ s/ai/aɪ/g;
    $spanishipa =~ s/au/aʊ/g;
    $spanishipa =~ s/eu/eʊ/g;
    $spanishipa =~ s/ja/jɒ/g;
    $spanishipa =~ s/je/je/g;
    $spanishipa =~ s/jo/oʊ/g;
    $spanishipa =~ s/oi/ɔɪ/g;
    $spanishipa =~ s/ou/oʊ/g;
    $spanishipa =~ s/wa/wɒ/g;
    $spanishipa =~ s/wo/wəʊ/g;
    $spanishipa =~ s/a/ɑ/g;
    $spanishipa =~ s/o/oʊ/g;
    $spanishipa =~ s/e/ɛ/g;


    return $spanishipa;

}

sub convertcmupronounce {

# Convert characters from English IPA to CMU Pronounciation
# using these resources as guides
# http://www.speech.cs.cmu.edu/cgi-bin/cmudict
# http://en.wikipedia.org/wiki/Arpabet

    my $englishipa = shift;

    $englishipa =~ s/'//g;
    $englishipa =~ s/ɔ/AO-/g;
    $englishipa =~ s/ɒ/AO-/g;
    $englishipa =~ s/ɑ/AA-/g;
    $englishipa =~ s/i/IY-/g;
    $englishipa =~ s/u/UW-/g;
    $englishipa =~ s/ɛ/EH-/g;
    $englishipa =~ s/ɪ/IH-/g;
    $englishipa =~ s/ʊ/UH-/g;
    $englishipa =~ s/ʌ/AH-/g;
    $englishipa =~ s/ə/AH-/g;
    $englishipa =~ s/æ/AE-/g;
    $englishipa =~ s/eɪ/EY-/g;
    $englishipa =~ s/aɪ/AY-/g;
    $englishipa =~ s/oʊ/OW-/g;
    $englishipa =~ s/o/OW-/g;
    $englishipa =~ s/aʊ/AW-/g;
    $englishipa =~ s/ɔɪ/OY-/g;
    $englishipa =~ s/ɝ/ER-/g;
    $englishipa =~ s/ɚ/ER-/g;
    $englishipa =~ s/ɛr/EH-R-/g;
    $englishipa =~ s/ʊr/UH-R-/g;
    $englishipa =~ s/ɔr/AO-R-/g;
    $englishipa =~ s/ɑr/AA-R-/g;
    $englishipa =~ s/ɪr/IH-R-/g;
    $englishipa =~ s/aʊr/AW-R-/g;
    $englishipa =~ s/p/P-/g;
    $englishipa =~ s/b/B-/g;
    $englishipa =~ s/t/T-/g;
    $englishipa =~ s/d/D-/g;
    $englishipa =~ s/k/K-/g;
    $englishipa =~ s/g/G-/g;
    $englishipa =~ s/t∫/CH-/g;
    $englishipa =~ s/dʒ/JH-/g;
    $englishipa =~ s/f/F-/g;
    $englishipa =~ s/v/V-/g;
    $englishipa =~ s/θ/TH-/g;
    $englishipa =~ s/ð/DH-/g;
    $englishipa =~ s/s/S-/g;
    $englishipa =~ s/z/Z-/g;
    $englishipa =~ s/∫/SH-/g;
    $englishipa =~ s/ʒ/ZH-/g;
    $englishipa =~ s/h/HH-/g;
    $englishipa =~ s/m/M-/g;
    $englishipa =~ s/n/N-/g;
    $englishipa =~ s/ŋ/NG-/g;
    $englishipa =~ s/l/L-/g;
    $englishipa =~ s/r/R-/g;
    $englishipa =~ s/j/Y-/g;
    $englishipa =~ s/w/W-/g;
    $englishipa =~ s/x/HH-/g;
    chop ($englishipa);

    return $englishipa;

}

#################################################################
####    AUTOR: Xavier López Morrás
####
####    SCRIPT: Transcriptor fonético automático del español
####    EMAIL:  lopezx@gmail.com
##################################################################


#####################################################################
####    Puedes hacer uso libre del script y código a nivel personal.
####    Para otras finalidades consultar al autor.
#####################################################################

## INPUT: escritura ordinaria. OUTPUT: transcripcion fonetica.


sub caracteres  {

    my $Frasev= shift;
    $Frasev =~ s/ñ/%F1/g;
    $Frasev =~ s/á/%E1/g;
    $Frasev =~ s/é/%E9/g;
    $Frasev =~ s/í/%ED/g;
    $Frasev =~ s/ó/%F3/g;
    $Frasev =~ s/ú/%FA/g;
    $Frasev =~ s/,/%2C/g;
    $Frasev =~ s/!/%21/g;
    $Frasev =~ s/¿/%BF/g;
    $Frasev =~ s/\?/%3F/g;
    $Frasev =~ s/ü/%FC/g;
    $Frasev =~ s/Ü/%DC/g;
    $Frasev =~ s/Í/%CD/g;
    $Frasev =~ s/Ú/%DA/g;
    $Frasev =~ s/Á/%C1/g;
    $Frasev =~ s/É/%C9/g;
    $Frasev =~ s/Ó/%D3/g;
    $Frasev =~ s/:/%3A/g;
    $Frasev =~ s/"/%22/g;
    #substr ($Frasev, 0, 2) = "";
    return $Frasev;
}

sub transcribe {

    my $oracion=shift;

    $oracion=~ s/%F1/ñ/g;
    $oracion=~ s/%E1/á/g;
    $oracion=~ s/%E9/é/g;
    $oracion=~ s/%ED/í/g;
    $oracion=~ s/%F3/ó/g;
    $oracion=~ s/%FA/ú/g;
    $oracion=~ s/%3A/|/g;
    $oracion=~ s/%22//g;
    $oracion=~ s/%FC/w/g;
    $oracion=~ s/%DC/w/g;
    $oracion=~ s/%CD/í/g;
    $oracion=~ s/%DA/ú/g;
    $oracion=~ s/%C1/á/g;
    $oracion=~ s/%C9/é/g;
    $oracion=~ s/%D3/ó/g;
    $oracion=~ s/%2C/,/g;
    $oracion=~ s/%21/,/g;
    $oracion=~ s/%BF/,/g;
    $oracion=~ s/%3F/,/g;
    $oracion=~ tr/+/ /;
    $oracion=~ tr/ABCDEFGHIJKLMNÑOPQ/abcdefghijklmnñopq/;
    $oracion=~ tr/RSTUVWXYZ/rstuvwxyz/;

    $medida = length ( $oracion ) ;
    $esp=0;
    $n = 0;
    $voc = 0;
    $rasgo= 0;
    $transcripcion = "";


    while ($n < $medida )
{

    $c = substr ( $oracion, $n, 1 );
    $vsig = substr ($oracion, $n+1, 1);
    $vant = substr ($oracion, $n-1, 1);

    if ($vsig eq " ") {
	$esps = 1;
    }
    elsif ($vsig ne " ") {
	$esps = 0;
    }

    $vsigg = substr ($oracion, $n+2, 1);
    $impres="";


    if ($vsigg eq " " || $vsig eq " " ) {
        $vsigg = substr ($oracion, $n+3, 1);
    }

    if ($vsig eq " ") {
        $vsig = substr ($oracion, $n+2, 1); }


    if ($vsig eq "h" && $c ne "c") {
	$vsig = $vsigg;
    }


    if ($c eq "." || $c eq "," || $c eq ";") {
        $impres= "";
        $rasgo = 0; }

    if ($c eq " ") {
        $impres=" ";
        $esp = 1
    }

    if ($c eq "a")
    {
        $impres=  "a";
        $rasgo = "vocal";
        $voc = "a";
        $esp = 0;
    }

    if ($c eq "á")
    {

        $impres=  "A";
        $rasgo = "vocal";
        $voc = "a";
        $esp = 0;
    }

    if ($c eq "é") {
        $impres=  "E";
        $rasgo = "vocal";
        $voc = "e";
        $esp = 0; }


    if ($c eq "í")
    {
        $impres=  "I";
        $rasgo = "vocal";
        $voc = "ii";
    }

    if ($c eq "ó") {
        $impres=  "O";
        $rasgo = "vocal";
        $voc = "o";
        $esp = 0; }


    if ($c eq "ú")
    {
        $impres=  "U";
        $rasgo = "vocal";
        $voc = "uu";
    }


    #resto de letras
    if ($c eq "b")
    {

	if ($rasgo eq "vocal" || $rasgo eq "l" || $rasgo eq "r")
	{

	    if ($vsig=~/[aeiouáéíóúrl]/)    {
		$impres="&beta;"; }

	    else
	    {
		$impres=  "b";
	    }
	}
	else
	{
	    $impres=  "b";
	}

        $rasgo = "b";
    }

    if ($c eq "c") {
	if ( $vsig eq "h" ) {
	    $impres="t&int;";
	    $n++;
	    $rasgo = "tS"; }
	elsif ( $vsig eq "e" || $vsig eq "i" ||$vsig eq "í" ) {
	    $impres="&theta;";
	    $rasgo ="Z"; }
	else {
	    $impres=  "k";
	    $rasgo = "k"; }
    }

    if ($c eq "d") {
	if ($rasgo eq "vocal" || $rasgo eq "r")
	{
	    if ($vsig=~/[aeiouáéíóúrl]/)    {
		$impres=  "ð" }

	    else {
		$impres=  "d"; }
	}
	else
	{
	    $impres=  "d";
	}
        $rasgo= "d";
    }

    if ($c eq "e") {
        $impres=  "e";
        $rasgo = "vocal";
        $voc = "e";
        $esp = 0; }


    if ($c eq "f") {
        $impres=  "f";
        $rasgo = "f"; }

    if ($c eq "g") {
	if ($vsig eq "a"|| $vsig eq "w")    {
	    if ($rasgo eq "vocal" || $rasgo eq "s" || $rasgo eq "r" || $rasgo eq "l") {
		$impres=  "&gamma;";
	    }
	    else {
		$impres=  "g"; }
	}
	elsif ($vsig eq "e") {
	    $impres=  "x"; }
	elsif ($vsig eq "i") {
	    $impres=  "x"; }
	elsif ($vsig eq "o") {
	    if ($rasgo eq "vocal" || $rasgo eq "s" || $rasgo eq "r" || $rasgo eq  "l") {
		$impres=  "&gamma;"; }
	    else {
		$impres=  "g" }
	}

	elsif ($vsig eq "u"|| $vsig eq "ú") {

	    if ( $vsigg eq "e" || $vsigg eq "i" ) {
		$n++;
		if ($rasgo eq "vocal" || $rasgo eq "s" || $rasgo eq "r" ||
		    $rasgo eq "l") {
		    $impres="&gamma;" }
		else {
		    $impres=  "g"; }
	    }
	    else {
		if ($rasgo eq "vocal"|| $rasgo eq "s" || $rasgo eq "r" ||
		    $rasgo eq "l") {
		    $impres="&gamma;" }
		else {
		    $impres=  "g"; }
	    }
	}
	elsif ($vsig eq "r"||$vsig eq "l") {
	    if ($rasgo eq "vocal"){
		$impres="&gamma;";
		$rasgo eq "G";}
	    else {
		$impres=  "g";
		$rasgo eq "g";}


	}

	else {
	    $impres=  "g"; }
	$rasgo = "g";
    }


    if ($c eq "h") {
    }

    if ($c eq "i") {
	if ( ($rasgo eq "vocal") || ($vsig=~/[aeiouáéíó]/)  ) {
	    unless ($vant=~/ /) {
		$impres=  "j";
		$rasgo = "vocal";
		$voc = "ï";
	    }
	}

	else {
	    $impres=  "i";
	    $rasgo = "vocal";
	    $voc = "i";
	}
	if ($vant=~/ /) {
	    $impres= "i";
	    $esp = 0;
	}


    }

    if ($c eq "j") {
        $impres=  "x";
        $rasgo = "x";
    }

    if ($c eq "k") {
        $impres=  "k";
        $rasgo = "k";
    }


    if ($c eq "l") {
	if ($vsig eq "l" && $vsigg ne "l" && $esps ne 1) {
	    if ($rasgo eq "vocal"){
		$impres=  "&lambda;"}
	    elsif ($rasgo ne "vocal"){
		$impres=  "&lambda;"}
	    $n++;
	}

	elsif ($vsig eq "l" && $vsigg ne "l" && $esps eq 1) {
	    $impres=  "l l";
	    $n = $n+2;
	    $esps = 0;
	}

	elsif ($vsig eq "l" && $vsigg eq "l" && $esps eq 1) {
	    $impres=  "&lambda; &lambda;";
	    $n = $n+3;
	    $esps = 0;
	}

        else {
	    $impres=  "l";
	    $rasgo = "l";
	    $esp = 0;
        }
        $rasgo = "l";

    }

    if ($c eq "m") {
	if ($vsig eq "f") {
	    $impres=  "M"; }
	else {
	    $impres=  "m";
	}
	$rasgo = "m";
    }

    if ($c eq "n") {
	if ($vsig eq "t" || $vsig eq "d" || $vsig eq "z")
	{
	    $impres=  "N"; }

	elsif (($vsig eq "c" || $vsig eq "q") && ($vsigg eq "a" || $vsigg eq "o" || $vsigg eq "u")) {
	    $impres="&#331;"; }

        elsif ($vsig eq "b"||$vsig eq "v"||$vsig eq "p" || $vsig eq "m"){
	    $impres=  "m"; }

	elsif ($vsig eq "g" || $vsig eq "j"){
	    $impres="&#331;";}

	elsif ($vsig eq "f"){
	    $impres=  "M";}

	elsif (($vsig eq "c") && ($vsigg eq "e" || $vsigg eq "i")) {
	    $impres=  "N"; }

	elsif ( (($vsig eq "y") && ($vsigg =~ /a|e|i|o|u/)) || ($vsig eq "l" && $vsigg eq "l") ) {
	    $impres=  "ñ"; }
        else {
	    $impres=  "n";
        }
	$rasgo = "n";

    }

    if ($c eq "ñ") {
        $impres=  "ñ";
        $rasgo ="ñ";
    }

    if ($c eq "o") {
        $impres=  "o";
        $rasgo = "vocal";
        $voc = "o";
        $esp = 0;}

    if ($c eq "p") {
        $impres=  "p";
        $rasgo = "p";
    }

    if ($c eq "q") {
        $impres=  "k";
        $n++;
        $rasgo = "q";
    }

    if ($c eq "r") {
	if ($rasgo eq "t" || $rasgo eq "d" || $rasgo eq "p" || $rasgo eq "b" || $rasgo eq "k" || $rasgo eq "g" ||$rasgo eq "f") {
	    $impres="r";
	    $rasgo = "r";
        }
	elsif ($vsig eq "r") {
	    $rasgo = "r";}

	elsif ($vsig ne "r" && $rasgo eq "r" && $esp ne 1) {
	    $impres="&#345;";
	    $rasgo = "R";
	}
	elsif ($rasgo eq "vocal" && $vsig ne "r" && $esp ne 1) {
	    $impres=  "r";
	    $rasgo = "r";
	}
	elsif ($rasgo ne "vocal" && $esp eq 0) {
	    $impres=  "&#345;";
	    $rasgo = "r";
	}
	elsif ($esp eq 1 && $rasgo ne "R") {
	    $impres=  "&#345;";
	    $rasgo = "R";
	}
	elsif ($esp eq 1 && $rasgo eq "R") {
	    $impres=  "r";
	    $rasgo ="R";
	    $esp= 0;}

	else{
	    $impres=  "*";
	}
    }

    if ($c eq "s") {

	if ($vsig eq "b" || $vsig eq "v"|| $vsig eq "d"|| ($vsig eq "g" && ($vsigg ne "e" && $vsigg ne "i"))||$vsig eq "l"|| $vsig eq "m" || $vsig eq "n") {
	    $impres=  "z";
	    $rasgo = "vocal";}
        else {
	    $impres=  "s";
	    $rasgo = "s"; }
    }


    if ($c eq "t") {
        $impres=  "t";
        $rasgo = "t";
    }

    if ($c eq "u") {
	if ($rasgo eq "vocal" && $voc ne "ï")
	{
	    $impres=  "w";
	    $rasgo = "vocal";
	    $voc = "w";
	}
	elsif ($vsig=~/[aeouáéó]/) {
	    $impres=  "w";}

	else {
	    $impres=  "u";
	    $rasgo = "vocal";
	    $voc = "u";}
	$esp = 0;
    }

    if ($c eq "v")
    {
	if ($rasgo eq "vocal" || $rasgo eq "l" || $rasgo eq "r")
	{


	    if ($vsig=~/[aeiouáéíóúrl]/)   {
		$impres="&beta;";
		$rasgo ="B";}

	    else
	    {
		$impres=  "b";
	    }
	}
	else
	{
	    $impres=  "b";
	}

        $rasgo = "b";

    }

    if ($c eq "w") {
        $impres=  "w";
        $rasgo = "w";
    }
    if ($c eq "x") {
        $impres=  "ks";
        $rasgo = "x" }


    if ($c eq "y") {
        if ($vsig =~/[aeiouáéíóú]/ && $esps eq 0) {
	    $impres= "y";
	    $rasgo= "vocal";
	    $voc="ï";
        }

        elsif ($rasgo eq "vocal" || $esps eq 1) {
	    if (($rasgo eq "vocal"))
	    {

		$impres=  "i";  ##aproximante
		$rasgo = "vocal";
		$voc = "ï";

	    }
	    else {
		$impres=  "i";
		$rasgo = "vocal";
		$voc = "i";
	    }
	}


        else {
	    $impres=  "y";
	}

        $esps = 0;

    }


    if ($c eq "z") {
        $impres="&theta;";
        $rasgo = "Z"; }

    if ($rasgo ne "vocal")
    {
        $voc = 0;
    }
    $n++;
    $transcripcion= $transcripcion.$impres;
}
return $transcripcion;
}


sub proc_sil {


    @phrase_sil = ();

#recogemos el argumento que es la transcripcion fonetica sin separacion de silabas ni acentos
    local $transcripcion= shift;


#convertiendo carácteres para procesar con más facilidad
    $transcripcion =~ s/&gamma;/G/g;
    $transcripcion =~ s/&theta;/Z/g;
    $transcripcion =~ s/&beta;/B/g;
    $transcripcion =~ s/&#345;/R/g;
    $transcripcion =~ s/t&int;/X/g;
    $transcripcion =~ s/ð/D/g;
    $transcripcion =~ s/&lambda;/L/g;
    $transcripcion =~ s/&#331;/ç/g;
    $transcripcion =~ s/^ +//g;


#creamos un array cuyos elementos son las palabras
    @palabras= split(/ /,$transcripcion);
    local $n=0;


##########################################################################################
## procesamiento de una palabra
##########################################################################################

    for (@palabras) {
	local $palabra=$palabras[$n];
	local $n2=-1;

        ###################################################################################
        ##    procesamiento de una silaba individual
        ###################################################################################
        #for ($n3=0; (length($palabra)) >= $n3; $n3++) {
        while ( (substr($palabra,$n2,1)=~/./) ) {
	    $letra0= substr($palabra,$n2,1);
	    $letra1= substr($palabra,$n2-1,1);
	    $letra2= substr($palabra,$n2-2,1);
	    $letra3 =substr($palabra,$n2-3,1);
	    $letra4 =substr($palabra,$n2-4,1);
	    $letra_sig=substr($palabra,$n2+1,1);
	    local $silaba="8xz";
	    #print "<font color=grey>-$letra0-</font>";

	    #caso CV
	    if ($letra0=~/[aeiouAEIOU]/) {
		if ($letra1=~/[bBcdDfgGhklLmnNñprRstvxXyzZ]/) {
		    unless ( ($letra2=~/[tkpgGdDfbB]/) && ($letra1=~/[rl]/)   ) {
                        $silaba=$letra1.$letra0;
                        ##print "$silaba!!<br>";
                        $n2--;

		    }
		}
	    }

	    #caso CVC
	    if ($letra0=~/[nszrNpkmMlLZçdDgb]/) {
		if ($letra1=~/[aeiouAEIOU]/) {
		    if ($letra2=~/[bBcdDfgGhklLmnNñprRstvxXyzZ]/) {
			unless( ($letra3=~/[tkpgGdDfbB]/) && ($letra2=~/[rl]/)  ) {
			    $silaba="$letra2"."$letra1"."$letra0";
			    ##print "$silaba*<br>";
			    $n2=$n2-2;}
		    }
		}
	    }

	    #caso V
	    if ( ($letra0=~/[aeiouAEIOU]/) )  {
                unless ( ($letra1=~/[bBcdDfgGhklLmnNñprRstvxXyzZ]/) || ($letra1=~/[jw]/) ) {
		    $silaba= "$letra0";
		    ##print "$silaba?<br>";
                }
	    }

	    #caso VC
	    if ( ($letra0=~/[nszrNpkmMlLZçdDgb]/) && ($letra1=~/[aeiouAEIOUwj]/) ) {
                unless ($letra2=~/[bBcdDfgGhklLmnNñprRstvxXyzZ]/ || ($letra2=~/[jw]/) ) {
		    $silaba= $letra1.$letra0;
		    ##print "$silaba-<br>";
		    $n2--;
                }
	    }


	    #caso VCC
	    if ( ($letra0=~/[s]/) && ($letra1=~/[kn]/) && ($letra2=~/[aeiouAEIOU]/) ) {
                $silaba= $letra2.$letra1.$letra0;
                $n2=$n2-2;
                ##print "$silaba*<br>";
	    }

	    #caso CCV

	    if ( ($letra0=~/[aeiouAEIOU]/) && ($letra1=~/[rl]/) && ($letra2=~/[tkpgGdDfbB]/) )
	    {$silaba= $letra2.$letra1.$letra0;
	     ##print "$silaba<br>";
	     $n2=$n2-2;}

	    #caso CCVC
	    if ( ($letra0=~/[nszrNpkmMlLZçdDgb]/) && ($letra1=~/[aeiouAEIOU]/) && ($letra2=~/[rl]/) && ($letra3=~/[tkpgGdDfbB]/))                {$silaba= $letra3.$letra2.$letra1.$letra0;
																		  ##print "$silaba<br>";
																		  $n2=$n2-3;}

	    #caso CVCC
	    if ( ($letra0=~/[s]/) && ($letra1=~/[b]/) && ($letra2=~/[aeiouAEIOU]/) && ($letra3=~/[s]/) )
	    {$silaba= $letra3.$letra2.$letra1.$letra0;
	     $n2=$n2-3;}


	    #diptongos
	    #caso CVv y Vv
	    if ( ($letra0=~/[jw]/) && ($letra1=~/[aeiou]/) ) {
		if ($letra2 =~/[bBcdDfgGhklLmnNñprRstvxXyzZ]/) {
		    $silaba= $letra2.$letra1.$letra0;
		    ##print "$silaba<br>";

		    $n2=$n2-2;
		}

		elsif ( ($letra0=~/[jw]/) && ($letra1=~/[aeiou]/) ) {
		    $silaba= $letra1.$letra0;
		    ##print "$silaba¿¿<br>";

		    $n2--;
		}
	    }

	    #caso CvV
	    if ( ($letra0=~/[aeiouAEIOU]/) && ($letra1=~/[jw]/) ) {
		if (($letra2 =~/[bBcdDfgGhklLmnNñprRstvxXyzZ]/) ) {
		    unless ($encontrada==1) {
                        $silaba = $letra2.$letra1.$letra0;
                        ##print "$silaba-<br>";

                        $n2=$n2-2;
		    }
		}

		elsif ( ($letra0=~/[aeiou]/) && ($letra1=~/[jw]/) ) {
		    $silaba= $letra1.$letra0;
		    ##print "$silaba<br>";

		    $n2--;
		}
	    }

	    #caso CvVC
	    if ( ($letra0=~/[cksznNplrçm]/) && ($letra1=~/[aeiouAEIOU]/) && ($letra2=~/[wj]/)
		 && ($letra3=~/[bBcdDfgGhklLmnNñprRstvxXyzZ]/) ) {
                $silaba= $letra3.$letra2.$letra1.$letra0;
                ##print "$silaba<br>";
                $n2 = $n2-3;
	    }


	    #caso CCvV
	    if ( ($letra0=~/[aeiou]/) && ($letra1=~/[jw]/) && ($letra2=~/[rl]/) && ($letra3=~/[tkpgGdDfbB]/) ) {
                $silaba= $letra3.$letra2.$letra1.$letra0;
                $n2= $n2-3;
	    }

	    #caso CCVv
	    if ( ($letra0=~/[jw]/) && ($letra1=~/[aeiou]/) && ($letra2=~/[rl]/) && ($letra3=~/[tkpgGdDfbB]/) ) {
                $silaba= $letra3.$letra2.$letra1.$letra0;
                $n2= $n2-3;
	    }

	    #caso CCvVC
	    if ( ($letra0=~/[cksznNplrçm]/) && ($letra1=~/[aeiou]/) && ($letra2=~/[jw]/) && ($letra3=~/[rl]/) &&
		 ($letra4=~/[tkpgGdDfbB]/) ) {
                $silaba= $letra4.$letra3.$letra2.$letra1.$letra0;
                $n2= $n2-4;
	    }


	    ################################## fin diptongos ############################

	    #caso CCVCC
	    if ( ($letra0=~/s/) && ($letra1=~/[n]/) && ($letra2=~/[aeiouAEIOU]/) && ($letra3=~/[r]/)
		 && ($letra4=~/[tkpgGdDfbB]/) ) {
                $silaba= $letra4.$letra3.$letra2.$letra1.$letra0;
                ##print "$silaba*<br>";
                $n2= $n2-4;
	    }


	    #otros casos
	    if ( ($letra0=~/[jw]/) )  {
                unless ( $letra1=~/[aeioubBcdDfgGhklLmnNñprRstvxXyzZ]/  ) {
		    $silaba= $letra0;
		    ##print "$silaba*<br>";
		    ##$tonica_encontrada=1;
                }
		if ($letra1=~/[bBcdDfgGhklLmnNñprRstvxXyzZ]/) {
		    $silaba=$letra1.$letra0;
		    #print "$silaba:<br>";
		    $n2--;
		}
	    }


	    if ($silaba=="8xz") {
                $silaba="?";
	    }


	    #comprobamos si es tónica con acento


	    unshift(@word,$silaba);

	    $n2--;
	}


	#############################################################################################
	############################## fin procesamiento silaba  ###################################

####asignamos el acento si hay tónica
	local $n5=0;

	for (@word) {
	    if ($word[$n5]=~/[AEIOU]/) {
		$word[$n5]= "'".$word[$n5];
		$tonica_encontrada=1;
	    }
	    $n5++;
	}


####### acentos por defecto #######################

	unless (($tonica_encontrada==1)) {
	    if (@word>1) {                                  # si no es monosilabo....

		if ($word[-1]=~/[rlZdD]$/) {                    # si la ultima silaba acaba en C
		    $word[-1]="'"."$word[-1]";
		}


		else {
		    $word[-2]="'"."$word[-2]";
		}
	    }


	    if ((@word==1) && ($word[0]=~/ir|ba|Ba/)) {
		$word[0]="'"."$word[0]";
	    }

	    elsif ( (@word==1) && ($word[0]=~/[^']..+/)) {    #si es monosilabo y tiene 3 o mas letras
		unless ($word[0]=~/^lo[sz]$|^la[sz]$/) {                #excepto 'los' y 'las'
		    $word[0]="'"."$word[0]";
		}}
	}

#añadimos la palabra a la frase en formas de elementos (=silabas)  de un array
	push(@phrase_sil, @word);
#reseteamos la palabra
	@word=();
	$tonica_encontrada=0;

	$n++;
    }
############ convertimos de nuevo los caracteres para ser visualizador en pagina web ######################
    local $n4=0;


    for (@phrase_sil) {
	$phrase_sil[$n4] =~ s/G/ɣ/g;
	$phrase_sil[$n4] =~ s/Z/θ/g;
	$phrase_sil[$n4] =~ s/B/β/g;
	$phrase_sil[$n4] =~ s/R/ɾ/g;
	$phrase_sil[$n4] =~ s/X/t∫/g;
	$phrase_sil[$n4] =~ s/D/ð/g;
	$phrase_sil[$n4] =~ s/ç/ç/g;
	$phrase_sil[$n4] =~ tr/AEIOU/aeiou/;
	$phrase_sil[$n4] =~ s/L/ʎ/g;
	$n4++;
    }


############################################################################################################
############################ fin procesamiento palabra ####################################################
    return @phrase_sil;
}
############################### fin separador de silabas ##################################################


1;
	#!/usr/bin/perl

	#################################################################
	#### Author: Sebastian Gallese
	####
	#### SCRIPT: Combine Aspell dictionary, CMU Pronunciation and
	#### Penn Parts of Speech into a file readable by Rita
	#### EMAIL: SebastianGallese AT GMAIL DOT COM
	##################################################################

	use utf8;

	$dictpath = $ARGV[0];
	$cmupath = $ARGV[1];
	$pennpath = $ARGV[2];

	if ($dictpath eq "" \|\| $cmupath eq "" \|\| $pennpath eq "") {
	print "Please enter in a path to your Spanish Aspell dictionary txt\n";
	print "and a path to your CMU Pronunciation txt\n";
	print "and a path to your Penn Parts of Speech txt\n";
	print "Usage: perl combinefiles.pl path/to/aspelldictionary.txt path/to/cmupronunciation.txt path/to/pennpos.txt\n";
	print "E.g.: perl combinefiles.pl ~/aspellutf8.txt ~/aspellutf8cmu.txt ~/aspellutf8pos.txt\n";
	}
	else{
	open (dict, $dictpath);
	@dict=<dict>;
	close dict;
	open (cmu, $cmupath);
	@cmu=<cmu>;
	close cmu;
	open (penn, $pennpath);
	@penn=<penn>;
	close penn;

	$count = 0;

	foreach (@dict) {

	chomp ( $dict = @dict[$count] );
	chomp ( $cmu = @cmu[$count] );
	chomp ( $penn = @penn[$count] );

	print $dict . ":\t" . $cmu . "\t\\| ". $penn . " \n";

	$count += 1;
	}
	}
	# How to make a Rita Lexicon in Spanish from Aspell
	# by Sebastian Gallese
	# gmail: sebastiangallese

	# I assume you know a good bit about Rita and Java and the command line
	# It might be helpful to know some Perl (I didn't know any before this)
	# If you modify this file, you might be able to execute this as a
	# bash script instead of copy-pasting the commands line by line.
	# I sure as hell wouldn't.

	# Prefix:
	# In order for this to be incorporated into Rita officially,
	# we must change or enhance a number of things:
	# 1. See if aspell is the best source dictionary
	# - why not use the RAE? (it's the OED of Spanish)
	# - decide if we should use expanded forms (we currently expand all forms of a word)
	# 2. Use a non-statistical POS tagger
	# - OpenNLP Spanish POS tagger can fail miserably with rare words and different conjugations
	# - in the SpanPosTagger.java source, you'll see problems getting more than one tag per word
	# 3. Tweak the Cast3lB Corpus POS to Penn Treebank POS Conversion
	# - if we stick to OpenNLP, need a Spanish linguist to verify SpanPosTagger.java settings
	# 4. Use a non-automated Spanish IPA transcriptor
	# - is there a database full of IPA transcription for Spanish words?
	# 5. Tweak the automated IPA transcription
	# - if we stick to spanishapelltorita.pl, email Xavier and ask him about his algorithm
	# - modify the algorithm's weaknesses with the help of a Spanish linguist or textbook
	# 6. Tweak the Spanish IPA to English IPA Conversion
	# - get Spanish linguist to verify spanishapelltorita.pl conversions as most appropriate
	# 7. Tweak the English IPA to CMU Pronunciation Conversion
	# - get English linguist to verify spanishapelltorita.pl conversions as most appropriate


	# 1. Install aspell

	# Make sure you've installed Macports!
	# https://trac.macports.org/wiki/InstallingMacPorts
	# install aspell and the spanish dictionary
	sudo port install aspell
	sudo port install aspell-dict-es

	# References
	# http://docs.moodle.org/en/Configuring_aspell_on_Mac_OS_X

	# 2. Make word list

	# go to your home directory
	cd

	# expand an aspell dictionary
	aspell -l es dump master \| aspell -l es expand > aspell.txt

	# change it the dictionary to utf, change every space to a newline, and sort it
	iconv -f ISO8859-1 -t UTF-8 aspell.txt \| tr ' ' '\n' \| sort -uf > aspellutf8.txt

	# move the dictionary to your home folder
	mv aspellutf8.txt ~/

	# References
	# http://www.pocketmagic.net/?p=782
	# http://fileformat.wordpress.com/2007/11/25/how-to-make-a-word-list-from-an-aspell-dictionary/

	# 3. Extract POS from dictionary file

	# Download the parts-of-speech generator Tagger.jar
	# http://code.google.com/p/cognate-translation-tool/source/browse/#svn/trunk/src/posTagger
	# Move Tagger.jar to your home directory

	# Download the Spanish POS model SpanishPOS.bin.gz
	# DON'T UNZIP THIS FILE!
	# http://opennlp.sourceforge.net/models/spanish/postag/
	# Move SpanishPOS.bin.gz to your home directory

	# go to your home directory
	cd

	# run the jar file on your aspell dictionary and output the POS
	java -jar Tagger.jar SpanishPOS.bin.gz < aspellutf8.txt >> aspellutf8pos.txt

	# References
	# Source code and comments on the Tagger.jar file
	# You will also find related links
	# http://code.google.com/p/cognate-translation-tool/source/browse/#svn/trunk/src/posTagger

	# 4. Extract CMU Pronunciation from dictionary file

	# Download the Spanish CMU pronunciation converter spanishaspelltorita.pl
	# http://code.google.com/p/cognate-translation-tool/source/browse/#svn/trunk/src/ritaspanishdictonary
	# Move spanishaspelltorita.pl to your home directory

	# go to your home directory
	cd

	# run the perl file on your aspell dictionary and output the CMU pronunciation
	perl spanishaspelltorita.pl aspellutf8.txt >> aspellutf8cmu.txt

	# References
	# Source code and comments on the spanishaspelltorita.pl file
	# You will also find related links
	# http://code.google.com/p/cognate-translation-tool/source/browse/#svn/trunk/src/ritaspanishdictonary

	# 5. Combine spanish dictionary, cmu pronunciation, and pos tags

	# Download the combiner combinefiles.pl
	# http://code.google.com/p/cognate-translation-tool/source/browse/#svn/trunk/src/ritaspanishdictonary
	# Move combinefiles.pl to your home directory

	# go to your home directory
	cd

	# run the per file to combine all the previous files you've made
	perl combinefiles.pl aspellutf8.txt aspellutf8cmu.txt aspellutf8pos.txt >> aspellutf8combined.txt

	# References
	# Source code and comments on the combinefiles.pl file
	# You will also find related links
	# http://code.google.com/p/cognate-translation-tool/source/browse/#svn/trunk/src/ritaspanishdictonary