Created
May 10, 2010 04:24
-
-
Save sgallese/395672 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/perl | |
################################################################# | |
#### Author: Sebastian Gallese | |
#### | |
#### SCRIPT: Combine Aspell dictionary, CMU Pronunciation and | |
#### Penn Parts of Speech into a file readable by Rita | |
#### EMAIL: SebastianGallese AT GMAIL DOT COM | |
################################################################## | |
use utf8; | |
$dictpath = $ARGV[0]; | |
$cmupath = $ARGV[1]; | |
$pennpath = $ARGV[2]; | |
if ($dictpath eq "" || $cmupath eq "" || $pennpath eq "") { | |
print "Please enter in a path to your Spanish Aspell dictionary txt\n"; | |
print "and a path to your CMU Pronunciation txt\n"; | |
print "and a path to your Penn Parts of Speech txt\n"; | |
print "Usage: perl combinefiles.pl path/to/aspelldictionary.txt path/to/cmupronunciation.txt path/to/pennpos.txt\n"; | |
print "E.g.: perl combinefiles.pl ~/aspellutf8.txt ~/aspellutf8cmu.txt ~/aspellutf8pos.txt\n"; | |
} | |
else{ | |
open (dict, $dictpath); | |
@dict=<dict>; | |
close dict; | |
open (cmu, $cmupath); | |
@cmu=<cmu>; | |
close cmu; | |
open (penn, $pennpath); | |
@penn=<penn>; | |
close penn; | |
$count = 0; | |
foreach (@dict) { | |
chomp ( $dict = @dict[$count] ); | |
chomp ( $cmu = @cmu[$count] ); | |
chomp ( $penn = @penn[$count] ); | |
print $dict . ":\t" . $cmu . "\t\| ". $penn . " \n"; | |
$count += 1; | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# How to make a Rita Lexicon in Spanish from Aspell | |
# by Sebastian Gallese | |
# gmail: sebastiangallese | |
# I assume you know a good bit about Rita and Java and the command line | |
# It might be helpful to know some Perl (I didn't know any before this) | |
# If you modify this file, you might be able to execute this as a | |
# bash script instead of copy-pasting the commands line by line. | |
# I sure as hell wouldn't. | |
# Prefix: | |
# In order for this to be incorporated into Rita officially, | |
# we must change or enhance a number of things: | |
# 1. See if aspell is the best source dictionary | |
# - why not use the RAE? (it's the OED of Spanish) | |
# - decide if we should use expanded forms (we currently expand all forms of a word) | |
# 2. Use a non-statistical POS tagger | |
# - OpenNLP Spanish POS tagger can fail miserably with rare words and different conjugations | |
# - in the SpanPosTagger.java source, you'll see problems getting more than one tag per word | |
# 3. Tweak the Cast3lB Corpus POS to Penn Treebank POS Conversion | |
# - if we stick to OpenNLP, need a Spanish linguist to verify SpanPosTagger.java settings | |
# 4. Use a non-automated Spanish IPA transcriptor | |
# - is there a database full of IPA transcription for Spanish words? | |
# 5. Tweak the automated IPA transcription | |
# - if we stick to spanishapelltorita.pl, email Xavier and ask him about his algorithm | |
# - modify the algorithm's weaknesses with the help of a Spanish linguist or textbook | |
# 6. Tweak the Spanish IPA to English IPA Conversion | |
# - get Spanish linguist to verify spanishapelltorita.pl conversions as most appropriate | |
# 7. Tweak the English IPA to CMU Pronunciation Conversion | |
# - get English linguist to verify spanishapelltorita.pl conversions as most appropriate | |
# 1. Install aspell | |
# Make sure you've installed Macports! | |
# https://trac.macports.org/wiki/InstallingMacPorts | |
# install aspell and the spanish dictionary | |
sudo port install aspell | |
sudo port install aspell-dict-es | |
# References | |
# http://docs.moodle.org/en/Configuring_aspell_on_Mac_OS_X | |
# 2. Make word list | |
# go to your home directory | |
cd | |
# expand an aspell dictionary | |
aspell -l es dump master | aspell -l es expand > aspell.txt | |
# change it the dictionary to utf, change every space to a newline, and sort it | |
iconv -f ISO8859-1 -t UTF-8 aspell.txt | tr ' ' '\n' | sort -uf > aspellutf8.txt | |
# move the dictionary to your home folder | |
mv aspellutf8.txt ~/ | |
# References | |
# http://www.pocketmagic.net/?p=782 | |
# http://fileformat.wordpress.com/2007/11/25/how-to-make-a-word-list-from-an-aspell-dictionary/ | |
# 3. Extract POS from dictionary file | |
# Download the parts-of-speech generator Tagger.jar | |
# http://code.google.com/p/cognate-translation-tool/source/browse/#svn/trunk/src/posTagger | |
# Move Tagger.jar to your home directory | |
# Download the Spanish POS model SpanishPOS.bin.gz | |
# DON'T UNZIP THIS FILE! | |
# http://opennlp.sourceforge.net/models/spanish/postag/ | |
# Move SpanishPOS.bin.gz to your home directory | |
# go to your home directory | |
cd | |
# run the jar file on your aspell dictionary and output the POS | |
java -jar Tagger.jar SpanishPOS.bin.gz < aspellutf8.txt >> aspellutf8pos.txt | |
# References | |
# Source code and comments on the Tagger.jar file | |
# You will also find related links | |
# http://code.google.com/p/cognate-translation-tool/source/browse/#svn/trunk/src/posTagger | |
# 4. Extract CMU Pronunciation from dictionary file | |
# Download the Spanish CMU pronunciation converter spanishaspelltorita.pl | |
# http://code.google.com/p/cognate-translation-tool/source/browse/#svn/trunk/src/ritaspanishdictonary | |
# Move spanishaspelltorita.pl to your home directory | |
# go to your home directory | |
cd | |
# run the perl file on your aspell dictionary and output the CMU pronunciation | |
perl spanishaspelltorita.pl aspellutf8.txt >> aspellutf8cmu.txt | |
# References | |
# Source code and comments on the spanishaspelltorita.pl file | |
# You will also find related links | |
# http://code.google.com/p/cognate-translation-tool/source/browse/#svn/trunk/src/ritaspanishdictonary | |
# 5. Combine spanish dictionary, cmu pronunciation, and pos tags | |
# Download the combiner combinefiles.pl | |
# http://code.google.com/p/cognate-translation-tool/source/browse/#svn/trunk/src/ritaspanishdictonary | |
# Move combinefiles.pl to your home directory | |
# go to your home directory | |
cd | |
# run the per file to combine all the previous files you've made | |
perl combinefiles.pl aspellutf8.txt aspellutf8cmu.txt aspellutf8pos.txt >> aspellutf8combined.txt | |
# References | |
# Source code and comments on the combinefiles.pl file | |
# You will also find related links | |
# http://code.google.com/p/cognate-translation-tool/source/browse/#svn/trunk/src/ritaspanishdictonary |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/perl | |
################################################################# | |
#### Author: Sebastian Gallese | |
#### | |
#### SCRIPT: Convert a Spanish Aspell dictionary list to CMU Pronunciation | |
#### EMAIL: SebastianGallese AT GMAIL DOT COM | |
################################################################## | |
##################################################################### | |
#### All of the subroutines below the Xavier López Morrás header | |
#### were made by Xavier with revisions by Sebastian Gallese | |
##################################################################### | |
use utf8; | |
# binmode STDOUT, "utf8"; | |
# Used to run program from command line | |
# Usage: perl transcriptor.pl path/to/aspelldictionary.txt | |
# E.g.: perl transcriptor.pl aspelldict.txt | |
$dictpath = $ARGV[0]; | |
if ($dictpath eq "") { | |
print "Please enter in a path to your Spanish Aspell dictionary txt!\n"; | |
print "Usage: perl spanishaspelltorita.pl path/to/aspelldictionary.txt\n"; | |
} | |
else{ | |
# $questions = 0; | |
open (MYDICT, $dictpath); | |
while (<MYDICT>) { | |
chomp; | |
$original = $_; | |
utf8::decode($_); | |
$currentword = $_; | |
$transcribed = &transcribe($currentword); | |
@separated = &proc_sil($transcribed); | |
$syllable = 0; | |
for my $syll (@separated) { | |
@separated[$syllable] = &convertcmupronounce(&convertenglishipa($syll)); | |
$syllable += 1; | |
# if ($syll =~ /\?/ ) { | |
# 7099 words are encoded with a question mark | |
# for their pronunciation. | |
# For now, we just replace these question marks, | |
# but will look into fixing the problem in | |
# the future | |
# $questions += 1; | |
# } | |
} | |
$lastsyll = pop(@separated); | |
push (@separated, $lastsyll); | |
$cmu = ""; | |
# print "original: $original\n"; | |
# print "decoded: $currentword\n"; | |
# $flag = utf8::valid($currentword); | |
# print "isUTF?: $flag\n"; | |
# print "length: $medida\n"; | |
# print "transcribe: $transcribed\n"; | |
for my $syll (@separated) { | |
if ($syll eq $lastsyll){ | |
$cmu = $cmu . "$syll"; | |
} | |
else{ | |
$cmu = $cmu . "$syll "; | |
} | |
} | |
print "$cmu\n"; | |
# print "--------------------------\n"; | |
} | |
close (MYDICT); | |
# print "total questions: $questions\n"; | |
} | |
sub convertenglishipa { | |
# Convert characters from Spanish IPA to English IPA | |
# using these resources as guides | |
# http://en.wikipedia.org/wiki/IPA_chart_for_Spanish | |
# http://en.wikipedia.org/wiki/IPA_for_English | |
# http://www.aucel.com/pln/ (see Símbolos no AFI) | |
my $spanishipa = shift; | |
# Convert characters that don't exit in Eng IPA | |
$spanishipa =~ s/ɾ/r/g; | |
$spanishipa =~ s/β/b/g; | |
$spanishipa =~ s/ɣ/g/g; | |
$spanishipa =~ s/y/j/g; | |
$spanishipa =~ s/ʝ/j/g; | |
$spanishipa =~ s/ʎ/j/g; | |
$spanishipa =~ s/M/n/g; | |
$spanishipa =~ s/N/n/g; | |
$spanishipa =~ s/ñ/ŋ/g; | |
$spanishipa =~ s/ɲ/ŋ/g; | |
$spanishipa =~ s/ɾ/r/g; | |
$spanishipa =~ s/ɾ/r/g; | |
# Change character according to Eng pronunciation | |
$spanishipa =~ s/ai/aɪ/g; | |
$spanishipa =~ s/au/aʊ/g; | |
$spanishipa =~ s/eu/eʊ/g; | |
$spanishipa =~ s/ja/jɒ/g; | |
$spanishipa =~ s/je/je/g; | |
$spanishipa =~ s/jo/oʊ/g; | |
$spanishipa =~ s/oi/ɔɪ/g; | |
$spanishipa =~ s/ou/oʊ/g; | |
$spanishipa =~ s/wa/wɒ/g; | |
$spanishipa =~ s/wo/wəʊ/g; | |
$spanishipa =~ s/a/ɑ/g; | |
$spanishipa =~ s/o/oʊ/g; | |
$spanishipa =~ s/e/ɛ/g; | |
return $spanishipa; | |
} | |
sub convertcmupronounce { | |
# Convert characters from English IPA to CMU Pronounciation | |
# using these resources as guides | |
# http://www.speech.cs.cmu.edu/cgi-bin/cmudict | |
# http://en.wikipedia.org/wiki/Arpabet | |
my $englishipa = shift; | |
$englishipa =~ s/'//g; | |
$englishipa =~ s/ɔ/AO-/g; | |
$englishipa =~ s/ɒ/AO-/g; | |
$englishipa =~ s/ɑ/AA-/g; | |
$englishipa =~ s/i/IY-/g; | |
$englishipa =~ s/u/UW-/g; | |
$englishipa =~ s/ɛ/EH-/g; | |
$englishipa =~ s/ɪ/IH-/g; | |
$englishipa =~ s/ʊ/UH-/g; | |
$englishipa =~ s/ʌ/AH-/g; | |
$englishipa =~ s/ə/AH-/g; | |
$englishipa =~ s/æ/AE-/g; | |
$englishipa =~ s/eɪ/EY-/g; | |
$englishipa =~ s/aɪ/AY-/g; | |
$englishipa =~ s/oʊ/OW-/g; | |
$englishipa =~ s/o/OW-/g; | |
$englishipa =~ s/aʊ/AW-/g; | |
$englishipa =~ s/ɔɪ/OY-/g; | |
$englishipa =~ s/ɝ/ER-/g; | |
$englishipa =~ s/ɚ/ER-/g; | |
$englishipa =~ s/ɛr/EH-R-/g; | |
$englishipa =~ s/ʊr/UH-R-/g; | |
$englishipa =~ s/ɔr/AO-R-/g; | |
$englishipa =~ s/ɑr/AA-R-/g; | |
$englishipa =~ s/ɪr/IH-R-/g; | |
$englishipa =~ s/aʊr/AW-R-/g; | |
$englishipa =~ s/p/P-/g; | |
$englishipa =~ s/b/B-/g; | |
$englishipa =~ s/t/T-/g; | |
$englishipa =~ s/d/D-/g; | |
$englishipa =~ s/k/K-/g; | |
$englishipa =~ s/g/G-/g; | |
$englishipa =~ s/t∫/CH-/g; | |
$englishipa =~ s/dʒ/JH-/g; | |
$englishipa =~ s/f/F-/g; | |
$englishipa =~ s/v/V-/g; | |
$englishipa =~ s/θ/TH-/g; | |
$englishipa =~ s/ð/DH-/g; | |
$englishipa =~ s/s/S-/g; | |
$englishipa =~ s/z/Z-/g; | |
$englishipa =~ s/∫/SH-/g; | |
$englishipa =~ s/ʒ/ZH-/g; | |
$englishipa =~ s/h/HH-/g; | |
$englishipa =~ s/m/M-/g; | |
$englishipa =~ s/n/N-/g; | |
$englishipa =~ s/ŋ/NG-/g; | |
$englishipa =~ s/l/L-/g; | |
$englishipa =~ s/r/R-/g; | |
$englishipa =~ s/j/Y-/g; | |
$englishipa =~ s/w/W-/g; | |
$englishipa =~ s/x/HH-/g; | |
chop ($englishipa); | |
return $englishipa; | |
} | |
################################################################# | |
#### AUTOR: Xavier López Morrás | |
#### | |
#### SCRIPT: Transcriptor fonético automático del español | |
#### EMAIL: lopezx@gmail.com | |
################################################################## | |
##################################################################### | |
#### Puedes hacer uso libre del script y código a nivel personal. | |
#### Para otras finalidades consultar al autor. | |
##################################################################### | |
## INPUT: escritura ordinaria. OUTPUT: transcripcion fonetica. | |
sub caracteres { | |
my $Frasev= shift; | |
$Frasev =~ s/ñ/%F1/g; | |
$Frasev =~ s/á/%E1/g; | |
$Frasev =~ s/é/%E9/g; | |
$Frasev =~ s/í/%ED/g; | |
$Frasev =~ s/ó/%F3/g; | |
$Frasev =~ s/ú/%FA/g; | |
$Frasev =~ s/,/%2C/g; | |
$Frasev =~ s/!/%21/g; | |
$Frasev =~ s/¿/%BF/g; | |
$Frasev =~ s/\?/%3F/g; | |
$Frasev =~ s/ü/%FC/g; | |
$Frasev =~ s/Ü/%DC/g; | |
$Frasev =~ s/Í/%CD/g; | |
$Frasev =~ s/Ú/%DA/g; | |
$Frasev =~ s/Á/%C1/g; | |
$Frasev =~ s/É/%C9/g; | |
$Frasev =~ s/Ó/%D3/g; | |
$Frasev =~ s/:/%3A/g; | |
$Frasev =~ s/"/%22/g; | |
#substr ($Frasev, 0, 2) = ""; | |
return $Frasev; | |
} | |
sub transcribe { | |
my $oracion=shift; | |
$oracion=~ s/%F1/ñ/g; | |
$oracion=~ s/%E1/á/g; | |
$oracion=~ s/%E9/é/g; | |
$oracion=~ s/%ED/í/g; | |
$oracion=~ s/%F3/ó/g; | |
$oracion=~ s/%FA/ú/g; | |
$oracion=~ s/%3A/|/g; | |
$oracion=~ s/%22//g; | |
$oracion=~ s/%FC/w/g; | |
$oracion=~ s/%DC/w/g; | |
$oracion=~ s/%CD/í/g; | |
$oracion=~ s/%DA/ú/g; | |
$oracion=~ s/%C1/á/g; | |
$oracion=~ s/%C9/é/g; | |
$oracion=~ s/%D3/ó/g; | |
$oracion=~ s/%2C/,/g; | |
$oracion=~ s/%21/,/g; | |
$oracion=~ s/%BF/,/g; | |
$oracion=~ s/%3F/,/g; | |
$oracion=~ tr/+/ /; | |
$oracion=~ tr/ABCDEFGHIJKLMNÑOPQ/abcdefghijklmnñopq/; | |
$oracion=~ tr/RSTUVWXYZ/rstuvwxyz/; | |
$medida = length ( $oracion ) ; | |
$esp=0; | |
$n = 0; | |
$voc = 0; | |
$rasgo= 0; | |
$transcripcion = ""; | |
while ($n < $medida ) | |
{ | |
$c = substr ( $oracion, $n, 1 ); | |
$vsig = substr ($oracion, $n+1, 1); | |
$vant = substr ($oracion, $n-1, 1); | |
if ($vsig eq " ") { | |
$esps = 1; | |
} | |
elsif ($vsig ne " ") { | |
$esps = 0; | |
} | |
$vsigg = substr ($oracion, $n+2, 1); | |
$impres=""; | |
if ($vsigg eq " " || $vsig eq " " ) { | |
$vsigg = substr ($oracion, $n+3, 1); | |
} | |
if ($vsig eq " ") { | |
$vsig = substr ($oracion, $n+2, 1); } | |
if ($vsig eq "h" && $c ne "c") { | |
$vsig = $vsigg; | |
} | |
if ($c eq "." || $c eq "," || $c eq ";") { | |
$impres= ""; | |
$rasgo = 0; } | |
if ($c eq " ") { | |
$impres=" "; | |
$esp = 1 | |
} | |
if ($c eq "a") | |
{ | |
$impres= "a"; | |
$rasgo = "vocal"; | |
$voc = "a"; | |
$esp = 0; | |
} | |
if ($c eq "á") | |
{ | |
$impres= "A"; | |
$rasgo = "vocal"; | |
$voc = "a"; | |
$esp = 0; | |
} | |
if ($c eq "é") { | |
$impres= "E"; | |
$rasgo = "vocal"; | |
$voc = "e"; | |
$esp = 0; } | |
if ($c eq "í") | |
{ | |
$impres= "I"; | |
$rasgo = "vocal"; | |
$voc = "ii"; | |
} | |
if ($c eq "ó") { | |
$impres= "O"; | |
$rasgo = "vocal"; | |
$voc = "o"; | |
$esp = 0; } | |
if ($c eq "ú") | |
{ | |
$impres= "U"; | |
$rasgo = "vocal"; | |
$voc = "uu"; | |
} | |
#resto de letras | |
if ($c eq "b") | |
{ | |
if ($rasgo eq "vocal" || $rasgo eq "l" || $rasgo eq "r") | |
{ | |
if ($vsig=~/[aeiouáéíóúrl]/) { | |
$impres="β"; } | |
else | |
{ | |
$impres= "b"; | |
} | |
} | |
else | |
{ | |
$impres= "b"; | |
} | |
$rasgo = "b"; | |
} | |
if ($c eq "c") { | |
if ( $vsig eq "h" ) { | |
$impres="t∫"; | |
$n++; | |
$rasgo = "tS"; } | |
elsif ( $vsig eq "e" || $vsig eq "i" ||$vsig eq "í" ) { | |
$impres="θ"; | |
$rasgo ="Z"; } | |
else { | |
$impres= "k"; | |
$rasgo = "k"; } | |
} | |
if ($c eq "d") { | |
if ($rasgo eq "vocal" || $rasgo eq "r") | |
{ | |
if ($vsig=~/[aeiouáéíóúrl]/) { | |
$impres= "ð" } | |
else { | |
$impres= "d"; } | |
} | |
else | |
{ | |
$impres= "d"; | |
} | |
$rasgo= "d"; | |
} | |
if ($c eq "e") { | |
$impres= "e"; | |
$rasgo = "vocal"; | |
$voc = "e"; | |
$esp = 0; } | |
if ($c eq "f") { | |
$impres= "f"; | |
$rasgo = "f"; } | |
if ($c eq "g") { | |
if ($vsig eq "a"|| $vsig eq "w") { | |
if ($rasgo eq "vocal" || $rasgo eq "s" || $rasgo eq "r" || $rasgo eq "l") { | |
$impres= "γ"; | |
} | |
else { | |
$impres= "g"; } | |
} | |
elsif ($vsig eq "e") { | |
$impres= "x"; } | |
elsif ($vsig eq "i") { | |
$impres= "x"; } | |
elsif ($vsig eq "o") { | |
if ($rasgo eq "vocal" || $rasgo eq "s" || $rasgo eq "r" || $rasgo eq "l") { | |
$impres= "γ"; } | |
else { | |
$impres= "g" } | |
} | |
elsif ($vsig eq "u"|| $vsig eq "ú") { | |
if ( $vsigg eq "e" || $vsigg eq "i" ) { | |
$n++; | |
if ($rasgo eq "vocal" || $rasgo eq "s" || $rasgo eq "r" || | |
$rasgo eq "l") { | |
$impres="γ" } | |
else { | |
$impres= "g"; } | |
} | |
else { | |
if ($rasgo eq "vocal"|| $rasgo eq "s" || $rasgo eq "r" || | |
$rasgo eq "l") { | |
$impres="γ" } | |
else { | |
$impres= "g"; } | |
} | |
} | |
elsif ($vsig eq "r"||$vsig eq "l") { | |
if ($rasgo eq "vocal"){ | |
$impres="γ"; | |
$rasgo eq "G";} | |
else { | |
$impres= "g"; | |
$rasgo eq "g";} | |
} | |
else { | |
$impres= "g"; } | |
$rasgo = "g"; | |
} | |
if ($c eq "h") { | |
} | |
if ($c eq "i") { | |
if ( ($rasgo eq "vocal") || ($vsig=~/[aeiouáéíó]/) ) { | |
unless ($vant=~/ /) { | |
$impres= "j"; | |
$rasgo = "vocal"; | |
$voc = "ï"; | |
} | |
} | |
else { | |
$impres= "i"; | |
$rasgo = "vocal"; | |
$voc = "i"; | |
} | |
if ($vant=~/ /) { | |
$impres= "i"; | |
$esp = 0; | |
} | |
} | |
if ($c eq "j") { | |
$impres= "x"; | |
$rasgo = "x"; | |
} | |
if ($c eq "k") { | |
$impres= "k"; | |
$rasgo = "k"; | |
} | |
if ($c eq "l") { | |
if ($vsig eq "l" && $vsigg ne "l" && $esps ne 1) { | |
if ($rasgo eq "vocal"){ | |
$impres= "λ"} | |
elsif ($rasgo ne "vocal"){ | |
$impres= "λ"} | |
$n++; | |
} | |
elsif ($vsig eq "l" && $vsigg ne "l" && $esps eq 1) { | |
$impres= "l l"; | |
$n = $n+2; | |
$esps = 0; | |
} | |
elsif ($vsig eq "l" && $vsigg eq "l" && $esps eq 1) { | |
$impres= "λ λ"; | |
$n = $n+3; | |
$esps = 0; | |
} | |
else { | |
$impres= "l"; | |
$rasgo = "l"; | |
$esp = 0; | |
} | |
$rasgo = "l"; | |
} | |
if ($c eq "m") { | |
if ($vsig eq "f") { | |
$impres= "M"; } | |
else { | |
$impres= "m"; | |
} | |
$rasgo = "m"; | |
} | |
if ($c eq "n") { | |
if ($vsig eq "t" || $vsig eq "d" || $vsig eq "z") | |
{ | |
$impres= "N"; } | |
elsif (($vsig eq "c" || $vsig eq "q") && ($vsigg eq "a" || $vsigg eq "o" || $vsigg eq "u")) { | |
$impres="ŋ"; } | |
elsif ($vsig eq "b"||$vsig eq "v"||$vsig eq "p" || $vsig eq "m"){ | |
$impres= "m"; } | |
elsif ($vsig eq "g" || $vsig eq "j"){ | |
$impres="ŋ";} | |
elsif ($vsig eq "f"){ | |
$impres= "M";} | |
elsif (($vsig eq "c") && ($vsigg eq "e" || $vsigg eq "i")) { | |
$impres= "N"; } | |
elsif ( (($vsig eq "y") && ($vsigg =~ /a|e|i|o|u/)) || ($vsig eq "l" && $vsigg eq "l") ) { | |
$impres= "ñ"; } | |
else { | |
$impres= "n"; | |
} | |
$rasgo = "n"; | |
} | |
if ($c eq "ñ") { | |
$impres= "ñ"; | |
$rasgo ="ñ"; | |
} | |
if ($c eq "o") { | |
$impres= "o"; | |
$rasgo = "vocal"; | |
$voc = "o"; | |
$esp = 0;} | |
if ($c eq "p") { | |
$impres= "p"; | |
$rasgo = "p"; | |
} | |
if ($c eq "q") { | |
$impres= "k"; | |
$n++; | |
$rasgo = "q"; | |
} | |
if ($c eq "r") { | |
if ($rasgo eq "t" || $rasgo eq "d" || $rasgo eq "p" || $rasgo eq "b" || $rasgo eq "k" || $rasgo eq "g" ||$rasgo eq "f") { | |
$impres="r"; | |
$rasgo = "r"; | |
} | |
elsif ($vsig eq "r") { | |
$rasgo = "r";} | |
elsif ($vsig ne "r" && $rasgo eq "r" && $esp ne 1) { | |
$impres="ř"; | |
$rasgo = "R"; | |
} | |
elsif ($rasgo eq "vocal" && $vsig ne "r" && $esp ne 1) { | |
$impres= "r"; | |
$rasgo = "r"; | |
} | |
elsif ($rasgo ne "vocal" && $esp eq 0) { | |
$impres= "ř"; | |
$rasgo = "r"; | |
} | |
elsif ($esp eq 1 && $rasgo ne "R") { | |
$impres= "ř"; | |
$rasgo = "R"; | |
} | |
elsif ($esp eq 1 && $rasgo eq "R") { | |
$impres= "r"; | |
$rasgo ="R"; | |
$esp= 0;} | |
else{ | |
$impres= "*"; | |
} | |
} | |
if ($c eq "s") { | |
if ($vsig eq "b" || $vsig eq "v"|| $vsig eq "d"|| ($vsig eq "g" && ($vsigg ne "e" && $vsigg ne "i"))||$vsig eq "l"|| $vsig eq "m" || $vsig eq "n") { | |
$impres= "z"; | |
$rasgo = "vocal";} | |
else { | |
$impres= "s"; | |
$rasgo = "s"; } | |
} | |
if ($c eq "t") { | |
$impres= "t"; | |
$rasgo = "t"; | |
} | |
if ($c eq "u") { | |
if ($rasgo eq "vocal" && $voc ne "ï") | |
{ | |
$impres= "w"; | |
$rasgo = "vocal"; | |
$voc = "w"; | |
} | |
elsif ($vsig=~/[aeouáéó]/) { | |
$impres= "w";} | |
else { | |
$impres= "u"; | |
$rasgo = "vocal"; | |
$voc = "u";} | |
$esp = 0; | |
} | |
if ($c eq "v") | |
{ | |
if ($rasgo eq "vocal" || $rasgo eq "l" || $rasgo eq "r") | |
{ | |
if ($vsig=~/[aeiouáéíóúrl]/) { | |
$impres="β"; | |
$rasgo ="B";} | |
else | |
{ | |
$impres= "b"; | |
} | |
} | |
else | |
{ | |
$impres= "b"; | |
} | |
$rasgo = "b"; | |
} | |
if ($c eq "w") { | |
$impres= "w"; | |
$rasgo = "w"; | |
} | |
if ($c eq "x") { | |
$impres= "ks"; | |
$rasgo = "x" } | |
if ($c eq "y") { | |
if ($vsig =~/[aeiouáéíóú]/ && $esps eq 0) { | |
$impres= "y"; | |
$rasgo= "vocal"; | |
$voc="ï"; | |
} | |
elsif ($rasgo eq "vocal" || $esps eq 1) { | |
if (($rasgo eq "vocal")) | |
{ | |
$impres= "i"; ##aproximante | |
$rasgo = "vocal"; | |
$voc = "ï"; | |
} | |
else { | |
$impres= "i"; | |
$rasgo = "vocal"; | |
$voc = "i"; | |
} | |
} | |
else { | |
$impres= "y"; | |
} | |
$esps = 0; | |
} | |
if ($c eq "z") { | |
$impres="θ"; | |
$rasgo = "Z"; } | |
if ($rasgo ne "vocal") | |
{ | |
$voc = 0; | |
} | |
$n++; | |
$transcripcion= $transcripcion.$impres; | |
} | |
return $transcripcion; | |
} | |
sub proc_sil { | |
@phrase_sil = (); | |
#recogemos el argumento que es la transcripcion fonetica sin separacion de silabas ni acentos | |
local $transcripcion= shift; | |
#convertiendo carácteres para procesar con más facilidad | |
$transcripcion =~ s/γ/G/g; | |
$transcripcion =~ s/θ/Z/g; | |
$transcripcion =~ s/β/B/g; | |
$transcripcion =~ s/ř/R/g; | |
$transcripcion =~ s/t∫/X/g; | |
$transcripcion =~ s/ð/D/g; | |
$transcripcion =~ s/λ/L/g; | |
$transcripcion =~ s/ŋ/ç/g; | |
$transcripcion =~ s/^ +//g; | |
#creamos un array cuyos elementos son las palabras | |
@palabras= split(/ /,$transcripcion); | |
local $n=0; | |
########################################################################################## | |
## procesamiento de una palabra | |
########################################################################################## | |
for (@palabras) { | |
local $palabra=$palabras[$n]; | |
local $n2=-1; | |
################################################################################### | |
## procesamiento de una silaba individual | |
################################################################################### | |
#for ($n3=0; (length($palabra)) >= $n3; $n3++) { | |
while ( (substr($palabra,$n2,1)=~/./) ) { | |
$letra0= substr($palabra,$n2,1); | |
$letra1= substr($palabra,$n2-1,1); | |
$letra2= substr($palabra,$n2-2,1); | |
$letra3 =substr($palabra,$n2-3,1); | |
$letra4 =substr($palabra,$n2-4,1); | |
$letra_sig=substr($palabra,$n2+1,1); | |
local $silaba="8xz"; | |
#print "<font color=grey>-$letra0-</font>"; | |
#caso CV | |
if ($letra0=~/[aeiouAEIOU]/) { | |
if ($letra1=~/[bBcdDfgGhklLmnNñprRstvxXyzZ]/) { | |
unless ( ($letra2=~/[tkpgGdDfbB]/) && ($letra1=~/[rl]/) ) { | |
$silaba=$letra1.$letra0; | |
##print "$silaba!!<br>"; | |
$n2--; | |
} | |
} | |
} | |
#caso CVC | |
if ($letra0=~/[nszrNpkmMlLZçdDgb]/) { | |
if ($letra1=~/[aeiouAEIOU]/) { | |
if ($letra2=~/[bBcdDfgGhklLmnNñprRstvxXyzZ]/) { | |
unless( ($letra3=~/[tkpgGdDfbB]/) && ($letra2=~/[rl]/) ) { | |
$silaba="$letra2"."$letra1"."$letra0"; | |
##print "$silaba*<br>"; | |
$n2=$n2-2;} | |
} | |
} | |
} | |
#caso V | |
if ( ($letra0=~/[aeiouAEIOU]/) ) { | |
unless ( ($letra1=~/[bBcdDfgGhklLmnNñprRstvxXyzZ]/) || ($letra1=~/[jw]/) ) { | |
$silaba= "$letra0"; | |
##print "$silaba?<br>"; | |
} | |
} | |
#caso VC | |
if ( ($letra0=~/[nszrNpkmMlLZçdDgb]/) && ($letra1=~/[aeiouAEIOUwj]/) ) { | |
unless ($letra2=~/[bBcdDfgGhklLmnNñprRstvxXyzZ]/ || ($letra2=~/[jw]/) ) { | |
$silaba= $letra1.$letra0; | |
##print "$silaba-<br>"; | |
$n2--; | |
} | |
} | |
#caso VCC | |
if ( ($letra0=~/[s]/) && ($letra1=~/[kn]/) && ($letra2=~/[aeiouAEIOU]/) ) { | |
$silaba= $letra2.$letra1.$letra0; | |
$n2=$n2-2; | |
##print "$silaba*<br>"; | |
} | |
#caso CCV | |
if ( ($letra0=~/[aeiouAEIOU]/) && ($letra1=~/[rl]/) && ($letra2=~/[tkpgGdDfbB]/) ) | |
{$silaba= $letra2.$letra1.$letra0; | |
##print "$silaba<br>"; | |
$n2=$n2-2;} | |
#caso CCVC | |
if ( ($letra0=~/[nszrNpkmMlLZçdDgb]/) && ($letra1=~/[aeiouAEIOU]/) && ($letra2=~/[rl]/) && ($letra3=~/[tkpgGdDfbB]/)) {$silaba= $letra3.$letra2.$letra1.$letra0; | |
##print "$silaba<br>"; | |
$n2=$n2-3;} | |
#caso CVCC | |
if ( ($letra0=~/[s]/) && ($letra1=~/[b]/) && ($letra2=~/[aeiouAEIOU]/) && ($letra3=~/[s]/) ) | |
{$silaba= $letra3.$letra2.$letra1.$letra0; | |
$n2=$n2-3;} | |
#diptongos | |
#caso CVv y Vv | |
if ( ($letra0=~/[jw]/) && ($letra1=~/[aeiou]/) ) { | |
if ($letra2 =~/[bBcdDfgGhklLmnNñprRstvxXyzZ]/) { | |
$silaba= $letra2.$letra1.$letra0; | |
##print "$silaba<br>"; | |
$n2=$n2-2; | |
} | |
elsif ( ($letra0=~/[jw]/) && ($letra1=~/[aeiou]/) ) { | |
$silaba= $letra1.$letra0; | |
##print "$silaba¿¿<br>"; | |
$n2--; | |
} | |
} | |
#caso CvV | |
if ( ($letra0=~/[aeiouAEIOU]/) && ($letra1=~/[jw]/) ) { | |
if (($letra2 =~/[bBcdDfgGhklLmnNñprRstvxXyzZ]/) ) { | |
unless ($encontrada==1) { | |
$silaba = $letra2.$letra1.$letra0; | |
##print "$silaba-<br>"; | |
$n2=$n2-2; | |
} | |
} | |
elsif ( ($letra0=~/[aeiou]/) && ($letra1=~/[jw]/) ) { | |
$silaba= $letra1.$letra0; | |
##print "$silaba<br>"; | |
$n2--; | |
} | |
} | |
#caso CvVC | |
if ( ($letra0=~/[cksznNplrçm]/) && ($letra1=~/[aeiouAEIOU]/) && ($letra2=~/[wj]/) | |
&& ($letra3=~/[bBcdDfgGhklLmnNñprRstvxXyzZ]/) ) { | |
$silaba= $letra3.$letra2.$letra1.$letra0; | |
##print "$silaba<br>"; | |
$n2 = $n2-3; | |
} | |
#caso CCvV | |
if ( ($letra0=~/[aeiou]/) && ($letra1=~/[jw]/) && ($letra2=~/[rl]/) && ($letra3=~/[tkpgGdDfbB]/) ) { | |
$silaba= $letra3.$letra2.$letra1.$letra0; | |
$n2= $n2-3; | |
} | |
#caso CCVv | |
if ( ($letra0=~/[jw]/) && ($letra1=~/[aeiou]/) && ($letra2=~/[rl]/) && ($letra3=~/[tkpgGdDfbB]/) ) { | |
$silaba= $letra3.$letra2.$letra1.$letra0; | |
$n2= $n2-3; | |
} | |
#caso CCvVC | |
if ( ($letra0=~/[cksznNplrçm]/) && ($letra1=~/[aeiou]/) && ($letra2=~/[jw]/) && ($letra3=~/[rl]/) && | |
($letra4=~/[tkpgGdDfbB]/) ) { | |
$silaba= $letra4.$letra3.$letra2.$letra1.$letra0; | |
$n2= $n2-4; | |
} | |
################################## fin diptongos ############################ | |
#caso CCVCC | |
if ( ($letra0=~/s/) && ($letra1=~/[n]/) && ($letra2=~/[aeiouAEIOU]/) && ($letra3=~/[r]/) | |
&& ($letra4=~/[tkpgGdDfbB]/) ) { | |
$silaba= $letra4.$letra3.$letra2.$letra1.$letra0; | |
##print "$silaba*<br>"; | |
$n2= $n2-4; | |
} | |
#otros casos | |
if ( ($letra0=~/[jw]/) ) { | |
unless ( $letra1=~/[aeioubBcdDfgGhklLmnNñprRstvxXyzZ]/ ) { | |
$silaba= $letra0; | |
##print "$silaba*<br>"; | |
##$tonica_encontrada=1; | |
} | |
if ($letra1=~/[bBcdDfgGhklLmnNñprRstvxXyzZ]/) { | |
$silaba=$letra1.$letra0; | |
#print "$silaba:<br>"; | |
$n2--; | |
} | |
} | |
if ($silaba=="8xz") { | |
$silaba="?"; | |
} | |
#comprobamos si es tónica con acento | |
unshift(@word,$silaba); | |
$n2--; | |
} | |
############################################################################################# | |
############################## fin procesamiento silaba ################################### | |
####asignamos el acento si hay tónica | |
local $n5=0; | |
for (@word) { | |
if ($word[$n5]=~/[AEIOU]/) { | |
$word[$n5]= "'".$word[$n5]; | |
$tonica_encontrada=1; | |
} | |
$n5++; | |
} | |
####### acentos por defecto ####################### | |
unless (($tonica_encontrada==1)) { | |
if (@word>1) { # si no es monosilabo.... | |
if ($word[-1]=~/[rlZdD]$/) { # si la ultima silaba acaba en C | |
$word[-1]="'"."$word[-1]"; | |
} | |
else { | |
$word[-2]="'"."$word[-2]"; | |
} | |
} | |
if ((@word==1) && ($word[0]=~/ir|ba|Ba/)) { | |
$word[0]="'"."$word[0]"; | |
} | |
elsif ( (@word==1) && ($word[0]=~/[^']..+/)) { #si es monosilabo y tiene 3 o mas letras | |
unless ($word[0]=~/^lo[sz]$|^la[sz]$/) { #excepto 'los' y 'las' | |
$word[0]="'"."$word[0]"; | |
}} | |
} | |
#añadimos la palabra a la frase en formas de elementos (=silabas) de un array | |
push(@phrase_sil, @word); | |
#reseteamos la palabra | |
@word=(); | |
$tonica_encontrada=0; | |
$n++; | |
} | |
############ convertimos de nuevo los caracteres para ser visualizador en pagina web ###################### | |
local $n4=0; | |
for (@phrase_sil) { | |
$phrase_sil[$n4] =~ s/G/ɣ/g; | |
$phrase_sil[$n4] =~ s/Z/θ/g; | |
$phrase_sil[$n4] =~ s/B/β/g; | |
$phrase_sil[$n4] =~ s/R/ɾ/g; | |
$phrase_sil[$n4] =~ s/X/t∫/g; | |
$phrase_sil[$n4] =~ s/D/ð/g; | |
$phrase_sil[$n4] =~ s/ç/ç/g; | |
$phrase_sil[$n4] =~ tr/AEIOU/aeiou/; | |
$phrase_sil[$n4] =~ s/L/ʎ/g; | |
$n4++; | |
} | |
############################################################################################################ | |
############################ fin procesamiento palabra #################################################### | |
return @phrase_sil; | |
} | |
############################### fin separador de silabas ################################################## | |
1; | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment