Created
April 30, 2010 01:32
-
-
Save jazzido/384576 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
# very old (1999?) code for syllabification of spanish words | |
# based on this thesis: http://www.fismat.umich.mx/~karina/tesisLicenciatura/capitulo2.html | |
# Author: Manuel Aristarán (jazzido.com) | |
# License: Creative Commons Atribución 2.5 Argentina -- http://creativecommons.org/licenses/by/2.5/ar/ | |
# usage: determina_caso('cadena de entrada') | |
vocales = u'aeiouyáéíóúÁÉÍÓÚAEIOUY' | |
vocalesny = u'aeiouAEIOU' | |
consonantes = u'qwrtpsdfghjklñzxcvbnmQWERTPSDFGHJKLÑZXCVBNM' | |
consonantes_inseparables = (u'BR',u'BL',u'CR',u'CL',u'DR',u'FR',u'FL',u'GR',u'GL',u'KR',u'LL',u'PR',u'PL',u'TR',u'RR',u'CH',u'br',u'bl',u'cr',u'cl',u'dr',u'fr',u'fl',u'gr',u'gl',u'kr',u'll',u'pr',u'pl',u'tr',u'rr',u'ch') | |
def determina_caso(palabra): | |
resultado = '' | |
sep = '' | |
nosep=1 | |
while 1: | |
if not len(palabra): | |
break | |
elif palabra[0] not in vocales + consonantes: | |
resultado = resultado + palabra[0] | |
palabra = palabra[1:] | |
sep = '' | |
nosep=1 | |
else: | |
l1 = palabra[0] | |
if len(palabra) > 1: l2 = palabra[1] | |
if l1 in vocales: | |
# caso 1 | |
s = segmenta_caso_1(palabra) | |
elif l2 in vocales: | |
# caso 2 | |
s = segmenta_caso_2(palabra) | |
else: | |
s = segmenta_caso_3(palabra) | |
# print '%s' % (s) | |
if nosep == 1: resultado = resultado + jeringozar(s) | |
else: resultado = resultado + '-' + jeringozar(s) | |
nosep =0 | |
palabra = palabra[len(s):] | |
return resultado | |
def segmenta_caso_1(palabra): | |
#C1,1 V | |
if es_fin_palabra(palabra[1:]) or es_cv(palabra[1:]) or es_ccv(palabra[1:]): return palabra[:1] | |
elif palabra[1] in consonantes: | |
#C1,2 VC | |
if es_cv(palabra[2:]) or es_ccv(palabra[2:]) or es_fin_palabra(palabra[2:]): return palabra[:2] | |
# error | |
else: return 0 | |
elif palabra[1] in vocales: | |
# C1,3 VV | |
if es_cv(palabra[2:]) or es_ccv(palabra[2:]) or es_fin_palabra(palabra[2:]): return palabra[:2] | |
elif palabra[2] in consonantes: | |
#C1,4 VVC | |
if es_cv(palabra[3:]) or es_ccv(palabra[3:]) or es_fin_palabra(palabra[3:]): return palabra[:3] | |
else: | |
return 0 | |
def segmenta_caso_2(palabra): | |
if es_cv(palabra[2:]) or es_ccv(palabra[2:]) or es_fin_palabra(palabra[2:]): | |
# C2,1 CV | |
return palabra[:2] | |
elif palabra[2] in consonantes: | |
#C2,2 CVC | |
if es_cv(palabra[3:]) or es_ccv(palabra[3:]) or es_fin_palabra(palabra[3:]): | |
return palabra[:3] | |
elif palabra[3] in consonantes: | |
#C2,3 CVCC | |
if es_fin_palabra(palabra[4:]) or es_cv(palabra[4:]): | |
return palabra[:4] | |
#C2,4 CVV | |
elif es_cv(palabra[3:]) or es_ccv(palabra[3:]) or es_fin_palabra(palabra[3:]): | |
if (palabra[0] not in vocalesny) and (palabra[1] in vocalesny) and (palabra[2] in vocalesny) and (palabra[1] == palabra[2]): | |
return palabra[:2] | |
else: | |
return palabra[:3] | |
elif palabra[3] in consonantes: | |
#C2,5 CVVC | |
if es_fin_palabra(palabra[4:]) or es_cv(palabra[4:]) or es_ccv(palabra[4:]): | |
if (palabra[0] not in vocalesny) and (palabra[1] in vocalesny) and (palabra[2] in vocalesny) and (palabra[3] not in vocalesny) and (palabra[1] == palabra[2]): return palabra[:2] | |
else: return palabra[:4] | |
#C2,6 CVVV | |
elif es_fin_palabra(palabra[4:]) or es_cv(palabra[4:]) or es_ccv(palabra[4:]): return palabra[:4] | |
elif palabra[4] in consonantes: | |
#C2,7 CVVVC | |
if es_cv(palabra[5:]) or es_ccv(palabra[5:]) or es_fin_palabra(palabra[5:]): return palabra[:5] | |
else: return 0 | |
def segmenta_caso_3(palabra): | |
#C3,1 CCV | |
if es_cv(palabra[3:]) or es_ccv(palabra[3:]) or es_fin_palabra(palabra[3:]): | |
return palabra[:3] | |
elif palabra[3] in consonantes: | |
#C3,3 CCVCC | |
if es_cv(palabra[5:]) or es_fin_palabra(palabra[5:]): | |
return palabra[:5] | |
#C3,2 CCVC | |
elif es_ccv(palabra[4:]) or es_cv(palabra[4:]) or es_fin_palabra(palabra[4:]): | |
return palabra[:4] | |
#C3,4 CCVV | |
#codigo original | |
# elif es_cv(palabra[4:]) or es_ccv(palabra[4:]) or es_fin_palabra(palabra[4:]): return palabra[:4] | |
#fin codigo original | |
#codigo modificado por Santiago Bruno (bananabruno@hotmail.com) | |
#si las dos vocales son iguales se deberia separar | |
#ej: creen: cre-en. | |
#no se nada de lenguaje asi que si hay una regla o excepciones | |
#que se deban considerar avisen por mail. | |
elif es_cv(palabra[4:]) or es_ccv(palabra[4:]) or es_fin_palabra(palabra[4:]): | |
if (palabra[0] not in vocalesny) and (palabra[1] not in vocalesny) and (palabra[2] in vocalesny) and (palabra[3] in vocalesny) and (palabra[2] == palabra[3]): | |
return palabra[:3] | |
else: | |
return palabra[:4] | |
# fin codigo modificado | |
#C3,5 CCVVC | |
elif es_cv(palabra[5:]) or es_ccv(palabra[5:]) or es_fin_palabra(palabra[5:]): | |
if (palabra[0] not in vocalesny) and (palabra[1] not in vocalesny) and (palabra[2] in vocalesny) and (palabra[3] in vocalesny) and (palabra[5] not in vocalesny) and (palabra[2] == palabra[3]): | |
return palabra[:3] | |
else: | |
return palabra[:5] | |
# fin codigo modificado | |
else: return 0 | |
#es_ccv() y es_cv() controlan que la palabra tenga la forma consonante-consonante-vocal o | |
#consonante-vocal respectivamente | |
def es_ccv(palabra): | |
try: | |
retVal = palabra[:2] in consonantes_inseparables | |
except: | |
retVal = None | |
return retVal | |
def es_cv(palabra): | |
try: | |
retVal = (palabra[0] in consonantes) and (palabra[1] in vocales) | |
except: | |
retVal = None | |
return retVal | |
def es_fin_palabra(palabra): | |
if not palabra: return 1 | |
if len(palabra) == 0: return 1 | |
elif palabra and (palabra[0] not in vocales+consonantes): return 1 | |
return None | |
def jeringozar(silaba): | |
t = range(len(silaba)) | |
t.reverse() | |
for i in t: | |
if (len(silaba) == 3) and (silaba[2] == "y") and (silaba[1] in vocalesny): # caso soy= sopoy, hoy= hopoy | |
return silaba[0] + silaba[1] + 'p' + silaba[1] + silaba[2] | |
elif silaba[i] in vocales: | |
return silaba + 'p' + silaba[i] | |
return None | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment