Skip to content

Instantly share code, notes, and snippets.

@jazzido
Created April 30, 2010 01:32
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jazzido/384576 to your computer and use it in GitHub Desktop.
Save jazzido/384576 to your computer and use it in GitHub Desktop.
# -*- coding: utf-8 -*-
# very old (1999?) code for syllabification of spanish words
# based on this thesis: http://www.fismat.umich.mx/~karina/tesisLicenciatura/capitulo2.html
# Author: Manuel Aristarán (jazzido.com)
# License: Creative Commons Atribución 2.5 Argentina -- http://creativecommons.org/licenses/by/2.5/ar/
# usage: determina_caso('cadena de entrada')
vocales = u'aeiouyáéíóúÁÉÍÓÚAEIOUY'
vocalesny = u'aeiouAEIOU'
consonantes = u'qwrtpsdfghjklñzxcvbnmQWERTPSDFGHJKLÑZXCVBNM'
consonantes_inseparables = (u'BR',u'BL',u'CR',u'CL',u'DR',u'FR',u'FL',u'GR',u'GL',u'KR',u'LL',u'PR',u'PL',u'TR',u'RR',u'CH',u'br',u'bl',u'cr',u'cl',u'dr',u'fr',u'fl',u'gr',u'gl',u'kr',u'll',u'pr',u'pl',u'tr',u'rr',u'ch')
def determina_caso(palabra):
resultado = ''
sep = ''
nosep=1
while 1:
if not len(palabra):
break
elif palabra[0] not in vocales + consonantes:
resultado = resultado + palabra[0]
palabra = palabra[1:]
sep = ''
nosep=1
else:
l1 = palabra[0]
if len(palabra) > 1: l2 = palabra[1]
if l1 in vocales:
# caso 1
s = segmenta_caso_1(palabra)
elif l2 in vocales:
# caso 2
s = segmenta_caso_2(palabra)
else:
s = segmenta_caso_3(palabra)
# print '%s' % (s)
if nosep == 1: resultado = resultado + jeringozar(s)
else: resultado = resultado + '-' + jeringozar(s)
nosep =0
palabra = palabra[len(s):]
return resultado
def segmenta_caso_1(palabra):
#C1,1 V
if es_fin_palabra(palabra[1:]) or es_cv(palabra[1:]) or es_ccv(palabra[1:]): return palabra[:1]
elif palabra[1] in consonantes:
#C1,2 VC
if es_cv(palabra[2:]) or es_ccv(palabra[2:]) or es_fin_palabra(palabra[2:]): return palabra[:2]
# error
else: return 0
elif palabra[1] in vocales:
# C1,3 VV
if es_cv(palabra[2:]) or es_ccv(palabra[2:]) or es_fin_palabra(palabra[2:]): return palabra[:2]
elif palabra[2] in consonantes:
#C1,4 VVC
if es_cv(palabra[3:]) or es_ccv(palabra[3:]) or es_fin_palabra(palabra[3:]): return palabra[:3]
else:
return 0
def segmenta_caso_2(palabra):
if es_cv(palabra[2:]) or es_ccv(palabra[2:]) or es_fin_palabra(palabra[2:]):
# C2,1 CV
return palabra[:2]
elif palabra[2] in consonantes:
#C2,2 CVC
if es_cv(palabra[3:]) or es_ccv(palabra[3:]) or es_fin_palabra(palabra[3:]):
return palabra[:3]
elif palabra[3] in consonantes:
#C2,3 CVCC
if es_fin_palabra(palabra[4:]) or es_cv(palabra[4:]):
return palabra[:4]
#C2,4 CVV
elif es_cv(palabra[3:]) or es_ccv(palabra[3:]) or es_fin_palabra(palabra[3:]):
if (palabra[0] not in vocalesny) and (palabra[1] in vocalesny) and (palabra[2] in vocalesny) and (palabra[1] == palabra[2]):
return palabra[:2]
else:
return palabra[:3]
elif palabra[3] in consonantes:
#C2,5 CVVC
if es_fin_palabra(palabra[4:]) or es_cv(palabra[4:]) or es_ccv(palabra[4:]):
if (palabra[0] not in vocalesny) and (palabra[1] in vocalesny) and (palabra[2] in vocalesny) and (palabra[3] not in vocalesny) and (palabra[1] == palabra[2]): return palabra[:2]
else: return palabra[:4]
#C2,6 CVVV
elif es_fin_palabra(palabra[4:]) or es_cv(palabra[4:]) or es_ccv(palabra[4:]): return palabra[:4]
elif palabra[4] in consonantes:
#C2,7 CVVVC
if es_cv(palabra[5:]) or es_ccv(palabra[5:]) or es_fin_palabra(palabra[5:]): return palabra[:5]
else: return 0
def segmenta_caso_3(palabra):
#C3,1 CCV
if es_cv(palabra[3:]) or es_ccv(palabra[3:]) or es_fin_palabra(palabra[3:]):
return palabra[:3]
elif palabra[3] in consonantes:
#C3,3 CCVCC
if es_cv(palabra[5:]) or es_fin_palabra(palabra[5:]):
return palabra[:5]
#C3,2 CCVC
elif es_ccv(palabra[4:]) or es_cv(palabra[4:]) or es_fin_palabra(palabra[4:]):
return palabra[:4]
#C3,4 CCVV
#codigo original
# elif es_cv(palabra[4:]) or es_ccv(palabra[4:]) or es_fin_palabra(palabra[4:]): return palabra[:4]
#fin codigo original
#codigo modificado por Santiago Bruno (bananabruno@hotmail.com)
#si las dos vocales son iguales se deberia separar
#ej: creen: cre-en.
#no se nada de lenguaje asi que si hay una regla o excepciones
#que se deban considerar avisen por mail.
elif es_cv(palabra[4:]) or es_ccv(palabra[4:]) or es_fin_palabra(palabra[4:]):
if (palabra[0] not in vocalesny) and (palabra[1] not in vocalesny) and (palabra[2] in vocalesny) and (palabra[3] in vocalesny) and (palabra[2] == palabra[3]):
return palabra[:3]
else:
return palabra[:4]
# fin codigo modificado
#C3,5 CCVVC
elif es_cv(palabra[5:]) or es_ccv(palabra[5:]) or es_fin_palabra(palabra[5:]):
if (palabra[0] not in vocalesny) and (palabra[1] not in vocalesny) and (palabra[2] in vocalesny) and (palabra[3] in vocalesny) and (palabra[5] not in vocalesny) and (palabra[2] == palabra[3]):
return palabra[:3]
else:
return palabra[:5]
# fin codigo modificado
else: return 0
#es_ccv() y es_cv() controlan que la palabra tenga la forma consonante-consonante-vocal o
#consonante-vocal respectivamente
def es_ccv(palabra):
try:
retVal = palabra[:2] in consonantes_inseparables
except:
retVal = None
return retVal
def es_cv(palabra):
try:
retVal = (palabra[0] in consonantes) and (palabra[1] in vocales)
except:
retVal = None
return retVal
def es_fin_palabra(palabra):
if not palabra: return 1
if len(palabra) == 0: return 1
elif palabra and (palabra[0] not in vocales+consonantes): return 1
return None
def jeringozar(silaba):
t = range(len(silaba))
t.reverse()
for i in t:
if (len(silaba) == 3) and (silaba[2] == "y") and (silaba[1] in vocalesny): # caso soy= sopoy, hoy= hopoy
return silaba[0] + silaba[1] + 'p' + silaba[1] + silaba[2]
elif silaba[i] in vocales:
return silaba + 'p' + silaba[i]
return None
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment