Skip to content

Instantly share code, notes, and snippets.

@derlin
Last active July 8, 2019 08:01
Show Gist options
  • Save derlin/36ee30b78a27cac9fa4e66e7d94f4ed7 to your computer and use it in GitHub Desktop.
Save derlin/36ee30b78a27cac9fa4e66e7d94f4ed7 to your computer and use it in GitHub Desktop.
GSW utils
import unicodedata
SG_DIACRITICS = [
0x0300, # COMBINING GRAVE ACCENT
0x0301, # COMBINING ACUTE ACCENT
0x0302, # COMBINING CIRCUMFLEX ACCENT
0x0308, # COMBINING DIAERESIS
# TODO: what about ǜ (U+01DC) and ß ?
]
SG_ACCENTED_CHARS = unicodedata.normalize('NFC',
''.join([ f'{c}{chr(diac)}' for diac in SG_DIACRITICS for c in list('aeiou')])
)
import re
_pattern = re.compile(f'[^\W\da-z{SG_ACCENTED_CHARS}]', re.IGNORECASE) #re.compile('[^\W\däÄöÖüÜa-zA-Z]')
def is_sg_charset(sentence, upper_limit=1):
matches = _pattern.findall(sentence)
return len(matches) <= upper_limit
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment