Created
January 26, 2011 15:18
-
-
Save paolo-losi/796823 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf8 -*- | |
import sys | |
import unicodedata | |
import re | |
def compress_whitespace(string): | |
elements = string.split() | |
return " ".join(elements) | |
pseudo_accents = (u'\N{apostrophe}', | |
u'\N{acute accent}', | |
u'\N{grave accent}') | |
combining_accents = (u'\N{combining acute accent}', | |
u'\N{combining grave accent}', | |
u'\N{combining circumflex accent}') | |
def convert_accents(s): | |
assert isinstance(s, unicode) | |
s = unicodedata.normalize('NFKD', s) | |
for a in combining_accents + pseudo_accents: | |
s = s.replace(a, "'") | |
return s | |
#TODO remove | |
def remove_accents(s): | |
import warnings | |
warnings.warn('replace with convert_accents', stacklevel=2) | |
return convert_accents(s) | |
strip_re = re.compile(ur'([aeiouAEIOU])[' + u''.join(combining_accents) | |
+ u''.join(pseudo_accents) + ']') | |
def strip_accents(s): | |
assert isinstance(s, unicode) | |
s = unicodedata.normalize('NFKD', s) | |
return strip_re.sub(r'\1', s) | |
strange_chars = (u'\N{right single quotation mark}', | |
u'\N{left single quotation mark}') | |
def cleanup_strange_chars(s): | |
assert isinstance(s, unicode) | |
for c in strange_chars: | |
s = s.replace(c, u"'") | |
return s |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment