Skip to content

Instantly share code, notes, and snippets.

@paolo-losi
Created January 26, 2011 15:18
Show Gist options
  • Save paolo-losi/796823 to your computer and use it in GitHub Desktop.
Save paolo-losi/796823 to your computer and use it in GitHub Desktop.
# -*- coding: utf8 -*-
import sys
import unicodedata
import re
def compress_whitespace(string):
elements = string.split()
return " ".join(elements)
pseudo_accents = (u'\N{apostrophe}',
u'\N{acute accent}',
u'\N{grave accent}')
combining_accents = (u'\N{combining acute accent}',
u'\N{combining grave accent}',
u'\N{combining circumflex accent}')
def convert_accents(s):
assert isinstance(s, unicode)
s = unicodedata.normalize('NFKD', s)
for a in combining_accents + pseudo_accents:
s = s.replace(a, "'")
return s
#TODO remove
def remove_accents(s):
import warnings
warnings.warn('replace with convert_accents', stacklevel=2)
return convert_accents(s)
strip_re = re.compile(ur'([aeiouAEIOU])[' + u''.join(combining_accents)
+ u''.join(pseudo_accents) + ']')
def strip_accents(s):
assert isinstance(s, unicode)
s = unicodedata.normalize('NFKD', s)
return strip_re.sub(r'\1', s)
strange_chars = (u'\N{right single quotation mark}',
u'\N{left single quotation mark}')
def cleanup_strange_chars(s):
assert isinstance(s, unicode)
for c in strange_chars:
s = s.replace(c, u"'")
return s
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment