Skip to content

Instantly share code, notes, and snippets.

@ftfarias
Last active September 20, 2019 19:36
Show Gist options
  • Save ftfarias/a37a4a29c5bc4be87e97c572f6b8af12 to your computer and use it in GitHub Desktop.
Save ftfarias/a37a4a29c5bc4be87e97c572f6b8af12 to your computer and use it in GitHub Desktop.
import re
DOUBLE_SPACES_REMOVER = re.compile(r'[ ]+')
def remove_double_spaces_re(text):
return DOUBLE_SPACES_REMOVER.sub(' ',text)
%timeit remove_double_spaces_re('ads sadf scbvcxb ret h fdgh jj gh erty ')
4.3 µs ± 62.2 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)
############################
def remove_double_spaces(text):
return " ".join((x for x in text.split(' ') if x))
%timeit remove_double_spaces('ads sadf scbvcxb ret h fdgh jj gh erty ')
3.22 µs ± 5.82 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)
############################
import unicodedata
def strip_accents(s):
return ''.join(c for c in unicodedata.normalize('NFD', s)
if unicodedata.category(c) != 'Mn')
strip_accents('Formação estrelar na Grande 行説道ικλ Nuvem de Magalhães, uma galáxia irregular.')
Out[4]: 'Formacao estrelar na Grande 行説道ικλ Nuvem de Magalhaes, uma galaxia irregular.'
############################
import unicodedata
def remove_accents(input_str):
nfkd_form = unicodedata.normalize('NFKD', input_str)
only_ascii = nfkd_form.encode('ASCII', 'ignore')
return only_ascii.decode('ASCII')
remove_accents('Formação estrelar na Grande 行説道ικλ Nuvem de Magalhães, uma galáxia irregular.')
​ 'Formacao estrelar na Grande Nuvem de Magalhaes, uma galaxia irregular.'
import re
def normalize_portugues(s):
s = re.sub(r"[^a-zA-Z0-9ãõçÇáÁéÉíÍóÓúÚâÂêÊîÎôÔûÛàÀ.,!* \(\)|\-]+", r"", s, re.UNICODE)
return s
print(normalize_portugues('Formação estrelar na Grande 行説道ικλ Nuvem de Magalhães, uma galáxia irregular.'))
Formação estrelar na Grande Nuvem de Magalhães, uma galáxia irregular.
#####################################
import unicodedata
def remove_accents_keep_other(input_str):
nfkd_form = unicodedata.normalize('NFKD', input_str)
return u"".join([c for c in nfkd_form if not unicodedata.combining(c)])
remove_accents_keep_other('Formação estrelar na Grande 行説道ικλ Nuvem de Magalhães, uma galáxia irregular.')
'Formacao estrelar na Grande 行説道ικλ Nuvem de Magalhaes, uma galaxia irregular.'
#####################################
!sudo python3 -m pip install unidecode
Collecting unidecode
Downloading Unidecode-0.04.21-py2.py3-none-any.whl (228kB)
100% |################################| 235kB 3.2MB/s ta 0:00:01
Installing collected packages: unidecode
Successfully installed unidecode-0.4.21
#####################################
from unidecode import unidecode
s="Formação estrelar na Grande 行説道ικλ Nuvem de Magalhães, uma galáxia irregular."
t=unidecode(s)
t.encode("ascii") #works fine, because all non-ASCII from s are replaced with their equivalents
print(t)
Formacao estrelar na Grande Xing Shuo Dao ikl Nuvem de Magalhaes, uma galaxia irregular.
#####################################
import unicodedata as ud
def rmdiacritics(char):
'''
Return the base character of char, by "removing" any
diacritics like accents or curls and strokes and the like.
'''
desc = ud.name(unicode(char))
cutoff = desc.find(' WITH ')
if cutoff != -1:
desc = desc[:cutoff]
return ud.lookup(desc)
rmdiacritics('Formação estrelar na Grande 行説道ικλ Nuvem de Magalhães, uma galáxia irregular.')
#####################################
import re
import unicodedata
def strip_accents(text):
"""
Strip accents from input String.
:param text: The input string.
:type text: String.
:returns: The processed String.
:rtype: String.
"""
try:
text = unicode(text, 'utf-8')
except NameError: # unicode is a default on python 3
pass
text = unicodedata.normalize('NFD', text)
text = text.encode('ascii', 'ignore')
text = text.decode("utf-8")
return str(text)
strip_accents('Formação estrelar na Grande 行説道ικλ Nuvem de Magalhães, uma galáxia irregular.')
'Formacao estrelar na Grande Nuvem de Magalhaes, uma galaxia irregular.'
def text_to_slug(text):
"""
Convert input text to id.
:param text: The input string.
:type text: String.
:returns: The processed String.
:rtype: String.
"""
text = strip_accents(text.lower())
text = re.sub('[ ]+', '_', text)
text = re.sub('[^0-9a-zA-Z_-]', '', text)
return text
text_to_id('Formação estrelar na Grande 行説道ικλ Nuvem de Magalhães, uma galáxia irregular.')
Out[21]:
'formacao_estrelar_na_grande_nuvem_de_magalhaes_uma_galaxia_irregular'
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment