Last active
September 20, 2019 19:36
-
-
Save ftfarias/a37a4a29c5bc4be87e97c572f6b8af12 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
DOUBLE_SPACES_REMOVER = re.compile(r'[ ]+') | |
def remove_double_spaces_re(text): | |
return DOUBLE_SPACES_REMOVER.sub(' ',text) | |
%timeit remove_double_spaces_re('ads sadf scbvcxb ret h fdgh jj gh erty ') | |
4.3 µs ± 62.2 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each) | |
############################ | |
def remove_double_spaces(text): | |
return " ".join((x for x in text.split(' ') if x)) | |
%timeit remove_double_spaces('ads sadf scbvcxb ret h fdgh jj gh erty ') | |
3.22 µs ± 5.82 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each) | |
############################ | |
import unicodedata | |
def strip_accents(s): | |
return ''.join(c for c in unicodedata.normalize('NFD', s) | |
if unicodedata.category(c) != 'Mn') | |
strip_accents('Formação estrelar na Grande 行説道ικλ Nuvem de Magalhães, uma galáxia irregular.') | |
Out[4]: 'Formacao estrelar na Grande 行説道ικλ Nuvem de Magalhaes, uma galaxia irregular.' | |
############################ | |
import unicodedata | |
def remove_accents(input_str): | |
nfkd_form = unicodedata.normalize('NFKD', input_str) | |
only_ascii = nfkd_form.encode('ASCII', 'ignore') | |
return only_ascii.decode('ASCII') | |
remove_accents('Formação estrelar na Grande 行説道ικλ Nuvem de Magalhães, uma galáxia irregular.') | |
'Formacao estrelar na Grande Nuvem de Magalhaes, uma galaxia irregular.' | |
import re | |
def normalize_portugues(s): | |
s = re.sub(r"[^a-zA-Z0-9ãõçÇáÁéÉíÍóÓúÚâÂêÊîÎôÔûÛàÀ.,!* \(\)|\-]+", r"", s, re.UNICODE) | |
return s | |
print(normalize_portugues('Formação estrelar na Grande 行説道ικλ Nuvem de Magalhães, uma galáxia irregular.')) | |
Formação estrelar na Grande Nuvem de Magalhães, uma galáxia irregular. | |
##################################### | |
import unicodedata | |
def remove_accents_keep_other(input_str): | |
nfkd_form = unicodedata.normalize('NFKD', input_str) | |
return u"".join([c for c in nfkd_form if not unicodedata.combining(c)]) | |
remove_accents_keep_other('Formação estrelar na Grande 行説道ικλ Nuvem de Magalhães, uma galáxia irregular.') | |
'Formacao estrelar na Grande 行説道ικλ Nuvem de Magalhaes, uma galaxia irregular.' | |
##################################### | |
!sudo python3 -m pip install unidecode | |
Collecting unidecode | |
Downloading Unidecode-0.04.21-py2.py3-none-any.whl (228kB) | |
100% |################################| 235kB 3.2MB/s ta 0:00:01 | |
Installing collected packages: unidecode | |
Successfully installed unidecode-0.4.21 | |
##################################### | |
from unidecode import unidecode | |
s="Formação estrelar na Grande 行説道ικλ Nuvem de Magalhães, uma galáxia irregular." | |
t=unidecode(s) | |
t.encode("ascii") #works fine, because all non-ASCII from s are replaced with their equivalents | |
print(t) | |
Formacao estrelar na Grande Xing Shuo Dao ikl Nuvem de Magalhaes, uma galaxia irregular. | |
##################################### | |
import unicodedata as ud | |
def rmdiacritics(char): | |
''' | |
Return the base character of char, by "removing" any | |
diacritics like accents or curls and strokes and the like. | |
''' | |
desc = ud.name(unicode(char)) | |
cutoff = desc.find(' WITH ') | |
if cutoff != -1: | |
desc = desc[:cutoff] | |
return ud.lookup(desc) | |
rmdiacritics('Formação estrelar na Grande 行説道ικλ Nuvem de Magalhães, uma galáxia irregular.') | |
##################################### | |
import re | |
import unicodedata | |
def strip_accents(text): | |
""" | |
Strip accents from input String. | |
:param text: The input string. | |
:type text: String. | |
:returns: The processed String. | |
:rtype: String. | |
""" | |
try: | |
text = unicode(text, 'utf-8') | |
except NameError: # unicode is a default on python 3 | |
pass | |
text = unicodedata.normalize('NFD', text) | |
text = text.encode('ascii', 'ignore') | |
text = text.decode("utf-8") | |
return str(text) | |
strip_accents('Formação estrelar na Grande 行説道ικλ Nuvem de Magalhães, uma galáxia irregular.') | |
'Formacao estrelar na Grande Nuvem de Magalhaes, uma galaxia irregular.' | |
def text_to_slug(text): | |
""" | |
Convert input text to id. | |
| |
:param text: The input string. | |
:type text: String. | |
| |
:returns: The processed String. | |
:rtype: String. | |
""" | |
text = strip_accents(text.lower()) | |
text = re.sub('[ ]+', '_', text) | |
text = re.sub('[^0-9a-zA-Z_-]', '', text) | |
return text | |
| |
text_to_id('Formação estrelar na Grande 行説道ικλ Nuvem de Magalhães, uma galáxia irregular.') | |
Out[21]: | |
'formacao_estrelar_na_grande_nuvem_de_magalhaes_uma_galaxia_irregular' |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment