ftfarias/clean_data_unicode.py

## clean_data_unicode.py
import re
DOUBLE_SPACES_REMOVER = re.compile(r'[ ]+')

def remove_double_spaces_re(text):
    return DOUBLE_SPACES_REMOVER.sub(' ',text)

%timeit remove_double_spaces_re('ads   sadf   scbvcxb  ret h fdgh jj gh  erty  ')
4.3 µs ± 62.2 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)

############################
def remove_double_spaces(text):
    return " ".join((x for x in text.split(' ') if x))

%timeit remove_double_spaces('ads   sadf   scbvcxb  ret h fdgh jj gh  erty  ')
3.22 µs ± 5.82 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)

############################

import unicodedata

def strip_accents(s):
   return ''.join(c for c in unicodedata.normalize('NFD', s)
                  if unicodedata.category(c) != 'Mn')
strip_accents('Formação estrelar na Grande 行説道ικλ Nuvem de Magalhães, uma galáxia irregular.')

Out[4]: 'Formacao estrelar na Grande 行説道ικλ Nuvem de Magalhaes, uma galaxia irregular.'


############################

import unicodedata
def remove_accents(input_str):
    nfkd_form = unicodedata.normalize('NFKD', input_str)
    only_ascii = nfkd_form.encode('ASCII', 'ignore')
    return only_ascii.decode('ASCII')

remove_accents('Formação estrelar na Grande 行説道ικλ Nuvem de Magalhães, uma galáxia irregular.')
 'Formacao estrelar na Grande  Nuvem de Magalhaes, uma galaxia irregular.'


import re
def normalize_portugues(s):
    s = re.sub(r"[^a-zA-Z0-9ãõçÇáÁéÉíÍóÓúÚâÂêÊîÎôÔûÛàÀ.,!* \(\)|\-]+", r"", s, re.UNICODE)
    return s

print(normalize_portugues('Formação estrelar na Grande 行説道ικλ Nuvem de Magalhães, uma galáxia irregular.'))
Formação estrelar na Grande  Nuvem de Magalhães, uma galáxia irregular.

#####################################


import unicodedata
def remove_accents_keep_other(input_str):
    nfkd_form = unicodedata.normalize('NFKD', input_str)
    return u"".join([c for c in nfkd_form if not unicodedata.combining(c)])

remove_accents_keep_other('Formação estrelar na Grande 行説道ικλ Nuvem de Magalhães, uma galáxia irregular.')
'Formacao estrelar na Grande 行説道ικλ Nuvem de Magalhaes, uma galaxia irregular.'

#####################################

!sudo python3 -m pip install unidecode
Collecting unidecode
  Downloading Unidecode-0.04.21-py2.py3-none-any.whl (228kB)
    100% |################################| 235kB 3.2MB/s ta 0:00:01
Installing collected packages: unidecode
Successfully installed unidecode-0.4.21

#####################################

from unidecode import unidecode
s="Formação estrelar na Grande 行説道ικλ Nuvem de Magalhães, uma galáxia irregular."
t=unidecode(s)
t.encode("ascii")  #works fine, because all non-ASCII from s are replaced with their equivalents
print(t)
Formacao estrelar na Grande Xing Shuo Dao ikl Nuvem de Magalhaes, uma galaxia irregular.

#####################################

import unicodedata as ud
def rmdiacritics(char):
    '''
    Return the base character of char, by "removing" any
    diacritics like accents or curls and strokes and the like.
    '''
    desc = ud.name(unicode(char))
    cutoff = desc.find(' WITH ')
    if cutoff != -1:
        desc = desc[:cutoff]
    return ud.lookup(desc)
rmdiacritics('Formação estrelar na Grande 行説道ικλ Nuvem de Magalhães, uma galáxia irregular.')

#####################################

import re
import unicodedata

def strip_accents(text):
    """
    Strip accents from input String.
    :param text: The input string.
    :type text: String.
    :returns: The processed String.
    :rtype: String.
    """
    try:
        text = unicode(text, 'utf-8')
    except NameError: # unicode is a default on python 3
        pass
    text = unicodedata.normalize('NFD', text)
    text = text.encode('ascii', 'ignore')
    text = text.decode("utf-8")
    return str(text)

strip_accents('Formação estrelar na Grande 行説道ικλ Nuvem de Magalhães, uma galáxia irregular.')
'Formacao estrelar na Grande  Nuvem de Magalhaes, uma galaxia irregular.'

def text_to_slug(text):
    """
    Convert input text to id.

    :param text: The input string.
    :type text: String.

    :returns: The processed String.
    :rtype: String.
    """
    text = strip_accents(text.lower())
    text = re.sub('[ ]+', '_', text)
    text = re.sub('[^0-9a-zA-Z_-]', '', text)
    return text

text_to_id('Formação estrelar na Grande 行説道ικλ Nuvem de Magalhães, uma galáxia irregular.')

Out[21]:
'formacao_estrelar_na_grande_nuvem_de_magalhaes_uma_galaxia_irregular'
	import re
	DOUBLE_SPACES_REMOVER = re.compile(r'[ ]+')

	def remove_double_spaces_re(text):
	return DOUBLE_SPACES_REMOVER.sub(' ',text)

	%timeit remove_double_spaces_re('ads sadf scbvcxb ret h fdgh jj gh erty ')
	4.3 µs ± 62.2 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)

	############################
	def remove_double_spaces(text):
	return " ".join((x for x in text.split(' ') if x))

	%timeit remove_double_spaces('ads sadf scbvcxb ret h fdgh jj gh erty ')
	3.22 µs ± 5.82 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)

	############################

	import unicodedata

	def strip_accents(s):
	return ''.join(c for c in unicodedata.normalize('NFD', s)
	if unicodedata.category(c) != 'Mn')
	strip_accents('Formação estrelar na Grande 行説道ικλ Nuvem de Magalhães, uma galáxia irregular.')

	Out[4]: 'Formacao estrelar na Grande 行説道ικλ Nuvem de Magalhaes, uma galaxia irregular.'


	############################

	import unicodedata
	def remove_accents(input_str):
	nfkd_form = unicodedata.normalize('NFKD', input_str)
	only_ascii = nfkd_form.encode('ASCII', 'ignore')
	return only_ascii.decode('ASCII')

	remove_accents('Formação estrelar na Grande 行説道ικλ Nuvem de Magalhães, uma galáxia irregular.')
	'Formacao estrelar na Grande Nuvem de Magalhaes, uma galaxia irregular.'


	import re
	def normalize_portugues(s):
	s = re.sub(r"[^a-zA-Z0-9ãõçÇáÁéÉíÍóÓúÚâÂêÊîÎôÔûÛàÀ.,!* \(\)\|\-]+", r"", s, re.UNICODE)
	return s

	print(normalize_portugues('Formação estrelar na Grande 行説道ικλ Nuvem de Magalhães, uma galáxia irregular.'))
	Formação estrelar na Grande Nuvem de Magalhães, uma galáxia irregular.

	#####################################


	import unicodedata
	def remove_accents_keep_other(input_str):
	nfkd_form = unicodedata.normalize('NFKD', input_str)
	return u"".join([c for c in nfkd_form if not unicodedata.combining(c)])

	remove_accents_keep_other('Formação estrelar na Grande 行説道ικλ Nuvem de Magalhães, uma galáxia irregular.')
	'Formacao estrelar na Grande 行説道ικλ Nuvem de Magalhaes, uma galaxia irregular.'

	#####################################

	!sudo python3 -m pip install unidecode
	Collecting unidecode
	Downloading Unidecode-0.04.21-py2.py3-none-any.whl (228kB)
	100% \|################################\| 235kB 3.2MB/s ta 0:00:01
	Installing collected packages: unidecode
	Successfully installed unidecode-0.4.21

	#####################################

	from unidecode import unidecode
	s="Formação estrelar na Grande 行説道ικλ Nuvem de Magalhães, uma galáxia irregular."
	t=unidecode(s)
	t.encode("ascii") #works fine, because all non-ASCII from s are replaced with their equivalents
	print(t)
	Formacao estrelar na Grande Xing Shuo Dao ikl Nuvem de Magalhaes, uma galaxia irregular.

	#####################################

	import unicodedata as ud
	def rmdiacritics(char):
	'''
	Return the base character of char, by "removing" any
	diacritics like accents or curls and strokes and the like.
	'''
	desc = ud.name(unicode(char))
	cutoff = desc.find(' WITH ')
	if cutoff != -1:
	desc = desc[:cutoff]
	return ud.lookup(desc)
	rmdiacritics('Formação estrelar na Grande 行説道ικλ Nuvem de Magalhães, uma galáxia irregular.')

	#####################################

	import re
	import unicodedata

	def strip_accents(text):
	"""
	Strip accents from input String.
	:param text: The input string.
	:type text: String.
	:returns: The processed String.
	:rtype: String.
	"""
	try:
	text = unicode(text, 'utf-8')
	except NameError: # unicode is a default on python 3
	pass
	text = unicodedata.normalize('NFD', text)
	text = text.encode('ascii', 'ignore')
	text = text.decode("utf-8")
	return str(text)

	strip_accents('Formação estrelar na Grande 行説道ικλ Nuvem de Magalhães, uma galáxia irregular.')
	'Formacao estrelar na Grande Nuvem de Magalhaes, uma galaxia irregular.'

	def text_to_slug(text):
	"""
	Convert input text to id.

	:param text: The input string.
	:type text: String.

	:returns: The processed String.
	:rtype: String.
	"""
	text = strip_accents(text.lower())
	text = re.sub('[ ]+', '_', text)
	text = re.sub('[^0-9a-zA-Z_-]', '', text)
	return text

	text_to_id('Formação estrelar na Grande 行説道ικλ Nuvem de Magalhães, uma galáxia irregular.')

	Out[21]:
	'formacao_estrelar_na_grande_nuvem_de_magalhaes_uma_galaxia_irregular'