fclesio/remove_ptbr_chars_pandas.py

## remove_ptbr_chars_pandas.py
# Special thanks for the user Humberto Diogenes from Python List (answer from Aug 11, 2008)
# Link: http://python.6.x6.nabble.com/O-jeito-mais-rapido-de-remover-acentos-de-uma-string-td2041508.html

# I found the issue by chance (I swear, haha) but this guy gave the tip before me
# Link: https://github.com/scikit-learn/scikit-learn/issues/12897#issuecomment-518644215

import spacy
from unicodedata import normalize
nlp = spacy.load('pt_core_news_sm')

# Define default stopwords list
stoplist = spacy.lang.pt.stop_words.STOP_WORDS

def replace_ptbr_char_by_word(word):
  """ Will remove the encode token by token"""
    word = str(word)
    word = normalize('NFKD', word).encode('ASCII','ignore').decode('ASCII')
    return word

def remove_pt_br_char_by_text(text):
  """ Will remove the encode using the entire text"""
    text = str(text)
    text = " ".join(replace_ptbr_char_by_word(word) for word in text.split() if word not in stoplist)
    return text

df['text'] = df['text'].apply(remove_pt_br_char_by_text)
	# Special thanks for the user Humberto Diogenes from Python List (answer from Aug 11, 2008)
	# Link: http://python.6.x6.nabble.com/O-jeito-mais-rapido-de-remover-acentos-de-uma-string-td2041508.html

	# I found the issue by chance (I swear, haha) but this guy gave the tip before me
	# Link: https://github.com/scikit-learn/scikit-learn/issues/12897#issuecomment-518644215

	import spacy
	from unicodedata import normalize
	nlp = spacy.load('pt_core_news_sm')

	# Define default stopwords list
	stoplist = spacy.lang.pt.stop_words.STOP_WORDS

	def replace_ptbr_char_by_word(word):
	""" Will remove the encode token by token"""
	word = str(word)
	word = normalize('NFKD', word).encode('ASCII','ignore').decode('ASCII')
	return word

	def remove_pt_br_char_by_text(text):
	""" Will remove the encode using the entire text"""
	text = str(text)
	text = " ".join(replace_ptbr_char_by_word(word) for word in text.split() if word not in stoplist)
	return text

	df['text'] = df['text'].apply(remove_pt_br_char_by_text)