Last active
May 10, 2022 13:23
-
-
Save fclesio/fba7714397ec1db11fccee7a8cdd20f5 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Special thanks for the user Humberto Diogenes from Python List (answer from Aug 11, 2008) | |
# Link: http://python.6.x6.nabble.com/O-jeito-mais-rapido-de-remover-acentos-de-uma-string-td2041508.html | |
# I found the issue by chance (I swear, haha) but this guy gave the tip before me | |
# Link: https://github.com/scikit-learn/scikit-learn/issues/12897#issuecomment-518644215 | |
import spacy | |
from unicodedata import normalize | |
nlp = spacy.load('pt_core_news_sm') | |
# Define default stopwords list | |
stoplist = spacy.lang.pt.stop_words.STOP_WORDS | |
def replace_ptbr_char_by_word(word): | |
""" Will remove the encode token by token""" | |
word = str(word) | |
word = normalize('NFKD', word).encode('ASCII','ignore').decode('ASCII') | |
return word | |
def remove_pt_br_char_by_text(text): | |
""" Will remove the encode using the entire text""" | |
text = str(text) | |
text = " ".join(replace_ptbr_char_by_word(word) for word in text.split() if word not in stoplist) | |
return text | |
df['text'] = df['text'].apply(remove_pt_br_char_by_text) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment