Last active
April 7, 2020 17:59
-
-
Save ftfarias/2a5b6bfcebf6bf7d364034ff99dc1d13 to your computer and use it in GitHub Desktop.
Data cleaning
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import csv | |
import re | |
EMAIL_REGEXP = '^[_A-Za-z0-9-+]+(\.[_A-Za-z0-9-]+)*@[A-Za-z0-]+(\.[A-Za-z0-9]+)*(\.[A-Za-z]{2,3})$' | |
#from tqdm import tqdm_notebook as tqdm | |
import gensim | |
import collections | |
import nltk | |
from nltk.corpus import stopwords | |
from collections import Counter, defaultdict | |
STOPWORDS = stopwords.words('portuguese') | |
from functools import lru_cache | |
######################## | |
def remove_double_spaces(text): | |
return " ".join(text.split()) | |
######################## | |
misspelling_dict = {} | |
misspelling_csv = csv.reader(open('misspelling.psv','r', encoding='utf-8'), delimiter='|', quotechar='"') | |
for row in misspelling_csv: | |
try: | |
correct_word = row[0].strip() | |
misspelled_words = [i.strip() for i in row[1:] if i.strip() != ''] | |
for w in misspelled_words: | |
if len(w) > 0 and len(correct_word) > 0 and w != correct_word: | |
misspelling_dict[w] = correct_word | |
else: | |
print(row) | |
except Exception as e: | |
print(e) | |
print(row) | |
# todo: testar performance para fazer um dicionário {Set(termos errados): palavra certa'} | |
def misspelling_word(word): | |
if word in misspelling_dict: | |
return misspelling_dict[word], 1 | |
else: | |
return word, 0 | |
def misspelling_words(words): | |
result = [] | |
count = 0 | |
for word in words: | |
w_correct, c = misspelling_word(word) | |
count += c | |
result.append(w_correct) | |
return result, count | |
def misspelling_string(phrase): | |
words = phrase.split(' ') | |
tokens, count = misspelling_words(words) | |
return " ".join(tokens), count | |
print('{:,} misspelling'.format(len(misspelling_dict))) | |
######################## | |
collocations_dict = collections.defaultdict(list) | |
collocations_csv = csv.reader(open('collocations.csv','r', encoding="utf_8"), delimiter=',', quotechar='"') | |
for row in collocations_csv: | |
if row[0].strip() == '---END---': | |
break | |
try: | |
# collocation_phrase = remove_accents(row[0]) | |
collocation_phrase_len = len(collocation_phrase.split(' ')) | |
collocations_dict[collocation_phrase_len].append(collocation_phrase) | |
except Exception as e: | |
print(e) | |
print(row) | |
collocations_keys = sorted(collocations_dict.keys(),reverse=True) | |
for k,v in collocations_dict.items(): | |
print('{:,} collocations com {} palavras'.format(len(v),k)) | |
def find_collocations(text): | |
for size in collocations_keys: | |
for coll in collocations_dict[size]: | |
if coll in text: | |
text = text.replace(coll, coll.replace(' ', '_')) | |
return text | |
######################## | |
RE_REAIS = re.compile(r'(R|r)?\$\s?[0-9]{1,6}((.|,)[0-9]{1,2})?') | |
RE_URL = re.compile(r'(http[s]?://)?www.([a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+') | |
RE_TAMANHO1 = re.compile(r'[0-9]{1,3}\s*(m|cm|mm)?') | |
RE_TAMANHO2 = re.compile(r'[0-9]{1,3}\s?x\s?[0-9]{1,3}(x\s?[0-9]{1,3})?') | |
RE_TAMANHO3 = re.compile(r'[0-9]{1,3}\s?(m|cm|mm)?\s?x\s?[0-9]{1,3}\s?(m|cm|mm)?(\s?x\s?[0-9]{1,3})?\s?(m|cm|mm)?') | |
RE_SIMBOLOS = re.compile(r'[\"\#\$\%\&\'\(\)\*\+\-\/\<\=\>\@\[\\\]\^\`\{\|\}\~\£¡§µ\,\.\!\?\:\;]+') | |
#RE_PONTUACAO = re.compile(r'[\,\.\!\?\:\;]') | |
RE_CARACTERES_UNICOS = re.compile(r'(^| ).( |$)') | |
import unicodedata | |
@lru_cache(maxsize=1024) | |
def remove_accents(text): | |
nfkd_form = unicodedata.normalize('NFKD', text) | |
only_ascii = nfkd_form.encode('ASCII', 'ignore') | |
return only_ascii.decode('ASCII') | |
@lru_cache(maxsize=1024) | |
def limpa_regexp(text): | |
# url: http', 'viviancordeiro', 'com' | |
text = RE_URL.sub(' ' , text) | |
text = RE_REAIS.sub(' ' , text) | |
text = RE_TAMANHO3.sub(' ' , text) | |
text = RE_TAMANHO2.sub(' ' , text) | |
text = RE_TAMANHO1.sub(' ' , text) | |
text = RE_CARACTERES_UNICOS.sub(' ', text) | |
text = RE_SIMBOLOS.sub(' ', text) | |
return text | |
BLACKLIST = [] | |
def remove_blacklist(tokens): | |
return [x.strip() for x in tokens if x not in BLACKLIST] | |
def remove_stop_words(tokens): | |
return [x.strip() for x in tokens if x not in STOPWORDS] | |
def clean_text_tokens(text): | |
text = text.lower() | |
text = clean_regexps(text) | |
text, _ = misspelling_string(text) | |
text = remove_plural(text) | |
text = find_collocations(text) | |
# text = remove_accents(text) | |
# tokenizer | |
tokens = tokenizer(text, ' ') | |
tokens = remove_stop_words(tokens) | |
tokens = remove_blacklist(tokens) | |
# remove blank tokens | |
tokens = [t for t in tokens if t] | |
return tokens | |
def clean_text(text): | |
tokens = clean_text_tokens(text) | |
text = ' '.join(tokens) | |
return text.strip() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment