Skip to content

Instantly share code, notes, and snippets.

@ftfarias
Last active April 7, 2020 17:59
Show Gist options
  • Save ftfarias/2a5b6bfcebf6bf7d364034ff99dc1d13 to your computer and use it in GitHub Desktop.
Save ftfarias/2a5b6bfcebf6bf7d364034ff99dc1d13 to your computer and use it in GitHub Desktop.
Data cleaning
import csv
import re
EMAIL_REGEXP = '^[_A-Za-z0-9-+]+(\.[_A-Za-z0-9-]+)*@[A-Za-z0-]+(\.[A-Za-z0-9]+)*(\.[A-Za-z]{2,3})$'
#from tqdm import tqdm_notebook as tqdm
import gensim
import collections
import nltk
from nltk.corpus import stopwords
from collections import Counter, defaultdict
STOPWORDS = stopwords.words('portuguese')
from functools import lru_cache
########################
def remove_double_spaces(text):
return " ".join(text.split())
########################
misspelling_dict = {}
misspelling_csv = csv.reader(open('misspelling.psv','r', encoding='utf-8'), delimiter='|', quotechar='"')
for row in misspelling_csv:
try:
correct_word = row[0].strip()
misspelled_words = [i.strip() for i in row[1:] if i.strip() != '']
for w in misspelled_words:
if len(w) > 0 and len(correct_word) > 0 and w != correct_word:
misspelling_dict[w] = correct_word
else:
print(row)
except Exception as e:
print(e)
print(row)
# todo: testar performance para fazer um dicionário {Set(termos errados): palavra certa'}
def misspelling_word(word):
if word in misspelling_dict:
return misspelling_dict[word], 1
else:
return word, 0
def misspelling_words(words):
result = []
count = 0
for word in words:
w_correct, c = misspelling_word(word)
count += c
result.append(w_correct)
return result, count
def misspelling_string(phrase):
words = phrase.split(' ')
tokens, count = misspelling_words(words)
return " ".join(tokens), count
print('{:,} misspelling'.format(len(misspelling_dict)))
########################
collocations_dict = collections.defaultdict(list)
collocations_csv = csv.reader(open('collocations.csv','r', encoding="utf_8"), delimiter=',', quotechar='"')
for row in collocations_csv:
if row[0].strip() == '---END---':
break
try:
# collocation_phrase = remove_accents(row[0])
collocation_phrase_len = len(collocation_phrase.split(' '))
collocations_dict[collocation_phrase_len].append(collocation_phrase)
except Exception as e:
print(e)
print(row)
collocations_keys = sorted(collocations_dict.keys(),reverse=True)
for k,v in collocations_dict.items():
print('{:,} collocations com {} palavras'.format(len(v),k))
def find_collocations(text):
for size in collocations_keys:
for coll in collocations_dict[size]:
if coll in text:
text = text.replace(coll, coll.replace(' ', '_'))
return text
########################
RE_REAIS = re.compile(r'(R|r)?\$\s?[0-9]{1,6}((.|,)[0-9]{1,2})?')
RE_URL = re.compile(r'(http[s]?://)?www.([a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
RE_TAMANHO1 = re.compile(r'[0-9]{1,3}\s*(m|cm|mm)?')
RE_TAMANHO2 = re.compile(r'[0-9]{1,3}\s?x\s?[0-9]{1,3}(x\s?[0-9]{1,3})?')
RE_TAMANHO3 = re.compile(r'[0-9]{1,3}\s?(m|cm|mm)?\s?x\s?[0-9]{1,3}\s?(m|cm|mm)?(\s?x\s?[0-9]{1,3})?\s?(m|cm|mm)?')
RE_SIMBOLOS = re.compile(r'[\"\#\$\%\&\'\(\)\*\+\-\/\<\=\>\@\[\\\]\^\`\{\|\}\~\£¡§µ\,\.\!\?\:\;]+')
#RE_PONTUACAO = re.compile(r'[\,\.\!\?\:\;]')
RE_CARACTERES_UNICOS = re.compile(r'(^| ).( |$)')
import unicodedata
@lru_cache(maxsize=1024)
def remove_accents(text):
nfkd_form = unicodedata.normalize('NFKD', text)
only_ascii = nfkd_form.encode('ASCII', 'ignore')
return only_ascii.decode('ASCII')
@lru_cache(maxsize=1024)
def limpa_regexp(text):
# url: http', 'viviancordeiro', 'com'
text = RE_URL.sub(' ' , text)
text = RE_REAIS.sub(' ' , text)
text = RE_TAMANHO3.sub(' ' , text)
text = RE_TAMANHO2.sub(' ' , text)
text = RE_TAMANHO1.sub(' ' , text)
text = RE_CARACTERES_UNICOS.sub(' ', text)
text = RE_SIMBOLOS.sub(' ', text)
return text
BLACKLIST = []
def remove_blacklist(tokens):
return [x.strip() for x in tokens if x not in BLACKLIST]
def remove_stop_words(tokens):
return [x.strip() for x in tokens if x not in STOPWORDS]
def clean_text_tokens(text):
text = text.lower()
text = clean_regexps(text)
text, _ = misspelling_string(text)
text = remove_plural(text)
text = find_collocations(text)
# text = remove_accents(text)
# tokenizer
tokens = tokenizer(text, ' ')
tokens = remove_stop_words(tokens)
tokens = remove_blacklist(tokens)
# remove blank tokens
tokens = [t for t in tokens if t]
return tokens
def clean_text(text):
tokens = clean_text_tokens(text)
text = ' '.join(tokens)
return text.strip()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment