ftfarias/data_cleaning.py

## data_cleaning.py
import csv
import re

EMAIL_REGEXP = '^[_A-Za-z0-9-+]+(\.[_A-Za-z0-9-]+)*@[A-Za-z0-]+(\.[A-Za-z0-9]+)*(\.[A-Za-z]{2,3})$'

#from tqdm import tqdm_notebook as tqdm

import gensim
import collections
import nltk
from nltk.corpus import stopwords
from collections import Counter, defaultdict

STOPWORDS = stopwords.words('portuguese')
from functools import lru_cache


########################

def remove_double_spaces(text):
    return " ".join(text.split())

########################

misspelling_dict = {}

misspelling_csv = csv.reader(open('misspelling.psv','r', encoding='utf-8'), delimiter='|', quotechar='"')

for row in misspelling_csv:
    try:
        correct_word = row[0].strip()
        misspelled_words = [i.strip() for i in row[1:] if i.strip() != '']
        for w in misspelled_words:
            if len(w) > 0 and len(correct_word) > 0 and w != correct_word:
                misspelling_dict[w] = correct_word
            else:
                print(row)
    except Exception as e:
        print(e)
        print(row)

# todo: testar performance para fazer um dicionário {Set(termos errados): palavra certa'}

def misspelling_word(word):
    if word in misspelling_dict:
        return misspelling_dict[word], 1
    else:
        return word, 0

def misspelling_words(words):
    result = []
    count = 0
    for word in words:
        w_correct, c = misspelling_word(word)
        count += c
        result.append(w_correct)
    return result, count

def misspelling_string(phrase):
    words = phrase.split(' ')
    tokens, count = misspelling_words(words)
    return " ".join(tokens), count


print('{:,}  misspelling'.format(len(misspelling_dict)))

########################

collocations_dict = collections.defaultdict(list)

collocations_csv = csv.reader(open('collocations.csv','r', encoding="utf_8"), delimiter=',', quotechar='"')

for row in collocations_csv:
    if row[0].strip() == '---END---':
        break
    try:
#         collocation_phrase = remove_accents(row[0])
        collocation_phrase_len = len(collocation_phrase.split(' '))
        collocations_dict[collocation_phrase_len].append(collocation_phrase)
    except Exception as e:
        print(e)
        print(row)

collocations_keys = sorted(collocations_dict.keys(),reverse=True)

for k,v in collocations_dict.items():
    print('{:,} collocations com {} palavras'.format(len(v),k))

def find_collocations(text):
    for size in collocations_keys:
        for coll in collocations_dict[size]:
            if coll in text:
                text = text.replace(coll, coll.replace(' ', '_'))
    return text


########################


RE_REAIS = re.compile(r'(R|r)?\$\s?[0-9]{1,6}((.|,)[0-9]{1,2})?')
RE_URL = re.compile(r'(http[s]?://)?www.([a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
RE_TAMANHO1 = re.compile(r'[0-9]{1,3}\s*(m|cm|mm)?')
RE_TAMANHO2 = re.compile(r'[0-9]{1,3}\s?x\s?[0-9]{1,3}(x\s?[0-9]{1,3})?')
RE_TAMANHO3 = re.compile(r'[0-9]{1,3}\s?(m|cm|mm)?\s?x\s?[0-9]{1,3}\s?(m|cm|mm)?(\s?x\s?[0-9]{1,3})?\s?(m|cm|mm)?')

RE_SIMBOLOS = re.compile(r'[\"\#\$\%\&\'\(\)\*\+\-\/\<\=\>\@\[\\\]\^\`\{\|\}\~\£¡§µ\,\.\!\?\:\;]+')

#RE_PONTUACAO = re.compile(r'[\,\.\!\?\:\;]')
RE_CARACTERES_UNICOS = re.compile(r'(^| ).( |$)')

import unicodedata
@lru_cache(maxsize=1024)
def remove_accents(text):
    nfkd_form = unicodedata.normalize('NFKD', text)
    only_ascii = nfkd_form.encode('ASCII', 'ignore')
    return only_ascii.decode('ASCII')

@lru_cache(maxsize=1024)
def limpa_regexp(text):
    # url: http', 'viviancordeiro', 'com'
    text = RE_URL.sub(' ' , text)
    text = RE_REAIS.sub(' ' , text)
    text = RE_TAMANHO3.sub(' ' , text)
    text = RE_TAMANHO2.sub(' ' , text)
    text = RE_TAMANHO1.sub(' ' , text)
    text = RE_CARACTERES_UNICOS.sub(' ', text)
    text = RE_SIMBOLOS.sub(' ', text)
    return text

BLACKLIST = []

def remove_blacklist(tokens):
    return [x.strip() for x in tokens if x not in BLACKLIST]

def remove_stop_words(tokens):
    return [x.strip() for x in tokens if x not in STOPWORDS]

def clean_text_tokens(text):
    text = text.lower()
    text = clean_regexps(text)

    text, _ = misspelling_string(text)
    text = remove_plural(text)
    text = find_collocations(text)
    # text = remove_accents(text)

    # tokenizer
    tokens = tokenizer(text, ' ')

    tokens = remove_stop_words(tokens)
    tokens = remove_blacklist(tokens)

    # remove blank tokens
    tokens = [t for t in tokens if t]

    return tokens

def clean_text(text):
    tokens = clean_text_tokens(text)
    text = ' '.join(tokens)
    return text.strip()
	import csv
	import re

	EMAIL_REGEXP = '^[_A-Za-z0-9-+]+(\.[_A-Za-z0-9-]+)@[A-Za-z0-]+(\.[A-Za-z0-9]+)(\.[A-Za-z]{2,3})$'

	#from tqdm import tqdm_notebook as tqdm

	import gensim
	import collections
	import nltk
	from nltk.corpus import stopwords
	from collections import Counter, defaultdict

	STOPWORDS = stopwords.words('portuguese')
	from functools import lru_cache


	########################

	def remove_double_spaces(text):
	return " ".join(text.split())

	########################

	misspelling_dict = {}

	misspelling_csv = csv.reader(open('misspelling.psv','r', encoding='utf-8'), delimiter='\|', quotechar='"')

	for row in misspelling_csv:
	try:
	correct_word = row[0].strip()
	misspelled_words = [i.strip() for i in row[1:] if i.strip() != '']
	for w in misspelled_words:
	if len(w) > 0 and len(correct_word) > 0 and w != correct_word:
	misspelling_dict[w] = correct_word
	else:
	print(row)
	except Exception as e:
	print(e)
	print(row)

	# todo: testar performance para fazer um dicionário {Set(termos errados): palavra certa'}

	def misspelling_word(word):
	if word in misspelling_dict:
	return misspelling_dict[word], 1
	else:
	return word, 0

	def misspelling_words(words):
	result = []
	count = 0
	for word in words:
	w_correct, c = misspelling_word(word)
	count += c
	result.append(w_correct)
	return result, count

	def misspelling_string(phrase):
	words = phrase.split(' ')
	tokens, count = misspelling_words(words)
	return " ".join(tokens), count


	print('{:,} misspelling'.format(len(misspelling_dict)))

	########################

	collocations_dict = collections.defaultdict(list)

	collocations_csv = csv.reader(open('collocations.csv','r', encoding="utf_8"), delimiter=',', quotechar='"')

	for row in collocations_csv:
	if row[0].strip() == '---END---':
	break
	try:
	# collocation_phrase = remove_accents(row[0])
	collocation_phrase_len = len(collocation_phrase.split(' '))
	collocations_dict[collocation_phrase_len].append(collocation_phrase)
	except Exception as e:
	print(e)
	print(row)

	collocations_keys = sorted(collocations_dict.keys(),reverse=True)

	for k,v in collocations_dict.items():
	print('{:,} collocations com {} palavras'.format(len(v),k))

	def find_collocations(text):
	for size in collocations_keys:
	for coll in collocations_dict[size]:
	if coll in text:
	text = text.replace(coll, coll.replace(' ', '_'))
	return text


	########################


	RE_REAIS = re.compile(r'(R\|r)?\$\s?[0-9]{1,6}((.\|,)[0-9]{1,2})?')
	RE_URL = re.compile(r'(http[s]?://)?www.([a-zA-Z]\|[0-9]\|[$-_@.&+]\|[!*\(\),]\|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
	RE_TAMANHO1 = re.compile(r'[0-9]{1,3}\s*(m\|cm\|mm)?')
	RE_TAMANHO2 = re.compile(r'[0-9]{1,3}\s?x\s?[0-9]{1,3}(x\s?[0-9]{1,3})?')
	RE_TAMANHO3 = re.compile(r'[0-9]{1,3}\s?(m\|cm\|mm)?\s?x\s?[0-9]{1,3}\s?(m\|cm\|mm)?(\s?x\s?[0-9]{1,3})?\s?(m\|cm\|mm)?')

	RE_SIMBOLOS = re.compile(r'[\"\#\$\%\&\'\(\)\*\+\-\/\<\=\>\@\[\\\]\^\`\{\\|\}\~\£¡§µ\,\.\!\?\:\;]+')

	#RE_PONTUACAO = re.compile(r'[\,\.\!\?\:\;]')
	RE_CARACTERES_UNICOS = re.compile(r'(^\| ).( \|$)')

	import unicodedata
	@lru_cache(maxsize=1024)
	def remove_accents(text):
	nfkd_form = unicodedata.normalize('NFKD', text)
	only_ascii = nfkd_form.encode('ASCII', 'ignore')
	return only_ascii.decode('ASCII')

	@lru_cache(maxsize=1024)
	def limpa_regexp(text):
	# url: http', 'viviancordeiro', 'com'
	text = RE_URL.sub(' ' , text)
	text = RE_REAIS.sub(' ' , text)
	text = RE_TAMANHO3.sub(' ' , text)
	text = RE_TAMANHO2.sub(' ' , text)
	text = RE_TAMANHO1.sub(' ' , text)
	text = RE_CARACTERES_UNICOS.sub(' ', text)
	text = RE_SIMBOLOS.sub(' ', text)
	return text

	BLACKLIST = []

	def remove_blacklist(tokens):
	return [x.strip() for x in tokens if x not in BLACKLIST]

	def remove_stop_words(tokens):
	return [x.strip() for x in tokens if x not in STOPWORDS]

	def clean_text_tokens(text):
	text = text.lower()
	text = clean_regexps(text)

	text, _ = misspelling_string(text)
	text = remove_plural(text)
	text = find_collocations(text)
	# text = remove_accents(text)

	# tokenizer
	tokens = tokenizer(text, ' ')

	tokens = remove_stop_words(tokens)
	tokens = remove_blacklist(tokens)

	# remove blank tokens
	tokens = [t for t in tokens if t]

	return tokens

	def clean_text(text):
	tokens = clean_text_tokens(text)
	text = ' '.join(tokens)
	return text.strip()