Last active
July 17, 2017 22:22
-
-
Save pedrotnascimento/d83e667d07892b5e7059c9b97399dd86 to your computer and use it in GitHub Desktop.
# o contexto é limitar o escopo de busca para verificar se existem instâncias da coluna 1 que tem o mesmo significado que a coluna2
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding:utf-8 -*- | |
# verifica se alguma palavra de alguma instancia da coluna1 está contida em alguma palavra de alguma instância na coluna2 | |
# com isso se tem um indicador de possível correlação entre as instâncias. | |
# o contexto é limitar o escopo de busca para verificar se existem instâncias da coluna 1 que tem o mesmo significado que a coluna2 | |
import re | |
def compile_stopwords_to_regex(arr): | |
orRegexSign = "|" | |
return re.compile(orRegexSign.join(arr)) | |
def retiraLatin(string): | |
# "á":"A","é":"E","í":"I","ó":"O","ú":"u","â":"A","ê":"E","ô":"O","ã":"A","ç":"C","°":"","º":"","ª":"", "´":"", "À":"A","Ü":"U","Ò":"O", "È":"E" | |
cuts = {'\xc1':'A', '\xc9':'E', '\xcd':'I', '\xd3':'O', '\xda':'U', | |
'\xca':'E','\xc2':'A', '\xd4':'O', '\xc3':'A','\xd5':'O','\xc7':'C', | |
"\xe1":"A","\xe9":"E","\xed":"I","\xf3":"O","\xfa":"u","\xe2":"A","\xea":"E", | |
"\xf4":"O","\xe3":"A","\xe7":"C","\xb0":"","\xba":"","\xaa":"", "\xb4":"", "\xc0":"A", | |
"\xdc":"U","\xd2":"O","\xc8":"E"} | |
string_list = list(string) | |
for c, inx in zip(string_list, range(len(string_list))): | |
if c in cuts: | |
string_list[inx] = cuts[c] | |
return ''.join(string_list) | |
def cleanStopWord(data): | |
stopWords = [ | |
"AO", | |
"DO", | |
"DA", | |
"DAS", | |
"DE", | |
"COM", | |
"COMA", | |
"O", | |
"A", | |
"E" | |
"POR", | |
"NO", | |
"PELA", | |
"DOS", | |
"Nº"] | |
stopWordRe = compile_stopwords_to_regex(stopWords) | |
strProc = re.sub(stopWordRe, "$", data) | |
strProc = strProc.replace("$ ", "") | |
strProc = strProc.replace("$", "") | |
return strProc | |
def pre_proc(string): | |
stopWords = [ | |
"AO", | |
"DO", | |
"DA", | |
"DAS", | |
"DE", | |
"COM", | |
"COMA", | |
"O", | |
"A", | |
"E", | |
"POR", | |
"NO", | |
"PELA", | |
"DOS", | |
"Nº"] | |
string = retiraLatin(string) | |
string = re.sub(r"[/\\]|\r\n", " ",string) | |
string = string.split() | |
string = [ i for i in string if i not in stopWords] | |
return string | |
f1 = open("coluna1.csv", "rb") | |
f2= open("coluna2.csv", "rb") | |
fout = open("termos_relatos.csv", "wb") | |
x1 = f1.read() | |
x2 = f2.read() | |
x1 = retiraLatin(x1) | |
x2 = retiraLatin(x2) | |
x1_lis = x1.split("\r\n") | |
x2_lis = x2.split("\r\n") | |
def check_pair(str1, str2): | |
str1= pre_proc(str1) | |
str2 = pre_proc(str2) | |
for i in str1: | |
for j in str2: | |
if i == j: | |
return True | |
return False | |
for i in x1_lis: | |
for j in x2_lis: | |
if check_pair(i,j): | |
fout.write(i + "," +j +"\r\n") | |
break | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment