Skip to content

Instantly share code, notes, and snippets.

@pedrotnascimento
Last active July 17, 2017 22:22
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save pedrotnascimento/d83e667d07892b5e7059c9b97399dd86 to your computer and use it in GitHub Desktop.
Save pedrotnascimento/d83e667d07892b5e7059c9b97399dd86 to your computer and use it in GitHub Desktop.
# o contexto é limitar o escopo de busca para verificar se existem instâncias da coluna 1 que tem o mesmo significado que a coluna2
# -*- coding:utf-8 -*-
# verifica se alguma palavra de alguma instancia da coluna1 está contida em alguma palavra de alguma instância na coluna2
# com isso se tem um indicador de possível correlação entre as instâncias.
# o contexto é limitar o escopo de busca para verificar se existem instâncias da coluna 1 que tem o mesmo significado que a coluna2
import re
def compile_stopwords_to_regex(arr):
orRegexSign = "|"
return re.compile(orRegexSign.join(arr))
def retiraLatin(string):
# "á":"A","é":"E","í":"I","ó":"O","ú":"u","â":"A","ê":"E","ô":"O","ã":"A","ç":"C","°":"","º":"","ª":"", "´":"", "À":"A","Ü":"U","Ò":"O", "È":"E"
cuts = {'\xc1':'A', '\xc9':'E', '\xcd':'I', '\xd3':'O', '\xda':'U',
'\xca':'E','\xc2':'A', '\xd4':'O', '\xc3':'A','\xd5':'O','\xc7':'C',
"\xe1":"A","\xe9":"E","\xed":"I","\xf3":"O","\xfa":"u","\xe2":"A","\xea":"E",
"\xf4":"O","\xe3":"A","\xe7":"C","\xb0":"","\xba":"","\xaa":"", "\xb4":"", "\xc0":"A",
"\xdc":"U","\xd2":"O","\xc8":"E"}
string_list = list(string)
for c, inx in zip(string_list, range(len(string_list))):
if c in cuts:
string_list[inx] = cuts[c]
return ''.join(string_list)
def cleanStopWord(data):
stopWords = [
"AO",
"DO",
"DA",
"DAS",
"DE",
"COM",
"COMA",
"O",
"A",
"E"
"POR",
"NO",
"PELA",
"DOS",
"Nº"]
stopWordRe = compile_stopwords_to_regex(stopWords)
strProc = re.sub(stopWordRe, "$", data)
strProc = strProc.replace("$ ", "")
strProc = strProc.replace("$", "")
return strProc
def pre_proc(string):
stopWords = [
"AO",
"DO",
"DA",
"DAS",
"DE",
"COM",
"COMA",
"O",
"A",
"E",
"POR",
"NO",
"PELA",
"DOS",
"Nº"]
string = retiraLatin(string)
string = re.sub(r"[/\\]|\r\n", " ",string)
string = string.split()
string = [ i for i in string if i not in stopWords]
return string
f1 = open("coluna1.csv", "rb")
f2= open("coluna2.csv", "rb")
fout = open("termos_relatos.csv", "wb")
x1 = f1.read()
x2 = f2.read()
x1 = retiraLatin(x1)
x2 = retiraLatin(x2)
x1_lis = x1.split("\r\n")
x2_lis = x2.split("\r\n")
def check_pair(str1, str2):
str1= pre_proc(str1)
str2 = pre_proc(str2)
for i in str1:
for j in str2:
if i == j:
return True
return False
for i in x1_lis:
for j in x2_lis:
if check_pair(i,j):
fout.write(i + "," +j +"\r\n")
break
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment