Created
February 21, 2018 22:43
-
-
Save soscler/6190282c563ace9fda5a7cb3128b7063 to your computer and use it in GitHub Desktop.
doc2vec
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from source import cosineFile as cosine | |
import os | |
k = 1 | |
repoTrain = "/home/chriss/Desktop/Semestre6/tal/tp3/TP02-textclassif/train/" | |
def categorize(A): | |
tableau = {} | |
for filename in os.listdir(repoTrain): | |
a= cosine(A, filename) | |
print(str(a)) | |
#tableau[filename] = cos(A, filename) | |
categorize("/home/chriss/Desktop/Semestre6/tal/tp3/TP02-textclassif/test/10.txt") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from source import doc2vec as dv | |
import math as m | |
pathA = "1rst file" | |
pathB = "second file" | |
a = dv.vec(pathA) | |
b = dv.vec(pathB); | |
def scalaire(A,B): | |
ret =0 | |
for occ in A: | |
if B.__contains__(occ): | |
print(occ) | |
ret = ret+ A[occ] * B[occ] | |
return ret | |
def dist_eclu(A): | |
tmp = 0 | |
for occ in A: | |
tmp = tmp + pow(A[occ], 2) | |
return m.sqrt(tmp) | |
def cosine(A, B): | |
return scalaire(A,B)/(dist_eclu(A)*dist_eclu(B)) | |
print(cosine(a,b)) | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from source import doc2vec as dv | |
import math as m | |
pathA = "1rst file" | |
pathB = "second file" | |
a = dv.vec(pathA) | |
b = dv.vec(pathB); | |
def scalaire(A,B): | |
ret =0 | |
for occ in A: | |
if B.__contains__(occ): | |
ret = ret+ A[occ] * B[occ] | |
return ret | |
def dist_eclu(A): | |
tmp = 0 | |
for occ in A: | |
tmp = tmp + pow(A[occ], 2) | |
return m.sqrt(tmp) | |
def cosine(A, B): | |
return scalaire(A,B)/(dist_eclu(A)*dist_eclu(B)) | |
#print(cosine(a,b)) | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: UTF8 -*- | |
import os | |
import sys | |
import operator | |
import codecs | |
pathStop = "stop word file" | |
def vec(file): | |
testFile = file | |
current_s_file = open(pathStop, "r") | |
tmp_stop = current_s_file.read() | |
stop_word = tmp_stop.split("\n") # stop word | |
# print(stop_word) | |
with open(testFile, "r") as current_file: | |
tmpFile = current_file.read() # attention à l'encodage | |
current_file.close() | |
str = tmpFile.split(" ") # recuperation de la chaine de caractère | |
str.sort() | |
vec = {} | |
for ch in str: | |
# if (ch.__len__() > 1 ): #suppression des chaines de caractères à une lettre | |
vec[ch] = str.count(ch) # dictionnaire | |
for tp in stop_word: # filtrage des stopwords | |
if vec.__contains__(tp): | |
del vec[tp] | |
return vec | |
# tri en fonction du nombre d'occurrence | |
def sortedVec(A): | |
sortedVec = sorted(A.items(), key=operator.itemgetter(1), reverse=True) # list of tuples | |
return sortedVec | |
# generation des représentation vectorielles de tous les fichiers | |
# a revoir probleme de codecs et de permission | |
# si non tout semble etre correct | |
def write_vec(file): | |
try : | |
os.mkdir(file+"vec", 0o777) | |
except FileExistsError: | |
print("dossier existant") | |
repo = "Train repository" | |
for filename in os.listdir(repo): | |
#print(filename) | |
vecName = filename.strip('.txt') | |
vecName = file + "vec/" +'' + vecName + ".vec" | |
print(vecName) | |
dv = vec(repo + '' + filename) | |
dv_sorted = sortedVec(dv) | |
try: | |
with codecs.open(vecName, 'w', 'utf-8') as dvFile: | |
for tup in dv_sorted: | |
tmp = '' | |
tmp = str(tup[0]) + "\t" + str(tup[1])+ "\n" | |
dvFile.write(tmp) | |
#dvFile.write(s) | |
except PermissionError: | |
print('permission denied') | |
except UnicodeDecodeError: | |
pass | |
# a = vec("file path" ) | |
# write_vec("resource repos, where to write .vec files") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment