Skip to content

Instantly share code, notes, and snippets.

@Sanix-Darker
Created June 6, 2020 01:07
Show Gist options
  • Star 2 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save Sanix-Darker/ed0d5ab0eb7ac41a174e5d97bb7682e1 to your computer and use it in GitHub Desktop.
Save Sanix-Darker/ed0d5ab0eb7ac41a174e5d97bb7682e1 to your computer and use it in GitHub Desktop.
[PYTHON]PLAGON.py
# ____ _ _ ____ ___ _ _
# | _ \| | / \ / ___|/ _ \| \ | |
# | |_) | | / _ \| | _| | | | \| |
# | __/| |___ / ___ \ |_| | |_| | |\ |
# |_| |_____/_/ \_\____|\___/|_| \_|
# --------------------------------------
from os import listdir as os_listdir, path as os_path
# pip install -U scikit-learn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
def vectorize(Text):
"""
"""
return TfidfVectorizer().fit_transform(Text).toarray()
def similarity(doc1, doc2):
"""
"""
return cosine_similarity([doc1, doc2])
def loop_comparaison(file_x, text_vector_x, new_vectors, results):
"""
"""
for file_y, text_vector_y in new_vectors:
sim_score = similarity(text_vector_x, text_vector_y)[0][1]
student_pair = sorted((file_x, file_y))
score = (student_pair[0], student_pair[1],sim_score)
results.add(score)
return results
def plagon_core(list_of_files, contents_of_files):
"""
"""
results = set()
vectors = vectorize(contents_of_files)
s_vectors = list(zip(list_of_files, vectors))
for file_x, text_vector_x in s_vectors:
new_vectors = s_vectors.copy()
new_index = new_vectors.index((file_x, text_vector_x))
del new_vectors[new_index]
results = loop_comparaison(file_x, text_vector_x, new_vectors, results)
return results
if __name__ == "__main__":
dir_ = "/home/d4rk3r/ACTUALC/vagrant/PYTHON/github/test_plagiat"
list_of_files = [os_path.join(dir_, doc) for doc in os_listdir(dir_) if doc.endswith('.txt')]
contents_of_files =[open(File).read() for File in list_of_files]
for data in plagon_core(list_of_files, contents_of_files):
print(data)
# results:
# ['tt2.txt', 'tt3.txt'] => 0.050632398572142946 (5%)
# ['tt.txt', 'tt3.txt'] => 0.06448929199938869) (6%)
# ['tt.txt', 'tt2.txt'] => 0.7492151128400741) (74%)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment