Last active
November 26, 2018 06:52
-
-
Save HemantNegi/0b181ce30cb6faf1c83e2a28b188b033 to your computer and use it in GitHub Desktop.
find_plagiarism if a more than a given % of words match.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# | |
# Sample input for the program | |
# 7 60 | |
# 201305641 2 | |
# The quick brown fox jumps over the lazy dog | |
# the quick brown fox jumps over the lazy dog the | |
# 201305581 2 | |
# The leisurely fox leisurely jumps over leisurely lazy dog | |
# the leisurely fox leisurely jumps over the leisurely lazy dog | |
# 201305051 2 | |
# The leisurely fox leisurely jumps over the leisurely lazy hound | |
# The leisurely fox leisurely jumps over the leisurely lazy hound | |
# 201205874 1 | |
# Lorem ipsum dolor sit amet consectetur adipiscing elit Nullam imperdiet | |
# 201305111 1 | |
# Lorem ipsum dolor sit amet consectetur adipiscing elit Nullam imperdiet | |
# 201305532 1 | |
# Post no so what deal evil rent by real in | |
# 201305182 1 | |
# Post so much what meal evil sent sly real in | |
# | |
# Required Output: | |
# [[201205874, 201305111], [201305051, 201305581, 201305641]] | |
# | |
# Note 3 test cases are passing. | |
from collections import Counter | |
# import ipdb | |
class Student(object): | |
def __init__(self, roll_no): | |
self.text = Counter() | |
self.roll_no = roll_no | |
self.visited = False | |
def __repr__(self): | |
return '<Student %s>' % self.roll_no | |
def find_plagiarism(students, threshold): | |
result = [] | |
result_index = {} # {'123445': 2} roll_no: index of list(group) in result. | |
students.sort(key=lambda x: x.roll_no) | |
grp_count = 0 | |
for i in xrange(len(students)): | |
grp = [] | |
result.append(grp) | |
for j in xrange(i+1, len(students)): | |
if has_plagiarism(students[i], students[j], threshold): | |
roll_i = students[i].roll_no | |
roll_j = students[j].roll_no | |
# print roll_i, roll_j | |
if roll_i in result_index: | |
idx_i = result_index[roll_i] | |
result[idx_i].append(roll_j) | |
result_index[roll_j] = idx_i | |
elif roll_j in result_index: | |
idx_j = result_index[roll_j] | |
result[idx_j].append(roll_i) | |
result_index[roll_i] = idx_j | |
else: | |
grp.append(roll_i) | |
result_index[roll_i] = grp_count | |
grp.append(roll_j) | |
result_index[roll_j] = grp_count | |
grp_count += 1 | |
return [x for x in result if x] | |
def has_plagiarism(student1, student2, threshold): | |
plag_count = 0 | |
uniq_count = len(set(student1.text.keys() + student2.text.keys())) | |
for key, s1 in student1.text.iteritems(): | |
s2 = student2.text.get(key) | |
if s2: | |
if (min(s1,s2)*1./max(s1,s2) * 100) >= threshold: | |
plag_count += 1 | |
return (plag_count*1./uniq_count * 100) >= threshold | |
def main(): | |
students_count, threshold = map(int, raw_input().split(' ')) | |
# print students_count, threshold | |
students = [] | |
for _ in xrange(students_count): | |
roll_no, lines_count = map(int, raw_input().split(' ')) | |
s = Student(roll_no) | |
for _ in xrange(lines_count): | |
s.text.update([x.strip().lower() for x in raw_input().split(' ') if x.strip()]) | |
students.append(s) | |
# print s.text | |
for x in find_plagiarism(students, threshold): | |
print ' '.join(map(str, x)) | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment