Skip to content

Instantly share code, notes, and snippets.

@HemantNegi
Last active November 26, 2018 06:52
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save HemantNegi/0b181ce30cb6faf1c83e2a28b188b033 to your computer and use it in GitHub Desktop.
Save HemantNegi/0b181ce30cb6faf1c83e2a28b188b033 to your computer and use it in GitHub Desktop.
find_plagiarism if a more than a given % of words match.
#
# Sample input for the program
# 7 60
# 201305641 2
# The quick brown fox jumps over the lazy dog
# the quick brown fox jumps over the lazy dog the
# 201305581 2
# The leisurely fox leisurely jumps over leisurely lazy dog
# the leisurely fox leisurely jumps over the leisurely lazy dog
# 201305051 2
# The leisurely fox leisurely jumps over the leisurely lazy hound
# The leisurely fox leisurely jumps over the leisurely lazy hound
# 201205874 1
# Lorem ipsum dolor sit amet consectetur adipiscing elit Nullam imperdiet
# 201305111 1
# Lorem ipsum dolor sit amet consectetur adipiscing elit Nullam imperdiet
# 201305532 1
# Post no so what deal evil rent by real in
# 201305182 1
# Post so much what meal evil sent sly real in
#
# Required Output:
# [[201205874, 201305111], [201305051, 201305581, 201305641]]
#
# Note 3 test cases are passing.
from collections import Counter
# import ipdb
class Student(object):
def __init__(self, roll_no):
self.text = Counter()
self.roll_no = roll_no
self.visited = False
def __repr__(self):
return '<Student %s>' % self.roll_no
def find_plagiarism(students, threshold):
result = []
result_index = {} # {'123445': 2} roll_no: index of list(group) in result.
students.sort(key=lambda x: x.roll_no)
grp_count = 0
for i in xrange(len(students)):
grp = []
result.append(grp)
for j in xrange(i+1, len(students)):
if has_plagiarism(students[i], students[j], threshold):
roll_i = students[i].roll_no
roll_j = students[j].roll_no
# print roll_i, roll_j
if roll_i in result_index:
idx_i = result_index[roll_i]
result[idx_i].append(roll_j)
result_index[roll_j] = idx_i
elif roll_j in result_index:
idx_j = result_index[roll_j]
result[idx_j].append(roll_i)
result_index[roll_i] = idx_j
else:
grp.append(roll_i)
result_index[roll_i] = grp_count
grp.append(roll_j)
result_index[roll_j] = grp_count
grp_count += 1
return [x for x in result if x]
def has_plagiarism(student1, student2, threshold):
plag_count = 0
uniq_count = len(set(student1.text.keys() + student2.text.keys()))
for key, s1 in student1.text.iteritems():
s2 = student2.text.get(key)
if s2:
if (min(s1,s2)*1./max(s1,s2) * 100) >= threshold:
plag_count += 1
return (plag_count*1./uniq_count * 100) >= threshold
def main():
students_count, threshold = map(int, raw_input().split(' '))
# print students_count, threshold
students = []
for _ in xrange(students_count):
roll_no, lines_count = map(int, raw_input().split(' '))
s = Student(roll_no)
for _ in xrange(lines_count):
s.text.update([x.strip().lower() for x in raw_input().split(' ') if x.strip()])
students.append(s)
# print s.text
for x in find_plagiarism(students, threshold):
print ' '.join(map(str, x))
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment