HemantNegi/find_plagiarism.py

## find_plagiarism.py
#
# Sample input for the program
# 7 60
# 201305641 2
# The quick brown fox jumps over the lazy dog
# the quick brown fox jumps over the lazy dog the
# 201305581 2
# The leisurely fox leisurely jumps over leisurely lazy dog
# the leisurely fox leisurely jumps over the leisurely lazy dog
# 201305051 2
# The leisurely fox leisurely jumps over the leisurely lazy hound
# The leisurely fox leisurely jumps over the leisurely lazy hound
# 201205874 1
# Lorem ipsum dolor sit amet consectetur adipiscing elit Nullam imperdiet
# 201305111 1
# Lorem ipsum dolor sit amet consectetur adipiscing elit Nullam imperdiet
# 201305532 1
# Post no so what deal evil rent by real in
# 201305182 1
# Post so much what meal evil sent sly real in
#
# Required Output:
# [[201205874, 201305111], [201305051, 201305581, 201305641]]
#
# Note 3 test cases are passing.

from collections import Counter
# import ipdb


class Student(object):
    def __init__(self, roll_no):
        self.text = Counter()
        self.roll_no = roll_no
        self.visited = False


    def __repr__(self):
        return '<Student %s>' % self.roll_no


def find_plagiarism(students, threshold):
    result = []
    result_index = {} # {'123445': 2} roll_no: index of list(group) in result.

    students.sort(key=lambda x: x.roll_no)

    grp_count = 0
    for i in xrange(len(students)):
        grp = []
        result.append(grp)

        for j in xrange(i+1, len(students)):
            if has_plagiarism(students[i], students[j], threshold):
                roll_i = students[i].roll_no
                roll_j = students[j].roll_no
                # print roll_i, roll_j

                if roll_i in result_index:
                    idx_i = result_index[roll_i]
                    result[idx_i].append(roll_j)
                    result_index[roll_j] = idx_i

                elif roll_j in result_index:
                    idx_j = result_index[roll_j]
                    result[idx_j].append(roll_i)
                    result_index[roll_i] = idx_j

                else:
                    grp.append(roll_i)
                    result_index[roll_i] = grp_count

                    grp.append(roll_j)
                    result_index[roll_j] = grp_count

        grp_count += 1
    return [x for x in result if x]

def has_plagiarism(student1, student2, threshold):
    plag_count = 0
    uniq_count = len(set(student1.text.keys() + student2.text.keys()))
    for key, s1 in student1.text.iteritems():
        s2 = student2.text.get(key)
        if s2:
            if (min(s1,s2)*1./max(s1,s2) * 100) >= threshold:
                plag_count += 1

    return (plag_count*1./uniq_count * 100) >= threshold


def main():
    students_count, threshold = map(int, raw_input().split(' '))

    # print students_count, threshold
    students = []
    for _ in xrange(students_count):
        roll_no, lines_count = map(int, raw_input().split(' '))

        s = Student(roll_no)
        for _ in xrange(lines_count):
            s.text.update([x.strip().lower() for x in raw_input().split(' ') if x.strip()])
        students.append(s)
        # print s.text
    for x in find_plagiarism(students, threshold):
        print ' '.join(map(str, x))


if __name__ == '__main__':
  main()
	#
	# Sample input for the program
	# 7 60
	# 201305641 2
	# The quick brown fox jumps over the lazy dog
	# the quick brown fox jumps over the lazy dog the
	# 201305581 2
	# The leisurely fox leisurely jumps over leisurely lazy dog
	# the leisurely fox leisurely jumps over the leisurely lazy dog
	# 201305051 2
	# The leisurely fox leisurely jumps over the leisurely lazy hound
	# The leisurely fox leisurely jumps over the leisurely lazy hound
	# 201205874 1
	# Lorem ipsum dolor sit amet consectetur adipiscing elit Nullam imperdiet
	# 201305111 1
	# Lorem ipsum dolor sit amet consectetur adipiscing elit Nullam imperdiet
	# 201305532 1
	# Post no so what deal evil rent by real in
	# 201305182 1
	# Post so much what meal evil sent sly real in
	#
	# Required Output:
	# [[201205874, 201305111], [201305051, 201305581, 201305641]]
	#
	# Note 3 test cases are passing.

	from collections import Counter
	# import ipdb


	class Student(object):
	def __init__(self, roll_no):
	self.text = Counter()
	self.roll_no = roll_no
	self.visited = False


	def __repr__(self):
	return '<Student %s>' % self.roll_no


	def find_plagiarism(students, threshold):
	result = []
	result_index = {} # {'123445': 2} roll_no: index of list(group) in result.

	students.sort(key=lambda x: x.roll_no)

	grp_count = 0
	for i in xrange(len(students)):
	grp = []
	result.append(grp)

	for j in xrange(i+1, len(students)):
	if has_plagiarism(students[i], students[j], threshold):
	roll_i = students[i].roll_no
	roll_j = students[j].roll_no
	# print roll_i, roll_j

	if roll_i in result_index:
	idx_i = result_index[roll_i]
	result[idx_i].append(roll_j)
	result_index[roll_j] = idx_i

	elif roll_j in result_index:
	idx_j = result_index[roll_j]
	result[idx_j].append(roll_i)
	result_index[roll_i] = idx_j

	else:
	grp.append(roll_i)
	result_index[roll_i] = grp_count

	grp.append(roll_j)
	result_index[roll_j] = grp_count

	grp_count += 1
	return [x for x in result if x]

	def has_plagiarism(student1, student2, threshold):
	plag_count = 0
	uniq_count = len(set(student1.text.keys() + student2.text.keys()))
	for key, s1 in student1.text.iteritems():
	s2 = student2.text.get(key)
	if s2:
	if (min(s1,s2)1./max(s1,s2) 100) >= threshold:
	plag_count += 1

	return (plag_count1./uniq_count 100) >= threshold



	def main():
	students_count, threshold = map(int, raw_input().split(' '))

	# print students_count, threshold
	students = []
	for _ in xrange(students_count):
	roll_no, lines_count = map(int, raw_input().split(' '))

	s = Student(roll_no)
	for _ in xrange(lines_count):
	s.text.update([x.strip().lower() for x in raw_input().split(' ') if x.strip()])
	students.append(s)
	# print s.text
	for x in find_plagiarism(students, threshold):
	print ' '.join(map(str, x))



	if __name__ == '__main__':
	main()