Phoenix-Effect/exam-matcher.py Secret

## exam-matcher.py
from docx import Document
import re
import pprint

questions = re.compile('^\d{1,2}.\s')  # regex to detect if line is a question
answers = re.compile('^[a-z].\s') # regex to check if line is an mcq answer
info = re.compile('^(.*?)\..*') # regex to extract the number/text of question/answer
text = re.compile('^.*?\.(.*)')
pp = pprint.PrettyPrinter(indent=4) # pretty print for debugging


# takes a word document and returns a parsed exam object
def parseExam(doc):
    pExam = []
    for index, paragraph in enumerate(doc.paragraphs): # go over each paragraph in the document
        if re.match(questions, paragraph.text): # find all questions from the exam
           pExam.append({
               "question": questionText(paragraph.text),
               "line": index,
               "number": questionNum(paragraph.text)
           })
    return pExam


# given a question/answer returns its number
def questionNum(question):
    obj = re.match(info, question)
    return obj.group(1)


# given a question, returns its text
def questionText(question):
    obj = re.match(text, question)
    return obj.group(1).strip()


# takes in 2 exams and tells you matching questions of both exams.
def matchExams(exam1, exam2):
    matched = []
    for question in exam1:
        match = (question['number'], findQuestionNum(exam2, question['question']))
        matched.append(match)
    return matched


# given an exam and a question, tells you what number it is in the exam
# if it can't find the question in the exam then returns a -1
def findQuestionNum(exam, question):
    for q in exam:
        if q['question'] == question:
            return q['number']
    return -1

# open up the documents
exam1 = Document('data/Test1sampleA.docx') # path to first exam
exam2 = Document('data/Test1sampleB.docx') # path to second exam

# parse the documents
parsed1 = parseExam(exam1)
parsed2 = parseExam(exam2)


# tells you which question of exam 1 matches which one in exam 2.
# in the tuple the first number is question number in exam one and second number is exam 2
# if any value is -1 it means that the question wasn't found in exam 2.
matched = matchExams(parsed1, parsed2)

# pp.pprint(parsed1)
# pp.pprint(parsed2)

print(matched)
print("done")

## exam-matcherv2.py
from docx import Document
import mammoth
import re
import pprint


questions = re.compile('^\d{1,2}.\s')  # regex to detect if line is a question
answers = re.compile('^[a-z].\s') # regex to check if line is an mcq answer
info = re.compile('^(.*?)\..*') # regex to extract the number/text of question/answer
text = re.compile('^.*?\.(.*)')
parseHtml = re.compile('<li>(((?!<li>).)*)<ol>')
pp = pprint.PrettyPrinter(indent=4) # pretty print for debugging

file1 = 'data/sampletest.docx'  # path to first exam
file2 = 'data/Test1sampleA.docx'  # path to second exam

# takes a word document and returns a parsed exam object
def parseExamDocx(file):
    doc = Document(file)
    pExam = []
    for index, paragraph in enumerate(doc.paragraphs): # go over each paragraph in the document
        if re.match(questions, paragraph.text): # find all questions from the exam
           pExam.append({
               "question": questionText(paragraph.text),
               "line": index,
               "number": questionNum(paragraph.text)
           })
    return pExam

def cleanHTML(text):
    removeTags = re.sub('<[^<]+?>', '', text)
    return removeTags.strip()

# parse using html
def parseExamHTML(file):
    with open(file, "rb") as docx_file:
        result = mammoth.convert_to_html(docx_file)
        html = result.value  # The generated HTML
        matched = re.findall('<li>(((?!<li>).)*)<ol>', html)
        pExam = []
        for index, match in enumerate(matched):
            pExam.append({
                "question": cleanHTML(match[0]),
                "number": index
            })
        return pExam

# try parsing using different methods
def parseExam(file):
    exam = parseExamDocx(file)  # parse using docx library, if fails then parse using html
    if not exam:
        print()
        return parseExamHTML(file)
    return exam


# given a question/answer returns its number
def questionNum(question):
    obj = re.match(info, question)
    return obj.group(1)


# given a question, returns its text
def questionText(question):
    obj = re.match(text, question)
    return obj.group(1).strip()


# takes in 2 exams and tells you matching questions of both exams.
def matchExams(exam1, exam2):
    matched = []
    for question in exam1:
        match = (question['number'], findQuestionNum(exam2, question['question']))
        matched.append(match)
    return matched


# given an exam and a question, tells you what number it is in the exam
# if it can't find the question in the exam then returns a -1
def findQuestionNum(exam, question):
    for q in exam:
        if q['question'] == question:
            return q['number']
    return -1

###
### MAIN PROGRAM
###

# parse the documents
parsed1 = parseExam(file1)
# parsed2 = parseExam(exam2)

# tells you which question of exam 1 matches which one in exam 2.
# in the tuple the first number is question number in exam one and second number is exam 2
# if any value is -1 it means that the question wasn't found in exam 2.
# matched = matchExams(parsed1, parsed2)

pp.pprint(parsed1)
# pp.pprint(parsed2)

# print(matched)
print("done")

## exam-matcherv3.py
from docx2python import docx2python
import re
import pprint
import os


questions = re.compile('^\d{1,2}.')  # regex to detect if line is a question
answerRe = re.compile('^[a-z]\)') # regex to check if line is an mcq answer
info = re.compile('^(.*?)\).*') # regex to extract the number/text of question/answer
text = re.compile('^.*?\)(.*)')
pp = pprint.PrettyPrinter(indent=4) # pretty print for debugging

file1 = 'data/I2Fall18T1A.docx'  # path to first exam
file2 = 'data/I2Fall18T1B.docx'  # path to second exam

# try parsing using different methods
def parseExam(file):
    doc = docx2python(file)
    cleaned = cleanexam(doc.text)
    lines = cleaned.splitlines()

    pExam = []
    for index, line in enumerate(lines):
        if re.match(questions, line):
            pExam.append({
                "question": questionText(line),
                "number": questionNum(line),
                "answers": sorted(getAnswers(index, lines))
            })

    return pExam
    # doc = Document(file)
    # pExam = []
    # for index, paragraph in enumerate(doc.paragraphs):  # go over each paragraph in the document
    #     print(paragraph.style)
    #     if re.match(questions, paragraph.text):  # find all questions from the exam
    #         pExam.append({
    #             "question": questionText(paragraph.text),
    #             "line": index,
    #             "number": questionNum(paragraph.text)
    #         })
    # return pExam

def getAnswers(i, arr):
    i += 1
    answers = []

    while not re.match(answerRe, arr[i]):
        i += 1

    try:
        while re.match(answerRe, arr[i]):
            answers.append(questionText(arr[i]))
            i += 1
        return answers
    except IndexError:
        return answers

def cleanexam(text):
    text = text.replace('\t', '')
    text = os.linesep.join([s for s in text.splitlines() if s])
    return text

# given a question/answer returns its number
def questionNum(question):
    obj = re.match(info, question)
    return obj.group(1)


# given a question, returns its text
def questionText(question):
    obj = re.match(text, question)
    return obj.group(1).strip()


# takes in 2 exams and tells you matching questions of both exams.
def matchExams(exam1, exam2):
    matched = []
    for question in exam1:
        match = (question['number'], findQuestionNum(exam2, question))
        matched.append(match)
    return matched


# given an exam and a question, tells you what number it is in the exam
# if it can't find the question in the exam then returns a -1
def findQuestionNum(exam, question):
    for q in exam:
        if q['question'] == question['question'] and q['answers'] == question['answers']:
            return q['number']
    return -1

###
### MAIN PROGRAM
###

# parse the documents
parsed1 = parseExam(file1)
parsed2 = parseExam(file2)

# tells you which question of exam 1 matches which one in exam 2.
# in the tuple the first number is question number in exam one and second number is exam 2
# if any value is -1 it means that the question wasn't found in exam 2.
matched = matchExams(parsed1, parsed2)


# pp.pprint(parsed1)
# pp.pprint(parsed2)

pp.pprint(matched)
print("done")
	from docx import Document
	import re
	import pprint

	questions = re.compile('^\d{1,2}.\s') # regex to detect if line is a question
	answers = re.compile('^[a-z].\s') # regex to check if line is an mcq answer
	info = re.compile('^(.?)\..') # regex to extract the number/text of question/answer
	text = re.compile('^.?\.(.)')
	pp = pprint.PrettyPrinter(indent=4) # pretty print for debugging


	# takes a word document and returns a parsed exam object
	def parseExam(doc):
	pExam = []
	for index, paragraph in enumerate(doc.paragraphs): # go over each paragraph in the document
	if re.match(questions, paragraph.text): # find all questions from the exam
	pExam.append({
	"question": questionText(paragraph.text),
	"line": index,
	"number": questionNum(paragraph.text)
	})
	return pExam


	# given a question/answer returns its number
	def questionNum(question):
	obj = re.match(info, question)
	return obj.group(1)


	# given a question, returns its text
	def questionText(question):
	obj = re.match(text, question)
	return obj.group(1).strip()


	# takes in 2 exams and tells you matching questions of both exams.
	def matchExams(exam1, exam2):
	matched = []
	for question in exam1:
	match = (question['number'], findQuestionNum(exam2, question['question']))
	matched.append(match)
	return matched


	# given an exam and a question, tells you what number it is in the exam
	# if it can't find the question in the exam then returns a -1
	def findQuestionNum(exam, question):
	for q in exam:
	if q['question'] == question:
	return q['number']
	return -1

	# open up the documents
	exam1 = Document('data/Test1sampleA.docx') # path to first exam
	exam2 = Document('data/Test1sampleB.docx') # path to second exam

	# parse the documents
	parsed1 = parseExam(exam1)
	parsed2 = parseExam(exam2)


	# tells you which question of exam 1 matches which one in exam 2.
	# in the tuple the first number is question number in exam one and second number is exam 2
	# if any value is -1 it means that the question wasn't found in exam 2.
	matched = matchExams(parsed1, parsed2)

	# pp.pprint(parsed1)
	# pp.pprint(parsed2)

	print(matched)
	print("done")
	from docx import Document
	import mammoth
	import re
	import pprint


	questions = re.compile('^\d{1,2}.\s') # regex to detect if line is a question
	answers = re.compile('^[a-z].\s') # regex to check if line is an mcq answer
	info = re.compile('^(.?)\..') # regex to extract the number/text of question/answer
	text = re.compile('^.?\.(.)')
	parseHtml = re.compile('<li>(((?!<li>).)*)<ol>')
	pp = pprint.PrettyPrinter(indent=4) # pretty print for debugging

	file1 = 'data/sampletest.docx' # path to first exam
	file2 = 'data/Test1sampleA.docx' # path to second exam

	# takes a word document and returns a parsed exam object
	def parseExamDocx(file):
	doc = Document(file)
	pExam = []
	for index, paragraph in enumerate(doc.paragraphs): # go over each paragraph in the document
	if re.match(questions, paragraph.text): # find all questions from the exam
	pExam.append({
	"question": questionText(paragraph.text),
	"line": index,
	"number": questionNum(paragraph.text)
	})
	return pExam

	def cleanHTML(text):
	removeTags = re.sub('<[^<]+?>', '', text)
	return removeTags.strip()

	# parse using html
	def parseExamHTML(file):
	with open(file, "rb") as docx_file:
	result = mammoth.convert_to_html(docx_file)
	html = result.value # The generated HTML
	matched = re.findall('<li>(((?!<li>).)*)<ol>', html)
	pExam = []
	for index, match in enumerate(matched):
	pExam.append({
	"question": cleanHTML(match[0]),
	"number": index
	})
	return pExam

	# try parsing using different methods
	def parseExam(file):
	exam = parseExamDocx(file) # parse using docx library, if fails then parse using html
	if not exam:
	print()
	return parseExamHTML(file)
	return exam


	# given a question/answer returns its number
	def questionNum(question):
	obj = re.match(info, question)
	return obj.group(1)


	# given a question, returns its text
	def questionText(question):
	obj = re.match(text, question)
	return obj.group(1).strip()


	# takes in 2 exams and tells you matching questions of both exams.
	def matchExams(exam1, exam2):
	matched = []
	for question in exam1:
	match = (question['number'], findQuestionNum(exam2, question['question']))
	matched.append(match)
	return matched


	# given an exam and a question, tells you what number it is in the exam
	# if it can't find the question in the exam then returns a -1
	def findQuestionNum(exam, question):
	for q in exam:
	if q['question'] == question:
	return q['number']
	return -1

	###
	### MAIN PROGRAM
	###

	# parse the documents
	parsed1 = parseExam(file1)
	# parsed2 = parseExam(exam2)

	# tells you which question of exam 1 matches which one in exam 2.
	# in the tuple the first number is question number in exam one and second number is exam 2
	# if any value is -1 it means that the question wasn't found in exam 2.
	# matched = matchExams(parsed1, parsed2)

	pp.pprint(parsed1)
	# pp.pprint(parsed2)

	# print(matched)
	print("done")
	from docx2python import docx2python
	import re
	import pprint
	import os


	questions = re.compile('^\d{1,2}.') # regex to detect if line is a question
	answerRe = re.compile('^[a-z]\)') # regex to check if line is an mcq answer
	info = re.compile('^(.?)\).') # regex to extract the number/text of question/answer
	text = re.compile('^.?\)(.)')
	pp = pprint.PrettyPrinter(indent=4) # pretty print for debugging

	file1 = 'data/I2Fall18T1A.docx' # path to first exam
	file2 = 'data/I2Fall18T1B.docx' # path to second exam

	# try parsing using different methods
	def parseExam(file):
	doc = docx2python(file)
	cleaned = cleanexam(doc.text)
	lines = cleaned.splitlines()

	pExam = []
	for index, line in enumerate(lines):
	if re.match(questions, line):
	pExam.append({
	"question": questionText(line),
	"number": questionNum(line),
	"answers": sorted(getAnswers(index, lines))
	})

	return pExam
	# doc = Document(file)
	# pExam = []
	# for index, paragraph in enumerate(doc.paragraphs): # go over each paragraph in the document
	# print(paragraph.style)
	# if re.match(questions, paragraph.text): # find all questions from the exam
	# pExam.append({
	# "question": questionText(paragraph.text),
	# "line": index,
	# "number": questionNum(paragraph.text)
	# })
	# return pExam

	def getAnswers(i, arr):
	i += 1
	answers = []

	while not re.match(answerRe, arr[i]):
	i += 1

	try:
	while re.match(answerRe, arr[i]):
	answers.append(questionText(arr[i]))
	i += 1
	return answers
	except IndexError:
	return answers

	def cleanexam(text):
	text = text.replace('\t', '')
	text = os.linesep.join([s for s in text.splitlines() if s])
	return text

	# given a question/answer returns its number
	def questionNum(question):
	obj = re.match(info, question)
	return obj.group(1)


	# given a question, returns its text
	def questionText(question):
	obj = re.match(text, question)
	return obj.group(1).strip()


	# takes in 2 exams and tells you matching questions of both exams.
	def matchExams(exam1, exam2):
	matched = []
	for question in exam1:
	match = (question['number'], findQuestionNum(exam2, question))
	matched.append(match)
	return matched


	# given an exam and a question, tells you what number it is in the exam
	# if it can't find the question in the exam then returns a -1
	def findQuestionNum(exam, question):
	for q in exam:
	if q['question'] == question['question'] and q['answers'] == question['answers']:
	return q['number']
	return -1

	###
	### MAIN PROGRAM
	###

	# parse the documents
	parsed1 = parseExam(file1)
	parsed2 = parseExam(file2)

	# tells you which question of exam 1 matches which one in exam 2.
	# in the tuple the first number is question number in exam one and second number is exam 2
	# if any value is -1 it means that the question wasn't found in exam 2.
	matched = matchExams(parsed1, parsed2)


	# pp.pprint(parsed1)
	# pp.pprint(parsed2)

	pp.pprint(matched)
	print("done")