Parses exam and tells you the matching questions in both exams. V2 takes care of versions that use built-in lists in word.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from docx import Document | |
import re | |
import pprint | |
questions = re.compile('^\d{1,2}.\s') # regex to detect if line is a question | |
answers = re.compile('^[a-z].\s') # regex to check if line is an mcq answer | |
info = re.compile('^(.*?)\..*') # regex to extract the number/text of question/answer | |
text = re.compile('^.*?\.(.*)') | |
pp = pprint.PrettyPrinter(indent=4) # pretty print for debugging | |
# takes a word document and returns a parsed exam object | |
def parseExam(doc): | |
pExam = [] | |
for index, paragraph in enumerate(doc.paragraphs): # go over each paragraph in the document | |
if re.match(questions, paragraph.text): # find all questions from the exam | |
pExam.append({ | |
"question": questionText(paragraph.text), | |
"line": index, | |
"number": questionNum(paragraph.text) | |
}) | |
return pExam | |
# given a question/answer returns its number | |
def questionNum(question): | |
obj = re.match(info, question) | |
return obj.group(1) | |
# given a question, returns its text | |
def questionText(question): | |
obj = re.match(text, question) | |
return obj.group(1).strip() | |
# takes in 2 exams and tells you matching questions of both exams. | |
def matchExams(exam1, exam2): | |
matched = [] | |
for question in exam1: | |
match = (question['number'], findQuestionNum(exam2, question['question'])) | |
matched.append(match) | |
return matched | |
# given an exam and a question, tells you what number it is in the exam | |
# if it can't find the question in the exam then returns a -1 | |
def findQuestionNum(exam, question): | |
for q in exam: | |
if q['question'] == question: | |
return q['number'] | |
return -1 | |
# open up the documents | |
exam1 = Document('data/Test1sampleA.docx') # path to first exam | |
exam2 = Document('data/Test1sampleB.docx') # path to second exam | |
# parse the documents | |
parsed1 = parseExam(exam1) | |
parsed2 = parseExam(exam2) | |
# tells you which question of exam 1 matches which one in exam 2. | |
# in the tuple the first number is question number in exam one and second number is exam 2 | |
# if any value is -1 it means that the question wasn't found in exam 2. | |
matched = matchExams(parsed1, parsed2) | |
# pp.pprint(parsed1) | |
# pp.pprint(parsed2) | |
print(matched) | |
print("done") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from docx import Document | |
import mammoth | |
import re | |
import pprint | |
questions = re.compile('^\d{1,2}.\s') # regex to detect if line is a question | |
answers = re.compile('^[a-z].\s') # regex to check if line is an mcq answer | |
info = re.compile('^(.*?)\..*') # regex to extract the number/text of question/answer | |
text = re.compile('^.*?\.(.*)') | |
parseHtml = re.compile('<li>(((?!<li>).)*)<ol>') | |
pp = pprint.PrettyPrinter(indent=4) # pretty print for debugging | |
file1 = 'data/sampletest.docx' # path to first exam | |
file2 = 'data/Test1sampleA.docx' # path to second exam | |
# takes a word document and returns a parsed exam object | |
def parseExamDocx(file): | |
doc = Document(file) | |
pExam = [] | |
for index, paragraph in enumerate(doc.paragraphs): # go over each paragraph in the document | |
if re.match(questions, paragraph.text): # find all questions from the exam | |
pExam.append({ | |
"question": questionText(paragraph.text), | |
"line": index, | |
"number": questionNum(paragraph.text) | |
}) | |
return pExam | |
def cleanHTML(text): | |
removeTags = re.sub('<[^<]+?>', '', text) | |
return removeTags.strip() | |
# parse using html | |
def parseExamHTML(file): | |
with open(file, "rb") as docx_file: | |
result = mammoth.convert_to_html(docx_file) | |
html = result.value # The generated HTML | |
matched = re.findall('<li>(((?!<li>).)*)<ol>', html) | |
pExam = [] | |
for index, match in enumerate(matched): | |
pExam.append({ | |
"question": cleanHTML(match[0]), | |
"number": index | |
}) | |
return pExam | |
# try parsing using different methods | |
def parseExam(file): | |
exam = parseExamDocx(file) # parse using docx library, if fails then parse using html | |
if not exam: | |
print() | |
return parseExamHTML(file) | |
return exam | |
# given a question/answer returns its number | |
def questionNum(question): | |
obj = re.match(info, question) | |
return obj.group(1) | |
# given a question, returns its text | |
def questionText(question): | |
obj = re.match(text, question) | |
return obj.group(1).strip() | |
# takes in 2 exams and tells you matching questions of both exams. | |
def matchExams(exam1, exam2): | |
matched = [] | |
for question in exam1: | |
match = (question['number'], findQuestionNum(exam2, question['question'])) | |
matched.append(match) | |
return matched | |
# given an exam and a question, tells you what number it is in the exam | |
# if it can't find the question in the exam then returns a -1 | |
def findQuestionNum(exam, question): | |
for q in exam: | |
if q['question'] == question: | |
return q['number'] | |
return -1 | |
### | |
### MAIN PROGRAM | |
### | |
# parse the documents | |
parsed1 = parseExam(file1) | |
# parsed2 = parseExam(exam2) | |
# tells you which question of exam 1 matches which one in exam 2. | |
# in the tuple the first number is question number in exam one and second number is exam 2 | |
# if any value is -1 it means that the question wasn't found in exam 2. | |
# matched = matchExams(parsed1, parsed2) | |
pp.pprint(parsed1) | |
# pp.pprint(parsed2) | |
# print(matched) | |
print("done") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from docx2python import docx2python | |
import re | |
import pprint | |
import os | |
questions = re.compile('^\d{1,2}.') # regex to detect if line is a question | |
answerRe = re.compile('^[a-z]\)') # regex to check if line is an mcq answer | |
info = re.compile('^(.*?)\).*') # regex to extract the number/text of question/answer | |
text = re.compile('^.*?\)(.*)') | |
pp = pprint.PrettyPrinter(indent=4) # pretty print for debugging | |
file1 = 'data/I2Fall18T1A.docx' # path to first exam | |
file2 = 'data/I2Fall18T1B.docx' # path to second exam | |
# try parsing using different methods | |
def parseExam(file): | |
doc = docx2python(file) | |
cleaned = cleanexam(doc.text) | |
lines = cleaned.splitlines() | |
pExam = [] | |
for index, line in enumerate(lines): | |
if re.match(questions, line): | |
pExam.append({ | |
"question": questionText(line), | |
"number": questionNum(line), | |
"answers": sorted(getAnswers(index, lines)) | |
}) | |
return pExam | |
# doc = Document(file) | |
# pExam = [] | |
# for index, paragraph in enumerate(doc.paragraphs): # go over each paragraph in the document | |
# print(paragraph.style) | |
# if re.match(questions, paragraph.text): # find all questions from the exam | |
# pExam.append({ | |
# "question": questionText(paragraph.text), | |
# "line": index, | |
# "number": questionNum(paragraph.text) | |
# }) | |
# return pExam | |
def getAnswers(i, arr): | |
i += 1 | |
answers = [] | |
while not re.match(answerRe, arr[i]): | |
i += 1 | |
try: | |
while re.match(answerRe, arr[i]): | |
answers.append(questionText(arr[i])) | |
i += 1 | |
return answers | |
except IndexError: | |
return answers | |
def cleanexam(text): | |
text = text.replace('\t', '') | |
text = os.linesep.join([s for s in text.splitlines() if s]) | |
return text | |
# given a question/answer returns its number | |
def questionNum(question): | |
obj = re.match(info, question) | |
return obj.group(1) | |
# given a question, returns its text | |
def questionText(question): | |
obj = re.match(text, question) | |
return obj.group(1).strip() | |
# takes in 2 exams and tells you matching questions of both exams. | |
def matchExams(exam1, exam2): | |
matched = [] | |
for question in exam1: | |
match = (question['number'], findQuestionNum(exam2, question)) | |
matched.append(match) | |
return matched | |
# given an exam and a question, tells you what number it is in the exam | |
# if it can't find the question in the exam then returns a -1 | |
def findQuestionNum(exam, question): | |
for q in exam: | |
if q['question'] == question['question'] and q['answers'] == question['answers']: | |
return q['number'] | |
return -1 | |
### | |
### MAIN PROGRAM | |
### | |
# parse the documents | |
parsed1 = parseExam(file1) | |
parsed2 = parseExam(file2) | |
# tells you which question of exam 1 matches which one in exam 2. | |
# in the tuple the first number is question number in exam one and second number is exam 2 | |
# if any value is -1 it means that the question wasn't found in exam 2. | |
matched = matchExams(parsed1, parsed2) | |
# pp.pprint(parsed1) | |
# pp.pprint(parsed2) | |
pp.pprint(matched) | |
print("done") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment