Skip to content

Instantly share code, notes, and snippets.

@Phoenix-Effect
Last active Oct 3, 2019
Embed
What would you like to do?
Parses exam and tells you the matching questions in both exams. V2 takes care of versions that use built-in lists in word.
from docx import Document
import re
import pprint
questions = re.compile('^\d{1,2}.\s') # regex to detect if line is a question
answers = re.compile('^[a-z].\s') # regex to check if line is an mcq answer
info = re.compile('^(.*?)\..*') # regex to extract the number/text of question/answer
text = re.compile('^.*?\.(.*)')
pp = pprint.PrettyPrinter(indent=4) # pretty print for debugging
# takes a word document and returns a parsed exam object
def parseExam(doc):
pExam = []
for index, paragraph in enumerate(doc.paragraphs): # go over each paragraph in the document
if re.match(questions, paragraph.text): # find all questions from the exam
pExam.append({
"question": questionText(paragraph.text),
"line": index,
"number": questionNum(paragraph.text)
})
return pExam
# given a question/answer returns its number
def questionNum(question):
obj = re.match(info, question)
return obj.group(1)
# given a question, returns its text
def questionText(question):
obj = re.match(text, question)
return obj.group(1).strip()
# takes in 2 exams and tells you matching questions of both exams.
def matchExams(exam1, exam2):
matched = []
for question in exam1:
match = (question['number'], findQuestionNum(exam2, question['question']))
matched.append(match)
return matched
# given an exam and a question, tells you what number it is in the exam
# if it can't find the question in the exam then returns a -1
def findQuestionNum(exam, question):
for q in exam:
if q['question'] == question:
return q['number']
return -1
# open up the documents
exam1 = Document('data/Test1sampleA.docx') # path to first exam
exam2 = Document('data/Test1sampleB.docx') # path to second exam
# parse the documents
parsed1 = parseExam(exam1)
parsed2 = parseExam(exam2)
# tells you which question of exam 1 matches which one in exam 2.
# in the tuple the first number is question number in exam one and second number is exam 2
# if any value is -1 it means that the question wasn't found in exam 2.
matched = matchExams(parsed1, parsed2)
# pp.pprint(parsed1)
# pp.pprint(parsed2)
print(matched)
print("done")
from docx import Document
import mammoth
import re
import pprint
questions = re.compile('^\d{1,2}.\s') # regex to detect if line is a question
answers = re.compile('^[a-z].\s') # regex to check if line is an mcq answer
info = re.compile('^(.*?)\..*') # regex to extract the number/text of question/answer
text = re.compile('^.*?\.(.*)')
parseHtml = re.compile('<li>(((?!<li>).)*)<ol>')
pp = pprint.PrettyPrinter(indent=4) # pretty print for debugging
file1 = 'data/sampletest.docx' # path to first exam
file2 = 'data/Test1sampleA.docx' # path to second exam
# takes a word document and returns a parsed exam object
def parseExamDocx(file):
doc = Document(file)
pExam = []
for index, paragraph in enumerate(doc.paragraphs): # go over each paragraph in the document
if re.match(questions, paragraph.text): # find all questions from the exam
pExam.append({
"question": questionText(paragraph.text),
"line": index,
"number": questionNum(paragraph.text)
})
return pExam
def cleanHTML(text):
removeTags = re.sub('<[^<]+?>', '', text)
return removeTags.strip()
# parse using html
def parseExamHTML(file):
with open(file, "rb") as docx_file:
result = mammoth.convert_to_html(docx_file)
html = result.value # The generated HTML
matched = re.findall('<li>(((?!<li>).)*)<ol>', html)
pExam = []
for index, match in enumerate(matched):
pExam.append({
"question": cleanHTML(match[0]),
"number": index
})
return pExam
# try parsing using different methods
def parseExam(file):
exam = parseExamDocx(file) # parse using docx library, if fails then parse using html
if not exam:
print()
return parseExamHTML(file)
return exam
# given a question/answer returns its number
def questionNum(question):
obj = re.match(info, question)
return obj.group(1)
# given a question, returns its text
def questionText(question):
obj = re.match(text, question)
return obj.group(1).strip()
# takes in 2 exams and tells you matching questions of both exams.
def matchExams(exam1, exam2):
matched = []
for question in exam1:
match = (question['number'], findQuestionNum(exam2, question['question']))
matched.append(match)
return matched
# given an exam and a question, tells you what number it is in the exam
# if it can't find the question in the exam then returns a -1
def findQuestionNum(exam, question):
for q in exam:
if q['question'] == question:
return q['number']
return -1
###
### MAIN PROGRAM
###
# parse the documents
parsed1 = parseExam(file1)
# parsed2 = parseExam(exam2)
# tells you which question of exam 1 matches which one in exam 2.
# in the tuple the first number is question number in exam one and second number is exam 2
# if any value is -1 it means that the question wasn't found in exam 2.
# matched = matchExams(parsed1, parsed2)
pp.pprint(parsed1)
# pp.pprint(parsed2)
# print(matched)
print("done")
from docx2python import docx2python
import re
import pprint
import os
questions = re.compile('^\d{1,2}.') # regex to detect if line is a question
answerRe = re.compile('^[a-z]\)') # regex to check if line is an mcq answer
info = re.compile('^(.*?)\).*') # regex to extract the number/text of question/answer
text = re.compile('^.*?\)(.*)')
pp = pprint.PrettyPrinter(indent=4) # pretty print for debugging
file1 = 'data/I2Fall18T1A.docx' # path to first exam
file2 = 'data/I2Fall18T1B.docx' # path to second exam
# try parsing using different methods
def parseExam(file):
doc = docx2python(file)
cleaned = cleanexam(doc.text)
lines = cleaned.splitlines()
pExam = []
for index, line in enumerate(lines):
if re.match(questions, line):
pExam.append({
"question": questionText(line),
"number": questionNum(line),
"answers": sorted(getAnswers(index, lines))
})
return pExam
# doc = Document(file)
# pExam = []
# for index, paragraph in enumerate(doc.paragraphs): # go over each paragraph in the document
# print(paragraph.style)
# if re.match(questions, paragraph.text): # find all questions from the exam
# pExam.append({
# "question": questionText(paragraph.text),
# "line": index,
# "number": questionNum(paragraph.text)
# })
# return pExam
def getAnswers(i, arr):
i += 1
answers = []
while not re.match(answerRe, arr[i]):
i += 1
try:
while re.match(answerRe, arr[i]):
answers.append(questionText(arr[i]))
i += 1
return answers
except IndexError:
return answers
def cleanexam(text):
text = text.replace('\t', '')
text = os.linesep.join([s for s in text.splitlines() if s])
return text
# given a question/answer returns its number
def questionNum(question):
obj = re.match(info, question)
return obj.group(1)
# given a question, returns its text
def questionText(question):
obj = re.match(text, question)
return obj.group(1).strip()
# takes in 2 exams and tells you matching questions of both exams.
def matchExams(exam1, exam2):
matched = []
for question in exam1:
match = (question['number'], findQuestionNum(exam2, question))
matched.append(match)
return matched
# given an exam and a question, tells you what number it is in the exam
# if it can't find the question in the exam then returns a -1
def findQuestionNum(exam, question):
for q in exam:
if q['question'] == question['question'] and q['answers'] == question['answers']:
return q['number']
return -1
###
### MAIN PROGRAM
###
# parse the documents
parsed1 = parseExam(file1)
parsed2 = parseExam(file2)
# tells you which question of exam 1 matches which one in exam 2.
# in the tuple the first number is question number in exam one and second number is exam 2
# if any value is -1 it means that the question wasn't found in exam 2.
matched = matchExams(parsed1, parsed2)
# pp.pprint(parsed1)
# pp.pprint(parsed2)
pp.pprint(matched)
print("done")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment