Skip to content

Instantly share code, notes, and snippets.

@Phoenix-Effect
Last active October 3, 2019 02:26
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Phoenix-Effect/e2386f00525921385f1a9af1d4d6115d to your computer and use it in GitHub Desktop.
Save Phoenix-Effect/e2386f00525921385f1a9af1d4d6115d to your computer and use it in GitHub Desktop.
Parses exam and tells you the matching questions in both exams. V2 takes care of versions that use built-in lists in word.
from docx import Document
import re
import pprint
questions = re.compile('^\d{1,2}.\s') # regex to detect if line is a question
answers = re.compile('^[a-z].\s') # regex to check if line is an mcq answer
info = re.compile('^(.*?)\..*') # regex to extract the number/text of question/answer
text = re.compile('^.*?\.(.*)')
pp = pprint.PrettyPrinter(indent=4) # pretty print for debugging
# takes a word document and returns a parsed exam object
def parseExam(doc):
pExam = []
for index, paragraph in enumerate(doc.paragraphs): # go over each paragraph in the document
if re.match(questions, paragraph.text): # find all questions from the exam
pExam.append({
"question": questionText(paragraph.text),
"line": index,
"number": questionNum(paragraph.text)
})
return pExam
# given a question/answer returns its number
def questionNum(question):
obj = re.match(info, question)
return obj.group(1)
# given a question, returns its text
def questionText(question):
obj = re.match(text, question)
return obj.group(1).strip()
# takes in 2 exams and tells you matching questions of both exams.
def matchExams(exam1, exam2):
matched = []
for question in exam1:
match = (question['number'], findQuestionNum(exam2, question['question']))
matched.append(match)
return matched
# given an exam and a question, tells you what number it is in the exam
# if it can't find the question in the exam then returns a -1
def findQuestionNum(exam, question):
for q in exam:
if q['question'] == question:
return q['number']
return -1
# open up the documents
exam1 = Document('data/Test1sampleA.docx') # path to first exam
exam2 = Document('data/Test1sampleB.docx') # path to second exam
# parse the documents
parsed1 = parseExam(exam1)
parsed2 = parseExam(exam2)
# tells you which question of exam 1 matches which one in exam 2.
# in the tuple the first number is question number in exam one and second number is exam 2
# if any value is -1 it means that the question wasn't found in exam 2.
matched = matchExams(parsed1, parsed2)
# pp.pprint(parsed1)
# pp.pprint(parsed2)
print(matched)
print("done")
from docx import Document
import mammoth
import re
import pprint
questions = re.compile('^\d{1,2}.\s') # regex to detect if line is a question
answers = re.compile('^[a-z].\s') # regex to check if line is an mcq answer
info = re.compile('^(.*?)\..*') # regex to extract the number/text of question/answer
text = re.compile('^.*?\.(.*)')
parseHtml = re.compile('<li>(((?!<li>).)*)<ol>')
pp = pprint.PrettyPrinter(indent=4) # pretty print for debugging
file1 = 'data/sampletest.docx' # path to first exam
file2 = 'data/Test1sampleA.docx' # path to second exam
# takes a word document and returns a parsed exam object
def parseExamDocx(file):
doc = Document(file)
pExam = []
for index, paragraph in enumerate(doc.paragraphs): # go over each paragraph in the document
if re.match(questions, paragraph.text): # find all questions from the exam
pExam.append({
"question": questionText(paragraph.text),
"line": index,
"number": questionNum(paragraph.text)
})
return pExam
def cleanHTML(text):
removeTags = re.sub('<[^<]+?>', '', text)
return removeTags.strip()
# parse using html
def parseExamHTML(file):
with open(file, "rb") as docx_file:
result = mammoth.convert_to_html(docx_file)
html = result.value # The generated HTML
matched = re.findall('<li>(((?!<li>).)*)<ol>', html)
pExam = []
for index, match in enumerate(matched):
pExam.append({
"question": cleanHTML(match[0]),
"number": index
})
return pExam
# try parsing using different methods
def parseExam(file):
exam = parseExamDocx(file) # parse using docx library, if fails then parse using html
if not exam:
print()
return parseExamHTML(file)
return exam
# given a question/answer returns its number
def questionNum(question):
obj = re.match(info, question)
return obj.group(1)
# given a question, returns its text
def questionText(question):
obj = re.match(text, question)
return obj.group(1).strip()
# takes in 2 exams and tells you matching questions of both exams.
def matchExams(exam1, exam2):
matched = []
for question in exam1:
match = (question['number'], findQuestionNum(exam2, question['question']))
matched.append(match)
return matched
# given an exam and a question, tells you what number it is in the exam
# if it can't find the question in the exam then returns a -1
def findQuestionNum(exam, question):
for q in exam:
if q['question'] == question:
return q['number']
return -1
###
### MAIN PROGRAM
###
# parse the documents
parsed1 = parseExam(file1)
# parsed2 = parseExam(exam2)
# tells you which question of exam 1 matches which one in exam 2.
# in the tuple the first number is question number in exam one and second number is exam 2
# if any value is -1 it means that the question wasn't found in exam 2.
# matched = matchExams(parsed1, parsed2)
pp.pprint(parsed1)
# pp.pprint(parsed2)
# print(matched)
print("done")
from docx2python import docx2python
import re
import pprint
import os
questions = re.compile('^\d{1,2}.') # regex to detect if line is a question
answerRe = re.compile('^[a-z]\)') # regex to check if line is an mcq answer
info = re.compile('^(.*?)\).*') # regex to extract the number/text of question/answer
text = re.compile('^.*?\)(.*)')
pp = pprint.PrettyPrinter(indent=4) # pretty print for debugging
file1 = 'data/I2Fall18T1A.docx' # path to first exam
file2 = 'data/I2Fall18T1B.docx' # path to second exam
# try parsing using different methods
def parseExam(file):
doc = docx2python(file)
cleaned = cleanexam(doc.text)
lines = cleaned.splitlines()
pExam = []
for index, line in enumerate(lines):
if re.match(questions, line):
pExam.append({
"question": questionText(line),
"number": questionNum(line),
"answers": sorted(getAnswers(index, lines))
})
return pExam
# doc = Document(file)
# pExam = []
# for index, paragraph in enumerate(doc.paragraphs): # go over each paragraph in the document
# print(paragraph.style)
# if re.match(questions, paragraph.text): # find all questions from the exam
# pExam.append({
# "question": questionText(paragraph.text),
# "line": index,
# "number": questionNum(paragraph.text)
# })
# return pExam
def getAnswers(i, arr):
i += 1
answers = []
while not re.match(answerRe, arr[i]):
i += 1
try:
while re.match(answerRe, arr[i]):
answers.append(questionText(arr[i]))
i += 1
return answers
except IndexError:
return answers
def cleanexam(text):
text = text.replace('\t', '')
text = os.linesep.join([s for s in text.splitlines() if s])
return text
# given a question/answer returns its number
def questionNum(question):
obj = re.match(info, question)
return obj.group(1)
# given a question, returns its text
def questionText(question):
obj = re.match(text, question)
return obj.group(1).strip()
# takes in 2 exams and tells you matching questions of both exams.
def matchExams(exam1, exam2):
matched = []
for question in exam1:
match = (question['number'], findQuestionNum(exam2, question))
matched.append(match)
return matched
# given an exam and a question, tells you what number it is in the exam
# if it can't find the question in the exam then returns a -1
def findQuestionNum(exam, question):
for q in exam:
if q['question'] == question['question'] and q['answers'] == question['answers']:
return q['number']
return -1
###
### MAIN PROGRAM
###
# parse the documents
parsed1 = parseExam(file1)
parsed2 = parseExam(file2)
# tells you which question of exam 1 matches which one in exam 2.
# in the tuple the first number is question number in exam one and second number is exam 2
# if any value is -1 it means that the question wasn't found in exam 2.
matched = matchExams(parsed1, parsed2)
# pp.pprint(parsed1)
# pp.pprint(parsed2)
pp.pprint(matched)
print("done")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment