Skip to content

Instantly share code, notes, and snippets.

@iomz
Last active April 25, 2016 01:45
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save iomz/4048e3011031d1cfed56 to your computer and use it in GitHub Desktop.
Save iomz/4048e3011031d1cfed56 to your computer and use it in GitHub Desktop.
For extracting grade from comments in SFC-SFS assignment page
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from HTMLParser import HTMLParser
from re import match, search
from sys import exit, stdout
import csv
class HWParser(HTMLParser):
def __init__(self):
HTMLParser.__init__(self)
self.in_table = False
self.in_meta_tr = False # Includes whose sumbission
self.in_answer_tr = False # Answer and comment with grading
self.in_meta_td = False
self.in_answer_td = False
self.in_comment_ul = False
self.in_comment_li = False
self.last_read_student_id = ''
self.last_read_cns_id = ''
self.last_read_grade = ''
self.comment_str = ''
self.grades = {}
self.in_div_class_ja = False
self.in_h4_class_one = False
self.hw_title = ''
def handle_starttag(self, tag, attrs):
# Determine the submission table
if tag == 'table' and ('cellspacing', '2') in attrs:
self.in_table = True
# Determine the type of tr and td
if not self.in_table:
pass
elif tag == 'tr':
# tr with bgcolor is the meta tr
if ('bgcolor', '#e0e0e0') in attrs:
self.in_meta_tr = True
else:
self.in_answer_tr = True
elif tag == 'td':
## The first td without attr contains cns id
if self.in_meta_tr and len(attrs) == 0:
self.in_meta_td = True
# The td with attr rowspan="1" contains student id
if self.in_meta_tr and ('rowspan', '1') in attrs:
self.in_meta_td = True
# Only one td in answer tr
if self.in_answer_tr:
self.in_answer_td = True
elif tag == 'ul':
self.in_comment_ul = True
# Initialize the comment string and grade for multiline comments
self.comment_str = ''
self.last_read_grade = ''
elif tag == 'li' and self.in_comment_ul:
self.in_comment_li = True
if tag == 'div' and ('class', 'ja') in attrs:
self.in_div_class_ja = True
elif tag == 'h4' and ('class', 'one') in attrs:
self.in_h4_class_one = True
def handle_data(self, data):
## cns id
if self.in_meta_td and match(r"[st]\d{5}[a-z]{2}", data):
self.last_read_cns_id = data
# student id
if self.in_meta_td and match(r"\d{8}", data):
self.last_read_student_id = data
print self.last_read_student_id # to check duplicates
elif self.in_comment_li:
if self.comment_str: # If comment lines already stored
if search(r"[A-DL][+\-]?$", self.comment_str):
self.comment_str = data.encode('utf-8') + ' -> ' + self.comment_str
else: # When grade characters not yet inserted
self.comment_str = self.comment_str + data.encode('utf-8')
else: # When the first line observed
self.comment_str = data.encode('utf-8') + self.comment_str
m = search(r"[A-DL][+\-]?$", self.comment_str)
#if m and not self.last_read_grade: # if grading found in the comment and it's the first one
if m: # if grading found
self.last_read_grade = m.group()
# hw title
if self.in_div_class_ja and self.in_h4_class_one:
self.hw_title = data.split(u" ")[0].encode('utf-8')
def handle_endtag(self, tag):
if tag == 'table':
self.in_table = False
elif tag == 'tr':
if self.in_meta_tr:
self.in_meta_tr = False
else:
self.in_answer_tr = False
elif tag == 'td':
if self.in_meta_td:
self.in_meta_td = False
else:
self.in_answer_td = False
elif tag == 'ul':
# set the read comments and grade
self.grades[self.last_read_student_id] = [self.last_read_grade, self.comment_str]
self.in_comment_ul = False
elif tag == 'li':
self.in_comment_li = False
elif tag == 'div':
self.in_div_class_ja = False
elif tag == 'h4':
self.in_h4_class_one = False
def main():
parser = HWParser()
try:
parser.feed(open('SFC-SFS.html').read().decode('euc-jisx0213'))
except AssertionError:
exit(0)
grades = parser.grades
hw_title = parser.hw_title
# Get the hw sequence number from title
hw_number = search(r"\d+", hw_title).group()
for l in open('meibo.txt'): # Contains a full list of student ids
student_id = l.strip()
# empty students are interpolated from the list
if student_id not in grades.keys():
grades[student_id] = ['', '']
# Write out the result to csv for a new sheet
with open('hw'+hw_number+'.csv', 'wb') as f:
fields = ['学籍番号', hw_title, 'comment']
writer = csv.DictWriter(f, fieldnames=fields)
writer.writeheader()
for k, v in sorted(grades.iteritems()):
if k and v:
grade = v[0] if v[0] else 'D'
writer.writerow({'学籍番号': k, hw_title: grade, 'comment': v[1]})
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment