iomz/sfs-hw-grade-extractor.py

## sfs-hw-grade-extractor.py
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from HTMLParser import HTMLParser
from re import match, search
from sys import exit, stdout
import csv

class HWParser(HTMLParser):
    def __init__(self):
        HTMLParser.__init__(self)
        self.in_table = False
        self.in_meta_tr = False     # Includes whose sumbission
        self.in_answer_tr = False   # Answer and comment with grading
        self.in_meta_td = False
        self.in_answer_td = False
        self.in_comment_ul = False
        self.in_comment_li = False
        self.last_read_student_id = ''
        self.last_read_cns_id = ''
        self.last_read_grade = ''
        self.comment_str = ''
        self.grades = {}
        self.in_div_class_ja = False
        self.in_h4_class_one = False
        self.hw_title = ''

    def handle_starttag(self, tag, attrs):
        # Determine the submission table
        if tag == 'table' and ('cellspacing', '2') in attrs:
            self.in_table = True
        # Determine the type of tr and td
        if not self.in_table:
            pass
        elif tag == 'tr':
            # tr with bgcolor is the meta tr
            if ('bgcolor', '#e0e0e0') in attrs:
                self.in_meta_tr = True
            else:
                self.in_answer_tr = True
        elif tag == 'td':
            ## The first td without attr contains cns id
            if self.in_meta_tr and len(attrs) == 0:
                self.in_meta_td = True
            # The td with attr rowspan="1" contains student id
            if self.in_meta_tr and ('rowspan', '1') in attrs:
                self.in_meta_td = True
            # Only one td in answer tr
            if self.in_answer_tr:
                self.in_answer_td = True
        elif tag == 'ul':
            self.in_comment_ul = True
            # Initialize the comment string and grade for multiline comments
            self.comment_str = ''
            self.last_read_grade = ''
        elif tag == 'li' and self.in_comment_ul:
            self.in_comment_li = True
        if tag == 'div' and ('class', 'ja') in attrs:
            self.in_div_class_ja = True
        elif tag == 'h4' and ('class', 'one') in attrs:
            self.in_h4_class_one = True

    def handle_data(self, data):
        ## cns id
        if self.in_meta_td and match(r"[st]\d{5}[a-z]{2}", data):
            self.last_read_cns_id = data
        # student id
        if self.in_meta_td and match(r"\d{8}", data):
            self.last_read_student_id = data
            print self.last_read_student_id # to check duplicates
        elif self.in_comment_li:
            if self.comment_str: # If comment lines already stored
                if search(r"[A-DL][+\-]?$", self.comment_str):
                    self.comment_str = data.encode('utf-8') + ' -> ' + self.comment_str
                else: # When grade characters not yet inserted
                    self.comment_str = self.comment_str + data.encode('utf-8')
            else: # When the first line observed
                self.comment_str = data.encode('utf-8') + self.comment_str
            m = search(r"[A-DL][+\-]?$", self.comment_str)
            #if m and not self.last_read_grade: # if grading found in the comment and it's the first one
            if m: # if grading found
                self.last_read_grade = m.group()
        # hw title
        if self.in_div_class_ja and self.in_h4_class_one:
            self.hw_title = data.split(u"　")[0].encode('utf-8')

    def handle_endtag(self, tag):
        if tag == 'table':
            self.in_table = False
        elif tag == 'tr':
            if self.in_meta_tr:
                self.in_meta_tr = False
            else:
                self.in_answer_tr = False
        elif tag == 'td':
            if self.in_meta_td:
                self.in_meta_td = False
            else:
                self.in_answer_td = False
        elif tag == 'ul':
            # set the read comments and grade
            self.grades[self.last_read_student_id] = [self.last_read_grade, self.comment_str]
            self.in_comment_ul = False
        elif tag == 'li':
            self.in_comment_li = False
        elif tag == 'div':
            self.in_div_class_ja = False
        elif tag == 'h4':
            self.in_h4_class_one = False

def main():
    parser = HWParser()
    try:
        parser.feed(open('SFC-SFS.html').read().decode('euc-jisx0213'))
    except AssertionError:
        exit(0)

    grades = parser.grades
    hw_title = parser.hw_title
    # Get the hw sequence number from title
    hw_number = search(r"\d+", hw_title).group()
    for l in open('meibo.txt'): # Contains a full list of student ids
        student_id = l.strip()
        # empty students are interpolated from the list
        if student_id not in grades.keys():
            grades[student_id] = ['', '']

    # Write out the result to csv for a new sheet
    with open('hw'+hw_number+'.csv', 'wb') as f:
        fields = ['学籍番号', hw_title, 'comment']
        writer = csv.DictWriter(f, fieldnames=fields)
        writer.writeheader()
        for k, v in sorted(grades.iteritems()):
            if k and v:
                grade = v[0] if v[0] else 'D'
                writer.writerow({'学籍番号': k, hw_title: grade, 'comment': v[1]})

if __name__ == "__main__":
    main()
	#!/usr/bin/env python
	# -- coding: utf-8 --
	from HTMLParser import HTMLParser
	from re import match, search
	from sys import exit, stdout
	import csv

	class HWParser(HTMLParser):
	def __init__(self):
	HTMLParser.__init__(self)
	self.in_table = False
	self.in_meta_tr = False # Includes whose sumbission
	self.in_answer_tr = False # Answer and comment with grading
	self.in_meta_td = False
	self.in_answer_td = False
	self.in_comment_ul = False
	self.in_comment_li = False
	self.last_read_student_id = ''
	self.last_read_cns_id = ''
	self.last_read_grade = ''
	self.comment_str = ''
	self.grades = {}
	self.in_div_class_ja = False
	self.in_h4_class_one = False
	self.hw_title = ''

	def handle_starttag(self, tag, attrs):
	# Determine the submission table
	if tag == 'table' and ('cellspacing', '2') in attrs:
	self.in_table = True
	# Determine the type of tr and td
	if not self.in_table:
	pass
	elif tag == 'tr':
	# tr with bgcolor is the meta tr
	if ('bgcolor', '#e0e0e0') in attrs:
	self.in_meta_tr = True
	else:
	self.in_answer_tr = True
	elif tag == 'td':
	## The first td without attr contains cns id
	if self.in_meta_tr and len(attrs) == 0:
	self.in_meta_td = True
	# The td with attr rowspan="1" contains student id
	if self.in_meta_tr and ('rowspan', '1') in attrs:
	self.in_meta_td = True
	# Only one td in answer tr
	if self.in_answer_tr:
	self.in_answer_td = True
	elif tag == 'ul':
	self.in_comment_ul = True
	# Initialize the comment string and grade for multiline comments
	self.comment_str = ''
	self.last_read_grade = ''
	elif tag == 'li' and self.in_comment_ul:
	self.in_comment_li = True
	if tag == 'div' and ('class', 'ja') in attrs:
	self.in_div_class_ja = True
	elif tag == 'h4' and ('class', 'one') in attrs:
	self.in_h4_class_one = True

	def handle_data(self, data):
	## cns id
	if self.in_meta_td and match(r"[st]\d{5}[a-z]{2}", data):
	self.last_read_cns_id = data
	# student id
	if self.in_meta_td and match(r"\d{8}", data):
	self.last_read_student_id = data
	print self.last_read_student_id # to check duplicates
	elif self.in_comment_li:
	if self.comment_str: # If comment lines already stored
	if search(r"[A-DL][+\-]?$", self.comment_str):
	self.comment_str = data.encode('utf-8') + ' -> ' + self.comment_str
	else: # When grade characters not yet inserted
	self.comment_str = self.comment_str + data.encode('utf-8')
	else: # When the first line observed
	self.comment_str = data.encode('utf-8') + self.comment_str
	m = search(r"[A-DL][+\-]?$", self.comment_str)
	#if m and not self.last_read_grade: # if grading found in the comment and it's the first one
	if m: # if grading found
	self.last_read_grade = m.group()
	# hw title
	if self.in_div_class_ja and self.in_h4_class_one:
	self.hw_title = data.split(u"　")[0].encode('utf-8')

	def handle_endtag(self, tag):
	if tag == 'table':
	self.in_table = False
	elif tag == 'tr':
	if self.in_meta_tr:
	self.in_meta_tr = False
	else:
	self.in_answer_tr = False
	elif tag == 'td':
	if self.in_meta_td:
	self.in_meta_td = False
	else:
	self.in_answer_td = False
	elif tag == 'ul':
	# set the read comments and grade
	self.grades[self.last_read_student_id] = [self.last_read_grade, self.comment_str]
	self.in_comment_ul = False
	elif tag == 'li':
	self.in_comment_li = False
	elif tag == 'div':
	self.in_div_class_ja = False
	elif tag == 'h4':
	self.in_h4_class_one = False

	def main():
	parser = HWParser()
	try:
	parser.feed(open('SFC-SFS.html').read().decode('euc-jisx0213'))
	except AssertionError:
	exit(0)

	grades = parser.grades
	hw_title = parser.hw_title
	# Get the hw sequence number from title
	hw_number = search(r"\d+", hw_title).group()
	for l in open('meibo.txt'): # Contains a full list of student ids
	student_id = l.strip()
	# empty students are interpolated from the list
	if student_id not in grades.keys():
	grades[student_id] = ['', '']

	# Write out the result to csv for a new sheet
	with open('hw'+hw_number+'.csv', 'wb') as f:
	fields = ['学籍番号', hw_title, 'comment']
	writer = csv.DictWriter(f, fieldnames=fields)
	writer.writeheader()
	for k, v in sorted(grades.iteritems()):
	if k and v:
	grade = v[0] if v[0] else 'D'
	writer.writerow({'学籍番号': k, hw_title: grade, 'comment': v[1]})

	if __name__ == "__main__":
	main()