Skip to content

Instantly share code, notes, and snippets.

@yamachu
Created July 30, 2015 15:40
Show Gist options
  • Save yamachu/e74e9717e21ee6df6f94 to your computer and use it in GitHub Desktop.
Save yamachu/e74e9717e21ee6df6f94 to your computer and use it in GitHub Desktop.
Parse Tohoku Univ ecei local page's weight table
# -*- coding: utf-8 -*-
import lxml.html
import sys
def getText(element):
# this space is Zenkaku-space
return element.text_content().encode('utf-8').strip().replace(' ','')
# localpages html (weighttable1x.html)
f = open(sys.argv[1])
lines = f.read()
f.close()
root = lxml.html.fromstring(lines)
tables = root.cssselect('table')
# only numbers
tableHeader=[]
moduleWeight={}
meanWeight={}
labName=[]
name = ''
for table in [tb for i, tb in enumerate(tables) if i < len(tables) - 1]:
for i, tr in enumerate(table.cssselect('tr')):
if i == 0 or len(tableHeader) == 0:
tableHeader = [getText(text) for text in tr.cssselect('td')]
else:
if i % 2 == 1:
name = getText(tr.cssselect('td')[0])
labName.append(name)
moduleWeight[name] = [getText(text) for i, text in enumerate(tr.cssselect('td')) if i != 0]
else:
meanWeight[name] = [getText(text) for text in tr.cssselect('td')]
'''
for name in labName:
print moduleWeight[name]
print meanWeight[name]
'''
# subjects, must solve II and Ⅱ, I and Ⅰ...
subjectList = tables[len(tables) - 1]
subjectHeader = []
subjectSet = ''
subjectNameAndNumber = {}
subjectSetList = []
for i, tr in enumerate(subjectList.cssselect('tr')):
if i == 0:
subjectHeader = [getText(text) for text in tr.cssselect('td')]
else:
subjectSet = getText(tr.cssselect('td')[0])
subjectSetList.append(subjectSet)
subjectNameAndNumber[subjectSet] = [text.strip() for text in getText(tr.cssselect('td')[1]).split()]
'''
for subjectSetName in subjectSetList:
print subjectNameAndNumber[subjectSetName]
'''
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment