hulucc/main.py

## main.py
import urllib.request
import pprint
from html.parser import HTMLParser

class MyHTMLParser(HTMLParser):
    def __init__(self):
        self.column = 0
        self.value = None
        self.key = None
        self.table = []
        super().__init__()

    def handle_starttag(self, tag, attrs):
        pass

        if tag == "tr" and dict(attrs).get('class') == 'alt':
            self.data = {}
            self.table.append(self.data)
            return
        if tag == "td" and len(attrs) == 0 and self.column == 0:
            self.key = "order"
            self.column += 1
            return
        if tag == "td" and len(attrs) == 0 and self.column == 1:
            self.key = "name"
            self.column += 1
            return
        if tag == "td" and len(attrs) == 0 and self.column == 2:
            self.key = "province"
            self.column += 1
            return
        if tag == "td" and len(attrs) == 0 and self.column == 3:
            self.key = "score"
            self.column = 0
            return

    def handle_endtag(self, tag):
        pass

    def handle_data(self, data):
        if self.key:
            self.data[self.key] = data
            self.key = None


def main():
    province = '广东'
    html = urllib.request.urlopen('http://www.zuihaodaxue.cn/zuihaodaxuepaiming2019.html').read().decode('utf8')
    parser = MyHTMLParser()
    parser.feed(html)
    result = list(filter(lambda x: x.get('province') == province, parser.table))
    pprint.pprint(result)

if __name__ == "__main__":
    main()
	import urllib.request
	import pprint
	from html.parser import HTMLParser

	class MyHTMLParser(HTMLParser):
	def __init__(self):
	self.column = 0
	self.value = None
	self.key = None
	self.table = []
	super().__init__()

	def handle_starttag(self, tag, attrs):
	pass

	if tag == "tr" and dict(attrs).get('class') == 'alt':
	self.data = {}
	self.table.append(self.data)
	return
	if tag == "td" and len(attrs) == 0 and self.column == 0:
	self.key = "order"
	self.column += 1
	return
	if tag == "td" and len(attrs) == 0 and self.column == 1:
	self.key = "name"
	self.column += 1
	return
	if tag == "td" and len(attrs) == 0 and self.column == 2:
	self.key = "province"
	self.column += 1
	return
	if tag == "td" and len(attrs) == 0 and self.column == 3:
	self.key = "score"
	self.column = 0
	return

	def handle_endtag(self, tag):
	pass

	def handle_data(self, data):
	if self.key:
	self.data[self.key] = data
	self.key = None


	def main():
	province = '广东'
	html = urllib.request.urlopen('http://www.zuihaodaxue.cn/zuihaodaxuepaiming2019.html').read().decode('utf8')
	parser = MyHTMLParser()
	parser.feed(html)
	result = list(filter(lambda x: x.get('province') == province, parser.table))
	pprint.pprint(result)

	if __name__ == "__main__":
	main()