Skip to content

Instantly share code, notes, and snippets.

@dieterplex
Last active August 29, 2015 13:56
Show Gist options
  • Save dieterplex/9085603 to your computer and use it in GitHub Desktop.
Save dieterplex/9085603 to your computer and use it in GitHub Desktop.
#!/usr/bin/python
# -*- coding: utf-8 -*-
import sys
import urllib2
from HTMLParser import HTMLParser
def write(value=''):
sys.stdout.write(str(value))
class MyHTMLParser(HTMLParser):
row, col = 0, 0
def handle_starttag(self, tag, attrs):
if tag == 'table':
write('[')
if self.row:
if tag == 'tr':
if self.row > 1:
write(', ')
write('{')
if tag == 'td':
fm = ['"name": "', '"國語": ', '"數學": ', '"自然": ', '"社會": ', '"健康教育": ']
write(fm[self.col])
def handle_endtag(self, tag):
if tag == 'table':
write(']')
if self.row:
if tag == 'td':
if self.col == 0:
write('", "grades": {')
elif self.col == 5:
write('}')
else:
write(', ')
self.col = self.col + 1
if tag == 'tr':
write('}')
self.col = 0
# skip header
if tag == 'tr':
self.row = self.row + 1
def handle_data(self, data):
if self.row:
write(data.strip())
response = urllib2.urlopen('http://axe-level-1.herokuapp.com/')
html = response.read()
parser = MyHTMLParser()
parser.feed(html)
#[{"name": "王小明", "grades": {"國語": 90, "數學": 89, ...}}, ... ]
#!/usr/bin/python
#-*- coding: utf-8 -*-
import urllib2
import sys
from HTMLParser import HTMLParser
AXE_URL = 'http://axe-level-1.herokuapp.com/lv2'
def get_page_raw_data(page_num=1):
url ='%s/?page=%s' % (AXE_URL, page_num)
response = urllib2.urlopen(url)
html = response.read()
return html
def write(value=''):
sys.stdout.write(str(value))
class MyHTMLParser2(HTMLParser):
row, col = 0, 0
fm = ['"town": "', '"village": "', '"name" : "']
def handle_starttag(self, tag, attrs):
if self.row:
if tag == 'tr':
if self.row > 1:
write(', ')
write('{')
if tag == 'td':
write(self.fm[self.col])
def handle_endtag(self, tag):
if tag == 'table':
self.row = 0
if self.row:
if tag == 'td':
if self.col == 2:
write('"')
else:
write('", ')
self.col = self.col + 1
if tag == 'tr':
write('}')
self.col = 0
# skip header
if tag == 'tr':
self.row = self.row + 1
def handle_data(self, data):
if self.row:
write(data.strip())
parser = MyHTMLParser2()
write('[')
for i in range(1,13):
html = get_page_raw_data(i)
parser.feed(html)
if i < 12:
write(', ')
write(']')
#[{"town": "東區", "village": "東勢里", "name" : "林錦全"}, ...]
#!/usr/bin/python
#-*- coding: utf-8 -*-
import urllib2
import cookielib
import sys
from HTMLParser import HTMLParser
AXE_URL = 'http://axe-level-1.herokuapp.com/lv3'
cj = cookielib.CookieJar()
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
urllib2.install_opener(opener)
def get_page_raw_data(axe_url='%s/?page='%AXE_URL, page_num='next'):
url = axe_url + page_num
response = urllib2.urlopen(url)
html = response.read()
return html
def write(value=''):
sys.stdout.write(str(value))
class MyHTMLParser2(HTMLParser):
row, col = 0, 0
fm = ['"town": "', '"village": "', '"name" : "']
def handle_starttag(self, tag, attrs):
if self.row:
if tag == 'tr':
if self.row > 1:
write(', ')
write('{')
if tag == 'td':
write(self.fm[self.col])
def handle_endtag(self, tag):
if tag == 'table':
self.row = 0
if self.row:
if tag == 'td':
if self.col == 2:
write('"')
else:
write('", ')
self.col = self.col + 1
if tag == 'tr':
write('}')
self.col = 0
# skip header
if tag == 'tr':
self.row = self.row + 1
def handle_data(self, data):
if self.row:
write(data.strip())
parser = MyHTMLParser2()
write('[')
html = get_page_raw_data(axe_url=AXE_URL, page_num='')
parser.feed(html)
write(', ')
for i in range(1,76):
html = get_page_raw_data()
parser.feed(html)
if i < 75:
write(', ')
write(']')
#[{"town": "東區", "village": "東勢里", "name" : "林錦全"}, ...]
#!/usr/bin/python
#-*- coding: utf-8 -*-
import urllib2
import cookielib
import sys
import random
import time
from HTMLParser import HTMLParser
AXE_URL = 'http://axe-level-4.herokuapp.com/lv4'
UA = 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:27.0) Gecko/20100101 Firefox/27.0'
REF = lambda num: '%s/?page=%s' % (AXE_URL, num)
cj = cookielib.CookieJar()
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
urllib2.install_opener(opener)
def get_page_raw_data(ref=None, axeurl='%s/?page=1' % AXE_URL):
url = axeurl
time.sleep(random.random()*2**2)
request = urllib2.Request(url)
request.add_header('User-Agent', UA)
if ref:
request.add_header('Referer', ref)
response = urllib2.urlopen(request)
html = response.read()
return html
def write(value=''):
sys.stdout.write(str(value))
class MyHTMLParser4(HTMLParser):
row, col = 0, 0
fm = ['"town": "', '"village": "', '"name" : "']
def handle_starttag(self, tag, attrs):
if self.row:
if tag == 'tr':
if self.row > 1:
write(', ')
write('{')
if tag == 'td':
write(self.fm[self.col])
def handle_endtag(self, tag):
if tag == 'table':
self.row = 0
if self.row:
if tag == 'td':
if self.col == 2:
write('"')
else:
write('", ')
self.col = self.col + 1
if tag == 'tr':
write('}')
self.col = 0
# skip header
if tag == 'tr':
self.row = self.row + 1
def handle_data(self, data):
if self.row:
write(data.strip())
parser = MyHTMLParser4()
write('[')
ref = AXE_URL+'/'
html = get_page_raw_data(axeurl=ref)
parser.feed(html)
write(', ')
html = get_page_raw_data(ref, axeurl=REF(2))
parser.feed(html)
write(', ')
for i in range(3, 24):
axeurl=REF(i)
html = get_page_raw_data(ref=REF(i-1), axeurl=axeurl)
parser.feed(html)
write(', ')
html = get_page_raw_data(ref=REF(23), axeurl=REF(24))
parser.feed(html)
write(']')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment