Skip to content

Instantly share code, notes, and snippets.

@mlouielu
Created November 28, 2017 17:48
Show Gist options
  • Save mlouielu/90edd2201440b2334ca17417c4f71d8f to your computer and use it in GitHub Desktop.
Save mlouielu/90edd2201440b2334ca17417c4f71d8f to your computer and use it in GitHub Desktop.
抓取勞動違規統計
# -*- coding: utf-8 -*-
import re
import requests
from collections import defaultdict
from lxml import etree
URL = 'https://jobhelper.g0v.ronny.tw/package/show/%d'
REGEX = '第\\d+條第\\d+項第\\d+款|第\\d+條第\\d+項|第\\d+條'
def getit(id):
r = requests.get(URL % id)
root = etree.HTML(r.text)
trs = root.xpath('//table[@class="table"][1]/tbody/tr')
d = defaultdict(int)
for tr in trs:
reason = re.findall(REGEX,
tr[2].text.strip().split('(')[0].replace(' ', '').strip())
for r in reason:
d[r] += 1
return sorted(d.items(), key=lambda kv: kv[1], reverse=True)
if __name__ == '__main__':
d = getit(1)
print(sum([i[1] for i in d]))
for k, v in d:
print("%s\t%s" % (k, v))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment